1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
4
5; FIXME: Broken on evergreen
6; FIXME: For some reason the 8 and 16 vectors are being stored as
7; individual elements instead of 128-bit stores.
8
9define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind {
10; SI-LABEL: insertelement_v2f32_0:
11; SI:       ; %bb.0:
12; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
13; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14; SI-NEXT:    s_mov_b32 s3, 0x100f000
15; SI-NEXT:    s_mov_b32 s2, -1
16; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
17; SI-NEXT:    s_waitcnt lgkmcnt(0)
18; SI-NEXT:    v_mov_b32_e32 v1, s7
19; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
20; SI-NEXT:    s_endpgm
21;
22; VI-LABEL: insertelement_v2f32_0:
23; VI:       ; %bb.0:
24; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
25; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
26; VI-NEXT:    s_mov_b32 s3, 0x1100f000
27; VI-NEXT:    s_mov_b32 s2, -1
28; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
29; VI-NEXT:    s_waitcnt lgkmcnt(0)
30; VI-NEXT:    v_mov_b32_e32 v1, s7
31; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
32; VI-NEXT:    s_endpgm
33  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 0
34  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16
35  ret void
36}
37
38define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind {
39; SI-LABEL: insertelement_v2f32_1:
40; SI:       ; %bb.0:
41; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
42; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
43; SI-NEXT:    s_mov_b32 s3, 0x100f000
44; SI-NEXT:    s_mov_b32 s2, -1
45; SI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
46; SI-NEXT:    s_waitcnt lgkmcnt(0)
47; SI-NEXT:    v_mov_b32_e32 v0, s6
48; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
49; SI-NEXT:    s_endpgm
50;
51; VI-LABEL: insertelement_v2f32_1:
52; VI:       ; %bb.0:
53; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
54; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
55; VI-NEXT:    s_mov_b32 s3, 0x1100f000
56; VI-NEXT:    s_mov_b32 s2, -1
57; VI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
58; VI-NEXT:    s_waitcnt lgkmcnt(0)
59; VI-NEXT:    v_mov_b32_e32 v0, s6
60; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
61; VI-NEXT:    s_endpgm
62  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 1
63  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16
64  ret void
65}
66
67define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
68; SI-LABEL: insertelement_v2i32_0:
69; SI:       ; %bb.0:
70; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
71; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
72; SI-NEXT:    s_mov_b32 s3, 0x100f000
73; SI-NEXT:    s_mov_b32 s2, -1
74; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
75; SI-NEXT:    s_waitcnt lgkmcnt(0)
76; SI-NEXT:    v_mov_b32_e32 v1, s7
77; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
78; SI-NEXT:    s_endpgm
79;
80; VI-LABEL: insertelement_v2i32_0:
81; VI:       ; %bb.0:
82; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
83; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
84; VI-NEXT:    s_mov_b32 s3, 0x1100f000
85; VI-NEXT:    s_mov_b32 s2, -1
86; VI-NEXT:    v_mov_b32_e32 v0, 0x3e7
87; VI-NEXT:    s_waitcnt lgkmcnt(0)
88; VI-NEXT:    v_mov_b32_e32 v1, s7
89; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
90; VI-NEXT:    s_endpgm
91  %vecins = insertelement <2 x i32> %a, i32 999, i32 0
92  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16
93  ret void
94}
95
96define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
97; SI-LABEL: insertelement_v2i32_1:
98; SI:       ; %bb.0:
99; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
100; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
101; SI-NEXT:    s_mov_b32 s3, 0x100f000
102; SI-NEXT:    s_mov_b32 s2, -1
103; SI-NEXT:    v_mov_b32_e32 v1, 0x3e7
104; SI-NEXT:    s_waitcnt lgkmcnt(0)
105; SI-NEXT:    v_mov_b32_e32 v0, s6
106; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
107; SI-NEXT:    s_endpgm
108;
109; VI-LABEL: insertelement_v2i32_1:
110; VI:       ; %bb.0:
111; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
112; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
113; VI-NEXT:    s_mov_b32 s3, 0x1100f000
114; VI-NEXT:    s_mov_b32 s2, -1
115; VI-NEXT:    v_mov_b32_e32 v1, 0x3e7
116; VI-NEXT:    s_waitcnt lgkmcnt(0)
117; VI-NEXT:    v_mov_b32_e32 v0, s6
118; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
119; VI-NEXT:    s_endpgm
120  %vecins = insertelement <2 x i32> %a, i32 999, i32 1
121  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16
122  ret void
123}
124
125; FIXME: Why is the constant moved into the intermediate register and
126; not just directly into the vector component?
127define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
128; SI-LABEL: insertelement_v4f32_0:
129; SI:       ; %bb.0:
130; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
131; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
132; SI-NEXT:    s_waitcnt lgkmcnt(0)
133; SI-NEXT:    s_mov_b32 s0, 0x40a00000
134; SI-NEXT:    s_mov_b32 s7, 0x100f000
135; SI-NEXT:    s_mov_b32 s6, -1
136; SI-NEXT:    v_mov_b32_e32 v0, s0
137; SI-NEXT:    v_mov_b32_e32 v1, s1
138; SI-NEXT:    v_mov_b32_e32 v2, s2
139; SI-NEXT:    v_mov_b32_e32 v3, s3
140; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
141; SI-NEXT:    s_endpgm
142;
143; VI-LABEL: insertelement_v4f32_0:
144; VI:       ; %bb.0:
145; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
146; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
147; VI-NEXT:    s_waitcnt lgkmcnt(0)
148; VI-NEXT:    s_mov_b32 s0, 0x40a00000
149; VI-NEXT:    s_mov_b32 s7, 0x1100f000
150; VI-NEXT:    s_mov_b32 s6, -1
151; VI-NEXT:    v_mov_b32_e32 v0, s0
152; VI-NEXT:    v_mov_b32_e32 v1, s1
153; VI-NEXT:    v_mov_b32_e32 v2, s2
154; VI-NEXT:    v_mov_b32_e32 v3, s3
155; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
156; VI-NEXT:    s_endpgm
157  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
158  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
159  ret void
160}
161
162define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
163; SI-LABEL: insertelement_v4f32_1:
164; SI:       ; %bb.0:
165; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
166; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
167; SI-NEXT:    s_waitcnt lgkmcnt(0)
168; SI-NEXT:    s_mov_b32 s1, 0x40a00000
169; SI-NEXT:    s_mov_b32 s7, 0x100f000
170; SI-NEXT:    s_mov_b32 s6, -1
171; SI-NEXT:    v_mov_b32_e32 v0, s0
172; SI-NEXT:    v_mov_b32_e32 v1, s1
173; SI-NEXT:    v_mov_b32_e32 v2, s2
174; SI-NEXT:    v_mov_b32_e32 v3, s3
175; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
176; SI-NEXT:    s_endpgm
177;
178; VI-LABEL: insertelement_v4f32_1:
179; VI:       ; %bb.0:
180; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
181; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
182; VI-NEXT:    s_waitcnt lgkmcnt(0)
183; VI-NEXT:    s_mov_b32 s1, 0x40a00000
184; VI-NEXT:    s_mov_b32 s7, 0x1100f000
185; VI-NEXT:    s_mov_b32 s6, -1
186; VI-NEXT:    v_mov_b32_e32 v0, s0
187; VI-NEXT:    v_mov_b32_e32 v1, s1
188; VI-NEXT:    v_mov_b32_e32 v2, s2
189; VI-NEXT:    v_mov_b32_e32 v3, s3
190; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
191; VI-NEXT:    s_endpgm
192  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
193  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
194  ret void
195}
196
197define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
198; SI-LABEL: insertelement_v4f32_2:
199; SI:       ; %bb.0:
200; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
201; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
202; SI-NEXT:    s_waitcnt lgkmcnt(0)
203; SI-NEXT:    s_mov_b32 s2, 0x40a00000
204; SI-NEXT:    s_mov_b32 s7, 0x100f000
205; SI-NEXT:    s_mov_b32 s6, -1
206; SI-NEXT:    v_mov_b32_e32 v0, s0
207; SI-NEXT:    v_mov_b32_e32 v1, s1
208; SI-NEXT:    v_mov_b32_e32 v2, s2
209; SI-NEXT:    v_mov_b32_e32 v3, s3
210; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
211; SI-NEXT:    s_endpgm
212;
213; VI-LABEL: insertelement_v4f32_2:
214; VI:       ; %bb.0:
215; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
216; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
217; VI-NEXT:    s_waitcnt lgkmcnt(0)
218; VI-NEXT:    s_mov_b32 s2, 0x40a00000
219; VI-NEXT:    s_mov_b32 s7, 0x1100f000
220; VI-NEXT:    s_mov_b32 s6, -1
221; VI-NEXT:    v_mov_b32_e32 v0, s0
222; VI-NEXT:    v_mov_b32_e32 v1, s1
223; VI-NEXT:    v_mov_b32_e32 v2, s2
224; VI-NEXT:    v_mov_b32_e32 v3, s3
225; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
226; VI-NEXT:    s_endpgm
227  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
228  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
229  ret void
230}
231
232define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
233; SI-LABEL: insertelement_v4f32_3:
234; SI:       ; %bb.0:
235; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
236; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
237; SI-NEXT:    s_waitcnt lgkmcnt(0)
238; SI-NEXT:    s_mov_b32 s3, 0x40a00000
239; SI-NEXT:    s_mov_b32 s7, 0x100f000
240; SI-NEXT:    s_mov_b32 s6, -1
241; SI-NEXT:    v_mov_b32_e32 v0, s0
242; SI-NEXT:    v_mov_b32_e32 v1, s1
243; SI-NEXT:    v_mov_b32_e32 v2, s2
244; SI-NEXT:    v_mov_b32_e32 v3, s3
245; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
246; SI-NEXT:    s_endpgm
247;
248; VI-LABEL: insertelement_v4f32_3:
249; VI:       ; %bb.0:
250; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
251; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
252; VI-NEXT:    s_waitcnt lgkmcnt(0)
253; VI-NEXT:    s_mov_b32 s3, 0x40a00000
254; VI-NEXT:    s_mov_b32 s7, 0x1100f000
255; VI-NEXT:    s_mov_b32 s6, -1
256; VI-NEXT:    v_mov_b32_e32 v0, s0
257; VI-NEXT:    v_mov_b32_e32 v1, s1
258; VI-NEXT:    v_mov_b32_e32 v2, s2
259; VI-NEXT:    v_mov_b32_e32 v3, s3
260; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
261; VI-NEXT:    s_endpgm
262  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
263  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
264  ret void
265}
266
267define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
268; SI-LABEL: insertelement_v4i32_0:
269; SI:       ; %bb.0:
270; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
271; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
272; SI-NEXT:    s_waitcnt lgkmcnt(0)
273; SI-NEXT:    s_movk_i32 s0, 0x3e7
274; SI-NEXT:    s_mov_b32 s7, 0x100f000
275; SI-NEXT:    s_mov_b32 s6, -1
276; SI-NEXT:    v_mov_b32_e32 v0, s0
277; SI-NEXT:    v_mov_b32_e32 v1, s1
278; SI-NEXT:    v_mov_b32_e32 v2, s2
279; SI-NEXT:    v_mov_b32_e32 v3, s3
280; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
281; SI-NEXT:    s_endpgm
282;
283; VI-LABEL: insertelement_v4i32_0:
284; VI:       ; %bb.0:
285; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
286; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
287; VI-NEXT:    s_waitcnt lgkmcnt(0)
288; VI-NEXT:    s_movk_i32 s0, 0x3e7
289; VI-NEXT:    s_mov_b32 s7, 0x1100f000
290; VI-NEXT:    s_mov_b32 s6, -1
291; VI-NEXT:    v_mov_b32_e32 v0, s0
292; VI-NEXT:    v_mov_b32_e32 v1, s1
293; VI-NEXT:    v_mov_b32_e32 v2, s2
294; VI-NEXT:    v_mov_b32_e32 v3, s3
295; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
296; VI-NEXT:    s_endpgm
297  %vecins = insertelement <4 x i32> %a, i32 999, i32 0
298  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
299  ret void
300}
301
302define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
303; SI-LABEL: insertelement_v3f32_1:
304; SI:       ; %bb.0:
305; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
306; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
307; SI-NEXT:    s_mov_b32 s7, 0x100f000
308; SI-NEXT:    s_mov_b32 s6, -1
309; SI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
310; SI-NEXT:    s_waitcnt lgkmcnt(0)
311; SI-NEXT:    v_mov_b32_e32 v0, s0
312; SI-NEXT:    v_mov_b32_e32 v2, s2
313; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
314; SI-NEXT:    s_endpgm
315;
316; VI-LABEL: insertelement_v3f32_1:
317; VI:       ; %bb.0:
318; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
319; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
320; VI-NEXT:    s_mov_b32 s7, 0x1100f000
321; VI-NEXT:    s_mov_b32 s6, -1
322; VI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
323; VI-NEXT:    s_waitcnt lgkmcnt(0)
324; VI-NEXT:    v_mov_b32_e32 v0, s0
325; VI-NEXT:    v_mov_b32_e32 v2, s2
326; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
327; VI-NEXT:    s_endpgm
328  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
329  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
330  ret void
331}
332
333define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
334; SI-LABEL: insertelement_v3f32_2:
335; SI:       ; %bb.0:
336; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
337; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
338; SI-NEXT:    s_mov_b32 s7, 0x100f000
339; SI-NEXT:    s_mov_b32 s6, -1
340; SI-NEXT:    v_mov_b32_e32 v2, 0x40a00000
341; SI-NEXT:    s_waitcnt lgkmcnt(0)
342; SI-NEXT:    v_mov_b32_e32 v0, s0
343; SI-NEXT:    v_mov_b32_e32 v1, s1
344; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
345; SI-NEXT:    s_endpgm
346;
347; VI-LABEL: insertelement_v3f32_2:
348; VI:       ; %bb.0:
349; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
350; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
351; VI-NEXT:    s_mov_b32 s7, 0x1100f000
352; VI-NEXT:    s_mov_b32 s6, -1
353; VI-NEXT:    v_mov_b32_e32 v2, 0x40a00000
354; VI-NEXT:    s_waitcnt lgkmcnt(0)
355; VI-NEXT:    v_mov_b32_e32 v0, s0
356; VI-NEXT:    v_mov_b32_e32 v1, s1
357; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
358; VI-NEXT:    s_endpgm
359  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
360  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
361  ret void
362}
363
364define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
365; GCN-LABEL: insertelement_v3f32_3:
366; GCN:       ; %bb.0:
367; GCN-NEXT:    s_endpgm
368  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
369  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
370  ret void
371}
372
373define <4 x float> @insertelement_to_sgpr() nounwind {
374; GCN-LABEL: insertelement_to_sgpr:
375; GCN:       ; %bb.0:
376; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
378; GCN-NEXT:    s_waitcnt lgkmcnt(0)
379; GCN-NEXT:    s_mov_b32 s4, 0
380; GCN-NEXT:    image_gather4_lz v[0:3], v[0:1], s[4:11], s[4:7] dmask:0x1
381; GCN-NEXT:    s_waitcnt vmcnt(0)
382; GCN-NEXT:    s_setpc_b64 s[30:31]
383  %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
384  %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
385  %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
386  ret <4 x float> %tmp2
387}
388
389define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
390; SI-LABEL: dynamic_insertelement_v2f32:
391; SI:       ; %bb.0:
392; SI-NEXT:    s_load_dword s6, s[4:5], 0x4
393; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
394; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
395; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
396; SI-NEXT:    s_mov_b32 s3, 0x100f000
397; SI-NEXT:    s_waitcnt lgkmcnt(0)
398; SI-NEXT:    s_cmp_lg_u32 s6, 1
399; SI-NEXT:    s_cselect_b64 vcc, -1, 0
400; SI-NEXT:    v_mov_b32_e32 v1, s5
401; SI-NEXT:    s_cmp_lg_u32 s6, 0
402; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
403; SI-NEXT:    v_mov_b32_e32 v2, s4
404; SI-NEXT:    s_cselect_b64 vcc, -1, 0
405; SI-NEXT:    s_mov_b32 s2, -1
406; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
407; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
408; SI-NEXT:    s_endpgm
409;
410; VI-LABEL: dynamic_insertelement_v2f32:
411; VI:       ; %bb.0:
412; VI-NEXT:    s_load_dword s6, s[4:5], 0x10
413; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
414; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
415; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
416; VI-NEXT:    s_mov_b32 s3, 0x1100f000
417; VI-NEXT:    s_waitcnt lgkmcnt(0)
418; VI-NEXT:    s_cmp_lg_u32 s6, 1
419; VI-NEXT:    s_cselect_b64 vcc, -1, 0
420; VI-NEXT:    v_mov_b32_e32 v1, s5
421; VI-NEXT:    s_cmp_lg_u32 s6, 0
422; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
423; VI-NEXT:    v_mov_b32_e32 v2, s4
424; VI-NEXT:    s_cselect_b64 vcc, -1, 0
425; VI-NEXT:    s_mov_b32 s2, -1
426; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
427; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
428; VI-NEXT:    s_endpgm
429  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
430  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
431  ret void
432}
433
434define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
435; SI-LABEL: dynamic_insertelement_v3f32:
436; SI:       ; %bb.0:
437; SI-NEXT:    s_load_dword s8, s[4:5], 0x8
438; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
439; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
440; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
441; SI-NEXT:    s_mov_b32 s3, 0x100f000
442; SI-NEXT:    s_waitcnt lgkmcnt(0)
443; SI-NEXT:    s_cmp_lg_u32 s8, 2
444; SI-NEXT:    s_cselect_b64 vcc, -1, 0
445; SI-NEXT:    v_mov_b32_e32 v1, s6
446; SI-NEXT:    s_cmp_lg_u32 s8, 1
447; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
448; SI-NEXT:    v_mov_b32_e32 v1, s5
449; SI-NEXT:    s_cselect_b64 vcc, -1, 0
450; SI-NEXT:    s_cmp_lg_u32 s8, 0
451; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
452; SI-NEXT:    v_mov_b32_e32 v3, s4
453; SI-NEXT:    s_cselect_b64 vcc, -1, 0
454; SI-NEXT:    s_mov_b32 s2, -1
455; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
456; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
457; SI-NEXT:    s_endpgm
458;
459; VI-LABEL: dynamic_insertelement_v3f32:
460; VI:       ; %bb.0:
461; VI-NEXT:    s_load_dword s8, s[4:5], 0x20
462; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
463; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
464; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
465; VI-NEXT:    s_mov_b32 s3, 0x1100f000
466; VI-NEXT:    s_waitcnt lgkmcnt(0)
467; VI-NEXT:    s_cmp_lg_u32 s8, 2
468; VI-NEXT:    s_cselect_b64 vcc, -1, 0
469; VI-NEXT:    v_mov_b32_e32 v1, s6
470; VI-NEXT:    s_cmp_lg_u32 s8, 1
471; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
472; VI-NEXT:    v_mov_b32_e32 v1, s5
473; VI-NEXT:    s_cselect_b64 vcc, -1, 0
474; VI-NEXT:    s_cmp_lg_u32 s8, 0
475; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
476; VI-NEXT:    v_mov_b32_e32 v3, s4
477; VI-NEXT:    s_cselect_b64 vcc, -1, 0
478; VI-NEXT:    s_mov_b32 s2, -1
479; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
480; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
481; VI-NEXT:    s_endpgm
482  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
483  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
484  ret void
485}
486
487define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
488; SI-LABEL: dynamic_insertelement_v4f32:
489; SI:       ; %bb.0:
490; SI-NEXT:    s_load_dword s8, s[4:5], 0x8
491; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
492; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
493; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
494; SI-NEXT:    s_mov_b32 s3, 0x100f000
495; SI-NEXT:    s_waitcnt lgkmcnt(0)
496; SI-NEXT:    s_cmp_lg_u32 s8, 3
497; SI-NEXT:    s_cselect_b64 vcc, -1, 0
498; SI-NEXT:    v_mov_b32_e32 v1, s7
499; SI-NEXT:    s_cmp_lg_u32 s8, 2
500; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
501; SI-NEXT:    v_mov_b32_e32 v1, s6
502; SI-NEXT:    s_cselect_b64 vcc, -1, 0
503; SI-NEXT:    s_cmp_lg_u32 s8, 1
504; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
505; SI-NEXT:    v_mov_b32_e32 v1, s5
506; SI-NEXT:    s_cselect_b64 vcc, -1, 0
507; SI-NEXT:    s_cmp_lg_u32 s8, 0
508; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
509; SI-NEXT:    v_mov_b32_e32 v4, s4
510; SI-NEXT:    s_cselect_b64 vcc, -1, 0
511; SI-NEXT:    s_mov_b32 s2, -1
512; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
513; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
514; SI-NEXT:    s_endpgm
515;
516; VI-LABEL: dynamic_insertelement_v4f32:
517; VI:       ; %bb.0:
518; VI-NEXT:    s_load_dword s8, s[4:5], 0x20
519; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
520; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
521; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
522; VI-NEXT:    s_mov_b32 s3, 0x1100f000
523; VI-NEXT:    s_waitcnt lgkmcnt(0)
524; VI-NEXT:    s_cmp_lg_u32 s8, 3
525; VI-NEXT:    s_cselect_b64 vcc, -1, 0
526; VI-NEXT:    v_mov_b32_e32 v1, s7
527; VI-NEXT:    s_cmp_lg_u32 s8, 2
528; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
529; VI-NEXT:    v_mov_b32_e32 v1, s6
530; VI-NEXT:    s_cselect_b64 vcc, -1, 0
531; VI-NEXT:    s_cmp_lg_u32 s8, 1
532; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
533; VI-NEXT:    v_mov_b32_e32 v1, s5
534; VI-NEXT:    s_cselect_b64 vcc, -1, 0
535; VI-NEXT:    s_cmp_lg_u32 s8, 0
536; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
537; VI-NEXT:    v_mov_b32_e32 v4, s4
538; VI-NEXT:    s_cselect_b64 vcc, -1, 0
539; VI-NEXT:    s_mov_b32 s2, -1
540; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
541; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
542; VI-NEXT:    s_endpgm
543  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
544  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
545  ret void
546}
547
548define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
549; SI-LABEL: dynamic_insertelement_v8f32:
550; SI:       ; %bb.0:
551; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
552; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
553; SI-NEXT:    s_load_dword s4, s[4:5], 0x10
554; SI-NEXT:    v_mov_b32_e32 v8, 0x40a00000
555; SI-NEXT:    s_mov_b32 s3, 0x100f000
556; SI-NEXT:    s_mov_b32 s2, -1
557; SI-NEXT:    s_waitcnt lgkmcnt(0)
558; SI-NEXT:    v_mov_b32_e32 v0, s8
559; SI-NEXT:    v_mov_b32_e32 v1, s9
560; SI-NEXT:    v_mov_b32_e32 v2, s10
561; SI-NEXT:    v_mov_b32_e32 v3, s11
562; SI-NEXT:    v_mov_b32_e32 v4, s12
563; SI-NEXT:    v_mov_b32_e32 v5, s13
564; SI-NEXT:    v_mov_b32_e32 v6, s14
565; SI-NEXT:    v_mov_b32_e32 v7, s15
566; SI-NEXT:    s_mov_b32 m0, s4
567; SI-NEXT:    v_movreld_b32_e32 v0, v8
568; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
569; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
570; SI-NEXT:    s_endpgm
571;
572; VI-LABEL: dynamic_insertelement_v8f32:
573; VI:       ; %bb.0:
574; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
575; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
576; VI-NEXT:    s_load_dword s4, s[4:5], 0x40
577; VI-NEXT:    v_mov_b32_e32 v8, 0x40a00000
578; VI-NEXT:    s_mov_b32 s3, 0x1100f000
579; VI-NEXT:    s_mov_b32 s2, -1
580; VI-NEXT:    s_waitcnt lgkmcnt(0)
581; VI-NEXT:    v_mov_b32_e32 v0, s8
582; VI-NEXT:    v_mov_b32_e32 v1, s9
583; VI-NEXT:    v_mov_b32_e32 v2, s10
584; VI-NEXT:    v_mov_b32_e32 v3, s11
585; VI-NEXT:    v_mov_b32_e32 v4, s12
586; VI-NEXT:    v_mov_b32_e32 v5, s13
587; VI-NEXT:    v_mov_b32_e32 v6, s14
588; VI-NEXT:    v_mov_b32_e32 v7, s15
589; VI-NEXT:    s_mov_b32 m0, s4
590; VI-NEXT:    v_movreld_b32_e32 v0, v8
591; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
592; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
593; VI-NEXT:    s_endpgm
594  %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
595  store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
596  ret void
597}
598
599define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
600; SI-LABEL: dynamic_insertelement_v16f32:
601; SI:       ; %bb.0:
602; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
603; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
604; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
605; SI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
606; SI-NEXT:    s_mov_b32 s3, 0x100f000
607; SI-NEXT:    s_mov_b32 s2, -1
608; SI-NEXT:    s_waitcnt lgkmcnt(0)
609; SI-NEXT:    v_mov_b32_e32 v0, s8
610; SI-NEXT:    v_mov_b32_e32 v1, s9
611; SI-NEXT:    v_mov_b32_e32 v2, s10
612; SI-NEXT:    v_mov_b32_e32 v3, s11
613; SI-NEXT:    v_mov_b32_e32 v4, s12
614; SI-NEXT:    v_mov_b32_e32 v5, s13
615; SI-NEXT:    v_mov_b32_e32 v6, s14
616; SI-NEXT:    v_mov_b32_e32 v7, s15
617; SI-NEXT:    v_mov_b32_e32 v8, s16
618; SI-NEXT:    v_mov_b32_e32 v9, s17
619; SI-NEXT:    v_mov_b32_e32 v10, s18
620; SI-NEXT:    v_mov_b32_e32 v11, s19
621; SI-NEXT:    v_mov_b32_e32 v12, s20
622; SI-NEXT:    v_mov_b32_e32 v13, s21
623; SI-NEXT:    v_mov_b32_e32 v14, s22
624; SI-NEXT:    v_mov_b32_e32 v15, s23
625; SI-NEXT:    s_mov_b32 m0, s4
626; SI-NEXT:    v_movreld_b32_e32 v0, v16
627; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
628; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
629; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
630; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
631; SI-NEXT:    s_endpgm
632;
633; VI-LABEL: dynamic_insertelement_v16f32:
634; VI:       ; %bb.0:
635; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
636; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
637; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
638; VI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
639; VI-NEXT:    s_mov_b32 s3, 0x1100f000
640; VI-NEXT:    s_mov_b32 s2, -1
641; VI-NEXT:    s_waitcnt lgkmcnt(0)
642; VI-NEXT:    v_mov_b32_e32 v0, s8
643; VI-NEXT:    v_mov_b32_e32 v1, s9
644; VI-NEXT:    v_mov_b32_e32 v2, s10
645; VI-NEXT:    v_mov_b32_e32 v3, s11
646; VI-NEXT:    v_mov_b32_e32 v4, s12
647; VI-NEXT:    v_mov_b32_e32 v5, s13
648; VI-NEXT:    v_mov_b32_e32 v6, s14
649; VI-NEXT:    v_mov_b32_e32 v7, s15
650; VI-NEXT:    v_mov_b32_e32 v8, s16
651; VI-NEXT:    v_mov_b32_e32 v9, s17
652; VI-NEXT:    v_mov_b32_e32 v10, s18
653; VI-NEXT:    v_mov_b32_e32 v11, s19
654; VI-NEXT:    v_mov_b32_e32 v12, s20
655; VI-NEXT:    v_mov_b32_e32 v13, s21
656; VI-NEXT:    v_mov_b32_e32 v14, s22
657; VI-NEXT:    v_mov_b32_e32 v15, s23
658; VI-NEXT:    s_mov_b32 m0, s4
659; VI-NEXT:    v_movreld_b32_e32 v0, v16
660; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
661; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
662; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
663; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
664; VI-NEXT:    s_endpgm
665  %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
666  store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
667  ret void
668}
669
670define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
671; SI-LABEL: dynamic_insertelement_v2i32:
672; SI:       ; %bb.0:
673; SI-NEXT:    s_load_dword s8, s[4:5], 0x4
674; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
675; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
676; SI-NEXT:    s_mov_b32 s3, 0x100f000
677; SI-NEXT:    s_mov_b32 s2, -1
678; SI-NEXT:    s_waitcnt lgkmcnt(0)
679; SI-NEXT:    s_cmp_lg_u32 s8, 1
680; SI-NEXT:    v_mov_b32_e32 v0, s7
681; SI-NEXT:    s_cselect_b64 vcc, -1, 0
682; SI-NEXT:    s_cmp_lg_u32 s8, 0
683; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
684; SI-NEXT:    v_mov_b32_e32 v0, s6
685; SI-NEXT:    s_cselect_b64 vcc, -1, 0
686; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
687; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
688; SI-NEXT:    s_endpgm
689;
690; VI-LABEL: dynamic_insertelement_v2i32:
691; VI:       ; %bb.0:
692; VI-NEXT:    s_load_dword s8, s[4:5], 0x10
693; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
694; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
695; VI-NEXT:    s_mov_b32 s3, 0x1100f000
696; VI-NEXT:    s_mov_b32 s2, -1
697; VI-NEXT:    s_waitcnt lgkmcnt(0)
698; VI-NEXT:    s_cmp_lg_u32 s8, 1
699; VI-NEXT:    s_cselect_b32 s4, s7, 5
700; VI-NEXT:    s_cmp_lg_u32 s8, 0
701; VI-NEXT:    s_cselect_b32 s5, s6, 5
702; VI-NEXT:    v_mov_b32_e32 v0, s5
703; VI-NEXT:    v_mov_b32_e32 v1, s4
704; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
705; VI-NEXT:    s_endpgm
706  %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
707  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
708  ret void
709}
710
711define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
712; SI-LABEL: dynamic_insertelement_v3i32:
713; SI:       ; %bb.0:
714; SI-NEXT:    s_load_dword s8, s[4:5], 0x8
715; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
716; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
717; SI-NEXT:    s_mov_b32 s7, 0x100f000
718; SI-NEXT:    s_mov_b32 s6, -1
719; SI-NEXT:    s_waitcnt lgkmcnt(0)
720; SI-NEXT:    s_cmp_lg_u32 s8, 2
721; SI-NEXT:    v_mov_b32_e32 v0, s2
722; SI-NEXT:    s_cselect_b64 vcc, -1, 0
723; SI-NEXT:    s_cmp_lg_u32 s8, 1
724; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
725; SI-NEXT:    v_mov_b32_e32 v0, s1
726; SI-NEXT:    s_cselect_b64 vcc, -1, 0
727; SI-NEXT:    s_cmp_lg_u32 s8, 0
728; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
729; SI-NEXT:    v_mov_b32_e32 v0, s0
730; SI-NEXT:    s_cselect_b64 vcc, -1, 0
731; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
732; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
733; SI-NEXT:    s_endpgm
734;
735; VI-LABEL: dynamic_insertelement_v3i32:
736; VI:       ; %bb.0:
737; VI-NEXT:    s_load_dword s8, s[4:5], 0x20
738; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
739; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
740; VI-NEXT:    s_mov_b32 s7, 0x1100f000
741; VI-NEXT:    s_mov_b32 s6, -1
742; VI-NEXT:    s_waitcnt lgkmcnt(0)
743; VI-NEXT:    s_cmp_lg_u32 s8, 2
744; VI-NEXT:    s_cselect_b32 s2, s2, 5
745; VI-NEXT:    s_cmp_lg_u32 s8, 1
746; VI-NEXT:    s_cselect_b32 s1, s1, 5
747; VI-NEXT:    s_cmp_lg_u32 s8, 0
748; VI-NEXT:    s_cselect_b32 s0, s0, 5
749; VI-NEXT:    v_mov_b32_e32 v0, s0
750; VI-NEXT:    v_mov_b32_e32 v1, s1
751; VI-NEXT:    v_mov_b32_e32 v2, s2
752; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
753; VI-NEXT:    s_endpgm
754  %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
755  store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
756  ret void
757}
758
759define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
760; SI-LABEL: dynamic_insertelement_v4i32:
761; SI:       ; %bb.0:
762; SI-NEXT:    s_load_dword s6, s[4:5], 0x8
763; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
764; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
765; SI-NEXT:    s_load_dword s4, s[4:5], 0x11
766; SI-NEXT:    s_mov_b32 s3, 0x100f000
767; SI-NEXT:    s_waitcnt lgkmcnt(0)
768; SI-NEXT:    s_cmp_eq_u32 s6, 3
769; SI-NEXT:    s_cselect_b64 vcc, -1, 0
770; SI-NEXT:    v_mov_b32_e32 v0, s11
771; SI-NEXT:    v_mov_b32_e32 v4, s4
772; SI-NEXT:    s_cmp_eq_u32 s6, 2
773; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
774; SI-NEXT:    v_mov_b32_e32 v0, s10
775; SI-NEXT:    s_cselect_b64 vcc, -1, 0
776; SI-NEXT:    s_cmp_eq_u32 s6, 1
777; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
778; SI-NEXT:    v_mov_b32_e32 v0, s9
779; SI-NEXT:    s_cselect_b64 vcc, -1, 0
780; SI-NEXT:    s_cmp_eq_u32 s6, 0
781; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
782; SI-NEXT:    v_mov_b32_e32 v0, s8
783; SI-NEXT:    s_cselect_b64 vcc, -1, 0
784; SI-NEXT:    s_mov_b32 s2, -1
785; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
786; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
787; SI-NEXT:    s_endpgm
788;
789; VI-LABEL: dynamic_insertelement_v4i32:
790; VI:       ; %bb.0:
791; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
792; VI-NEXT:    s_load_dword s8, s[4:5], 0x20
793; VI-NEXT:    s_load_dword s9, s[4:5], 0x44
794; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
795; VI-NEXT:    s_mov_b32 s7, 0x1100f000
796; VI-NEXT:    s_mov_b32 s6, -1
797; VI-NEXT:    s_waitcnt lgkmcnt(0)
798; VI-NEXT:    s_cmp_eq_u32 s8, 3
799; VI-NEXT:    s_cselect_b32 s3, s9, s3
800; VI-NEXT:    s_cmp_eq_u32 s8, 2
801; VI-NEXT:    s_cselect_b32 s2, s9, s2
802; VI-NEXT:    s_cmp_eq_u32 s8, 1
803; VI-NEXT:    s_cselect_b32 s1, s9, s1
804; VI-NEXT:    s_cmp_eq_u32 s8, 0
805; VI-NEXT:    s_cselect_b32 s0, s9, s0
806; VI-NEXT:    v_mov_b32_e32 v0, s0
807; VI-NEXT:    v_mov_b32_e32 v1, s1
808; VI-NEXT:    v_mov_b32_e32 v2, s2
809; VI-NEXT:    v_mov_b32_e32 v3, s3
810; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
811; VI-NEXT:    s_endpgm
812  %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
813  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
814  ret void
815}
816
817define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
818; SI-LABEL: dynamic_insertelement_v8i32:
819; SI:       ; %bb.0:
820; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
821; SI-NEXT:    s_load_dword s6, s[4:5], 0x10
822; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
823; SI-NEXT:    s_mov_b32 s3, 0x100f000
824; SI-NEXT:    s_mov_b32 s2, -1
825; SI-NEXT:    s_waitcnt lgkmcnt(0)
826; SI-NEXT:    v_mov_b32_e32 v0, s8
827; SI-NEXT:    v_mov_b32_e32 v1, s9
828; SI-NEXT:    v_mov_b32_e32 v2, s10
829; SI-NEXT:    v_mov_b32_e32 v3, s11
830; SI-NEXT:    v_mov_b32_e32 v4, s12
831; SI-NEXT:    v_mov_b32_e32 v5, s13
832; SI-NEXT:    v_mov_b32_e32 v6, s14
833; SI-NEXT:    v_mov_b32_e32 v7, s15
834; SI-NEXT:    s_mov_b32 m0, s6
835; SI-NEXT:    v_movreld_b32_e32 v0, 5
836; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
837; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
838; SI-NEXT:    s_endpgm
839;
840; VI-LABEL: dynamic_insertelement_v8i32:
841; VI:       ; %bb.0:
842; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
843; VI-NEXT:    s_load_dword s6, s[4:5], 0x40
844; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
845; VI-NEXT:    s_mov_b32 s3, 0x1100f000
846; VI-NEXT:    s_mov_b32 s2, -1
847; VI-NEXT:    s_waitcnt lgkmcnt(0)
848; VI-NEXT:    v_mov_b32_e32 v0, s8
849; VI-NEXT:    v_mov_b32_e32 v1, s9
850; VI-NEXT:    v_mov_b32_e32 v2, s10
851; VI-NEXT:    v_mov_b32_e32 v3, s11
852; VI-NEXT:    v_mov_b32_e32 v4, s12
853; VI-NEXT:    v_mov_b32_e32 v5, s13
854; VI-NEXT:    v_mov_b32_e32 v6, s14
855; VI-NEXT:    v_mov_b32_e32 v7, s15
856; VI-NEXT:    s_mov_b32 m0, s6
857; VI-NEXT:    v_movreld_b32_e32 v0, 5
858; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
859; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
860; VI-NEXT:    s_endpgm
861  %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
862  store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
863  ret void
864}
865
866define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
867; SI-LABEL: dynamic_insertelement_v16i32:
868; SI:       ; %bb.0:
869; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
870; SI-NEXT:    s_load_dword s6, s[4:5], 0x20
871; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
872; SI-NEXT:    s_mov_b32 s3, 0x100f000
873; SI-NEXT:    s_mov_b32 s2, -1
874; SI-NEXT:    s_waitcnt lgkmcnt(0)
875; SI-NEXT:    v_mov_b32_e32 v0, s8
876; SI-NEXT:    v_mov_b32_e32 v1, s9
877; SI-NEXT:    v_mov_b32_e32 v2, s10
878; SI-NEXT:    v_mov_b32_e32 v3, s11
879; SI-NEXT:    v_mov_b32_e32 v4, s12
880; SI-NEXT:    v_mov_b32_e32 v5, s13
881; SI-NEXT:    v_mov_b32_e32 v6, s14
882; SI-NEXT:    v_mov_b32_e32 v7, s15
883; SI-NEXT:    v_mov_b32_e32 v8, s16
884; SI-NEXT:    v_mov_b32_e32 v9, s17
885; SI-NEXT:    v_mov_b32_e32 v10, s18
886; SI-NEXT:    v_mov_b32_e32 v11, s19
887; SI-NEXT:    v_mov_b32_e32 v12, s20
888; SI-NEXT:    v_mov_b32_e32 v13, s21
889; SI-NEXT:    v_mov_b32_e32 v14, s22
890; SI-NEXT:    v_mov_b32_e32 v15, s23
891; SI-NEXT:    s_mov_b32 m0, s6
892; SI-NEXT:    v_movreld_b32_e32 v0, 5
893; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
894; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
895; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
896; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
897; SI-NEXT:    s_endpgm
898;
899; VI-LABEL: dynamic_insertelement_v16i32:
900; VI:       ; %bb.0:
901; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
902; VI-NEXT:    s_load_dword s6, s[4:5], 0x80
903; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
904; VI-NEXT:    s_mov_b32 s3, 0x1100f000
905; VI-NEXT:    s_mov_b32 s2, -1
906; VI-NEXT:    s_waitcnt lgkmcnt(0)
907; VI-NEXT:    v_mov_b32_e32 v0, s8
908; VI-NEXT:    v_mov_b32_e32 v1, s9
909; VI-NEXT:    v_mov_b32_e32 v2, s10
910; VI-NEXT:    v_mov_b32_e32 v3, s11
911; VI-NEXT:    v_mov_b32_e32 v4, s12
912; VI-NEXT:    v_mov_b32_e32 v5, s13
913; VI-NEXT:    v_mov_b32_e32 v6, s14
914; VI-NEXT:    v_mov_b32_e32 v7, s15
915; VI-NEXT:    v_mov_b32_e32 v8, s16
916; VI-NEXT:    v_mov_b32_e32 v9, s17
917; VI-NEXT:    v_mov_b32_e32 v10, s18
918; VI-NEXT:    v_mov_b32_e32 v11, s19
919; VI-NEXT:    v_mov_b32_e32 v12, s20
920; VI-NEXT:    v_mov_b32_e32 v13, s21
921; VI-NEXT:    v_mov_b32_e32 v14, s22
922; VI-NEXT:    v_mov_b32_e32 v15, s23
923; VI-NEXT:    s_mov_b32 m0, s6
924; VI-NEXT:    v_movreld_b32_e32 v0, 5
925; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
926; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
927; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
928; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
929; VI-NEXT:    s_endpgm
930  %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
931  store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
932  ret void
933}
934
935define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
936; SI-LABEL: dynamic_insertelement_v2i16:
937; SI:       ; %bb.0:
938; SI-NEXT:    s_load_dword s6, s[4:5], 0x3
939; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
940; SI-NEXT:    s_load_dword s4, s[4:5], 0x2
941; SI-NEXT:    s_mov_b32 s3, 0x100f000
942; SI-NEXT:    s_mov_b32 s2, -1
943; SI-NEXT:    s_waitcnt lgkmcnt(0)
944; SI-NEXT:    s_lshl_b32 s5, s6, 4
945; SI-NEXT:    s_lshl_b32 s5, 0xffff, s5
946; SI-NEXT:    s_andn2_b32 s4, s4, s5
947; SI-NEXT:    s_and_b32 s5, s5, 0x50005
948; SI-NEXT:    s_or_b32 s4, s5, s4
949; SI-NEXT:    v_mov_b32_e32 v0, s4
950; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
951; SI-NEXT:    s_endpgm
952;
953; VI-LABEL: dynamic_insertelement_v2i16:
954; VI:       ; %bb.0:
955; VI-NEXT:    s_load_dword s6, s[4:5], 0xc
956; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
957; VI-NEXT:    s_load_dword s4, s[4:5], 0x8
958; VI-NEXT:    s_mov_b32 s3, 0x1100f000
959; VI-NEXT:    s_mov_b32 s2, -1
960; VI-NEXT:    s_waitcnt lgkmcnt(0)
961; VI-NEXT:    s_lshl_b32 s5, s6, 4
962; VI-NEXT:    s_lshl_b32 s5, 0xffff, s5
963; VI-NEXT:    s_andn2_b32 s4, s4, s5
964; VI-NEXT:    s_and_b32 s5, s5, 0x50005
965; VI-NEXT:    s_or_b32 s4, s5, s4
966; VI-NEXT:    v_mov_b32_e32 v0, s4
967; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
968; VI-NEXT:    s_endpgm
969  %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
970  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
971  ret void
972}
973
974define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
975; SI-LABEL: dynamic_insertelement_v3i16:
976; SI:       ; %bb.0:
977; SI-NEXT:    s_load_dword s6, s[4:5], 0x4
978; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
979; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
980; SI-NEXT:    s_mov_b32 s3, 0x100f000
981; SI-NEXT:    s_mov_b32 s2, -1
982; SI-NEXT:    s_waitcnt lgkmcnt(0)
983; SI-NEXT:    s_lshl_b32 s8, s6, 4
984; SI-NEXT:    s_mov_b64 s[6:7], 0xffff
985; SI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s8
986; SI-NEXT:    s_and_b32 s9, s7, 0x50005
987; SI-NEXT:    s_and_b32 s8, s6, 0x50005
988; SI-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
989; SI-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
990; SI-NEXT:    v_mov_b32_e32 v0, s5
991; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
992; SI-NEXT:    v_mov_b32_e32 v0, s4
993; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
994; SI-NEXT:    s_endpgm
995;
996; VI-LABEL: dynamic_insertelement_v3i16:
997; VI:       ; %bb.0:
998; VI-NEXT:    s_load_dword s6, s[4:5], 0x10
999; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1000; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
1001; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1002; VI-NEXT:    s_mov_b32 s2, -1
1003; VI-NEXT:    s_waitcnt lgkmcnt(0)
1004; VI-NEXT:    s_lshl_b32 s8, s6, 4
1005; VI-NEXT:    s_mov_b64 s[6:7], 0xffff
1006; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s8
1007; VI-NEXT:    s_mov_b32 s8, 0x50005
1008; VI-NEXT:    s_mov_b32 s9, s8
1009; VI-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
1010; VI-NEXT:    s_and_b64 s[6:7], s[6:7], s[8:9]
1011; VI-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
1012; VI-NEXT:    v_mov_b32_e32 v0, s5
1013; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
1014; VI-NEXT:    v_mov_b32_e32 v0, s4
1015; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1016; VI-NEXT:    s_endpgm
1017  %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
1018  store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8
1019  ret void
1020}
1021
1022define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
1023; SI-LABEL: dynamic_insertelement_v2i8:
1024; SI:       ; %bb.0:
1025; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
1026; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1027; SI-NEXT:    s_load_dword s4, s[4:5], 0xa
1028; SI-NEXT:    s_mov_b32 s3, 0x100f000
1029; SI-NEXT:    s_mov_b32 s2, -1
1030; SI-NEXT:    s_waitcnt lgkmcnt(0)
1031; SI-NEXT:    s_lshl_b32 s5, s6, 3
1032; SI-NEXT:    s_lshl_b32 s5, -1, s5
1033; SI-NEXT:    s_andn2_b32 s4, s4, s5
1034; SI-NEXT:    s_and_b32 s5, s5, 0x505
1035; SI-NEXT:    s_or_b32 s4, s5, s4
1036; SI-NEXT:    v_mov_b32_e32 v0, s4
1037; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1038; SI-NEXT:    s_endpgm
1039;
1040; VI-LABEL: dynamic_insertelement_v2i8:
1041; VI:       ; %bb.0:
1042; VI-NEXT:    s_load_dword s6, s[4:5], 0x4c
1043; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1044; VI-NEXT:    s_load_dword s4, s[4:5], 0x28
1045; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1046; VI-NEXT:    s_mov_b32 s2, -1
1047; VI-NEXT:    s_waitcnt lgkmcnt(0)
1048; VI-NEXT:    s_lshl_b32 s5, s6, 3
1049; VI-NEXT:    v_lshlrev_b16_e64 v0, s5, -1
1050; VI-NEXT:    v_not_b32_e32 v1, v0
1051; VI-NEXT:    v_and_b32_e32 v1, s4, v1
1052; VI-NEXT:    v_and_b32_e32 v0, 0x505, v0
1053; VI-NEXT:    v_or_b32_e32 v0, v0, v1
1054; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1055; VI-NEXT:    s_endpgm
1056  %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
1057  store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
1058  ret void
1059}
1060
1061; FIXME: post legalize i16 and i32 shifts aren't merged because of
1062; isTypeDesirableForOp in SimplifyDemandedBits
1063define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
1064; SI-LABEL: dynamic_insertelement_v3i8:
1065; SI:       ; %bb.0:
1066; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
1067; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1068; SI-NEXT:    s_load_dword s4, s[4:5], 0xa
1069; SI-NEXT:    s_mov_b32 s3, 0x100f000
1070; SI-NEXT:    s_mov_b32 s2, -1
1071; SI-NEXT:    s_waitcnt lgkmcnt(0)
1072; SI-NEXT:    s_lshl_b32 s5, s6, 3
1073; SI-NEXT:    s_lshl_b32 s5, 0xffff, s5
1074; SI-NEXT:    s_andn2_b32 s4, s4, s5
1075; SI-NEXT:    s_and_b32 s5, s5, 0x5050505
1076; SI-NEXT:    s_or_b32 s4, s5, s4
1077; SI-NEXT:    s_lshr_b32 s5, s4, 16
1078; SI-NEXT:    v_mov_b32_e32 v0, s4
1079; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1080; SI-NEXT:    v_mov_b32_e32 v0, s5
1081; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:2
1082; SI-NEXT:    s_endpgm
1083;
1084; VI-LABEL: dynamic_insertelement_v3i8:
1085; VI:       ; %bb.0:
1086; VI-NEXT:    s_load_dword s6, s[4:5], 0x4c
1087; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1088; VI-NEXT:    s_load_dword s4, s[4:5], 0x28
1089; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1090; VI-NEXT:    s_mov_b32 s2, -1
1091; VI-NEXT:    s_waitcnt lgkmcnt(0)
1092; VI-NEXT:    s_lshl_b32 s5, s6, 3
1093; VI-NEXT:    s_lshl_b32 s5, 0xffff, s5
1094; VI-NEXT:    s_andn2_b32 s4, s4, s5
1095; VI-NEXT:    s_and_b32 s5, s5, 0x5050505
1096; VI-NEXT:    s_or_b32 s4, s5, s4
1097; VI-NEXT:    s_lshr_b32 s5, s4, 16
1098; VI-NEXT:    v_mov_b32_e32 v0, s4
1099; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1100; VI-NEXT:    v_mov_b32_e32 v0, s5
1101; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:2
1102; VI-NEXT:    s_endpgm
1103  %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
1104  store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
1105  ret void
1106}
1107
1108define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
1109; SI-LABEL: dynamic_insertelement_v4i8:
1110; SI:       ; %bb.0:
1111; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
1112; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1113; SI-NEXT:    s_load_dword s4, s[4:5], 0xa
1114; SI-NEXT:    s_mov_b32 s3, 0x100f000
1115; SI-NEXT:    s_mov_b32 s2, -1
1116; SI-NEXT:    s_waitcnt lgkmcnt(0)
1117; SI-NEXT:    s_lshl_b32 s5, s6, 3
1118; SI-NEXT:    s_lshl_b32 s5, 0xffff, s5
1119; SI-NEXT:    s_andn2_b32 s4, s4, s5
1120; SI-NEXT:    s_and_b32 s5, s5, 0x5050505
1121; SI-NEXT:    s_or_b32 s4, s5, s4
1122; SI-NEXT:    v_mov_b32_e32 v0, s4
1123; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1124; SI-NEXT:    s_endpgm
1125;
1126; VI-LABEL: dynamic_insertelement_v4i8:
1127; VI:       ; %bb.0:
1128; VI-NEXT:    s_load_dword s6, s[4:5], 0x4c
1129; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1130; VI-NEXT:    s_load_dword s4, s[4:5], 0x28
1131; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1132; VI-NEXT:    s_mov_b32 s2, -1
1133; VI-NEXT:    s_waitcnt lgkmcnt(0)
1134; VI-NEXT:    s_lshl_b32 s5, s6, 3
1135; VI-NEXT:    s_lshl_b32 s5, 0xffff, s5
1136; VI-NEXT:    s_andn2_b32 s4, s4, s5
1137; VI-NEXT:    s_and_b32 s5, s5, 0x5050505
1138; VI-NEXT:    s_or_b32 s4, s5, s4
1139; VI-NEXT:    v_mov_b32_e32 v0, s4
1140; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1141; VI-NEXT:    s_endpgm
1142  %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
1143  store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
1144  ret void
1145}
1146
1147define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind {
1148; SI-LABEL: s_dynamic_insertelement_v8i8:
1149; SI:       ; %bb.0:
1150; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1151; SI-NEXT:    s_load_dword s8, s[4:5], 0x4
1152; SI-NEXT:    s_mov_b32 s7, 0x100f000
1153; SI-NEXT:    s_mov_b32 s6, -1
1154; SI-NEXT:    s_waitcnt lgkmcnt(0)
1155; SI-NEXT:    s_mov_b32 s4, s0
1156; SI-NEXT:    s_mov_b32 s5, s1
1157; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
1158; SI-NEXT:    s_lshl_b32 s8, s8, 3
1159; SI-NEXT:    s_mov_b64 s[2:3], 0xffff
1160; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
1161; SI-NEXT:    s_and_b32 s9, s3, 0x5050505
1162; SI-NEXT:    s_and_b32 s8, s2, 0x5050505
1163; SI-NEXT:    s_waitcnt lgkmcnt(0)
1164; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
1165; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
1166; SI-NEXT:    v_mov_b32_e32 v0, s0
1167; SI-NEXT:    v_mov_b32_e32 v1, s1
1168; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1169; SI-NEXT:    s_endpgm
1170;
1171; VI-LABEL: s_dynamic_insertelement_v8i8:
1172; VI:       ; %bb.0:
1173; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1174; VI-NEXT:    s_load_dword s8, s[4:5], 0x10
1175; VI-NEXT:    s_mov_b32 s7, 0x1100f000
1176; VI-NEXT:    s_mov_b32 s6, -1
1177; VI-NEXT:    s_waitcnt lgkmcnt(0)
1178; VI-NEXT:    s_mov_b32 s4, s0
1179; VI-NEXT:    s_mov_b32 s5, s1
1180; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
1181; VI-NEXT:    s_lshl_b32 s8, s8, 3
1182; VI-NEXT:    s_mov_b64 s[2:3], 0xffff
1183; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
1184; VI-NEXT:    s_and_b32 s9, s3, 0x5050505
1185; VI-NEXT:    s_and_b32 s8, s2, 0x5050505
1186; VI-NEXT:    s_waitcnt lgkmcnt(0)
1187; VI-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
1188; VI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
1189; VI-NEXT:    v_mov_b32_e32 v0, s0
1190; VI-NEXT:    v_mov_b32_e32 v1, s1
1191; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1192; VI-NEXT:    s_endpgm
1193  %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4
1194  %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
1195  store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
1196  ret void
1197}
1198
1199define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
1200; SI-LABEL: dynamic_insertelement_v16i8:
1201; SI:       ; %bb.0:
1202; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
1203; SI-NEXT:    s_load_dword s6, s[4:5], 0x8
1204; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1205; SI-NEXT:    s_mov_b32 s3, 0x100f000
1206; SI-NEXT:    s_mov_b32 s2, -1
1207; SI-NEXT:    s_waitcnt lgkmcnt(0)
1208; SI-NEXT:    s_lshr_b32 s4, s11, 24
1209; SI-NEXT:    s_cmp_lg_u32 s6, 15
1210; SI-NEXT:    v_mov_b32_e32 v0, s4
1211; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1212; SI-NEXT:    s_lshr_b32 s4, s11, 16
1213; SI-NEXT:    s_cmp_lg_u32 s6, 14
1214; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1215; SI-NEXT:    v_mov_b32_e32 v1, s4
1216; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1217; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1218; SI-NEXT:    s_lshr_b32 s4, s11, 8
1219; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1220; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
1221; SI-NEXT:    s_cmp_lg_u32 s6, 13
1222; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1223; SI-NEXT:    v_mov_b32_e32 v1, s4
1224; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1225; SI-NEXT:    s_cmp_lg_u32 s6, 12
1226; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1227; SI-NEXT:    v_mov_b32_e32 v2, s11
1228; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1229; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1230; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1231; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
1232; SI-NEXT:    v_or_b32_e32 v1, v2, v1
1233; SI-NEXT:    s_lshr_b32 s4, s10, 24
1234; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1235; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1236; SI-NEXT:    s_cmp_lg_u32 s6, 11
1237; SI-NEXT:    v_or_b32_e32 v3, v1, v0
1238; SI-NEXT:    v_mov_b32_e32 v0, s4
1239; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1240; SI-NEXT:    s_lshr_b32 s4, s10, 16
1241; SI-NEXT:    s_cmp_lg_u32 s6, 10
1242; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1243; SI-NEXT:    v_mov_b32_e32 v1, s4
1244; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1245; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1246; SI-NEXT:    s_lshr_b32 s4, s10, 8
1247; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1248; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
1249; SI-NEXT:    s_cmp_lg_u32 s6, 9
1250; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1251; SI-NEXT:    v_mov_b32_e32 v1, s4
1252; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1253; SI-NEXT:    s_cmp_lg_u32 s6, 8
1254; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1255; SI-NEXT:    v_mov_b32_e32 v2, s10
1256; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1257; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1258; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1259; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
1260; SI-NEXT:    v_or_b32_e32 v1, v2, v1
1261; SI-NEXT:    s_lshr_b32 s4, s9, 24
1262; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1263; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1264; SI-NEXT:    s_cmp_lg_u32 s6, 7
1265; SI-NEXT:    v_or_b32_e32 v2, v1, v0
1266; SI-NEXT:    v_mov_b32_e32 v0, s4
1267; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1268; SI-NEXT:    s_lshr_b32 s4, s9, 16
1269; SI-NEXT:    s_cmp_lg_u32 s6, 6
1270; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1271; SI-NEXT:    v_mov_b32_e32 v1, s4
1272; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1273; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1274; SI-NEXT:    s_lshr_b32 s4, s9, 8
1275; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1276; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
1277; SI-NEXT:    s_cmp_lg_u32 s6, 5
1278; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1279; SI-NEXT:    v_mov_b32_e32 v1, s4
1280; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1281; SI-NEXT:    s_cmp_lg_u32 s6, 4
1282; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1283; SI-NEXT:    v_mov_b32_e32 v4, s9
1284; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1285; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1286; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1287; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
1288; SI-NEXT:    v_or_b32_e32 v1, v4, v1
1289; SI-NEXT:    s_lshr_b32 s4, s8, 24
1290; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1291; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1292; SI-NEXT:    s_cmp_lg_u32 s6, 3
1293; SI-NEXT:    v_or_b32_e32 v1, v1, v0
1294; SI-NEXT:    v_mov_b32_e32 v0, s4
1295; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1296; SI-NEXT:    s_lshr_b32 s4, s8, 16
1297; SI-NEXT:    s_cmp_lg_u32 s6, 2
1298; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1299; SI-NEXT:    v_mov_b32_e32 v4, s4
1300; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1301; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1302; SI-NEXT:    s_lshr_b32 s4, s8, 8
1303; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1304; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
1305; SI-NEXT:    s_cmp_lg_u32 s6, 1
1306; SI-NEXT:    v_or_b32_e32 v0, v4, v0
1307; SI-NEXT:    v_mov_b32_e32 v4, s4
1308; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1309; SI-NEXT:    s_cmp_lg_u32 s6, 0
1310; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1311; SI-NEXT:    v_mov_b32_e32 v5, s8
1312; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1313; SI-NEXT:    v_cndmask_b32_e32 v5, 5, v5, vcc
1314; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
1315; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
1316; SI-NEXT:    v_or_b32_e32 v4, v5, v4
1317; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1318; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
1319; SI-NEXT:    v_or_b32_e32 v0, v4, v0
1320; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1321; SI-NEXT:    s_endpgm
1322;
1323; VI-LABEL: dynamic_insertelement_v16i8:
1324; VI:       ; %bb.0:
1325; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
1326; VI-NEXT:    s_load_dword s6, s[4:5], 0x20
1327; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1328; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1329; VI-NEXT:    s_mov_b32 s2, -1
1330; VI-NEXT:    s_waitcnt lgkmcnt(0)
1331; VI-NEXT:    s_lshr_b32 s4, s11, 24
1332; VI-NEXT:    s_cmp_lg_u32 s6, 15
1333; VI-NEXT:    v_mov_b32_e32 v0, s4
1334; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1335; VI-NEXT:    s_lshr_b32 s4, s11, 16
1336; VI-NEXT:    s_cmp_lg_u32 s6, 14
1337; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1338; VI-NEXT:    v_mov_b32_e32 v1, s4
1339; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1340; VI-NEXT:    s_lshr_b32 s4, s11, 8
1341; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1342; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1343; VI-NEXT:    s_cmp_lg_u32 s6, 13
1344; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1345; VI-NEXT:    v_mov_b32_e32 v1, s4
1346; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1347; VI-NEXT:    s_cmp_lg_u32 s6, 12
1348; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1349; VI-NEXT:    v_mov_b32_e32 v2, s11
1350; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1351; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1352; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1353; VI-NEXT:    s_lshr_b32 s4, s10, 24
1354; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1355; VI-NEXT:    s_cmp_lg_u32 s6, 11
1356; VI-NEXT:    v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1357; VI-NEXT:    v_mov_b32_e32 v0, s4
1358; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1359; VI-NEXT:    s_lshr_b32 s4, s10, 16
1360; VI-NEXT:    s_cmp_lg_u32 s6, 10
1361; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1362; VI-NEXT:    v_mov_b32_e32 v1, s4
1363; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1364; VI-NEXT:    s_lshr_b32 s4, s10, 8
1365; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1366; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1367; VI-NEXT:    s_cmp_lg_u32 s6, 9
1368; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1369; VI-NEXT:    v_mov_b32_e32 v1, s4
1370; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1371; VI-NEXT:    s_cmp_lg_u32 s6, 8
1372; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1373; VI-NEXT:    v_mov_b32_e32 v2, s10
1374; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1375; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1376; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1377; VI-NEXT:    s_lshr_b32 s4, s9, 24
1378; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1379; VI-NEXT:    s_cmp_lg_u32 s6, 7
1380; VI-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1381; VI-NEXT:    v_mov_b32_e32 v0, s4
1382; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1383; VI-NEXT:    s_lshr_b32 s4, s9, 16
1384; VI-NEXT:    s_cmp_lg_u32 s6, 6
1385; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1386; VI-NEXT:    v_mov_b32_e32 v1, s4
1387; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1388; VI-NEXT:    s_lshr_b32 s4, s9, 8
1389; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1390; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1391; VI-NEXT:    s_cmp_lg_u32 s6, 5
1392; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1393; VI-NEXT:    v_mov_b32_e32 v1, s4
1394; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1395; VI-NEXT:    s_cmp_lg_u32 s6, 4
1396; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1397; VI-NEXT:    v_mov_b32_e32 v4, s9
1398; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1399; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1400; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1401; VI-NEXT:    s_lshr_b32 s4, s8, 24
1402; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1403; VI-NEXT:    s_cmp_lg_u32 s6, 3
1404; VI-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1405; VI-NEXT:    v_mov_b32_e32 v0, s4
1406; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1407; VI-NEXT:    s_lshr_b32 s4, s8, 16
1408; VI-NEXT:    s_cmp_lg_u32 s6, 2
1409; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1410; VI-NEXT:    v_mov_b32_e32 v4, s4
1411; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1412; VI-NEXT:    s_lshr_b32 s4, s8, 8
1413; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1414; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1415; VI-NEXT:    s_cmp_lg_u32 s6, 1
1416; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1417; VI-NEXT:    v_mov_b32_e32 v4, s4
1418; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1419; VI-NEXT:    s_cmp_lg_u32 s6, 0
1420; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1421; VI-NEXT:    v_mov_b32_e32 v5, s8
1422; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1423; VI-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
1424; VI-NEXT:    v_cndmask_b32_e32 v5, 5, v5, vcc
1425; VI-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1426; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1427; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1428; VI-NEXT:    s_endpgm
1429  %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
1430  store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
1431  ret void
1432}
1433
1434; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
1435; the compiler doesn't crash.
1436define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
1437; SI-LABEL: insert_split_bb:
1438; SI:       ; %bb.0: ; %entry
1439; SI-NEXT:    s_load_dword s6, s[4:5], 0x4
1440; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1441; SI-NEXT:    s_waitcnt lgkmcnt(0)
1442; SI-NEXT:    s_cmp_lg_u32 s6, 0
1443; SI-NEXT:    s_cbranch_scc0 .LBB30_4
1444; SI-NEXT:  ; %bb.1: ; %else
1445; SI-NEXT:    s_load_dword s7, s[2:3], 0x1
1446; SI-NEXT:    s_mov_b64 s[4:5], 0
1447; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
1448; SI-NEXT:    s_waitcnt lgkmcnt(0)
1449; SI-NEXT:    s_mov_b64 vcc, vcc
1450; SI-NEXT:    s_cbranch_vccnz .LBB30_3
1451; SI-NEXT:  .LBB30_2: ; %if
1452; SI-NEXT:    s_load_dword s7, s[2:3], 0x0
1453; SI-NEXT:  .LBB30_3: ; %endif
1454; SI-NEXT:    s_waitcnt lgkmcnt(0)
1455; SI-NEXT:    v_mov_b32_e32 v0, s6
1456; SI-NEXT:    s_mov_b32 s3, 0x100f000
1457; SI-NEXT:    s_mov_b32 s2, -1
1458; SI-NEXT:    v_mov_b32_e32 v1, s7
1459; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1460; SI-NEXT:    s_endpgm
1461; SI-NEXT:  .LBB30_4:
1462; SI-NEXT:    s_branch .LBB30_2
1463;
1464; VI-LABEL: insert_split_bb:
1465; VI:       ; %bb.0: ; %entry
1466; VI-NEXT:    s_load_dword s6, s[4:5], 0x10
1467; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1468; VI-NEXT:    s_waitcnt lgkmcnt(0)
1469; VI-NEXT:    s_cmp_lg_u32 s6, 0
1470; VI-NEXT:    s_cbranch_scc0 .LBB30_4
1471; VI-NEXT:  ; %bb.1: ; %else
1472; VI-NEXT:    s_load_dword s7, s[2:3], 0x4
1473; VI-NEXT:    s_cbranch_execnz .LBB30_3
1474; VI-NEXT:  .LBB30_2: ; %if
1475; VI-NEXT:    s_waitcnt lgkmcnt(0)
1476; VI-NEXT:    s_load_dword s7, s[2:3], 0x0
1477; VI-NEXT:  .LBB30_3: ; %endif
1478; VI-NEXT:    s_waitcnt lgkmcnt(0)
1479; VI-NEXT:    v_mov_b32_e32 v0, s6
1480; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1481; VI-NEXT:    s_mov_b32 s2, -1
1482; VI-NEXT:    v_mov_b32_e32 v1, s7
1483; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1484; VI-NEXT:    s_endpgm
1485; VI-NEXT:  .LBB30_4:
1486; VI-NEXT:    s_branch .LBB30_2
1487entry:
1488  %0 = insertelement <2 x i32> undef, i32 %a, i32 0
1489  %1 = icmp eq i32 %a, 0
1490  br i1 %1, label %if, label %else
1491
1492if:
1493  %2 = load i32, i32 addrspace(1)* %in
1494  %3 = insertelement <2 x i32> %0, i32 %2, i32 1
1495  br label %endif
1496
1497else:
1498  %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1
1499  %5 = load i32, i32 addrspace(1)* %4
1500  %6 = insertelement <2 x i32> %0, i32 %5, i32 1
1501  br label %endif
1502
1503endif:
1504  %7 = phi <2 x i32> [%3, %if], [%6, %else]
1505  store <2 x i32> %7, <2 x i32> addrspace(1)* %out
1506  ret void
1507}
1508
1509define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
1510; SI-LABEL: dynamic_insertelement_v2f64:
1511; SI:       ; %bb.0:
1512; SI-NEXT:    s_load_dword s8, s[4:5], 0x18
1513; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xc
1514; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1515; SI-NEXT:    v_mov_b32_e32 v1, 0x40200000
1516; SI-NEXT:    s_mov_b32 s7, 0x100f000
1517; SI-NEXT:    s_waitcnt lgkmcnt(0)
1518; SI-NEXT:    s_cmp_eq_u32 s8, 1
1519; SI-NEXT:    v_mov_b32_e32 v0, s3
1520; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1521; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
1522; SI-NEXT:    v_mov_b32_e32 v0, s2
1523; SI-NEXT:    s_cmp_eq_u32 s8, 0
1524; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1525; SI-NEXT:    v_mov_b32_e32 v0, s1
1526; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1527; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
1528; SI-NEXT:    v_mov_b32_e32 v0, s0
1529; SI-NEXT:    s_mov_b32 s6, -1
1530; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1531; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1532; SI-NEXT:    s_endpgm
1533;
1534; VI-LABEL: dynamic_insertelement_v2f64:
1535; VI:       ; %bb.0:
1536; VI-NEXT:    s_load_dword s8, s[4:5], 0x60
1537; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x30
1538; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1539; VI-NEXT:    v_mov_b32_e32 v1, 0x40200000
1540; VI-NEXT:    s_mov_b32 s7, 0x1100f000
1541; VI-NEXT:    s_waitcnt lgkmcnt(0)
1542; VI-NEXT:    s_cmp_eq_u32 s8, 1
1543; VI-NEXT:    v_mov_b32_e32 v0, s3
1544; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1545; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
1546; VI-NEXT:    v_mov_b32_e32 v0, s2
1547; VI-NEXT:    s_cmp_eq_u32 s8, 0
1548; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1549; VI-NEXT:    v_mov_b32_e32 v0, s1
1550; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1551; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
1552; VI-NEXT:    v_mov_b32_e32 v0, s0
1553; VI-NEXT:    s_mov_b32 s6, -1
1554; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1555; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1556; VI-NEXT:    s_endpgm
1557  %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
1558  store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
1559  ret void
1560}
1561
1562define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
1563; SI-LABEL: dynamic_insertelement_v2i64:
1564; SI:       ; %bb.0:
1565; SI-NEXT:    s_load_dword s10, s[4:5], 0x8
1566; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
1567; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1568; SI-NEXT:    s_mov_b32 s7, 0x100f000
1569; SI-NEXT:    s_mov_b32 s6, -1
1570; SI-NEXT:    s_waitcnt lgkmcnt(0)
1571; SI-NEXT:    s_cmp_eq_u32 s10, 1
1572; SI-NEXT:    v_mov_b32_e32 v0, s3
1573; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
1574; SI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[8:9]
1575; SI-NEXT:    v_mov_b32_e32 v0, s2
1576; SI-NEXT:    s_cmp_eq_u32 s10, 0
1577; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[8:9]
1578; SI-NEXT:    v_mov_b32_e32 v0, s1
1579; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
1580; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[2:3]
1581; SI-NEXT:    v_mov_b32_e32 v0, s0
1582; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[2:3]
1583; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1584; SI-NEXT:    s_endpgm
1585;
1586; VI-LABEL: dynamic_insertelement_v2i64:
1587; VI:       ; %bb.0:
1588; VI-NEXT:    s_load_dword s10, s[4:5], 0x20
1589; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
1590; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1591; VI-NEXT:    s_mov_b32 s7, 0x1100f000
1592; VI-NEXT:    s_mov_b32 s6, -1
1593; VI-NEXT:    s_waitcnt lgkmcnt(0)
1594; VI-NEXT:    s_cmp_eq_u32 s10, 1
1595; VI-NEXT:    v_mov_b32_e32 v0, s3
1596; VI-NEXT:    s_cselect_b64 s[8:9], -1, 0
1597; VI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[8:9]
1598; VI-NEXT:    v_mov_b32_e32 v0, s2
1599; VI-NEXT:    s_cmp_eq_u32 s10, 0
1600; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[8:9]
1601; VI-NEXT:    v_mov_b32_e32 v0, s1
1602; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
1603; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[2:3]
1604; VI-NEXT:    v_mov_b32_e32 v0, s0
1605; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[2:3]
1606; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1607; VI-NEXT:    s_endpgm
1608  %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
1609  store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
1610  ret void
1611}
1612
1613define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
1614; SI-LABEL: dynamic_insertelement_v3i64:
1615; SI:       ; %bb.0:
1616; SI-NEXT:    s_load_dword s12, s[4:5], 0x10
1617; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1618; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
1619; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xc
1620; SI-NEXT:    s_mov_b32 s3, 0x100f000
1621; SI-NEXT:    s_waitcnt lgkmcnt(0)
1622; SI-NEXT:    s_cmp_eq_u32 s12, 1
1623; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
1624; SI-NEXT:    v_mov_b32_e32 v0, s11
1625; SI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[6:7]
1626; SI-NEXT:    v_mov_b32_e32 v0, s10
1627; SI-NEXT:    s_cmp_eq_u32 s12, 0
1628; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[6:7]
1629; SI-NEXT:    v_mov_b32_e32 v0, s9
1630; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
1631; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[6:7]
1632; SI-NEXT:    v_mov_b32_e32 v0, s8
1633; SI-NEXT:    s_cmp_eq_u32 s12, 2
1634; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[6:7]
1635; SI-NEXT:    v_mov_b32_e32 v4, s5
1636; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
1637; SI-NEXT:    v_cndmask_b32_e64 v5, v4, 0, s[6:7]
1638; SI-NEXT:    v_mov_b32_e32 v4, s4
1639; SI-NEXT:    s_mov_b32 s2, -1
1640; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 5, s[6:7]
1641; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
1642; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1643; SI-NEXT:    s_endpgm
1644;
1645; VI-LABEL: dynamic_insertelement_v3i64:
1646; VI:       ; %bb.0:
1647; VI-NEXT:    s_load_dword s12, s[4:5], 0x40
1648; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1649; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
1650; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x30
1651; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1652; VI-NEXT:    s_waitcnt lgkmcnt(0)
1653; VI-NEXT:    s_cmp_eq_u32 s12, 1
1654; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
1655; VI-NEXT:    v_mov_b32_e32 v0, s11
1656; VI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[6:7]
1657; VI-NEXT:    v_mov_b32_e32 v0, s10
1658; VI-NEXT:    s_cmp_eq_u32 s12, 0
1659; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[6:7]
1660; VI-NEXT:    v_mov_b32_e32 v0, s9
1661; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
1662; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[6:7]
1663; VI-NEXT:    v_mov_b32_e32 v0, s8
1664; VI-NEXT:    s_cmp_eq_u32 s12, 2
1665; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[6:7]
1666; VI-NEXT:    v_mov_b32_e32 v4, s5
1667; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
1668; VI-NEXT:    v_cndmask_b32_e64 v5, v4, 0, s[6:7]
1669; VI-NEXT:    v_mov_b32_e32 v4, s4
1670; VI-NEXT:    s_mov_b32 s2, -1
1671; VI-NEXT:    v_cndmask_b32_e64 v4, v4, 5, s[6:7]
1672; VI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
1673; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1674; VI-NEXT:    s_endpgm
1675  %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
1676  store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
1677  ret void
1678}
1679
1680define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
1681; SI-LABEL: dynamic_insertelement_v4f64:
1682; SI:       ; %bb.0:
1683; SI-NEXT:    s_load_dword s6, s[4:5], 0x10
1684; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
1685; SI-NEXT:    v_mov_b32_e32 v4, 0x40200000
1686; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1687; SI-NEXT:    s_mov_b32 s3, 0x100f000
1688; SI-NEXT:    s_waitcnt lgkmcnt(0)
1689; SI-NEXT:    s_cmp_eq_u32 s6, 1
1690; SI-NEXT:    v_mov_b32_e32 v0, s11
1691; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1692; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
1693; SI-NEXT:    v_mov_b32_e32 v0, s10
1694; SI-NEXT:    s_cmp_eq_u32 s6, 0
1695; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1696; SI-NEXT:    v_mov_b32_e32 v0, s9
1697; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1698; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
1699; SI-NEXT:    v_mov_b32_e32 v0, s8
1700; SI-NEXT:    s_cmp_eq_u32 s6, 3
1701; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1702; SI-NEXT:    v_mov_b32_e32 v5, s15
1703; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1704; SI-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
1705; SI-NEXT:    v_mov_b32_e32 v5, s14
1706; SI-NEXT:    s_cmp_eq_u32 s6, 2
1707; SI-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
1708; SI-NEXT:    v_mov_b32_e32 v5, s13
1709; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1710; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
1711; SI-NEXT:    v_mov_b32_e32 v4, s12
1712; SI-NEXT:    s_mov_b32 s2, -1
1713; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
1714; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1715; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1716; SI-NEXT:    s_endpgm
1717;
1718; VI-LABEL: dynamic_insertelement_v4f64:
1719; VI:       ; %bb.0:
1720; VI-NEXT:    s_load_dword s6, s[4:5], 0x40
1721; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
1722; VI-NEXT:    v_mov_b32_e32 v4, 0x40200000
1723; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1724; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1725; VI-NEXT:    s_waitcnt lgkmcnt(0)
1726; VI-NEXT:    s_cmp_eq_u32 s6, 1
1727; VI-NEXT:    v_mov_b32_e32 v0, s11
1728; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1729; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
1730; VI-NEXT:    v_mov_b32_e32 v0, s10
1731; VI-NEXT:    s_cmp_eq_u32 s6, 0
1732; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1733; VI-NEXT:    v_mov_b32_e32 v0, s9
1734; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1735; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
1736; VI-NEXT:    v_mov_b32_e32 v0, s8
1737; VI-NEXT:    s_cmp_eq_u32 s6, 3
1738; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1739; VI-NEXT:    v_mov_b32_e32 v5, s15
1740; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1741; VI-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
1742; VI-NEXT:    v_mov_b32_e32 v5, s14
1743; VI-NEXT:    s_cmp_eq_u32 s6, 2
1744; VI-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
1745; VI-NEXT:    v_mov_b32_e32 v5, s13
1746; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1747; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
1748; VI-NEXT:    v_mov_b32_e32 v4, s12
1749; VI-NEXT:    s_mov_b32 s2, -1
1750; VI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
1751; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1752; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1753; VI-NEXT:    s_endpgm
1754  %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
1755  store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
1756  ret void
1757}
1758
1759define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 {
1760; SI-LABEL: dynamic_insertelement_v8f64:
1761; SI:       ; %bb.0:
1762; SI-NEXT:    s_load_dword s6, s[4:5], 0x20
1763; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
1764; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1765; SI-NEXT:    v_mov_b32_e32 v16, 0x40200000
1766; SI-NEXT:    s_mov_b32 s3, 0x100f000
1767; SI-NEXT:    s_waitcnt lgkmcnt(0)
1768; SI-NEXT:    s_lshl_b32 s4, s6, 1
1769; SI-NEXT:    v_mov_b32_e32 v0, s8
1770; SI-NEXT:    v_mov_b32_e32 v1, s9
1771; SI-NEXT:    v_mov_b32_e32 v2, s10
1772; SI-NEXT:    v_mov_b32_e32 v3, s11
1773; SI-NEXT:    v_mov_b32_e32 v4, s12
1774; SI-NEXT:    v_mov_b32_e32 v5, s13
1775; SI-NEXT:    v_mov_b32_e32 v6, s14
1776; SI-NEXT:    v_mov_b32_e32 v7, s15
1777; SI-NEXT:    v_mov_b32_e32 v8, s16
1778; SI-NEXT:    v_mov_b32_e32 v9, s17
1779; SI-NEXT:    v_mov_b32_e32 v10, s18
1780; SI-NEXT:    v_mov_b32_e32 v11, s19
1781; SI-NEXT:    v_mov_b32_e32 v12, s20
1782; SI-NEXT:    v_mov_b32_e32 v13, s21
1783; SI-NEXT:    v_mov_b32_e32 v14, s22
1784; SI-NEXT:    v_mov_b32_e32 v15, s23
1785; SI-NEXT:    s_mov_b32 m0, s4
1786; SI-NEXT:    v_movreld_b32_e32 v0, 0
1787; SI-NEXT:    s_mov_b32 s2, -1
1788; SI-NEXT:    v_movreld_b32_e32 v1, v16
1789; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1790; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1791; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1792; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1793; SI-NEXT:    s_endpgm
1794;
1795; VI-LABEL: dynamic_insertelement_v8f64:
1796; VI:       ; %bb.0:
1797; VI-NEXT:    s_load_dword s6, s[4:5], 0x80
1798; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
1799; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1800; VI-NEXT:    v_mov_b32_e32 v16, 0x40200000
1801; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1802; VI-NEXT:    s_waitcnt lgkmcnt(0)
1803; VI-NEXT:    s_lshl_b32 s4, s6, 1
1804; VI-NEXT:    v_mov_b32_e32 v0, s8
1805; VI-NEXT:    v_mov_b32_e32 v1, s9
1806; VI-NEXT:    v_mov_b32_e32 v2, s10
1807; VI-NEXT:    v_mov_b32_e32 v3, s11
1808; VI-NEXT:    v_mov_b32_e32 v4, s12
1809; VI-NEXT:    v_mov_b32_e32 v5, s13
1810; VI-NEXT:    v_mov_b32_e32 v6, s14
1811; VI-NEXT:    v_mov_b32_e32 v7, s15
1812; VI-NEXT:    v_mov_b32_e32 v8, s16
1813; VI-NEXT:    v_mov_b32_e32 v9, s17
1814; VI-NEXT:    v_mov_b32_e32 v10, s18
1815; VI-NEXT:    v_mov_b32_e32 v11, s19
1816; VI-NEXT:    v_mov_b32_e32 v12, s20
1817; VI-NEXT:    v_mov_b32_e32 v13, s21
1818; VI-NEXT:    v_mov_b32_e32 v14, s22
1819; VI-NEXT:    v_mov_b32_e32 v15, s23
1820; VI-NEXT:    s_mov_b32 m0, s4
1821; VI-NEXT:    v_movreld_b32_e32 v0, 0
1822; VI-NEXT:    s_mov_b32 s2, -1
1823; VI-NEXT:    v_movreld_b32_e32 v1, v16
1824; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1825; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1826; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1827; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1828; VI-NEXT:    s_endpgm
1829  %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
1830  store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
1831  ret void
1832}
1833
1834declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
1835
1836attributes #0 = { nounwind }
1837attributes #1 = { nounwind readnone }
1838