1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
4
5; FIXME: Broken on evergreen
6; FIXME: For some reason the 8 and 16 vectors are being stored as
7; individual elements instead of 128-bit stores.
8
9define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind {
10; SI-LABEL: insertelement_v2f32_0:
11; SI:       ; %bb.0:
12; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
13; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14; SI-NEXT:    s_mov_b32 s3, 0x100f000
15; SI-NEXT:    s_mov_b32 s2, -1
16; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
17; SI-NEXT:    s_waitcnt lgkmcnt(0)
18; SI-NEXT:    v_mov_b32_e32 v1, s7
19; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
20; SI-NEXT:    s_endpgm
21;
22; VI-LABEL: insertelement_v2f32_0:
23; VI:       ; %bb.0:
24; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
25; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
26; VI-NEXT:    s_mov_b32 s3, 0x1100f000
27; VI-NEXT:    s_mov_b32 s2, -1
28; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
29; VI-NEXT:    s_waitcnt lgkmcnt(0)
30; VI-NEXT:    v_mov_b32_e32 v1, s7
31; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
32; VI-NEXT:    s_endpgm
33  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 0
34  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16
35  ret void
36}
37
38define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind {
39; SI-LABEL: insertelement_v2f32_1:
40; SI:       ; %bb.0:
41; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
42; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
43; SI-NEXT:    s_mov_b32 s3, 0x100f000
44; SI-NEXT:    s_mov_b32 s2, -1
45; SI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
46; SI-NEXT:    s_waitcnt lgkmcnt(0)
47; SI-NEXT:    v_mov_b32_e32 v0, s6
48; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
49; SI-NEXT:    s_endpgm
50;
51; VI-LABEL: insertelement_v2f32_1:
52; VI:       ; %bb.0:
53; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
54; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
55; VI-NEXT:    s_mov_b32 s3, 0x1100f000
56; VI-NEXT:    s_mov_b32 s2, -1
57; VI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
58; VI-NEXT:    s_waitcnt lgkmcnt(0)
59; VI-NEXT:    v_mov_b32_e32 v0, s6
60; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
61; VI-NEXT:    s_endpgm
62  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 1
63  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16
64  ret void
65}
66
67define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
68; SI-LABEL: insertelement_v2i32_0:
69; SI:       ; %bb.0:
70; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
71; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
72; SI-NEXT:    s_mov_b32 s3, 0x100f000
73; SI-NEXT:    s_mov_b32 s2, -1
74; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
75; SI-NEXT:    s_waitcnt lgkmcnt(0)
76; SI-NEXT:    v_mov_b32_e32 v1, s7
77; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
78; SI-NEXT:    s_endpgm
79;
80; VI-LABEL: insertelement_v2i32_0:
81; VI:       ; %bb.0:
82; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
83; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
84; VI-NEXT:    s_mov_b32 s3, 0x1100f000
85; VI-NEXT:    s_mov_b32 s2, -1
86; VI-NEXT:    v_mov_b32_e32 v0, 0x3e7
87; VI-NEXT:    s_waitcnt lgkmcnt(0)
88; VI-NEXT:    v_mov_b32_e32 v1, s7
89; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
90; VI-NEXT:    s_endpgm
91  %vecins = insertelement <2 x i32> %a, i32 999, i32 0
92  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16
93  ret void
94}
95
96define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
97; SI-LABEL: insertelement_v2i32_1:
98; SI:       ; %bb.0:
99; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
100; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
101; SI-NEXT:    s_mov_b32 s3, 0x100f000
102; SI-NEXT:    s_mov_b32 s2, -1
103; SI-NEXT:    v_mov_b32_e32 v1, 0x3e7
104; SI-NEXT:    s_waitcnt lgkmcnt(0)
105; SI-NEXT:    v_mov_b32_e32 v0, s6
106; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
107; SI-NEXT:    s_endpgm
108;
109; VI-LABEL: insertelement_v2i32_1:
110; VI:       ; %bb.0:
111; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
112; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
113; VI-NEXT:    s_mov_b32 s3, 0x1100f000
114; VI-NEXT:    s_mov_b32 s2, -1
115; VI-NEXT:    v_mov_b32_e32 v1, 0x3e7
116; VI-NEXT:    s_waitcnt lgkmcnt(0)
117; VI-NEXT:    v_mov_b32_e32 v0, s6
118; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
119; VI-NEXT:    s_endpgm
120  %vecins = insertelement <2 x i32> %a, i32 999, i32 1
121  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16
122  ret void
123}
124
125; FIXME: Why is the constant moved into the intermediate register and
126; not just directly into the vector component?
127define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
128; SI-LABEL: insertelement_v4f32_0:
129; SI:       ; %bb.0:
130; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
131; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
132; SI-NEXT:    s_waitcnt lgkmcnt(0)
133; SI-NEXT:    s_mov_b32 s0, 0x40a00000
134; SI-NEXT:    s_mov_b32 s7, 0x100f000
135; SI-NEXT:    s_mov_b32 s6, -1
136; SI-NEXT:    v_mov_b32_e32 v0, s0
137; SI-NEXT:    v_mov_b32_e32 v1, s1
138; SI-NEXT:    v_mov_b32_e32 v2, s2
139; SI-NEXT:    v_mov_b32_e32 v3, s3
140; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
141; SI-NEXT:    s_endpgm
142;
143; VI-LABEL: insertelement_v4f32_0:
144; VI:       ; %bb.0:
145; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
146; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
147; VI-NEXT:    s_waitcnt lgkmcnt(0)
148; VI-NEXT:    s_mov_b32 s0, 0x40a00000
149; VI-NEXT:    s_mov_b32 s7, 0x1100f000
150; VI-NEXT:    s_mov_b32 s6, -1
151; VI-NEXT:    v_mov_b32_e32 v0, s0
152; VI-NEXT:    v_mov_b32_e32 v1, s1
153; VI-NEXT:    v_mov_b32_e32 v2, s2
154; VI-NEXT:    v_mov_b32_e32 v3, s3
155; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
156; VI-NEXT:    s_endpgm
157  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
158  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
159  ret void
160}
161
162define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
163; SI-LABEL: insertelement_v4f32_1:
164; SI:       ; %bb.0:
165; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
166; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
167; SI-NEXT:    s_waitcnt lgkmcnt(0)
168; SI-NEXT:    s_mov_b32 s1, 0x40a00000
169; SI-NEXT:    s_mov_b32 s7, 0x100f000
170; SI-NEXT:    s_mov_b32 s6, -1
171; SI-NEXT:    v_mov_b32_e32 v0, s0
172; SI-NEXT:    v_mov_b32_e32 v1, s1
173; SI-NEXT:    v_mov_b32_e32 v2, s2
174; SI-NEXT:    v_mov_b32_e32 v3, s3
175; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
176; SI-NEXT:    s_endpgm
177;
178; VI-LABEL: insertelement_v4f32_1:
179; VI:       ; %bb.0:
180; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
181; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
182; VI-NEXT:    s_waitcnt lgkmcnt(0)
183; VI-NEXT:    s_mov_b32 s1, 0x40a00000
184; VI-NEXT:    s_mov_b32 s7, 0x1100f000
185; VI-NEXT:    s_mov_b32 s6, -1
186; VI-NEXT:    v_mov_b32_e32 v0, s0
187; VI-NEXT:    v_mov_b32_e32 v1, s1
188; VI-NEXT:    v_mov_b32_e32 v2, s2
189; VI-NEXT:    v_mov_b32_e32 v3, s3
190; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
191; VI-NEXT:    s_endpgm
192  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
193  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
194  ret void
195}
196
197define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
198; SI-LABEL: insertelement_v4f32_2:
199; SI:       ; %bb.0:
200; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
201; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
202; SI-NEXT:    s_waitcnt lgkmcnt(0)
203; SI-NEXT:    s_mov_b32 s2, 0x40a00000
204; SI-NEXT:    s_mov_b32 s7, 0x100f000
205; SI-NEXT:    s_mov_b32 s6, -1
206; SI-NEXT:    v_mov_b32_e32 v0, s0
207; SI-NEXT:    v_mov_b32_e32 v1, s1
208; SI-NEXT:    v_mov_b32_e32 v2, s2
209; SI-NEXT:    v_mov_b32_e32 v3, s3
210; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
211; SI-NEXT:    s_endpgm
212;
213; VI-LABEL: insertelement_v4f32_2:
214; VI:       ; %bb.0:
215; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
216; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
217; VI-NEXT:    s_waitcnt lgkmcnt(0)
218; VI-NEXT:    s_mov_b32 s2, 0x40a00000
219; VI-NEXT:    s_mov_b32 s7, 0x1100f000
220; VI-NEXT:    s_mov_b32 s6, -1
221; VI-NEXT:    v_mov_b32_e32 v0, s0
222; VI-NEXT:    v_mov_b32_e32 v1, s1
223; VI-NEXT:    v_mov_b32_e32 v2, s2
224; VI-NEXT:    v_mov_b32_e32 v3, s3
225; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
226; VI-NEXT:    s_endpgm
227  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
228  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
229  ret void
230}
231
232define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
233; SI-LABEL: insertelement_v4f32_3:
234; SI:       ; %bb.0:
235; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
236; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
237; SI-NEXT:    s_waitcnt lgkmcnt(0)
238; SI-NEXT:    s_mov_b32 s3, 0x40a00000
239; SI-NEXT:    s_mov_b32 s7, 0x100f000
240; SI-NEXT:    s_mov_b32 s6, -1
241; SI-NEXT:    v_mov_b32_e32 v0, s0
242; SI-NEXT:    v_mov_b32_e32 v1, s1
243; SI-NEXT:    v_mov_b32_e32 v2, s2
244; SI-NEXT:    v_mov_b32_e32 v3, s3
245; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
246; SI-NEXT:    s_endpgm
247;
248; VI-LABEL: insertelement_v4f32_3:
249; VI:       ; %bb.0:
250; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
251; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
252; VI-NEXT:    s_waitcnt lgkmcnt(0)
253; VI-NEXT:    s_mov_b32 s3, 0x40a00000
254; VI-NEXT:    s_mov_b32 s7, 0x1100f000
255; VI-NEXT:    s_mov_b32 s6, -1
256; VI-NEXT:    v_mov_b32_e32 v0, s0
257; VI-NEXT:    v_mov_b32_e32 v1, s1
258; VI-NEXT:    v_mov_b32_e32 v2, s2
259; VI-NEXT:    v_mov_b32_e32 v3, s3
260; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
261; VI-NEXT:    s_endpgm
262  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
263  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
264  ret void
265}
266
267define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
268; SI-LABEL: insertelement_v4i32_0:
269; SI:       ; %bb.0:
270; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
271; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
272; SI-NEXT:    s_waitcnt lgkmcnt(0)
273; SI-NEXT:    s_movk_i32 s0, 0x3e7
274; SI-NEXT:    s_mov_b32 s7, 0x100f000
275; SI-NEXT:    s_mov_b32 s6, -1
276; SI-NEXT:    v_mov_b32_e32 v0, s0
277; SI-NEXT:    v_mov_b32_e32 v1, s1
278; SI-NEXT:    v_mov_b32_e32 v2, s2
279; SI-NEXT:    v_mov_b32_e32 v3, s3
280; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
281; SI-NEXT:    s_endpgm
282;
283; VI-LABEL: insertelement_v4i32_0:
284; VI:       ; %bb.0:
285; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
286; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
287; VI-NEXT:    s_waitcnt lgkmcnt(0)
288; VI-NEXT:    s_movk_i32 s0, 0x3e7
289; VI-NEXT:    s_mov_b32 s7, 0x1100f000
290; VI-NEXT:    s_mov_b32 s6, -1
291; VI-NEXT:    v_mov_b32_e32 v0, s0
292; VI-NEXT:    v_mov_b32_e32 v1, s1
293; VI-NEXT:    v_mov_b32_e32 v2, s2
294; VI-NEXT:    v_mov_b32_e32 v3, s3
295; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
296; VI-NEXT:    s_endpgm
297  %vecins = insertelement <4 x i32> %a, i32 999, i32 0
298  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
299  ret void
300}
301
302define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
303; SI-LABEL: insertelement_v3f32_1:
304; SI:       ; %bb.0:
305; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
306; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
307; SI-NEXT:    s_mov_b32 s7, 0x100f000
308; SI-NEXT:    s_mov_b32 s6, -1
309; SI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
310; SI-NEXT:    s_waitcnt lgkmcnt(0)
311; SI-NEXT:    v_mov_b32_e32 v0, s0
312; SI-NEXT:    v_mov_b32_e32 v2, s2
313; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
314; SI-NEXT:    s_endpgm
315;
316; VI-LABEL: insertelement_v3f32_1:
317; VI:       ; %bb.0:
318; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
319; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
320; VI-NEXT:    s_mov_b32 s7, 0x1100f000
321; VI-NEXT:    s_mov_b32 s6, -1
322; VI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
323; VI-NEXT:    s_waitcnt lgkmcnt(0)
324; VI-NEXT:    v_mov_b32_e32 v0, s0
325; VI-NEXT:    v_mov_b32_e32 v2, s2
326; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
327; VI-NEXT:    s_endpgm
328  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
329  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
330  ret void
331}
332
333define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
334; SI-LABEL: insertelement_v3f32_2:
335; SI:       ; %bb.0:
336; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
337; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
338; SI-NEXT:    s_mov_b32 s7, 0x100f000
339; SI-NEXT:    s_mov_b32 s6, -1
340; SI-NEXT:    v_mov_b32_e32 v2, 0x40a00000
341; SI-NEXT:    s_waitcnt lgkmcnt(0)
342; SI-NEXT:    v_mov_b32_e32 v0, s0
343; SI-NEXT:    v_mov_b32_e32 v1, s1
344; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
345; SI-NEXT:    s_endpgm
346;
347; VI-LABEL: insertelement_v3f32_2:
348; VI:       ; %bb.0:
349; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
350; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
351; VI-NEXT:    s_mov_b32 s7, 0x1100f000
352; VI-NEXT:    s_mov_b32 s6, -1
353; VI-NEXT:    v_mov_b32_e32 v2, 0x40a00000
354; VI-NEXT:    s_waitcnt lgkmcnt(0)
355; VI-NEXT:    v_mov_b32_e32 v0, s0
356; VI-NEXT:    v_mov_b32_e32 v1, s1
357; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
358; VI-NEXT:    s_endpgm
359  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
360  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
361  ret void
362}
363
364define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
365; GCN-LABEL: insertelement_v3f32_3:
366; GCN:       ; %bb.0:
367; GCN-NEXT:    s_endpgm
368  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
369  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
370  ret void
371}
372
373define <4 x float> @insertelement_to_sgpr() nounwind {
374; GCN-LABEL: insertelement_to_sgpr:
375; GCN:       ; %bb.0:
376; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377; GCN-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x0
378; GCN-NEXT:    s_waitcnt lgkmcnt(0)
379; GCN-NEXT:    s_mov_b32 s12, 0
380; GCN-NEXT:    s_mov_b32 s4, s12
381; GCN-NEXT:    s_mov_b32 s5, s12
382; GCN-NEXT:    s_mov_b32 s6, s12
383; GCN-NEXT:    s_mov_b32 s7, s12
384; GCN-NEXT:    s_mov_b32 s8, s12
385; GCN-NEXT:    s_mov_b32 s9, s12
386; GCN-NEXT:    s_mov_b32 s10, s12
387; GCN-NEXT:    s_mov_b32 s11, s12
388; GCN-NEXT:    image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1
389; GCN-NEXT:    s_waitcnt vmcnt(0)
390; GCN-NEXT:    s_setpc_b64 s[30:31]
391  %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
392  %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
393  %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
394  ret <4 x float> %tmp2
395}
396
397define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
398; SI-LABEL: dynamic_insertelement_v2f32:
399; SI:       ; %bb.0:
400; SI-NEXT:    s_load_dword s6, s[4:5], 0x4
401; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
402; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
403; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
404; SI-NEXT:    s_mov_b32 s3, 0x100f000
405; SI-NEXT:    s_waitcnt lgkmcnt(0)
406; SI-NEXT:    s_cmp_lg_u32 s6, 1
407; SI-NEXT:    s_cselect_b64 vcc, -1, 0
408; SI-NEXT:    v_mov_b32_e32 v1, s5
409; SI-NEXT:    s_cmp_lg_u32 s6, 0
410; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
411; SI-NEXT:    v_mov_b32_e32 v2, s4
412; SI-NEXT:    s_cselect_b64 vcc, -1, 0
413; SI-NEXT:    s_mov_b32 s2, -1
414; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
415; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
416; SI-NEXT:    s_endpgm
417;
418; VI-LABEL: dynamic_insertelement_v2f32:
419; VI:       ; %bb.0:
420; VI-NEXT:    s_load_dword s6, s[4:5], 0x10
421; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
422; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
423; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
424; VI-NEXT:    s_mov_b32 s3, 0x1100f000
425; VI-NEXT:    s_waitcnt lgkmcnt(0)
426; VI-NEXT:    s_cmp_lg_u32 s6, 1
427; VI-NEXT:    s_cselect_b64 vcc, -1, 0
428; VI-NEXT:    v_mov_b32_e32 v1, s5
429; VI-NEXT:    s_cmp_lg_u32 s6, 0
430; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
431; VI-NEXT:    v_mov_b32_e32 v2, s4
432; VI-NEXT:    s_cselect_b64 vcc, -1, 0
433; VI-NEXT:    s_mov_b32 s2, -1
434; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
435; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
436; VI-NEXT:    s_endpgm
437  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
438  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
439  ret void
440}
441
442define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
443; SI-LABEL: dynamic_insertelement_v3f32:
444; SI:       ; %bb.0:
445; SI-NEXT:    s_load_dword s8, s[4:5], 0x8
446; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
447; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
448; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
449; SI-NEXT:    s_mov_b32 s3, 0x100f000
450; SI-NEXT:    s_waitcnt lgkmcnt(0)
451; SI-NEXT:    s_cmp_lg_u32 s8, 2
452; SI-NEXT:    s_cselect_b64 vcc, -1, 0
453; SI-NEXT:    v_mov_b32_e32 v1, s6
454; SI-NEXT:    s_cmp_lg_u32 s8, 1
455; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
456; SI-NEXT:    v_mov_b32_e32 v1, s5
457; SI-NEXT:    s_cselect_b64 vcc, -1, 0
458; SI-NEXT:    s_cmp_lg_u32 s8, 0
459; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
460; SI-NEXT:    v_mov_b32_e32 v3, s4
461; SI-NEXT:    s_cselect_b64 vcc, -1, 0
462; SI-NEXT:    s_mov_b32 s2, -1
463; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
464; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
465; SI-NEXT:    s_endpgm
466;
467; VI-LABEL: dynamic_insertelement_v3f32:
468; VI:       ; %bb.0:
469; VI-NEXT:    s_load_dword s8, s[4:5], 0x20
470; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
471; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
472; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
473; VI-NEXT:    s_mov_b32 s3, 0x1100f000
474; VI-NEXT:    s_waitcnt lgkmcnt(0)
475; VI-NEXT:    s_cmp_lg_u32 s8, 2
476; VI-NEXT:    s_cselect_b64 vcc, -1, 0
477; VI-NEXT:    v_mov_b32_e32 v1, s6
478; VI-NEXT:    s_cmp_lg_u32 s8, 1
479; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
480; VI-NEXT:    v_mov_b32_e32 v1, s5
481; VI-NEXT:    s_cselect_b64 vcc, -1, 0
482; VI-NEXT:    s_cmp_lg_u32 s8, 0
483; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
484; VI-NEXT:    v_mov_b32_e32 v3, s4
485; VI-NEXT:    s_cselect_b64 vcc, -1, 0
486; VI-NEXT:    s_mov_b32 s2, -1
487; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
488; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
489; VI-NEXT:    s_endpgm
490  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
491  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
492  ret void
493}
494
495define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
496; SI-LABEL: dynamic_insertelement_v4f32:
497; SI:       ; %bb.0:
498; SI-NEXT:    s_load_dword s8, s[4:5], 0x8
499; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
500; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
501; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
502; SI-NEXT:    s_mov_b32 s3, 0x100f000
503; SI-NEXT:    s_waitcnt lgkmcnt(0)
504; SI-NEXT:    s_cmp_lg_u32 s8, 3
505; SI-NEXT:    s_cselect_b64 vcc, -1, 0
506; SI-NEXT:    v_mov_b32_e32 v1, s7
507; SI-NEXT:    s_cmp_lg_u32 s8, 2
508; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
509; SI-NEXT:    v_mov_b32_e32 v1, s6
510; SI-NEXT:    s_cselect_b64 vcc, -1, 0
511; SI-NEXT:    s_cmp_lg_u32 s8, 1
512; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
513; SI-NEXT:    v_mov_b32_e32 v1, s5
514; SI-NEXT:    s_cselect_b64 vcc, -1, 0
515; SI-NEXT:    s_cmp_lg_u32 s8, 0
516; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
517; SI-NEXT:    v_mov_b32_e32 v4, s4
518; SI-NEXT:    s_cselect_b64 vcc, -1, 0
519; SI-NEXT:    s_mov_b32 s2, -1
520; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
521; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
522; SI-NEXT:    s_endpgm
523;
524; VI-LABEL: dynamic_insertelement_v4f32:
525; VI:       ; %bb.0:
526; VI-NEXT:    s_load_dword s8, s[4:5], 0x20
527; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
528; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
529; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
530; VI-NEXT:    s_mov_b32 s3, 0x1100f000
531; VI-NEXT:    s_waitcnt lgkmcnt(0)
532; VI-NEXT:    s_cmp_lg_u32 s8, 3
533; VI-NEXT:    s_cselect_b64 vcc, -1, 0
534; VI-NEXT:    v_mov_b32_e32 v1, s7
535; VI-NEXT:    s_cmp_lg_u32 s8, 2
536; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
537; VI-NEXT:    v_mov_b32_e32 v1, s6
538; VI-NEXT:    s_cselect_b64 vcc, -1, 0
539; VI-NEXT:    s_cmp_lg_u32 s8, 1
540; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
541; VI-NEXT:    v_mov_b32_e32 v1, s5
542; VI-NEXT:    s_cselect_b64 vcc, -1, 0
543; VI-NEXT:    s_cmp_lg_u32 s8, 0
544; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
545; VI-NEXT:    v_mov_b32_e32 v4, s4
546; VI-NEXT:    s_cselect_b64 vcc, -1, 0
547; VI-NEXT:    s_mov_b32 s2, -1
548; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
549; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
550; VI-NEXT:    s_endpgm
551  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
552  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
553  ret void
554}
555
556define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
557; SI-LABEL: dynamic_insertelement_v8f32:
558; SI:       ; %bb.0:
559; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
560; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
561; SI-NEXT:    s_load_dword s4, s[4:5], 0x10
562; SI-NEXT:    v_mov_b32_e32 v8, 0x40a00000
563; SI-NEXT:    s_mov_b32 s3, 0x100f000
564; SI-NEXT:    s_mov_b32 s2, -1
565; SI-NEXT:    s_waitcnt lgkmcnt(0)
566; SI-NEXT:    v_mov_b32_e32 v0, s8
567; SI-NEXT:    v_mov_b32_e32 v1, s9
568; SI-NEXT:    v_mov_b32_e32 v2, s10
569; SI-NEXT:    v_mov_b32_e32 v3, s11
570; SI-NEXT:    v_mov_b32_e32 v4, s12
571; SI-NEXT:    v_mov_b32_e32 v5, s13
572; SI-NEXT:    v_mov_b32_e32 v6, s14
573; SI-NEXT:    v_mov_b32_e32 v7, s15
574; SI-NEXT:    s_mov_b32 m0, s4
575; SI-NEXT:    v_movreld_b32_e32 v0, v8
576; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
577; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
578; SI-NEXT:    s_endpgm
579;
580; VI-LABEL: dynamic_insertelement_v8f32:
581; VI:       ; %bb.0:
582; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
583; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
584; VI-NEXT:    s_load_dword s4, s[4:5], 0x40
585; VI-NEXT:    v_mov_b32_e32 v8, 0x40a00000
586; VI-NEXT:    s_mov_b32 s3, 0x1100f000
587; VI-NEXT:    s_mov_b32 s2, -1
588; VI-NEXT:    s_waitcnt lgkmcnt(0)
589; VI-NEXT:    v_mov_b32_e32 v0, s8
590; VI-NEXT:    v_mov_b32_e32 v1, s9
591; VI-NEXT:    v_mov_b32_e32 v2, s10
592; VI-NEXT:    v_mov_b32_e32 v3, s11
593; VI-NEXT:    v_mov_b32_e32 v4, s12
594; VI-NEXT:    v_mov_b32_e32 v5, s13
595; VI-NEXT:    v_mov_b32_e32 v6, s14
596; VI-NEXT:    v_mov_b32_e32 v7, s15
597; VI-NEXT:    s_mov_b32 m0, s4
598; VI-NEXT:    v_movreld_b32_e32 v0, v8
599; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
600; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
601; VI-NEXT:    s_endpgm
602  %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
603  store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
604  ret void
605}
606
607define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
608; SI-LABEL: dynamic_insertelement_v16f32:
609; SI:       ; %bb.0:
610; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
611; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
612; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
613; SI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
614; SI-NEXT:    s_mov_b32 s3, 0x100f000
615; SI-NEXT:    s_mov_b32 s2, -1
616; SI-NEXT:    s_waitcnt lgkmcnt(0)
617; SI-NEXT:    v_mov_b32_e32 v0, s8
618; SI-NEXT:    v_mov_b32_e32 v1, s9
619; SI-NEXT:    v_mov_b32_e32 v2, s10
620; SI-NEXT:    v_mov_b32_e32 v3, s11
621; SI-NEXT:    v_mov_b32_e32 v4, s12
622; SI-NEXT:    v_mov_b32_e32 v5, s13
623; SI-NEXT:    v_mov_b32_e32 v6, s14
624; SI-NEXT:    v_mov_b32_e32 v7, s15
625; SI-NEXT:    v_mov_b32_e32 v8, s16
626; SI-NEXT:    v_mov_b32_e32 v9, s17
627; SI-NEXT:    v_mov_b32_e32 v10, s18
628; SI-NEXT:    v_mov_b32_e32 v11, s19
629; SI-NEXT:    v_mov_b32_e32 v12, s20
630; SI-NEXT:    v_mov_b32_e32 v13, s21
631; SI-NEXT:    v_mov_b32_e32 v14, s22
632; SI-NEXT:    v_mov_b32_e32 v15, s23
633; SI-NEXT:    s_mov_b32 m0, s4
634; SI-NEXT:    v_movreld_b32_e32 v0, v16
635; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
636; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
637; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
638; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
639; SI-NEXT:    s_endpgm
640;
641; VI-LABEL: dynamic_insertelement_v16f32:
642; VI:       ; %bb.0:
643; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
644; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
645; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
646; VI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
647; VI-NEXT:    s_mov_b32 s3, 0x1100f000
648; VI-NEXT:    s_mov_b32 s2, -1
649; VI-NEXT:    s_waitcnt lgkmcnt(0)
650; VI-NEXT:    v_mov_b32_e32 v0, s8
651; VI-NEXT:    v_mov_b32_e32 v1, s9
652; VI-NEXT:    v_mov_b32_e32 v2, s10
653; VI-NEXT:    v_mov_b32_e32 v3, s11
654; VI-NEXT:    v_mov_b32_e32 v4, s12
655; VI-NEXT:    v_mov_b32_e32 v5, s13
656; VI-NEXT:    v_mov_b32_e32 v6, s14
657; VI-NEXT:    v_mov_b32_e32 v7, s15
658; VI-NEXT:    v_mov_b32_e32 v8, s16
659; VI-NEXT:    v_mov_b32_e32 v9, s17
660; VI-NEXT:    v_mov_b32_e32 v10, s18
661; VI-NEXT:    v_mov_b32_e32 v11, s19
662; VI-NEXT:    v_mov_b32_e32 v12, s20
663; VI-NEXT:    v_mov_b32_e32 v13, s21
664; VI-NEXT:    v_mov_b32_e32 v14, s22
665; VI-NEXT:    v_mov_b32_e32 v15, s23
666; VI-NEXT:    s_mov_b32 m0, s4
667; VI-NEXT:    v_movreld_b32_e32 v0, v16
668; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
669; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
670; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
671; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
672; VI-NEXT:    s_endpgm
673  %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
674  store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
675  ret void
676}
677
678define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
679; SI-LABEL: dynamic_insertelement_v2i32:
680; SI:       ; %bb.0:
681; SI-NEXT:    s_load_dword s8, s[4:5], 0x4
682; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
683; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
684; SI-NEXT:    s_mov_b32 s3, 0x100f000
685; SI-NEXT:    s_mov_b32 s2, -1
686; SI-NEXT:    s_waitcnt lgkmcnt(0)
687; SI-NEXT:    s_cmp_lg_u32 s8, 1
688; SI-NEXT:    v_mov_b32_e32 v0, s7
689; SI-NEXT:    s_cselect_b64 vcc, -1, 0
690; SI-NEXT:    s_cmp_lg_u32 s8, 0
691; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
692; SI-NEXT:    v_mov_b32_e32 v0, s6
693; SI-NEXT:    s_cselect_b64 vcc, -1, 0
694; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
695; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
696; SI-NEXT:    s_endpgm
697;
698; VI-LABEL: dynamic_insertelement_v2i32:
699; VI:       ; %bb.0:
700; VI-NEXT:    s_load_dword s8, s[4:5], 0x10
701; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
702; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
703; VI-NEXT:    s_mov_b32 s3, 0x1100f000
704; VI-NEXT:    s_mov_b32 s2, -1
705; VI-NEXT:    s_waitcnt lgkmcnt(0)
706; VI-NEXT:    s_cmp_lg_u32 s8, 1
707; VI-NEXT:    s_cselect_b32 s4, s7, 5
708; VI-NEXT:    s_cmp_lg_u32 s8, 0
709; VI-NEXT:    s_cselect_b32 s5, s6, 5
710; VI-NEXT:    v_mov_b32_e32 v0, s5
711; VI-NEXT:    v_mov_b32_e32 v1, s4
712; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
713; VI-NEXT:    s_endpgm
714  %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
715  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
716  ret void
717}
718
719define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
720; SI-LABEL: dynamic_insertelement_v3i32:
721; SI:       ; %bb.0:
722; SI-NEXT:    s_load_dword s8, s[4:5], 0x8
723; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
724; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
725; SI-NEXT:    s_mov_b32 s7, 0x100f000
726; SI-NEXT:    s_mov_b32 s6, -1
727; SI-NEXT:    s_waitcnt lgkmcnt(0)
728; SI-NEXT:    s_cmp_lg_u32 s8, 2
729; SI-NEXT:    v_mov_b32_e32 v0, s2
730; SI-NEXT:    s_cselect_b64 vcc, -1, 0
731; SI-NEXT:    s_cmp_lg_u32 s8, 1
732; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
733; SI-NEXT:    v_mov_b32_e32 v0, s1
734; SI-NEXT:    s_cselect_b64 vcc, -1, 0
735; SI-NEXT:    s_cmp_lg_u32 s8, 0
736; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
737; SI-NEXT:    v_mov_b32_e32 v0, s0
738; SI-NEXT:    s_cselect_b64 vcc, -1, 0
739; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
740; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
741; SI-NEXT:    s_endpgm
742;
743; VI-LABEL: dynamic_insertelement_v3i32:
744; VI:       ; %bb.0:
745; VI-NEXT:    s_load_dword s8, s[4:5], 0x20
746; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
747; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
748; VI-NEXT:    s_mov_b32 s7, 0x1100f000
749; VI-NEXT:    s_mov_b32 s6, -1
750; VI-NEXT:    s_waitcnt lgkmcnt(0)
751; VI-NEXT:    s_cmp_lg_u32 s8, 2
752; VI-NEXT:    s_cselect_b32 s2, s2, 5
753; VI-NEXT:    s_cmp_lg_u32 s8, 1
754; VI-NEXT:    s_cselect_b32 s1, s1, 5
755; VI-NEXT:    s_cmp_lg_u32 s8, 0
756; VI-NEXT:    s_cselect_b32 s0, s0, 5
757; VI-NEXT:    v_mov_b32_e32 v0, s0
758; VI-NEXT:    v_mov_b32_e32 v1, s1
759; VI-NEXT:    v_mov_b32_e32 v2, s2
760; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
761; VI-NEXT:    s_endpgm
762  %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
763  store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
764  ret void
765}
766
767define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
768; SI-LABEL: dynamic_insertelement_v4i32:
769; SI:       ; %bb.0:
770; SI-NEXT:    s_load_dword s6, s[4:5], 0x8
771; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
772; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
773; SI-NEXT:    s_load_dword s4, s[4:5], 0x11
774; SI-NEXT:    s_mov_b32 s3, 0x100f000
775; SI-NEXT:    s_waitcnt lgkmcnt(0)
776; SI-NEXT:    s_cmp_eq_u32 s6, 3
777; SI-NEXT:    s_cselect_b64 vcc, -1, 0
778; SI-NEXT:    v_mov_b32_e32 v0, s11
779; SI-NEXT:    v_mov_b32_e32 v4, s4
780; SI-NEXT:    s_cmp_eq_u32 s6, 2
781; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
782; SI-NEXT:    v_mov_b32_e32 v0, s10
783; SI-NEXT:    s_cselect_b64 vcc, -1, 0
784; SI-NEXT:    s_cmp_eq_u32 s6, 1
785; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
786; SI-NEXT:    v_mov_b32_e32 v0, s9
787; SI-NEXT:    s_cselect_b64 vcc, -1, 0
788; SI-NEXT:    s_cmp_eq_u32 s6, 0
789; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
790; SI-NEXT:    v_mov_b32_e32 v0, s8
791; SI-NEXT:    s_cselect_b64 vcc, -1, 0
792; SI-NEXT:    s_mov_b32 s2, -1
793; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
794; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
795; SI-NEXT:    s_endpgm
796;
797; VI-LABEL: dynamic_insertelement_v4i32:
798; VI:       ; %bb.0:
799; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
800; VI-NEXT:    s_load_dword s8, s[4:5], 0x20
801; VI-NEXT:    s_load_dword s9, s[4:5], 0x44
802; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
803; VI-NEXT:    s_mov_b32 s7, 0x1100f000
804; VI-NEXT:    s_mov_b32 s6, -1
805; VI-NEXT:    s_waitcnt lgkmcnt(0)
806; VI-NEXT:    s_cmp_eq_u32 s8, 3
807; VI-NEXT:    s_cselect_b32 s3, s9, s3
808; VI-NEXT:    s_cmp_eq_u32 s8, 2
809; VI-NEXT:    s_cselect_b32 s2, s9, s2
810; VI-NEXT:    s_cmp_eq_u32 s8, 1
811; VI-NEXT:    s_cselect_b32 s1, s9, s1
812; VI-NEXT:    s_cmp_eq_u32 s8, 0
813; VI-NEXT:    s_cselect_b32 s0, s9, s0
814; VI-NEXT:    v_mov_b32_e32 v0, s0
815; VI-NEXT:    v_mov_b32_e32 v1, s1
816; VI-NEXT:    v_mov_b32_e32 v2, s2
817; VI-NEXT:    v_mov_b32_e32 v3, s3
818; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
819; VI-NEXT:    s_endpgm
820  %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
821  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
822  ret void
823}
824
825define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
826; SI-LABEL: dynamic_insertelement_v8i32:
827; SI:       ; %bb.0:
828; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
829; SI-NEXT:    s_load_dword s6, s[4:5], 0x10
830; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
831; SI-NEXT:    s_mov_b32 s3, 0x100f000
832; SI-NEXT:    s_mov_b32 s2, -1
833; SI-NEXT:    s_waitcnt lgkmcnt(0)
834; SI-NEXT:    v_mov_b32_e32 v0, s8
835; SI-NEXT:    v_mov_b32_e32 v1, s9
836; SI-NEXT:    v_mov_b32_e32 v2, s10
837; SI-NEXT:    v_mov_b32_e32 v3, s11
838; SI-NEXT:    v_mov_b32_e32 v4, s12
839; SI-NEXT:    v_mov_b32_e32 v5, s13
840; SI-NEXT:    v_mov_b32_e32 v6, s14
841; SI-NEXT:    v_mov_b32_e32 v7, s15
842; SI-NEXT:    s_mov_b32 m0, s6
843; SI-NEXT:    v_movreld_b32_e32 v0, 5
844; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
845; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
846; SI-NEXT:    s_endpgm
847;
848; VI-LABEL: dynamic_insertelement_v8i32:
849; VI:       ; %bb.0:
850; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
851; VI-NEXT:    s_load_dword s6, s[4:5], 0x40
852; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
853; VI-NEXT:    s_mov_b32 s3, 0x1100f000
854; VI-NEXT:    s_mov_b32 s2, -1
855; VI-NEXT:    s_waitcnt lgkmcnt(0)
856; VI-NEXT:    v_mov_b32_e32 v0, s8
857; VI-NEXT:    v_mov_b32_e32 v1, s9
858; VI-NEXT:    v_mov_b32_e32 v2, s10
859; VI-NEXT:    v_mov_b32_e32 v3, s11
860; VI-NEXT:    v_mov_b32_e32 v4, s12
861; VI-NEXT:    v_mov_b32_e32 v5, s13
862; VI-NEXT:    v_mov_b32_e32 v6, s14
863; VI-NEXT:    v_mov_b32_e32 v7, s15
864; VI-NEXT:    s_mov_b32 m0, s6
865; VI-NEXT:    v_movreld_b32_e32 v0, 5
866; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
867; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
868; VI-NEXT:    s_endpgm
869  %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
870  store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
871  ret void
872}
873
874define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
875; SI-LABEL: dynamic_insertelement_v16i32:
876; SI:       ; %bb.0:
877; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
878; SI-NEXT:    s_load_dword s6, s[4:5], 0x20
879; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
880; SI-NEXT:    s_mov_b32 s3, 0x100f000
881; SI-NEXT:    s_mov_b32 s2, -1
882; SI-NEXT:    s_waitcnt lgkmcnt(0)
883; SI-NEXT:    v_mov_b32_e32 v0, s8
884; SI-NEXT:    v_mov_b32_e32 v1, s9
885; SI-NEXT:    v_mov_b32_e32 v2, s10
886; SI-NEXT:    v_mov_b32_e32 v3, s11
887; SI-NEXT:    v_mov_b32_e32 v4, s12
888; SI-NEXT:    v_mov_b32_e32 v5, s13
889; SI-NEXT:    v_mov_b32_e32 v6, s14
890; SI-NEXT:    v_mov_b32_e32 v7, s15
891; SI-NEXT:    v_mov_b32_e32 v8, s16
892; SI-NEXT:    v_mov_b32_e32 v9, s17
893; SI-NEXT:    v_mov_b32_e32 v10, s18
894; SI-NEXT:    v_mov_b32_e32 v11, s19
895; SI-NEXT:    v_mov_b32_e32 v12, s20
896; SI-NEXT:    v_mov_b32_e32 v13, s21
897; SI-NEXT:    v_mov_b32_e32 v14, s22
898; SI-NEXT:    v_mov_b32_e32 v15, s23
899; SI-NEXT:    s_mov_b32 m0, s6
900; SI-NEXT:    v_movreld_b32_e32 v0, 5
901; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
902; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
903; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
904; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
905; SI-NEXT:    s_endpgm
906;
907; VI-LABEL: dynamic_insertelement_v16i32:
908; VI:       ; %bb.0:
909; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
910; VI-NEXT:    s_load_dword s6, s[4:5], 0x80
911; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
912; VI-NEXT:    s_mov_b32 s3, 0x1100f000
913; VI-NEXT:    s_mov_b32 s2, -1
914; VI-NEXT:    s_waitcnt lgkmcnt(0)
915; VI-NEXT:    v_mov_b32_e32 v0, s8
916; VI-NEXT:    v_mov_b32_e32 v1, s9
917; VI-NEXT:    v_mov_b32_e32 v2, s10
918; VI-NEXT:    v_mov_b32_e32 v3, s11
919; VI-NEXT:    v_mov_b32_e32 v4, s12
920; VI-NEXT:    v_mov_b32_e32 v5, s13
921; VI-NEXT:    v_mov_b32_e32 v6, s14
922; VI-NEXT:    v_mov_b32_e32 v7, s15
923; VI-NEXT:    v_mov_b32_e32 v8, s16
924; VI-NEXT:    v_mov_b32_e32 v9, s17
925; VI-NEXT:    v_mov_b32_e32 v10, s18
926; VI-NEXT:    v_mov_b32_e32 v11, s19
927; VI-NEXT:    v_mov_b32_e32 v12, s20
928; VI-NEXT:    v_mov_b32_e32 v13, s21
929; VI-NEXT:    v_mov_b32_e32 v14, s22
930; VI-NEXT:    v_mov_b32_e32 v15, s23
931; VI-NEXT:    s_mov_b32 m0, s6
932; VI-NEXT:    v_movreld_b32_e32 v0, 5
933; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
934; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
935; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
936; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
937; VI-NEXT:    s_endpgm
938  %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
939  store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
940  ret void
941}
942
943define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
944; SI-LABEL: dynamic_insertelement_v2i16:
945; SI:       ; %bb.0:
946; SI-NEXT:    s_load_dword s6, s[4:5], 0x3
947; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
948; SI-NEXT:    s_load_dword s4, s[4:5], 0x2
949; SI-NEXT:    s_mov_b32 s3, 0x100f000
950; SI-NEXT:    s_mov_b32 s2, -1
951; SI-NEXT:    s_waitcnt lgkmcnt(0)
952; SI-NEXT:    s_lshl_b32 s5, s6, 4
953; SI-NEXT:    s_lshl_b32 s5, 0xffff, s5
954; SI-NEXT:    s_andn2_b32 s4, s4, s5
955; SI-NEXT:    s_and_b32 s5, s5, 0x50005
956; SI-NEXT:    s_or_b32 s4, s5, s4
957; SI-NEXT:    v_mov_b32_e32 v0, s4
958; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
959; SI-NEXT:    s_endpgm
960;
961; VI-LABEL: dynamic_insertelement_v2i16:
962; VI:       ; %bb.0:
963; VI-NEXT:    s_load_dword s6, s[4:5], 0xc
964; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
965; VI-NEXT:    s_load_dword s4, s[4:5], 0x8
966; VI-NEXT:    s_mov_b32 s3, 0x1100f000
967; VI-NEXT:    s_mov_b32 s2, -1
968; VI-NEXT:    s_waitcnt lgkmcnt(0)
969; VI-NEXT:    s_lshl_b32 s5, s6, 4
970; VI-NEXT:    s_lshl_b32 s5, 0xffff, s5
971; VI-NEXT:    s_andn2_b32 s4, s4, s5
972; VI-NEXT:    s_and_b32 s5, s5, 0x50005
973; VI-NEXT:    s_or_b32 s4, s5, s4
974; VI-NEXT:    v_mov_b32_e32 v0, s4
975; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
976; VI-NEXT:    s_endpgm
977  %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
978  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
979  ret void
980}
981
982define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
983; SI-LABEL: dynamic_insertelement_v3i16:
984; SI:       ; %bb.0:
985; SI-NEXT:    s_load_dword s6, s[4:5], 0x4
986; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
987; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
988; SI-NEXT:    s_mov_b32 s3, 0x100f000
989; SI-NEXT:    s_mov_b32 s2, -1
990; SI-NEXT:    s_waitcnt lgkmcnt(0)
991; SI-NEXT:    s_lshl_b32 s8, s6, 4
992; SI-NEXT:    s_mov_b64 s[6:7], 0xffff
993; SI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s8
994; SI-NEXT:    s_and_b32 s9, s7, 0x50005
995; SI-NEXT:    s_and_b32 s8, s6, 0x50005
996; SI-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
997; SI-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
998; SI-NEXT:    v_mov_b32_e32 v0, s5
999; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
1000; SI-NEXT:    v_mov_b32_e32 v0, s4
1001; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1002; SI-NEXT:    s_endpgm
1003;
1004; VI-LABEL: dynamic_insertelement_v3i16:
1005; VI:       ; %bb.0:
1006; VI-NEXT:    s_load_dword s6, s[4:5], 0x10
1007; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1008; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
1009; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1010; VI-NEXT:    s_mov_b32 s2, -1
1011; VI-NEXT:    s_waitcnt lgkmcnt(0)
1012; VI-NEXT:    s_lshl_b32 s8, s6, 4
1013; VI-NEXT:    s_mov_b64 s[6:7], 0xffff
1014; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s8
1015; VI-NEXT:    s_mov_b32 s8, 0x50005
1016; VI-NEXT:    s_mov_b32 s9, s8
1017; VI-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
1018; VI-NEXT:    s_and_b64 s[6:7], s[6:7], s[8:9]
1019; VI-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
1020; VI-NEXT:    v_mov_b32_e32 v0, s5
1021; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
1022; VI-NEXT:    v_mov_b32_e32 v0, s4
1023; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1024; VI-NEXT:    s_endpgm
1025  %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
1026  store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8
1027  ret void
1028}
1029
1030define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
1031; SI-LABEL: dynamic_insertelement_v2i8:
1032; SI:       ; %bb.0:
1033; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
1034; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1035; SI-NEXT:    s_load_dword s4, s[4:5], 0xa
1036; SI-NEXT:    s_mov_b32 s3, 0x100f000
1037; SI-NEXT:    s_mov_b32 s2, -1
1038; SI-NEXT:    s_waitcnt lgkmcnt(0)
1039; SI-NEXT:    s_lshl_b32 s5, s6, 3
1040; SI-NEXT:    s_lshl_b32 s5, -1, s5
1041; SI-NEXT:    s_andn2_b32 s4, s4, s5
1042; SI-NEXT:    s_and_b32 s5, s5, 0x505
1043; SI-NEXT:    s_or_b32 s4, s5, s4
1044; SI-NEXT:    v_mov_b32_e32 v0, s4
1045; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1046; SI-NEXT:    s_endpgm
1047;
1048; VI-LABEL: dynamic_insertelement_v2i8:
1049; VI:       ; %bb.0:
1050; VI-NEXT:    s_load_dword s6, s[4:5], 0x4c
1051; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1052; VI-NEXT:    s_load_dword s4, s[4:5], 0x28
1053; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1054; VI-NEXT:    s_mov_b32 s2, -1
1055; VI-NEXT:    s_waitcnt lgkmcnt(0)
1056; VI-NEXT:    s_lshl_b32 s5, s6, 3
1057; VI-NEXT:    v_lshlrev_b16_e64 v0, s5, -1
1058; VI-NEXT:    v_not_b32_e32 v1, v0
1059; VI-NEXT:    v_and_b32_e32 v1, s4, v1
1060; VI-NEXT:    v_and_b32_e32 v0, 0x505, v0
1061; VI-NEXT:    v_or_b32_e32 v0, v0, v1
1062; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1063; VI-NEXT:    s_endpgm
1064  %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
1065  store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
1066  ret void
1067}
1068
1069; FIXME: post legalize i16 and i32 shifts aren't merged because of
1070; isTypeDesirableForOp in SimplifyDemandedBits
1071define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
1072; SI-LABEL: dynamic_insertelement_v3i8:
1073; SI:       ; %bb.0:
1074; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
1075; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1076; SI-NEXT:    s_load_dword s4, s[4:5], 0xa
1077; SI-NEXT:    s_mov_b32 s3, 0x100f000
1078; SI-NEXT:    s_mov_b32 s2, -1
1079; SI-NEXT:    s_waitcnt lgkmcnt(0)
1080; SI-NEXT:    s_lshl_b32 s5, s6, 3
1081; SI-NEXT:    s_lshl_b32 s5, 0xffff, s5
1082; SI-NEXT:    s_andn2_b32 s4, s4, s5
1083; SI-NEXT:    s_and_b32 s5, s5, 0x5050505
1084; SI-NEXT:    s_or_b32 s4, s5, s4
1085; SI-NEXT:    s_lshr_b32 s5, s4, 16
1086; SI-NEXT:    v_mov_b32_e32 v0, s4
1087; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1088; SI-NEXT:    v_mov_b32_e32 v0, s5
1089; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:2
1090; SI-NEXT:    s_endpgm
1091;
1092; VI-LABEL: dynamic_insertelement_v3i8:
1093; VI:       ; %bb.0:
1094; VI-NEXT:    s_load_dword s6, s[4:5], 0x4c
1095; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1096; VI-NEXT:    s_load_dword s4, s[4:5], 0x28
1097; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1098; VI-NEXT:    s_mov_b32 s2, -1
1099; VI-NEXT:    s_waitcnt lgkmcnt(0)
1100; VI-NEXT:    s_lshl_b32 s5, s6, 3
1101; VI-NEXT:    s_lshl_b32 s5, 0xffff, s5
1102; VI-NEXT:    s_andn2_b32 s4, s4, s5
1103; VI-NEXT:    s_and_b32 s5, s5, 0x5050505
1104; VI-NEXT:    s_or_b32 s4, s5, s4
1105; VI-NEXT:    s_lshr_b32 s5, s4, 16
1106; VI-NEXT:    v_mov_b32_e32 v0, s4
1107; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1108; VI-NEXT:    v_mov_b32_e32 v0, s5
1109; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:2
1110; VI-NEXT:    s_endpgm
1111  %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
1112  store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
1113  ret void
1114}
1115
1116define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
1117; SI-LABEL: dynamic_insertelement_v4i8:
1118; SI:       ; %bb.0:
1119; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
1120; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1121; SI-NEXT:    s_load_dword s4, s[4:5], 0xa
1122; SI-NEXT:    s_mov_b32 s3, 0x100f000
1123; SI-NEXT:    s_mov_b32 s2, -1
1124; SI-NEXT:    s_waitcnt lgkmcnt(0)
1125; SI-NEXT:    s_lshl_b32 s5, s6, 3
1126; SI-NEXT:    s_lshl_b32 s5, 0xffff, s5
1127; SI-NEXT:    s_andn2_b32 s4, s4, s5
1128; SI-NEXT:    s_and_b32 s5, s5, 0x5050505
1129; SI-NEXT:    s_or_b32 s4, s5, s4
1130; SI-NEXT:    v_mov_b32_e32 v0, s4
1131; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1132; SI-NEXT:    s_endpgm
1133;
1134; VI-LABEL: dynamic_insertelement_v4i8:
1135; VI:       ; %bb.0:
1136; VI-NEXT:    s_load_dword s6, s[4:5], 0x4c
1137; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1138; VI-NEXT:    s_load_dword s4, s[4:5], 0x28
1139; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1140; VI-NEXT:    s_mov_b32 s2, -1
1141; VI-NEXT:    s_waitcnt lgkmcnt(0)
1142; VI-NEXT:    s_lshl_b32 s5, s6, 3
1143; VI-NEXT:    s_lshl_b32 s5, 0xffff, s5
1144; VI-NEXT:    s_andn2_b32 s4, s4, s5
1145; VI-NEXT:    s_and_b32 s5, s5, 0x5050505
1146; VI-NEXT:    s_or_b32 s4, s5, s4
1147; VI-NEXT:    v_mov_b32_e32 v0, s4
1148; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1149; VI-NEXT:    s_endpgm
1150  %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
1151  store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
1152  ret void
1153}
1154
1155define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind {
1156; SI-LABEL: s_dynamic_insertelement_v8i8:
1157; SI:       ; %bb.0:
1158; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1159; SI-NEXT:    s_load_dword s8, s[4:5], 0x4
1160; SI-NEXT:    s_mov_b32 s7, 0x100f000
1161; SI-NEXT:    s_mov_b32 s6, -1
1162; SI-NEXT:    s_waitcnt lgkmcnt(0)
1163; SI-NEXT:    s_mov_b32 s4, s0
1164; SI-NEXT:    s_mov_b32 s5, s1
1165; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
1166; SI-NEXT:    s_lshl_b32 s8, s8, 3
1167; SI-NEXT:    s_mov_b64 s[2:3], 0xffff
1168; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
1169; SI-NEXT:    s_and_b32 s9, s3, 0x5050505
1170; SI-NEXT:    s_and_b32 s8, s2, 0x5050505
1171; SI-NEXT:    s_waitcnt lgkmcnt(0)
1172; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
1173; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
1174; SI-NEXT:    v_mov_b32_e32 v0, s0
1175; SI-NEXT:    v_mov_b32_e32 v1, s1
1176; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1177; SI-NEXT:    s_endpgm
1178;
1179; VI-LABEL: s_dynamic_insertelement_v8i8:
1180; VI:       ; %bb.0:
1181; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1182; VI-NEXT:    s_load_dword s8, s[4:5], 0x10
1183; VI-NEXT:    s_mov_b32 s7, 0x1100f000
1184; VI-NEXT:    s_mov_b32 s6, -1
1185; VI-NEXT:    s_waitcnt lgkmcnt(0)
1186; VI-NEXT:    s_mov_b32 s4, s0
1187; VI-NEXT:    s_mov_b32 s5, s1
1188; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
1189; VI-NEXT:    s_lshl_b32 s8, s8, 3
1190; VI-NEXT:    s_mov_b64 s[2:3], 0xffff
1191; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
1192; VI-NEXT:    s_and_b32 s9, s3, 0x5050505
1193; VI-NEXT:    s_and_b32 s8, s2, 0x5050505
1194; VI-NEXT:    s_waitcnt lgkmcnt(0)
1195; VI-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
1196; VI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
1197; VI-NEXT:    v_mov_b32_e32 v0, s0
1198; VI-NEXT:    v_mov_b32_e32 v1, s1
1199; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1200; VI-NEXT:    s_endpgm
1201  %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4
1202  %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
1203  store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
1204  ret void
1205}
1206
1207define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
1208; SI-LABEL: dynamic_insertelement_v16i8:
1209; SI:       ; %bb.0:
1210; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
1211; SI-NEXT:    s_load_dword s6, s[4:5], 0x8
1212; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1213; SI-NEXT:    s_mov_b32 s3, 0x100f000
1214; SI-NEXT:    s_mov_b32 s2, -1
1215; SI-NEXT:    s_waitcnt lgkmcnt(0)
1216; SI-NEXT:    s_lshr_b32 s4, s11, 24
1217; SI-NEXT:    s_cmp_lg_u32 s6, 15
1218; SI-NEXT:    v_mov_b32_e32 v0, s4
1219; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1220; SI-NEXT:    s_lshr_b32 s4, s11, 16
1221; SI-NEXT:    s_cmp_lg_u32 s6, 14
1222; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1223; SI-NEXT:    v_mov_b32_e32 v1, s4
1224; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1225; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1226; SI-NEXT:    s_lshr_b32 s4, s11, 8
1227; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1228; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
1229; SI-NEXT:    s_cmp_lg_u32 s6, 13
1230; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1231; SI-NEXT:    v_mov_b32_e32 v1, s4
1232; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1233; SI-NEXT:    s_cmp_lg_u32 s6, 12
1234; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1235; SI-NEXT:    v_mov_b32_e32 v2, s11
1236; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1237; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1238; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1239; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
1240; SI-NEXT:    v_or_b32_e32 v1, v2, v1
1241; SI-NEXT:    s_lshr_b32 s4, s10, 24
1242; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1243; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1244; SI-NEXT:    s_cmp_lg_u32 s6, 11
1245; SI-NEXT:    v_or_b32_e32 v3, v1, v0
1246; SI-NEXT:    v_mov_b32_e32 v0, s4
1247; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1248; SI-NEXT:    s_lshr_b32 s4, s10, 16
1249; SI-NEXT:    s_cmp_lg_u32 s6, 10
1250; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1251; SI-NEXT:    v_mov_b32_e32 v1, s4
1252; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1253; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1254; SI-NEXT:    s_lshr_b32 s4, s10, 8
1255; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1256; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
1257; SI-NEXT:    s_cmp_lg_u32 s6, 9
1258; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1259; SI-NEXT:    v_mov_b32_e32 v1, s4
1260; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1261; SI-NEXT:    s_cmp_lg_u32 s6, 8
1262; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1263; SI-NEXT:    v_mov_b32_e32 v2, s10
1264; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1265; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1266; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1267; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
1268; SI-NEXT:    v_or_b32_e32 v1, v2, v1
1269; SI-NEXT:    s_lshr_b32 s4, s9, 24
1270; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1271; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1272; SI-NEXT:    s_cmp_lg_u32 s6, 7
1273; SI-NEXT:    v_or_b32_e32 v2, v1, v0
1274; SI-NEXT:    v_mov_b32_e32 v0, s4
1275; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1276; SI-NEXT:    s_lshr_b32 s4, s9, 16
1277; SI-NEXT:    s_cmp_lg_u32 s6, 6
1278; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1279; SI-NEXT:    v_mov_b32_e32 v1, s4
1280; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1281; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1282; SI-NEXT:    s_lshr_b32 s4, s9, 8
1283; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1284; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
1285; SI-NEXT:    s_cmp_lg_u32 s6, 5
1286; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1287; SI-NEXT:    v_mov_b32_e32 v1, s4
1288; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1289; SI-NEXT:    s_cmp_lg_u32 s6, 4
1290; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1291; SI-NEXT:    v_mov_b32_e32 v4, s9
1292; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1293; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1294; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1295; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
1296; SI-NEXT:    v_or_b32_e32 v1, v4, v1
1297; SI-NEXT:    s_lshr_b32 s4, s8, 24
1298; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1299; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1300; SI-NEXT:    s_cmp_lg_u32 s6, 3
1301; SI-NEXT:    v_or_b32_e32 v1, v1, v0
1302; SI-NEXT:    v_mov_b32_e32 v0, s4
1303; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1304; SI-NEXT:    s_lshr_b32 s4, s8, 16
1305; SI-NEXT:    s_cmp_lg_u32 s6, 2
1306; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1307; SI-NEXT:    v_mov_b32_e32 v4, s4
1308; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1309; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1310; SI-NEXT:    s_lshr_b32 s4, s8, 8
1311; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1312; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
1313; SI-NEXT:    s_cmp_lg_u32 s6, 1
1314; SI-NEXT:    v_or_b32_e32 v0, v4, v0
1315; SI-NEXT:    v_mov_b32_e32 v4, s4
1316; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1317; SI-NEXT:    s_cmp_lg_u32 s6, 0
1318; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1319; SI-NEXT:    v_mov_b32_e32 v5, s8
1320; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1321; SI-NEXT:    v_cndmask_b32_e32 v5, 5, v5, vcc
1322; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
1323; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
1324; SI-NEXT:    v_or_b32_e32 v4, v5, v4
1325; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1326; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
1327; SI-NEXT:    v_or_b32_e32 v0, v4, v0
1328; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1329; SI-NEXT:    s_endpgm
1330;
1331; VI-LABEL: dynamic_insertelement_v16i8:
1332; VI:       ; %bb.0:
1333; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
1334; VI-NEXT:    s_load_dword s6, s[4:5], 0x20
1335; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1336; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1337; VI-NEXT:    s_mov_b32 s2, -1
1338; VI-NEXT:    s_waitcnt lgkmcnt(0)
1339; VI-NEXT:    s_lshr_b32 s4, s11, 24
1340; VI-NEXT:    s_cmp_lg_u32 s6, 15
1341; VI-NEXT:    v_mov_b32_e32 v0, s4
1342; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1343; VI-NEXT:    s_lshr_b32 s4, s11, 16
1344; VI-NEXT:    s_cmp_lg_u32 s6, 14
1345; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1346; VI-NEXT:    v_mov_b32_e32 v1, s4
1347; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1348; VI-NEXT:    s_lshr_b32 s4, s11, 8
1349; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1350; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1351; VI-NEXT:    s_cmp_lg_u32 s6, 13
1352; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1353; VI-NEXT:    v_mov_b32_e32 v1, s4
1354; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1355; VI-NEXT:    s_cmp_lg_u32 s6, 12
1356; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1357; VI-NEXT:    v_mov_b32_e32 v2, s11
1358; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1359; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1360; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1361; VI-NEXT:    s_lshr_b32 s4, s10, 24
1362; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1363; VI-NEXT:    s_cmp_lg_u32 s6, 11
1364; VI-NEXT:    v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1365; VI-NEXT:    v_mov_b32_e32 v0, s4
1366; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1367; VI-NEXT:    s_lshr_b32 s4, s10, 16
1368; VI-NEXT:    s_cmp_lg_u32 s6, 10
1369; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1370; VI-NEXT:    v_mov_b32_e32 v1, s4
1371; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1372; VI-NEXT:    s_lshr_b32 s4, s10, 8
1373; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1374; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1375; VI-NEXT:    s_cmp_lg_u32 s6, 9
1376; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1377; VI-NEXT:    v_mov_b32_e32 v1, s4
1378; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1379; VI-NEXT:    s_cmp_lg_u32 s6, 8
1380; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1381; VI-NEXT:    v_mov_b32_e32 v2, s10
1382; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1383; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1384; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1385; VI-NEXT:    s_lshr_b32 s4, s9, 24
1386; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1387; VI-NEXT:    s_cmp_lg_u32 s6, 7
1388; VI-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1389; VI-NEXT:    v_mov_b32_e32 v0, s4
1390; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1391; VI-NEXT:    s_lshr_b32 s4, s9, 16
1392; VI-NEXT:    s_cmp_lg_u32 s6, 6
1393; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1394; VI-NEXT:    v_mov_b32_e32 v1, s4
1395; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1396; VI-NEXT:    s_lshr_b32 s4, s9, 8
1397; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1398; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1399; VI-NEXT:    s_cmp_lg_u32 s6, 5
1400; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1401; VI-NEXT:    v_mov_b32_e32 v1, s4
1402; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1403; VI-NEXT:    s_cmp_lg_u32 s6, 4
1404; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1405; VI-NEXT:    v_mov_b32_e32 v4, s9
1406; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1407; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1408; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1409; VI-NEXT:    s_lshr_b32 s4, s8, 24
1410; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1411; VI-NEXT:    s_cmp_lg_u32 s6, 3
1412; VI-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1413; VI-NEXT:    v_mov_b32_e32 v0, s4
1414; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1415; VI-NEXT:    s_lshr_b32 s4, s8, 16
1416; VI-NEXT:    s_cmp_lg_u32 s6, 2
1417; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1418; VI-NEXT:    v_mov_b32_e32 v4, s4
1419; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1420; VI-NEXT:    s_lshr_b32 s4, s8, 8
1421; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1422; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1423; VI-NEXT:    s_cmp_lg_u32 s6, 1
1424; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1425; VI-NEXT:    v_mov_b32_e32 v4, s4
1426; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1427; VI-NEXT:    s_cmp_lg_u32 s6, 0
1428; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1429; VI-NEXT:    v_mov_b32_e32 v5, s8
1430; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1431; VI-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
1432; VI-NEXT:    v_cndmask_b32_e32 v5, 5, v5, vcc
1433; VI-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1434; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1435; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1436; VI-NEXT:    s_endpgm
1437  %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
1438  store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
1439  ret void
1440}
1441
1442; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
1443; the compiler doesn't crash.
1444define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
1445; SI-LABEL: insert_split_bb:
1446; SI:       ; %bb.0: ; %entry
1447; SI-NEXT:    s_load_dword s6, s[4:5], 0x4
1448; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1449; SI-NEXT:    s_waitcnt lgkmcnt(0)
1450; SI-NEXT:    s_cmp_lg_u32 s6, 0
1451; SI-NEXT:    s_cbranch_scc0 .LBB30_4
1452; SI-NEXT:  ; %bb.1: ; %else
1453; SI-NEXT:    s_load_dword s7, s[2:3], 0x1
1454; SI-NEXT:    s_mov_b64 s[4:5], 0
1455; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
1456; SI-NEXT:    s_waitcnt lgkmcnt(0)
1457; SI-NEXT:    s_mov_b64 vcc, vcc
1458; SI-NEXT:    s_cbranch_vccnz .LBB30_3
1459; SI-NEXT:  .LBB30_2: ; %if
1460; SI-NEXT:    s_load_dword s7, s[2:3], 0x0
1461; SI-NEXT:  .LBB30_3: ; %endif
1462; SI-NEXT:    s_waitcnt lgkmcnt(0)
1463; SI-NEXT:    v_mov_b32_e32 v0, s6
1464; SI-NEXT:    s_mov_b32 s3, 0x100f000
1465; SI-NEXT:    s_mov_b32 s2, -1
1466; SI-NEXT:    v_mov_b32_e32 v1, s7
1467; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1468; SI-NEXT:    s_endpgm
1469; SI-NEXT:  .LBB30_4:
1470; SI-NEXT:    s_branch .LBB30_2
1471;
1472; VI-LABEL: insert_split_bb:
1473; VI:       ; %bb.0: ; %entry
1474; VI-NEXT:    s_load_dword s6, s[4:5], 0x10
1475; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1476; VI-NEXT:    s_waitcnt lgkmcnt(0)
1477; VI-NEXT:    s_cmp_lg_u32 s6, 0
1478; VI-NEXT:    s_cbranch_scc0 .LBB30_4
1479; VI-NEXT:  ; %bb.1: ; %else
1480; VI-NEXT:    s_load_dword s7, s[2:3], 0x4
1481; VI-NEXT:    s_cbranch_execnz .LBB30_3
1482; VI-NEXT:  .LBB30_2: ; %if
1483; VI-NEXT:    s_waitcnt lgkmcnt(0)
1484; VI-NEXT:    s_load_dword s7, s[2:3], 0x0
1485; VI-NEXT:  .LBB30_3: ; %endif
1486; VI-NEXT:    s_waitcnt lgkmcnt(0)
1487; VI-NEXT:    v_mov_b32_e32 v0, s6
1488; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1489; VI-NEXT:    s_mov_b32 s2, -1
1490; VI-NEXT:    v_mov_b32_e32 v1, s7
1491; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1492; VI-NEXT:    s_endpgm
1493; VI-NEXT:  .LBB30_4:
1494; VI-NEXT:    s_branch .LBB30_2
1495entry:
1496  %0 = insertelement <2 x i32> undef, i32 %a, i32 0
1497  %1 = icmp eq i32 %a, 0
1498  br i1 %1, label %if, label %else
1499
1500if:
1501  %2 = load i32, i32 addrspace(1)* %in
1502  %3 = insertelement <2 x i32> %0, i32 %2, i32 1
1503  br label %endif
1504
1505else:
1506  %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1
1507  %5 = load i32, i32 addrspace(1)* %4
1508  %6 = insertelement <2 x i32> %0, i32 %5, i32 1
1509  br label %endif
1510
1511endif:
1512  %7 = phi <2 x i32> [%3, %if], [%6, %else]
1513  store <2 x i32> %7, <2 x i32> addrspace(1)* %out
1514  ret void
1515}
1516
1517define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
1518; SI-LABEL: dynamic_insertelement_v2f64:
1519; SI:       ; %bb.0:
1520; SI-NEXT:    s_load_dword s8, s[4:5], 0x18
1521; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xc
1522; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1523; SI-NEXT:    v_mov_b32_e32 v1, 0x40200000
1524; SI-NEXT:    s_mov_b32 s7, 0x100f000
1525; SI-NEXT:    s_waitcnt lgkmcnt(0)
1526; SI-NEXT:    s_cmp_eq_u32 s8, 1
1527; SI-NEXT:    v_mov_b32_e32 v0, s3
1528; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1529; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
1530; SI-NEXT:    v_mov_b32_e32 v0, s2
1531; SI-NEXT:    s_cmp_eq_u32 s8, 0
1532; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1533; SI-NEXT:    v_mov_b32_e32 v0, s1
1534; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1535; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
1536; SI-NEXT:    v_mov_b32_e32 v0, s0
1537; SI-NEXT:    s_mov_b32 s6, -1
1538; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1539; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1540; SI-NEXT:    s_endpgm
1541;
1542; VI-LABEL: dynamic_insertelement_v2f64:
1543; VI:       ; %bb.0:
1544; VI-NEXT:    s_load_dword s8, s[4:5], 0x60
1545; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x30
1546; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1547; VI-NEXT:    v_mov_b32_e32 v1, 0x40200000
1548; VI-NEXT:    s_mov_b32 s7, 0x1100f000
1549; VI-NEXT:    s_waitcnt lgkmcnt(0)
1550; VI-NEXT:    s_cmp_eq_u32 s8, 1
1551; VI-NEXT:    v_mov_b32_e32 v0, s3
1552; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1553; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
1554; VI-NEXT:    v_mov_b32_e32 v0, s2
1555; VI-NEXT:    s_cmp_eq_u32 s8, 0
1556; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1557; VI-NEXT:    v_mov_b32_e32 v0, s1
1558; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1559; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
1560; VI-NEXT:    v_mov_b32_e32 v0, s0
1561; VI-NEXT:    s_mov_b32 s6, -1
1562; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1563; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1564; VI-NEXT:    s_endpgm
1565  %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
1566  store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
1567  ret void
1568}
1569
1570define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
1571; SI-LABEL: dynamic_insertelement_v2i64:
1572; SI:       ; %bb.0:
1573; SI-NEXT:    s_load_dword s10, s[4:5], 0x8
1574; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
1575; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1576; SI-NEXT:    s_mov_b32 s7, 0x100f000
1577; SI-NEXT:    s_mov_b32 s6, -1
1578; SI-NEXT:    s_waitcnt lgkmcnt(0)
1579; SI-NEXT:    s_cmp_eq_u32 s10, 1
1580; SI-NEXT:    v_mov_b32_e32 v0, s3
1581; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
1582; SI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[8:9]
1583; SI-NEXT:    v_mov_b32_e32 v0, s2
1584; SI-NEXT:    s_cmp_eq_u32 s10, 0
1585; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[8:9]
1586; SI-NEXT:    v_mov_b32_e32 v0, s1
1587; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
1588; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[2:3]
1589; SI-NEXT:    v_mov_b32_e32 v0, s0
1590; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[2:3]
1591; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1592; SI-NEXT:    s_endpgm
1593;
1594; VI-LABEL: dynamic_insertelement_v2i64:
1595; VI:       ; %bb.0:
1596; VI-NEXT:    s_load_dword s10, s[4:5], 0x20
1597; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
1598; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1599; VI-NEXT:    s_mov_b32 s7, 0x1100f000
1600; VI-NEXT:    s_mov_b32 s6, -1
1601; VI-NEXT:    s_waitcnt lgkmcnt(0)
1602; VI-NEXT:    s_cmp_eq_u32 s10, 1
1603; VI-NEXT:    v_mov_b32_e32 v0, s3
1604; VI-NEXT:    s_cselect_b64 s[8:9], -1, 0
1605; VI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[8:9]
1606; VI-NEXT:    v_mov_b32_e32 v0, s2
1607; VI-NEXT:    s_cmp_eq_u32 s10, 0
1608; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[8:9]
1609; VI-NEXT:    v_mov_b32_e32 v0, s1
1610; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
1611; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[2:3]
1612; VI-NEXT:    v_mov_b32_e32 v0, s0
1613; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[2:3]
1614; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1615; VI-NEXT:    s_endpgm
1616  %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
1617  store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
1618  ret void
1619}
1620
1621define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
1622; SI-LABEL: dynamic_insertelement_v3i64:
1623; SI:       ; %bb.0:
1624; SI-NEXT:    s_load_dword s12, s[4:5], 0x10
1625; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1626; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
1627; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xc
1628; SI-NEXT:    s_mov_b32 s3, 0x100f000
1629; SI-NEXT:    s_waitcnt lgkmcnt(0)
1630; SI-NEXT:    s_cmp_eq_u32 s12, 1
1631; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
1632; SI-NEXT:    v_mov_b32_e32 v0, s11
1633; SI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[6:7]
1634; SI-NEXT:    v_mov_b32_e32 v0, s10
1635; SI-NEXT:    s_cmp_eq_u32 s12, 0
1636; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[6:7]
1637; SI-NEXT:    v_mov_b32_e32 v0, s9
1638; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
1639; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[6:7]
1640; SI-NEXT:    v_mov_b32_e32 v0, s8
1641; SI-NEXT:    s_cmp_eq_u32 s12, 2
1642; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[6:7]
1643; SI-NEXT:    v_mov_b32_e32 v4, s5
1644; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
1645; SI-NEXT:    v_cndmask_b32_e64 v5, v4, 0, s[6:7]
1646; SI-NEXT:    v_mov_b32_e32 v4, s4
1647; SI-NEXT:    s_mov_b32 s2, -1
1648; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 5, s[6:7]
1649; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
1650; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1651; SI-NEXT:    s_endpgm
1652;
1653; VI-LABEL: dynamic_insertelement_v3i64:
1654; VI:       ; %bb.0:
1655; VI-NEXT:    s_load_dword s12, s[4:5], 0x40
1656; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1657; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
1658; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x30
1659; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1660; VI-NEXT:    s_waitcnt lgkmcnt(0)
1661; VI-NEXT:    s_cmp_eq_u32 s12, 1
1662; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
1663; VI-NEXT:    v_mov_b32_e32 v0, s11
1664; VI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[6:7]
1665; VI-NEXT:    v_mov_b32_e32 v0, s10
1666; VI-NEXT:    s_cmp_eq_u32 s12, 0
1667; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[6:7]
1668; VI-NEXT:    v_mov_b32_e32 v0, s9
1669; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
1670; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[6:7]
1671; VI-NEXT:    v_mov_b32_e32 v0, s8
1672; VI-NEXT:    s_cmp_eq_u32 s12, 2
1673; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[6:7]
1674; VI-NEXT:    v_mov_b32_e32 v4, s5
1675; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
1676; VI-NEXT:    v_cndmask_b32_e64 v5, v4, 0, s[6:7]
1677; VI-NEXT:    v_mov_b32_e32 v4, s4
1678; VI-NEXT:    s_mov_b32 s2, -1
1679; VI-NEXT:    v_cndmask_b32_e64 v4, v4, 5, s[6:7]
1680; VI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
1681; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1682; VI-NEXT:    s_endpgm
1683  %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
1684  store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
1685  ret void
1686}
1687
1688define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
1689; SI-LABEL: dynamic_insertelement_v4f64:
1690; SI:       ; %bb.0:
1691; SI-NEXT:    s_load_dword s6, s[4:5], 0x10
1692; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
1693; SI-NEXT:    v_mov_b32_e32 v4, 0x40200000
1694; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1695; SI-NEXT:    s_mov_b32 s3, 0x100f000
1696; SI-NEXT:    s_waitcnt lgkmcnt(0)
1697; SI-NEXT:    s_cmp_eq_u32 s6, 1
1698; SI-NEXT:    v_mov_b32_e32 v0, s11
1699; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1700; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
1701; SI-NEXT:    v_mov_b32_e32 v0, s10
1702; SI-NEXT:    s_cmp_eq_u32 s6, 0
1703; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1704; SI-NEXT:    v_mov_b32_e32 v0, s9
1705; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1706; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
1707; SI-NEXT:    v_mov_b32_e32 v0, s8
1708; SI-NEXT:    s_cmp_eq_u32 s6, 3
1709; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1710; SI-NEXT:    v_mov_b32_e32 v5, s15
1711; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1712; SI-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
1713; SI-NEXT:    v_mov_b32_e32 v5, s14
1714; SI-NEXT:    s_cmp_eq_u32 s6, 2
1715; SI-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
1716; SI-NEXT:    v_mov_b32_e32 v5, s13
1717; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1718; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
1719; SI-NEXT:    v_mov_b32_e32 v4, s12
1720; SI-NEXT:    s_mov_b32 s2, -1
1721; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
1722; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1723; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1724; SI-NEXT:    s_endpgm
1725;
1726; VI-LABEL: dynamic_insertelement_v4f64:
1727; VI:       ; %bb.0:
1728; VI-NEXT:    s_load_dword s6, s[4:5], 0x40
1729; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
1730; VI-NEXT:    v_mov_b32_e32 v4, 0x40200000
1731; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1732; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1733; VI-NEXT:    s_waitcnt lgkmcnt(0)
1734; VI-NEXT:    s_cmp_eq_u32 s6, 1
1735; VI-NEXT:    v_mov_b32_e32 v0, s11
1736; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1737; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
1738; VI-NEXT:    v_mov_b32_e32 v0, s10
1739; VI-NEXT:    s_cmp_eq_u32 s6, 0
1740; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1741; VI-NEXT:    v_mov_b32_e32 v0, s9
1742; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1743; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
1744; VI-NEXT:    v_mov_b32_e32 v0, s8
1745; VI-NEXT:    s_cmp_eq_u32 s6, 3
1746; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1747; VI-NEXT:    v_mov_b32_e32 v5, s15
1748; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1749; VI-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
1750; VI-NEXT:    v_mov_b32_e32 v5, s14
1751; VI-NEXT:    s_cmp_eq_u32 s6, 2
1752; VI-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
1753; VI-NEXT:    v_mov_b32_e32 v5, s13
1754; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1755; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
1756; VI-NEXT:    v_mov_b32_e32 v4, s12
1757; VI-NEXT:    s_mov_b32 s2, -1
1758; VI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
1759; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1760; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1761; VI-NEXT:    s_endpgm
1762  %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
1763  store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
1764  ret void
1765}
1766
1767define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 {
1768; SI-LABEL: dynamic_insertelement_v8f64:
1769; SI:       ; %bb.0:
1770; SI-NEXT:    s_load_dword s6, s[4:5], 0x20
1771; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
1772; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1773; SI-NEXT:    v_mov_b32_e32 v16, 0x40200000
1774; SI-NEXT:    s_mov_b32 s3, 0x100f000
1775; SI-NEXT:    s_waitcnt lgkmcnt(0)
1776; SI-NEXT:    s_lshl_b32 s4, s6, 1
1777; SI-NEXT:    v_mov_b32_e32 v0, s8
1778; SI-NEXT:    v_mov_b32_e32 v1, s9
1779; SI-NEXT:    v_mov_b32_e32 v2, s10
1780; SI-NEXT:    v_mov_b32_e32 v3, s11
1781; SI-NEXT:    v_mov_b32_e32 v4, s12
1782; SI-NEXT:    v_mov_b32_e32 v5, s13
1783; SI-NEXT:    v_mov_b32_e32 v6, s14
1784; SI-NEXT:    v_mov_b32_e32 v7, s15
1785; SI-NEXT:    v_mov_b32_e32 v8, s16
1786; SI-NEXT:    v_mov_b32_e32 v9, s17
1787; SI-NEXT:    v_mov_b32_e32 v10, s18
1788; SI-NEXT:    v_mov_b32_e32 v11, s19
1789; SI-NEXT:    v_mov_b32_e32 v12, s20
1790; SI-NEXT:    v_mov_b32_e32 v13, s21
1791; SI-NEXT:    v_mov_b32_e32 v14, s22
1792; SI-NEXT:    v_mov_b32_e32 v15, s23
1793; SI-NEXT:    s_mov_b32 m0, s4
1794; SI-NEXT:    v_movreld_b32_e32 v0, 0
1795; SI-NEXT:    s_mov_b32 s2, -1
1796; SI-NEXT:    v_movreld_b32_e32 v1, v16
1797; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1798; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1799; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1800; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1801; SI-NEXT:    s_endpgm
1802;
1803; VI-LABEL: dynamic_insertelement_v8f64:
1804; VI:       ; %bb.0:
1805; VI-NEXT:    s_load_dword s6, s[4:5], 0x80
1806; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
1807; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1808; VI-NEXT:    v_mov_b32_e32 v16, 0x40200000
1809; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1810; VI-NEXT:    s_waitcnt lgkmcnt(0)
1811; VI-NEXT:    s_lshl_b32 s4, s6, 1
1812; VI-NEXT:    v_mov_b32_e32 v0, s8
1813; VI-NEXT:    v_mov_b32_e32 v1, s9
1814; VI-NEXT:    v_mov_b32_e32 v2, s10
1815; VI-NEXT:    v_mov_b32_e32 v3, s11
1816; VI-NEXT:    v_mov_b32_e32 v4, s12
1817; VI-NEXT:    v_mov_b32_e32 v5, s13
1818; VI-NEXT:    v_mov_b32_e32 v6, s14
1819; VI-NEXT:    v_mov_b32_e32 v7, s15
1820; VI-NEXT:    v_mov_b32_e32 v8, s16
1821; VI-NEXT:    v_mov_b32_e32 v9, s17
1822; VI-NEXT:    v_mov_b32_e32 v10, s18
1823; VI-NEXT:    v_mov_b32_e32 v11, s19
1824; VI-NEXT:    v_mov_b32_e32 v12, s20
1825; VI-NEXT:    v_mov_b32_e32 v13, s21
1826; VI-NEXT:    v_mov_b32_e32 v14, s22
1827; VI-NEXT:    v_mov_b32_e32 v15, s23
1828; VI-NEXT:    s_mov_b32 m0, s4
1829; VI-NEXT:    v_movreld_b32_e32 v0, 0
1830; VI-NEXT:    s_mov_b32 s2, -1
1831; VI-NEXT:    v_movreld_b32_e32 v1, v16
1832; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1833; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1834; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1835; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1836; VI-NEXT:    s_endpgm
1837  %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
1838  store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
1839  ret void
1840}
1841
1842declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
1843
1844attributes #0 = { nounwind }
1845attributes #1 = { nounwind readnone }
1846