1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
4
5; FIXME: Broken on evergreen
6; FIXME: For some reason the 8 and 16 vectors are being stored as
7; individual elements instead of 128-bit stores.
8
9define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind {
10; SI-LABEL: insertelement_v2f32_0:
11; SI:       ; %bb.0:
12; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
14; SI-NEXT:    s_mov_b32 s3, 0x100f000
15; SI-NEXT:    s_mov_b32 s2, -1
16; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
17; SI-NEXT:    s_waitcnt lgkmcnt(0)
18; SI-NEXT:    v_mov_b32_e32 v1, s5
19; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
20; SI-NEXT:    s_endpgm
21;
22; VI-LABEL: insertelement_v2f32_0:
23; VI:       ; %bb.0:
24; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
25; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
26; VI-NEXT:    s_mov_b32 s3, 0x1100f000
27; VI-NEXT:    s_mov_b32 s2, -1
28; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
29; VI-NEXT:    s_waitcnt lgkmcnt(0)
30; VI-NEXT:    v_mov_b32_e32 v1, s5
31; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
32; VI-NEXT:    s_endpgm
33  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 0
34  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16
35  ret void
36}
37
38define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind {
39; SI-LABEL: insertelement_v2f32_1:
40; SI:       ; %bb.0:
41; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
42; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
43; SI-NEXT:    s_mov_b32 s3, 0x100f000
44; SI-NEXT:    s_mov_b32 s2, -1
45; SI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
46; SI-NEXT:    s_waitcnt lgkmcnt(0)
47; SI-NEXT:    v_mov_b32_e32 v0, s4
48; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
49; SI-NEXT:    s_endpgm
50;
51; VI-LABEL: insertelement_v2f32_1:
52; VI:       ; %bb.0:
53; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
54; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
55; VI-NEXT:    s_mov_b32 s3, 0x1100f000
56; VI-NEXT:    s_mov_b32 s2, -1
57; VI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
58; VI-NEXT:    s_waitcnt lgkmcnt(0)
59; VI-NEXT:    v_mov_b32_e32 v0, s4
60; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
61; VI-NEXT:    s_endpgm
62  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 1
63  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16
64  ret void
65}
66
67define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
68; SI-LABEL: insertelement_v2i32_0:
69; SI:       ; %bb.0:
70; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
71; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
72; SI-NEXT:    s_mov_b32 s3, 0x100f000
73; SI-NEXT:    s_mov_b32 s2, -1
74; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
75; SI-NEXT:    s_waitcnt lgkmcnt(0)
76; SI-NEXT:    v_mov_b32_e32 v1, s5
77; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
78; SI-NEXT:    s_endpgm
79;
80; VI-LABEL: insertelement_v2i32_0:
81; VI:       ; %bb.0:
82; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
83; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
84; VI-NEXT:    s_mov_b32 s3, 0x1100f000
85; VI-NEXT:    s_mov_b32 s2, -1
86; VI-NEXT:    v_mov_b32_e32 v0, 0x3e7
87; VI-NEXT:    s_waitcnt lgkmcnt(0)
88; VI-NEXT:    v_mov_b32_e32 v1, s5
89; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
90; VI-NEXT:    s_endpgm
91  %vecins = insertelement <2 x i32> %a, i32 999, i32 0
92  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16
93  ret void
94}
95
96define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
97; SI-LABEL: insertelement_v2i32_1:
98; SI:       ; %bb.0:
99; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
100; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
101; SI-NEXT:    s_mov_b32 s3, 0x100f000
102; SI-NEXT:    s_mov_b32 s2, -1
103; SI-NEXT:    v_mov_b32_e32 v1, 0x3e7
104; SI-NEXT:    s_waitcnt lgkmcnt(0)
105; SI-NEXT:    v_mov_b32_e32 v0, s4
106; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
107; SI-NEXT:    s_endpgm
108;
109; VI-LABEL: insertelement_v2i32_1:
110; VI:       ; %bb.0:
111; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
112; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
113; VI-NEXT:    s_mov_b32 s3, 0x1100f000
114; VI-NEXT:    s_mov_b32 s2, -1
115; VI-NEXT:    v_mov_b32_e32 v1, 0x3e7
116; VI-NEXT:    s_waitcnt lgkmcnt(0)
117; VI-NEXT:    v_mov_b32_e32 v0, s4
118; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
119; VI-NEXT:    s_endpgm
120  %vecins = insertelement <2 x i32> %a, i32 999, i32 1
121  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16
122  ret void
123}
124
125; FIXME: Why is the constant moved into the intermediate register and
126; not just directly into the vector component?
127define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
128; SI-LABEL: insertelement_v4f32_0:
129; SI:       ; %bb.0:
130; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
131; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
132; SI-NEXT:    s_waitcnt lgkmcnt(0)
133; SI-NEXT:    s_mov_b32 s4, 0x40a00000
134; SI-NEXT:    s_mov_b32 s3, 0x100f000
135; SI-NEXT:    s_mov_b32 s2, -1
136; SI-NEXT:    v_mov_b32_e32 v0, s4
137; SI-NEXT:    v_mov_b32_e32 v1, s5
138; SI-NEXT:    v_mov_b32_e32 v2, s6
139; SI-NEXT:    v_mov_b32_e32 v3, s7
140; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
141; SI-NEXT:    s_endpgm
142;
143; VI-LABEL: insertelement_v4f32_0:
144; VI:       ; %bb.0:
145; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
146; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
147; VI-NEXT:    s_waitcnt lgkmcnt(0)
148; VI-NEXT:    s_mov_b32 s4, 0x40a00000
149; VI-NEXT:    s_mov_b32 s3, 0x1100f000
150; VI-NEXT:    s_mov_b32 s2, -1
151; VI-NEXT:    v_mov_b32_e32 v0, s4
152; VI-NEXT:    v_mov_b32_e32 v1, s5
153; VI-NEXT:    v_mov_b32_e32 v2, s6
154; VI-NEXT:    v_mov_b32_e32 v3, s7
155; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
156; VI-NEXT:    s_endpgm
157  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
158  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
159  ret void
160}
161
162define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
163; SI-LABEL: insertelement_v4f32_1:
164; SI:       ; %bb.0:
165; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
166; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
167; SI-NEXT:    s_waitcnt lgkmcnt(0)
168; SI-NEXT:    s_mov_b32 s5, 0x40a00000
169; SI-NEXT:    s_mov_b32 s3, 0x100f000
170; SI-NEXT:    s_mov_b32 s2, -1
171; SI-NEXT:    v_mov_b32_e32 v0, s4
172; SI-NEXT:    v_mov_b32_e32 v1, s5
173; SI-NEXT:    v_mov_b32_e32 v2, s6
174; SI-NEXT:    v_mov_b32_e32 v3, s7
175; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
176; SI-NEXT:    s_endpgm
177;
178; VI-LABEL: insertelement_v4f32_1:
179; VI:       ; %bb.0:
180; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
181; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
182; VI-NEXT:    s_waitcnt lgkmcnt(0)
183; VI-NEXT:    s_mov_b32 s5, 0x40a00000
184; VI-NEXT:    s_mov_b32 s3, 0x1100f000
185; VI-NEXT:    s_mov_b32 s2, -1
186; VI-NEXT:    v_mov_b32_e32 v0, s4
187; VI-NEXT:    v_mov_b32_e32 v1, s5
188; VI-NEXT:    v_mov_b32_e32 v2, s6
189; VI-NEXT:    v_mov_b32_e32 v3, s7
190; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
191; VI-NEXT:    s_endpgm
192  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
193  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
194  ret void
195}
196
197define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
198; SI-LABEL: insertelement_v4f32_2:
199; SI:       ; %bb.0:
200; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
201; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
202; SI-NEXT:    s_waitcnt lgkmcnt(0)
203; SI-NEXT:    s_mov_b32 s6, 0x40a00000
204; SI-NEXT:    s_mov_b32 s3, 0x100f000
205; SI-NEXT:    s_mov_b32 s2, -1
206; SI-NEXT:    v_mov_b32_e32 v0, s4
207; SI-NEXT:    v_mov_b32_e32 v1, s5
208; SI-NEXT:    v_mov_b32_e32 v2, s6
209; SI-NEXT:    v_mov_b32_e32 v3, s7
210; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
211; SI-NEXT:    s_endpgm
212;
213; VI-LABEL: insertelement_v4f32_2:
214; VI:       ; %bb.0:
215; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
216; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
217; VI-NEXT:    s_waitcnt lgkmcnt(0)
218; VI-NEXT:    s_mov_b32 s6, 0x40a00000
219; VI-NEXT:    s_mov_b32 s3, 0x1100f000
220; VI-NEXT:    s_mov_b32 s2, -1
221; VI-NEXT:    v_mov_b32_e32 v0, s4
222; VI-NEXT:    v_mov_b32_e32 v1, s5
223; VI-NEXT:    v_mov_b32_e32 v2, s6
224; VI-NEXT:    v_mov_b32_e32 v3, s7
225; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
226; VI-NEXT:    s_endpgm
227  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
228  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
229  ret void
230}
231
232define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
233; SI-LABEL: insertelement_v4f32_3:
234; SI:       ; %bb.0:
235; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
236; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
237; SI-NEXT:    s_waitcnt lgkmcnt(0)
238; SI-NEXT:    s_mov_b32 s7, 0x40a00000
239; SI-NEXT:    s_mov_b32 s3, 0x100f000
240; SI-NEXT:    s_mov_b32 s2, -1
241; SI-NEXT:    v_mov_b32_e32 v0, s4
242; SI-NEXT:    v_mov_b32_e32 v1, s5
243; SI-NEXT:    v_mov_b32_e32 v2, s6
244; SI-NEXT:    v_mov_b32_e32 v3, s7
245; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
246; SI-NEXT:    s_endpgm
247;
248; VI-LABEL: insertelement_v4f32_3:
249; VI:       ; %bb.0:
250; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
251; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
252; VI-NEXT:    s_waitcnt lgkmcnt(0)
253; VI-NEXT:    s_mov_b32 s7, 0x40a00000
254; VI-NEXT:    s_mov_b32 s3, 0x1100f000
255; VI-NEXT:    s_mov_b32 s2, -1
256; VI-NEXT:    v_mov_b32_e32 v0, s4
257; VI-NEXT:    v_mov_b32_e32 v1, s5
258; VI-NEXT:    v_mov_b32_e32 v2, s6
259; VI-NEXT:    v_mov_b32_e32 v3, s7
260; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
261; VI-NEXT:    s_endpgm
262  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
263  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
264  ret void
265}
266
267define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
268; SI-LABEL: insertelement_v4i32_0:
269; SI:       ; %bb.0:
270; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
271; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
272; SI-NEXT:    s_waitcnt lgkmcnt(0)
273; SI-NEXT:    s_movk_i32 s4, 0x3e7
274; SI-NEXT:    s_mov_b32 s3, 0x100f000
275; SI-NEXT:    s_mov_b32 s2, -1
276; SI-NEXT:    v_mov_b32_e32 v0, s4
277; SI-NEXT:    v_mov_b32_e32 v1, s5
278; SI-NEXT:    v_mov_b32_e32 v2, s6
279; SI-NEXT:    v_mov_b32_e32 v3, s7
280; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
281; SI-NEXT:    s_endpgm
282;
283; VI-LABEL: insertelement_v4i32_0:
284; VI:       ; %bb.0:
285; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
286; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
287; VI-NEXT:    s_waitcnt lgkmcnt(0)
288; VI-NEXT:    s_movk_i32 s4, 0x3e7
289; VI-NEXT:    s_mov_b32 s3, 0x1100f000
290; VI-NEXT:    s_mov_b32 s2, -1
291; VI-NEXT:    v_mov_b32_e32 v0, s4
292; VI-NEXT:    v_mov_b32_e32 v1, s5
293; VI-NEXT:    v_mov_b32_e32 v2, s6
294; VI-NEXT:    v_mov_b32_e32 v3, s7
295; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
296; VI-NEXT:    s_endpgm
297  %vecins = insertelement <4 x i32> %a, i32 999, i32 0
298  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
299  ret void
300}
301
302define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
303; SI-LABEL: insertelement_v3f32_1:
304; SI:       ; %bb.0:
305; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
306; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
307; SI-NEXT:    s_mov_b32 s3, 0x100f000
308; SI-NEXT:    s_mov_b32 s2, -1
309; SI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
310; SI-NEXT:    s_waitcnt lgkmcnt(0)
311; SI-NEXT:    v_mov_b32_e32 v0, s4
312; SI-NEXT:    v_mov_b32_e32 v2, s6
313; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
314; SI-NEXT:    s_endpgm
315;
316; VI-LABEL: insertelement_v3f32_1:
317; VI:       ; %bb.0:
318; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
319; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
320; VI-NEXT:    s_mov_b32 s3, 0x1100f000
321; VI-NEXT:    s_mov_b32 s2, -1
322; VI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
323; VI-NEXT:    s_waitcnt lgkmcnt(0)
324; VI-NEXT:    v_mov_b32_e32 v0, s4
325; VI-NEXT:    v_mov_b32_e32 v2, s6
326; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
327; VI-NEXT:    s_endpgm
328  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
329  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
330  ret void
331}
332
333define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
334; SI-LABEL: insertelement_v3f32_2:
335; SI:       ; %bb.0:
336; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
337; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
338; SI-NEXT:    s_mov_b32 s3, 0x100f000
339; SI-NEXT:    s_mov_b32 s2, -1
340; SI-NEXT:    v_mov_b32_e32 v2, 0x40a00000
341; SI-NEXT:    s_waitcnt lgkmcnt(0)
342; SI-NEXT:    v_mov_b32_e32 v0, s4
343; SI-NEXT:    v_mov_b32_e32 v1, s5
344; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
345; SI-NEXT:    s_endpgm
346;
347; VI-LABEL: insertelement_v3f32_2:
348; VI:       ; %bb.0:
349; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
350; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
351; VI-NEXT:    s_mov_b32 s3, 0x1100f000
352; VI-NEXT:    s_mov_b32 s2, -1
353; VI-NEXT:    v_mov_b32_e32 v2, 0x40a00000
354; VI-NEXT:    s_waitcnt lgkmcnt(0)
355; VI-NEXT:    v_mov_b32_e32 v0, s4
356; VI-NEXT:    v_mov_b32_e32 v1, s5
357; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
358; VI-NEXT:    s_endpgm
359  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
360  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
361  ret void
362}
363
364define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
365; GCN-LABEL: insertelement_v3f32_3:
366; GCN:       ; %bb.0:
367; GCN-NEXT:    s_endpgm
368  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
369  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
370  ret void
371}
372
373define <4 x float> @insertelement_to_sgpr() nounwind {
374; GCN-LABEL: insertelement_to_sgpr:
375; GCN:       ; %bb.0:
376; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377; GCN-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x0
378; GCN-NEXT:    s_waitcnt lgkmcnt(0)
379; GCN-NEXT:    s_mov_b32 s12, 0
380; GCN-NEXT:    s_mov_b32 s4, s12
381; GCN-NEXT:    s_mov_b32 s5, s12
382; GCN-NEXT:    s_mov_b32 s6, s12
383; GCN-NEXT:    s_mov_b32 s7, s12
384; GCN-NEXT:    s_mov_b32 s8, s12
385; GCN-NEXT:    s_mov_b32 s9, s12
386; GCN-NEXT:    s_mov_b32 s10, s12
387; GCN-NEXT:    s_mov_b32 s11, s12
388; GCN-NEXT:    image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1
389; GCN-NEXT:    s_waitcnt vmcnt(0)
390; GCN-NEXT:    s_setpc_b64 s[30:31]
391  %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
392  %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
393  %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
394  ret <4 x float> %tmp2
395}
396
397define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
398; SI-LABEL: dynamic_insertelement_v2f32:
399; SI:       ; %bb.0:
400; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
401; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
402; SI-NEXT:    s_load_dword s4, s[4:5], 0x4
403; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
404; SI-NEXT:    s_mov_b32 s3, 0x100f000
405; SI-NEXT:    s_mov_b32 s2, -1
406; SI-NEXT:    s_waitcnt lgkmcnt(0)
407; SI-NEXT:    v_mov_b32_e32 v1, s7
408; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
409; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
410; SI-NEXT:    v_mov_b32_e32 v2, s6
411; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
412; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
413; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
414; SI-NEXT:    s_endpgm
415;
416; VI-LABEL: dynamic_insertelement_v2f32:
417; VI:       ; %bb.0:
418; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
419; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
420; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
421; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
422; VI-NEXT:    s_mov_b32 s3, 0x1100f000
423; VI-NEXT:    s_mov_b32 s2, -1
424; VI-NEXT:    s_waitcnt lgkmcnt(0)
425; VI-NEXT:    v_mov_b32_e32 v1, s7
426; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
427; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
428; VI-NEXT:    v_mov_b32_e32 v2, s6
429; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
430; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
431; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
432; VI-NEXT:    s_endpgm
433  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
434  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
435  ret void
436}
437
438define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
439; SI-LABEL: dynamic_insertelement_v3f32:
440; SI:       ; %bb.0:
441; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
442; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
443; SI-NEXT:    s_load_dword s4, s[4:5], 0x8
444; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
445; SI-NEXT:    s_mov_b32 s3, 0x100f000
446; SI-NEXT:    s_mov_b32 s2, -1
447; SI-NEXT:    s_waitcnt lgkmcnt(0)
448; SI-NEXT:    v_mov_b32_e32 v1, s10
449; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
450; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
451; SI-NEXT:    v_mov_b32_e32 v1, s9
452; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
453; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
454; SI-NEXT:    v_mov_b32_e32 v3, s8
455; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
456; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
457; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
458; SI-NEXT:    s_endpgm
459;
460; VI-LABEL: dynamic_insertelement_v3f32:
461; VI:       ; %bb.0:
462; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
463; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
464; VI-NEXT:    s_load_dword s4, s[4:5], 0x20
465; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
466; VI-NEXT:    s_mov_b32 s3, 0x1100f000
467; VI-NEXT:    s_mov_b32 s2, -1
468; VI-NEXT:    s_waitcnt lgkmcnt(0)
469; VI-NEXT:    v_mov_b32_e32 v1, s10
470; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
471; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
472; VI-NEXT:    v_mov_b32_e32 v1, s9
473; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
474; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
475; VI-NEXT:    v_mov_b32_e32 v3, s8
476; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
477; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
478; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
479; VI-NEXT:    s_endpgm
480  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
481  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
482  ret void
483}
484
485define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
486; SI-LABEL: dynamic_insertelement_v4f32:
487; SI:       ; %bb.0:
488; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
489; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
490; SI-NEXT:    s_load_dword s4, s[4:5], 0x8
491; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
492; SI-NEXT:    s_mov_b32 s3, 0x100f000
493; SI-NEXT:    s_mov_b32 s2, -1
494; SI-NEXT:    s_waitcnt lgkmcnt(0)
495; SI-NEXT:    v_mov_b32_e32 v1, s11
496; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
497; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
498; SI-NEXT:    v_mov_b32_e32 v1, s10
499; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
500; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
501; SI-NEXT:    v_mov_b32_e32 v1, s9
502; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
503; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
504; SI-NEXT:    v_mov_b32_e32 v4, s8
505; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
506; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
507; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
508; SI-NEXT:    s_endpgm
509;
510; VI-LABEL: dynamic_insertelement_v4f32:
511; VI:       ; %bb.0:
512; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
513; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
514; VI-NEXT:    s_load_dword s4, s[4:5], 0x20
515; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
516; VI-NEXT:    s_mov_b32 s3, 0x1100f000
517; VI-NEXT:    s_mov_b32 s2, -1
518; VI-NEXT:    s_waitcnt lgkmcnt(0)
519; VI-NEXT:    v_mov_b32_e32 v1, s11
520; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
521; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
522; VI-NEXT:    v_mov_b32_e32 v1, s10
523; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
524; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
525; VI-NEXT:    v_mov_b32_e32 v1, s9
526; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
527; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
528; VI-NEXT:    v_mov_b32_e32 v4, s8
529; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
530; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
531; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
532; VI-NEXT:    s_endpgm
533  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
534  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
535  ret void
536}
537
538define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
539; SI-LABEL: dynamic_insertelement_v8f32:
540; SI:       ; %bb.0:
541; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
542; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
543; SI-NEXT:    s_load_dword s4, s[4:5], 0x10
544; SI-NEXT:    v_mov_b32_e32 v4, 0x40a00000
545; SI-NEXT:    s_mov_b32 s3, 0x100f000
546; SI-NEXT:    s_mov_b32 s2, -1
547; SI-NEXT:    s_waitcnt lgkmcnt(0)
548; SI-NEXT:    v_mov_b32_e32 v0, s11
549; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
550; SI-NEXT:    v_cndmask_b32_e32 v3, v4, v0, vcc
551; SI-NEXT:    v_mov_b32_e32 v0, s10
552; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
553; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
554; SI-NEXT:    v_mov_b32_e32 v0, s9
555; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
556; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v0, vcc
557; SI-NEXT:    v_mov_b32_e32 v0, s8
558; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
559; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
560; SI-NEXT:    v_mov_b32_e32 v5, s15
561; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
562; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v5, vcc
563; SI-NEXT:    v_mov_b32_e32 v5, s14
564; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
565; SI-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
566; SI-NEXT:    v_mov_b32_e32 v5, s13
567; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
568; SI-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc
569; SI-NEXT:    v_mov_b32_e32 v8, s12
570; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
571; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
572; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
573; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
574; SI-NEXT:    s_endpgm
575;
576; VI-LABEL: dynamic_insertelement_v8f32:
577; VI:       ; %bb.0:
578; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
579; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
580; VI-NEXT:    s_load_dword s4, s[4:5], 0x40
581; VI-NEXT:    v_mov_b32_e32 v4, 0x40a00000
582; VI-NEXT:    s_mov_b32 s3, 0x1100f000
583; VI-NEXT:    s_mov_b32 s2, -1
584; VI-NEXT:    s_waitcnt lgkmcnt(0)
585; VI-NEXT:    v_mov_b32_e32 v0, s11
586; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
587; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v0, vcc
588; VI-NEXT:    v_mov_b32_e32 v0, s10
589; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
590; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
591; VI-NEXT:    v_mov_b32_e32 v0, s9
592; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
593; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v0, vcc
594; VI-NEXT:    v_mov_b32_e32 v0, s8
595; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
596; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
597; VI-NEXT:    v_mov_b32_e32 v5, s15
598; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
599; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v5, vcc
600; VI-NEXT:    v_mov_b32_e32 v5, s14
601; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
602; VI-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
603; VI-NEXT:    v_mov_b32_e32 v5, s13
604; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
605; VI-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc
606; VI-NEXT:    v_mov_b32_e32 v8, s12
607; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
608; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
609; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
610; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
611; VI-NEXT:    s_endpgm
612  %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
613  store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
614  ret void
615}
616
617define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
618; SI-LABEL: dynamic_insertelement_v16f32:
619; SI:       ; %bb.0:
620; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
621; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
622; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
623; SI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
624; SI-NEXT:    s_mov_b32 s3, 0x100f000
625; SI-NEXT:    s_mov_b32 s2, -1
626; SI-NEXT:    s_waitcnt lgkmcnt(0)
627; SI-NEXT:    v_mov_b32_e32 v0, s8
628; SI-NEXT:    v_mov_b32_e32 v1, s9
629; SI-NEXT:    v_mov_b32_e32 v2, s10
630; SI-NEXT:    v_mov_b32_e32 v3, s11
631; SI-NEXT:    v_mov_b32_e32 v4, s12
632; SI-NEXT:    v_mov_b32_e32 v5, s13
633; SI-NEXT:    v_mov_b32_e32 v6, s14
634; SI-NEXT:    v_mov_b32_e32 v7, s15
635; SI-NEXT:    v_mov_b32_e32 v8, s16
636; SI-NEXT:    v_mov_b32_e32 v9, s17
637; SI-NEXT:    v_mov_b32_e32 v10, s18
638; SI-NEXT:    v_mov_b32_e32 v11, s19
639; SI-NEXT:    v_mov_b32_e32 v12, s20
640; SI-NEXT:    v_mov_b32_e32 v13, s21
641; SI-NEXT:    v_mov_b32_e32 v14, s22
642; SI-NEXT:    v_mov_b32_e32 v15, s23
643; SI-NEXT:    s_mov_b32 m0, s4
644; SI-NEXT:    v_movreld_b32_e32 v0, v16
645; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
646; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
647; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
648; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
649; SI-NEXT:    s_endpgm
650;
651; VI-LABEL: dynamic_insertelement_v16f32:
652; VI:       ; %bb.0:
653; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
654; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
655; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
656; VI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
657; VI-NEXT:    s_mov_b32 s3, 0x1100f000
658; VI-NEXT:    s_mov_b32 s2, -1
659; VI-NEXT:    s_waitcnt lgkmcnt(0)
660; VI-NEXT:    v_mov_b32_e32 v0, s8
661; VI-NEXT:    v_mov_b32_e32 v1, s9
662; VI-NEXT:    v_mov_b32_e32 v2, s10
663; VI-NEXT:    v_mov_b32_e32 v3, s11
664; VI-NEXT:    v_mov_b32_e32 v4, s12
665; VI-NEXT:    v_mov_b32_e32 v5, s13
666; VI-NEXT:    v_mov_b32_e32 v6, s14
667; VI-NEXT:    v_mov_b32_e32 v7, s15
668; VI-NEXT:    v_mov_b32_e32 v8, s16
669; VI-NEXT:    v_mov_b32_e32 v9, s17
670; VI-NEXT:    v_mov_b32_e32 v10, s18
671; VI-NEXT:    v_mov_b32_e32 v11, s19
672; VI-NEXT:    v_mov_b32_e32 v12, s20
673; VI-NEXT:    v_mov_b32_e32 v13, s21
674; VI-NEXT:    v_mov_b32_e32 v14, s22
675; VI-NEXT:    v_mov_b32_e32 v15, s23
676; VI-NEXT:    s_mov_b32 m0, s4
677; VI-NEXT:    v_movreld_b32_e32 v0, v16
678; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
679; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
680; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
681; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
682; VI-NEXT:    s_endpgm
683  %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
684  store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
685  ret void
686}
687
688define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
689; SI-LABEL: dynamic_insertelement_v2i32:
690; SI:       ; %bb.0:
691; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
692; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
693; SI-NEXT:    s_load_dword s4, s[4:5], 0x4
694; SI-NEXT:    s_mov_b32 s3, 0x100f000
695; SI-NEXT:    s_mov_b32 s2, -1
696; SI-NEXT:    s_waitcnt lgkmcnt(0)
697; SI-NEXT:    v_mov_b32_e32 v0, s7
698; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
699; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
700; SI-NEXT:    v_mov_b32_e32 v0, s6
701; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
702; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
703; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
704; SI-NEXT:    s_endpgm
705;
706; VI-LABEL: dynamic_insertelement_v2i32:
707; VI:       ; %bb.0:
708; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
709; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
710; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
711; VI-NEXT:    s_mov_b32 s3, 0x1100f000
712; VI-NEXT:    s_mov_b32 s2, -1
713; VI-NEXT:    s_waitcnt lgkmcnt(0)
714; VI-NEXT:    s_cmp_lg_u32 s4, 1
715; VI-NEXT:    s_cselect_b32 s5, s7, 5
716; VI-NEXT:    s_cmp_lg_u32 s4, 0
717; VI-NEXT:    s_cselect_b32 s4, s6, 5
718; VI-NEXT:    v_mov_b32_e32 v0, s4
719; VI-NEXT:    v_mov_b32_e32 v1, s5
720; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
721; VI-NEXT:    s_endpgm
722  %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
723  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
724  ret void
725}
726
727define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
728; SI-LABEL: dynamic_insertelement_v3i32:
729; SI:       ; %bb.0:
730; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
731; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
732; SI-NEXT:    s_load_dword s4, s[4:5], 0x8
733; SI-NEXT:    s_mov_b32 s3, 0x100f000
734; SI-NEXT:    s_mov_b32 s2, -1
735; SI-NEXT:    s_waitcnt lgkmcnt(0)
736; SI-NEXT:    v_mov_b32_e32 v0, s10
737; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
738; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
739; SI-NEXT:    v_mov_b32_e32 v0, s9
740; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
741; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
742; SI-NEXT:    v_mov_b32_e32 v0, s8
743; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
744; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
745; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
746; SI-NEXT:    s_endpgm
747;
748; VI-LABEL: dynamic_insertelement_v3i32:
749; VI:       ; %bb.0:
750; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
751; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
752; VI-NEXT:    s_load_dword s4, s[4:5], 0x20
753; VI-NEXT:    s_mov_b32 s3, 0x1100f000
754; VI-NEXT:    s_mov_b32 s2, -1
755; VI-NEXT:    s_waitcnt lgkmcnt(0)
756; VI-NEXT:    s_cmp_lg_u32 s4, 2
757; VI-NEXT:    s_cselect_b32 s5, s10, 5
758; VI-NEXT:    s_cmp_lg_u32 s4, 1
759; VI-NEXT:    s_cselect_b32 s6, s9, 5
760; VI-NEXT:    s_cmp_lg_u32 s4, 0
761; VI-NEXT:    s_cselect_b32 s4, s8, 5
762; VI-NEXT:    v_mov_b32_e32 v0, s4
763; VI-NEXT:    v_mov_b32_e32 v1, s6
764; VI-NEXT:    v_mov_b32_e32 v2, s5
765; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
766; VI-NEXT:    s_endpgm
767  %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
768  store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
769  ret void
770}
771
772define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
773; SI-LABEL: dynamic_insertelement_v4i32:
774; SI:       ; %bb.0:
775; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
776; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
777; SI-NEXT:    s_load_dword s6, s[4:5], 0x8
778; SI-NEXT:    s_load_dword s4, s[4:5], 0x11
779; SI-NEXT:    s_mov_b32 s3, 0x100f000
780; SI-NEXT:    s_mov_b32 s2, -1
781; SI-NEXT:    s_waitcnt lgkmcnt(0)
782; SI-NEXT:    v_mov_b32_e32 v0, s11
783; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 3
784; SI-NEXT:    v_mov_b32_e32 v4, s4
785; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
786; SI-NEXT:    v_mov_b32_e32 v0, s10
787; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 2
788; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
789; SI-NEXT:    v_mov_b32_e32 v0, s9
790; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 1
791; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
792; SI-NEXT:    v_mov_b32_e32 v0, s8
793; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 0
794; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
795; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
796; SI-NEXT:    s_endpgm
797;
798; VI-LABEL: dynamic_insertelement_v4i32:
799; VI:       ; %bb.0:
800; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
801; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
802; VI-NEXT:    s_load_dword s6, s[4:5], 0x20
803; VI-NEXT:    s_load_dword s4, s[4:5], 0x44
804; VI-NEXT:    s_mov_b32 s3, 0x1100f000
805; VI-NEXT:    s_mov_b32 s2, -1
806; VI-NEXT:    s_waitcnt lgkmcnt(0)
807; VI-NEXT:    s_cmp_eq_u32 s6, 3
808; VI-NEXT:    s_cselect_b32 s5, s4, s11
809; VI-NEXT:    s_cmp_eq_u32 s6, 2
810; VI-NEXT:    s_cselect_b32 s7, s4, s10
811; VI-NEXT:    s_cmp_eq_u32 s6, 1
812; VI-NEXT:    s_cselect_b32 s9, s4, s9
813; VI-NEXT:    s_cmp_eq_u32 s6, 0
814; VI-NEXT:    s_cselect_b32 s4, s4, s8
815; VI-NEXT:    v_mov_b32_e32 v0, s4
816; VI-NEXT:    v_mov_b32_e32 v1, s9
817; VI-NEXT:    v_mov_b32_e32 v2, s7
818; VI-NEXT:    v_mov_b32_e32 v3, s5
819; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
820; VI-NEXT:    s_endpgm
821  %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
822  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
823  ret void
824}
825
826define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
827; SI-LABEL: dynamic_insertelement_v8i32:
828; SI:       ; %bb.0:
829; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
830; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
831; SI-NEXT:    s_load_dword s4, s[4:5], 0x10
832; SI-NEXT:    s_mov_b32 s3, 0x100f000
833; SI-NEXT:    s_mov_b32 s2, -1
834; SI-NEXT:    s_waitcnt lgkmcnt(0)
835; SI-NEXT:    v_mov_b32_e32 v0, s11
836; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
837; SI-NEXT:    v_cndmask_b32_e32 v3, 5, v0, vcc
838; SI-NEXT:    v_mov_b32_e32 v0, s10
839; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
840; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
841; SI-NEXT:    v_mov_b32_e32 v0, s9
842; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
843; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
844; SI-NEXT:    v_mov_b32_e32 v0, s8
845; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
846; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
847; SI-NEXT:    v_mov_b32_e32 v4, s15
848; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
849; SI-NEXT:    v_cndmask_b32_e32 v7, 5, v4, vcc
850; SI-NEXT:    v_mov_b32_e32 v4, s14
851; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
852; SI-NEXT:    v_cndmask_b32_e32 v6, 5, v4, vcc
853; SI-NEXT:    v_mov_b32_e32 v4, s13
854; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
855; SI-NEXT:    v_cndmask_b32_e32 v5, 5, v4, vcc
856; SI-NEXT:    v_mov_b32_e32 v4, s12
857; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
858; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
859; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
860; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
861; SI-NEXT:    s_endpgm
862;
863; VI-LABEL: dynamic_insertelement_v8i32:
864; VI:       ; %bb.0:
865; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
866; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
867; VI-NEXT:    s_load_dword s4, s[4:5], 0x40
868; VI-NEXT:    s_mov_b32 s3, 0x1100f000
869; VI-NEXT:    s_mov_b32 s2, -1
870; VI-NEXT:    s_waitcnt lgkmcnt(0)
871; VI-NEXT:    s_cmp_lg_u32 s4, 3
872; VI-NEXT:    s_cselect_b32 s5, s11, 5
873; VI-NEXT:    s_cmp_lg_u32 s4, 2
874; VI-NEXT:    s_cselect_b32 s6, s10, 5
875; VI-NEXT:    s_cmp_lg_u32 s4, 1
876; VI-NEXT:    s_cselect_b32 s7, s9, 5
877; VI-NEXT:    s_cmp_lg_u32 s4, 0
878; VI-NEXT:    s_cselect_b32 s8, s8, 5
879; VI-NEXT:    s_cmp_lg_u32 s4, 7
880; VI-NEXT:    s_cselect_b32 s9, s15, 5
881; VI-NEXT:    s_cmp_lg_u32 s4, 6
882; VI-NEXT:    s_cselect_b32 s10, s14, 5
883; VI-NEXT:    s_cmp_lg_u32 s4, 5
884; VI-NEXT:    s_cselect_b32 s11, s13, 5
885; VI-NEXT:    s_cmp_lg_u32 s4, 4
886; VI-NEXT:    s_cselect_b32 s4, s12, 5
887; VI-NEXT:    v_mov_b32_e32 v0, s4
888; VI-NEXT:    v_mov_b32_e32 v1, s11
889; VI-NEXT:    v_mov_b32_e32 v2, s10
890; VI-NEXT:    v_mov_b32_e32 v3, s9
891; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
892; VI-NEXT:    s_nop 0
893; VI-NEXT:    v_mov_b32_e32 v0, s8
894; VI-NEXT:    v_mov_b32_e32 v1, s7
895; VI-NEXT:    v_mov_b32_e32 v2, s6
896; VI-NEXT:    v_mov_b32_e32 v3, s5
897; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
898; VI-NEXT:    s_endpgm
899  %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
900  store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
901  ret void
902}
903
904define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
905; SI-LABEL: dynamic_insertelement_v16i32:
906; SI:       ; %bb.0:
907; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
908; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
909; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
910; SI-NEXT:    s_mov_b32 s3, 0x100f000
911; SI-NEXT:    s_mov_b32 s2, -1
912; SI-NEXT:    s_waitcnt lgkmcnt(0)
913; SI-NEXT:    v_mov_b32_e32 v0, s8
914; SI-NEXT:    v_mov_b32_e32 v1, s9
915; SI-NEXT:    v_mov_b32_e32 v2, s10
916; SI-NEXT:    v_mov_b32_e32 v3, s11
917; SI-NEXT:    v_mov_b32_e32 v4, s12
918; SI-NEXT:    v_mov_b32_e32 v5, s13
919; SI-NEXT:    v_mov_b32_e32 v6, s14
920; SI-NEXT:    v_mov_b32_e32 v7, s15
921; SI-NEXT:    v_mov_b32_e32 v8, s16
922; SI-NEXT:    v_mov_b32_e32 v9, s17
923; SI-NEXT:    v_mov_b32_e32 v10, s18
924; SI-NEXT:    v_mov_b32_e32 v11, s19
925; SI-NEXT:    v_mov_b32_e32 v12, s20
926; SI-NEXT:    v_mov_b32_e32 v13, s21
927; SI-NEXT:    v_mov_b32_e32 v14, s22
928; SI-NEXT:    v_mov_b32_e32 v15, s23
929; SI-NEXT:    s_mov_b32 m0, s4
930; SI-NEXT:    v_movreld_b32_e32 v0, 5
931; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
932; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
933; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
934; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
935; SI-NEXT:    s_endpgm
936;
937; VI-LABEL: dynamic_insertelement_v16i32:
938; VI:       ; %bb.0:
939; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
940; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
941; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
942; VI-NEXT:    s_mov_b32 s3, 0x1100f000
943; VI-NEXT:    s_mov_b32 s2, -1
944; VI-NEXT:    s_waitcnt lgkmcnt(0)
945; VI-NEXT:    v_mov_b32_e32 v0, s8
946; VI-NEXT:    v_mov_b32_e32 v1, s9
947; VI-NEXT:    v_mov_b32_e32 v2, s10
948; VI-NEXT:    v_mov_b32_e32 v3, s11
949; VI-NEXT:    v_mov_b32_e32 v4, s12
950; VI-NEXT:    v_mov_b32_e32 v5, s13
951; VI-NEXT:    v_mov_b32_e32 v6, s14
952; VI-NEXT:    v_mov_b32_e32 v7, s15
953; VI-NEXT:    v_mov_b32_e32 v8, s16
954; VI-NEXT:    v_mov_b32_e32 v9, s17
955; VI-NEXT:    v_mov_b32_e32 v10, s18
956; VI-NEXT:    v_mov_b32_e32 v11, s19
957; VI-NEXT:    v_mov_b32_e32 v12, s20
958; VI-NEXT:    v_mov_b32_e32 v13, s21
959; VI-NEXT:    v_mov_b32_e32 v14, s22
960; VI-NEXT:    v_mov_b32_e32 v15, s23
961; VI-NEXT:    s_mov_b32 m0, s4
962; VI-NEXT:    v_movreld_b32_e32 v0, 5
963; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
964; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
965; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
966; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
967; VI-NEXT:    s_endpgm
968  %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
969  store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
970  ret void
971}
972
973define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
974; SI-LABEL: dynamic_insertelement_v2i16:
975; SI:       ; %bb.0:
976; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
977; SI-NEXT:    s_load_dword s6, s[4:5], 0x2
978; SI-NEXT:    s_load_dword s4, s[4:5], 0x3
979; SI-NEXT:    s_mov_b32 s3, 0x100f000
980; SI-NEXT:    s_mov_b32 s2, -1
981; SI-NEXT:    s_waitcnt lgkmcnt(0)
982; SI-NEXT:    s_lshl_b32 s4, s4, 4
983; SI-NEXT:    s_lshl_b32 s4, 0xffff, s4
984; SI-NEXT:    s_andn2_b32 s5, s6, s4
985; SI-NEXT:    s_and_b32 s4, s4, 0x50005
986; SI-NEXT:    s_or_b32 s4, s4, s5
987; SI-NEXT:    v_mov_b32_e32 v0, s4
988; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
989; SI-NEXT:    s_endpgm
990;
991; VI-LABEL: dynamic_insertelement_v2i16:
992; VI:       ; %bb.0:
993; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
994; VI-NEXT:    s_load_dword s6, s[4:5], 0x8
995; VI-NEXT:    s_load_dword s4, s[4:5], 0xc
996; VI-NEXT:    s_mov_b32 s3, 0x1100f000
997; VI-NEXT:    s_mov_b32 s2, -1
998; VI-NEXT:    s_waitcnt lgkmcnt(0)
999; VI-NEXT:    s_lshl_b32 s4, s4, 4
1000; VI-NEXT:    s_lshl_b32 s4, 0xffff, s4
1001; VI-NEXT:    s_andn2_b32 s5, s6, s4
1002; VI-NEXT:    s_and_b32 s4, s4, 0x50005
1003; VI-NEXT:    s_or_b32 s4, s4, s5
1004; VI-NEXT:    v_mov_b32_e32 v0, s4
1005; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1006; VI-NEXT:    s_endpgm
1007  %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
1008  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
1009  ret void
1010}
1011
1012define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
1013; SI-LABEL: dynamic_insertelement_v3i16:
1014; SI:       ; %bb.0:
1015; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1016; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
1017; SI-NEXT:    s_load_dword s4, s[4:5], 0x4
1018; SI-NEXT:    s_mov_b32 s3, 0x100f000
1019; SI-NEXT:    s_mov_b32 s2, -1
1020; SI-NEXT:    s_waitcnt lgkmcnt(0)
1021; SI-NEXT:    s_lshl_b32 s8, s4, 4
1022; SI-NEXT:    s_mov_b64 s[4:5], 0xffff
1023; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
1024; SI-NEXT:    s_mov_b32 s8, 0x50005
1025; SI-NEXT:    s_and_b32 s9, s5, s8
1026; SI-NEXT:    s_and_b32 s8, s4, s8
1027; SI-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
1028; SI-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
1029; SI-NEXT:    v_mov_b32_e32 v0, s5
1030; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
1031; SI-NEXT:    v_mov_b32_e32 v0, s4
1032; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1033; SI-NEXT:    s_endpgm
1034;
1035; VI-LABEL: dynamic_insertelement_v3i16:
1036; VI:       ; %bb.0:
1037; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1038; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
1039; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1040; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1041; VI-NEXT:    s_mov_b32 s2, -1
1042; VI-NEXT:    s_waitcnt lgkmcnt(0)
1043; VI-NEXT:    s_lshl_b32 s8, s4, 4
1044; VI-NEXT:    s_mov_b64 s[4:5], 0xffff
1045; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
1046; VI-NEXT:    s_mov_b32 s8, 0x50005
1047; VI-NEXT:    s_mov_b32 s9, s8
1048; VI-NEXT:    s_andn2_b64 s[6:7], s[6:7], s[4:5]
1049; VI-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
1050; VI-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
1051; VI-NEXT:    v_mov_b32_e32 v0, s5
1052; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
1053; VI-NEXT:    v_mov_b32_e32 v0, s4
1054; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1055; VI-NEXT:    s_endpgm
1056  %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
1057  store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8
1058  ret void
1059}
1060
1061define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
1062; SI-LABEL: dynamic_insertelement_v2i8:
1063; SI:       ; %bb.0:
1064; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1065; SI-NEXT:    s_load_dword s6, s[4:5], 0xa
1066; SI-NEXT:    s_load_dword s4, s[4:5], 0x13
1067; SI-NEXT:    s_mov_b32 s3, 0x100f000
1068; SI-NEXT:    s_mov_b32 s2, -1
1069; SI-NEXT:    s_waitcnt lgkmcnt(0)
1070; SI-NEXT:    s_lshl_b32 s4, s4, 3
1071; SI-NEXT:    s_lshl_b32 s4, -1, s4
1072; SI-NEXT:    s_andn2_b32 s5, s6, s4
1073; SI-NEXT:    s_and_b32 s4, s4, 0x505
1074; SI-NEXT:    s_or_b32 s4, s4, s5
1075; SI-NEXT:    v_mov_b32_e32 v0, s4
1076; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1077; SI-NEXT:    s_endpgm
1078;
1079; VI-LABEL: dynamic_insertelement_v2i8:
1080; VI:       ; %bb.0:
1081; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1082; VI-NEXT:    s_load_dword s6, s[4:5], 0x28
1083; VI-NEXT:    s_load_dword s4, s[4:5], 0x4c
1084; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1085; VI-NEXT:    s_mov_b32 s2, -1
1086; VI-NEXT:    s_waitcnt lgkmcnt(0)
1087; VI-NEXT:    s_lshl_b32 s4, s4, 3
1088; VI-NEXT:    v_lshlrev_b16_e64 v0, s4, -1
1089; VI-NEXT:    v_not_b32_e32 v1, v0
1090; VI-NEXT:    v_and_b32_e32 v1, s6, v1
1091; VI-NEXT:    v_and_b32_e32 v0, 0x505, v0
1092; VI-NEXT:    v_or_b32_e32 v0, v0, v1
1093; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1094; VI-NEXT:    s_endpgm
1095  %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
1096  store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
1097  ret void
1098}
1099
1100; FIXME: post legalize i16 and i32 shifts aren't merged because of
1101; isTypeDesirableForOp in SimplifyDemandedBits
1102define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
1103; SI-LABEL: dynamic_insertelement_v3i8:
1104; SI:       ; %bb.0:
1105; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1106; SI-NEXT:    s_load_dword s6, s[4:5], 0xa
1107; SI-NEXT:    s_load_dword s4, s[4:5], 0x13
1108; SI-NEXT:    s_mov_b32 s3, 0x100f000
1109; SI-NEXT:    s_mov_b32 s2, -1
1110; SI-NEXT:    s_waitcnt lgkmcnt(0)
1111; SI-NEXT:    s_lshl_b32 s4, s4, 3
1112; SI-NEXT:    s_lshl_b32 s4, 0xffff, s4
1113; SI-NEXT:    s_andn2_b32 s5, s6, s4
1114; SI-NEXT:    s_and_b32 s4, s4, 0x5050505
1115; SI-NEXT:    s_or_b32 s4, s4, s5
1116; SI-NEXT:    v_mov_b32_e32 v0, s4
1117; SI-NEXT:    s_lshr_b32 s5, s4, 16
1118; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1119; SI-NEXT:    v_mov_b32_e32 v0, s5
1120; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:2
1121; SI-NEXT:    s_endpgm
1122;
1123; VI-LABEL: dynamic_insertelement_v3i8:
1124; VI:       ; %bb.0:
1125; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1126; VI-NEXT:    s_load_dword s6, s[4:5], 0x28
1127; VI-NEXT:    s_load_dword s4, s[4:5], 0x4c
1128; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1129; VI-NEXT:    s_mov_b32 s2, -1
1130; VI-NEXT:    s_waitcnt lgkmcnt(0)
1131; VI-NEXT:    s_lshl_b32 s4, s4, 3
1132; VI-NEXT:    s_lshl_b32 s4, 0xffff, s4
1133; VI-NEXT:    s_andn2_b32 s5, s6, s4
1134; VI-NEXT:    s_and_b32 s4, s4, 0x5050505
1135; VI-NEXT:    s_or_b32 s4, s4, s5
1136; VI-NEXT:    v_mov_b32_e32 v0, s4
1137; VI-NEXT:    s_lshr_b32 s5, s4, 16
1138; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1139; VI-NEXT:    v_mov_b32_e32 v0, s5
1140; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:2
1141; VI-NEXT:    s_endpgm
1142  %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
1143  store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
1144  ret void
1145}
1146
1147define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
1148; SI-LABEL: dynamic_insertelement_v4i8:
1149; SI:       ; %bb.0:
1150; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1151; SI-NEXT:    s_load_dword s6, s[4:5], 0xa
1152; SI-NEXT:    s_load_dword s4, s[4:5], 0x13
1153; SI-NEXT:    s_mov_b32 s3, 0x100f000
1154; SI-NEXT:    s_mov_b32 s2, -1
1155; SI-NEXT:    s_waitcnt lgkmcnt(0)
1156; SI-NEXT:    s_lshl_b32 s4, s4, 3
1157; SI-NEXT:    s_lshl_b32 s4, 0xffff, s4
1158; SI-NEXT:    s_andn2_b32 s5, s6, s4
1159; SI-NEXT:    s_and_b32 s4, s4, 0x5050505
1160; SI-NEXT:    s_or_b32 s4, s4, s5
1161; SI-NEXT:    v_mov_b32_e32 v0, s4
1162; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1163; SI-NEXT:    s_endpgm
1164;
1165; VI-LABEL: dynamic_insertelement_v4i8:
1166; VI:       ; %bb.0:
1167; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1168; VI-NEXT:    s_load_dword s6, s[4:5], 0x28
1169; VI-NEXT:    s_load_dword s4, s[4:5], 0x4c
1170; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1171; VI-NEXT:    s_mov_b32 s2, -1
1172; VI-NEXT:    s_waitcnt lgkmcnt(0)
1173; VI-NEXT:    s_lshl_b32 s4, s4, 3
1174; VI-NEXT:    s_lshl_b32 s4, 0xffff, s4
1175; VI-NEXT:    s_andn2_b32 s5, s6, s4
1176; VI-NEXT:    s_and_b32 s4, s4, 0x5050505
1177; VI-NEXT:    s_or_b32 s4, s4, s5
1178; VI-NEXT:    v_mov_b32_e32 v0, s4
1179; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1180; VI-NEXT:    s_endpgm
1181  %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
1182  store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
1183  ret void
1184}
1185
1186define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind {
1187; SI-LABEL: s_dynamic_insertelement_v8i8:
1188; SI:       ; %bb.0:
1189; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
1190; SI-NEXT:    s_load_dword s6, s[4:5], 0x4
1191; SI-NEXT:    s_mov_b32 s3, 0x100f000
1192; SI-NEXT:    s_mov_b32 s2, -1
1193; SI-NEXT:    s_waitcnt lgkmcnt(0)
1194; SI-NEXT:    s_load_dwordx2 s[4:5], s[10:11], 0x0
1195; SI-NEXT:    s_mov_b32 s0, s8
1196; SI-NEXT:    s_lshl_b32 s8, s6, 3
1197; SI-NEXT:    s_mov_b64 s[6:7], 0xffff
1198; SI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s8
1199; SI-NEXT:    s_mov_b32 s8, 0x5050505
1200; SI-NEXT:    s_mov_b32 s1, s9
1201; SI-NEXT:    s_and_b32 s9, s7, s8
1202; SI-NEXT:    s_and_b32 s8, s6, s8
1203; SI-NEXT:    s_waitcnt lgkmcnt(0)
1204; SI-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
1205; SI-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
1206; SI-NEXT:    v_mov_b32_e32 v0, s4
1207; SI-NEXT:    v_mov_b32_e32 v1, s5
1208; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1209; SI-NEXT:    s_endpgm
1210;
1211; VI-LABEL: s_dynamic_insertelement_v8i8:
1212; VI:       ; %bb.0:
1213; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
1214; VI-NEXT:    s_load_dword s6, s[4:5], 0x10
1215; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1216; VI-NEXT:    s_mov_b32 s2, -1
1217; VI-NEXT:    s_waitcnt lgkmcnt(0)
1218; VI-NEXT:    s_load_dwordx2 s[4:5], s[10:11], 0x0
1219; VI-NEXT:    s_mov_b32 s0, s8
1220; VI-NEXT:    s_lshl_b32 s8, s6, 3
1221; VI-NEXT:    s_mov_b64 s[6:7], 0xffff
1222; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s8
1223; VI-NEXT:    s_mov_b32 s8, 0x5050505
1224; VI-NEXT:    s_mov_b32 s1, s9
1225; VI-NEXT:    s_and_b32 s9, s7, s8
1226; VI-NEXT:    s_and_b32 s8, s6, s8
1227; VI-NEXT:    s_waitcnt lgkmcnt(0)
1228; VI-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
1229; VI-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
1230; VI-NEXT:    v_mov_b32_e32 v0, s4
1231; VI-NEXT:    v_mov_b32_e32 v1, s5
1232; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1233; VI-NEXT:    s_endpgm
1234  %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4
1235  %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
1236  store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
1237  ret void
1238}
1239
1240define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
1241; SI-LABEL: dynamic_insertelement_v16i8:
1242; SI:       ; %bb.0:
1243; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1244; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
1245; SI-NEXT:    s_load_dword s4, s[4:5], 0x8
1246; SI-NEXT:    s_mov_b32 s3, 0x100f000
1247; SI-NEXT:    s_mov_b32 s2, -1
1248; SI-NEXT:    s_waitcnt lgkmcnt(0)
1249; SI-NEXT:    s_lshr_b32 s5, s11, 24
1250; SI-NEXT:    v_mov_b32_e32 v0, s5
1251; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 15
1252; SI-NEXT:    s_lshr_b32 s5, s11, 16
1253; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1254; SI-NEXT:    v_mov_b32_e32 v1, s5
1255; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 14
1256; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1257; SI-NEXT:    s_movk_i32 s5, 0xff
1258; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1259; SI-NEXT:    v_and_b32_e32 v1, s5, v1
1260; SI-NEXT:    s_lshr_b32 s6, s11, 8
1261; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1262; SI-NEXT:    v_mov_b32_e32 v1, s6
1263; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 13
1264; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1265; SI-NEXT:    v_mov_b32_e32 v2, s11
1266; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 12
1267; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1268; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1269; SI-NEXT:    v_and_b32_e32 v2, s5, v2
1270; SI-NEXT:    v_or_b32_e32 v1, v2, v1
1271; SI-NEXT:    s_mov_b32 s6, 0xffff
1272; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1273; SI-NEXT:    v_and_b32_e32 v1, s6, v1
1274; SI-NEXT:    s_lshr_b32 s7, s10, 24
1275; SI-NEXT:    v_or_b32_e32 v3, v1, v0
1276; SI-NEXT:    v_mov_b32_e32 v0, s7
1277; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 11
1278; SI-NEXT:    s_lshr_b32 s7, s10, 16
1279; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1280; SI-NEXT:    v_mov_b32_e32 v1, s7
1281; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 10
1282; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1283; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1284; SI-NEXT:    v_and_b32_e32 v1, s5, v1
1285; SI-NEXT:    s_lshr_b32 s7, s10, 8
1286; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1287; SI-NEXT:    v_mov_b32_e32 v1, s7
1288; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 9
1289; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1290; SI-NEXT:    v_mov_b32_e32 v2, s10
1291; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 8
1292; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1293; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1294; SI-NEXT:    v_and_b32_e32 v2, s5, v2
1295; SI-NEXT:    v_or_b32_e32 v1, v2, v1
1296; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1297; SI-NEXT:    v_and_b32_e32 v1, s6, v1
1298; SI-NEXT:    s_lshr_b32 s7, s9, 24
1299; SI-NEXT:    v_or_b32_e32 v2, v1, v0
1300; SI-NEXT:    v_mov_b32_e32 v0, s7
1301; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
1302; SI-NEXT:    s_lshr_b32 s7, s9, 16
1303; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1304; SI-NEXT:    v_mov_b32_e32 v1, s7
1305; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
1306; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1307; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1308; SI-NEXT:    v_and_b32_e32 v1, s5, v1
1309; SI-NEXT:    s_lshr_b32 s7, s9, 8
1310; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1311; SI-NEXT:    v_mov_b32_e32 v1, s7
1312; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
1313; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1314; SI-NEXT:    v_mov_b32_e32 v4, s9
1315; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
1316; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1317; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1318; SI-NEXT:    v_and_b32_e32 v4, s5, v4
1319; SI-NEXT:    v_or_b32_e32 v1, v4, v1
1320; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1321; SI-NEXT:    v_and_b32_e32 v1, s6, v1
1322; SI-NEXT:    s_lshr_b32 s7, s8, 24
1323; SI-NEXT:    v_or_b32_e32 v1, v1, v0
1324; SI-NEXT:    v_mov_b32_e32 v0, s7
1325; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
1326; SI-NEXT:    s_lshr_b32 s7, s8, 16
1327; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1328; SI-NEXT:    v_mov_b32_e32 v4, s7
1329; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
1330; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1331; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1332; SI-NEXT:    v_and_b32_e32 v4, s5, v4
1333; SI-NEXT:    s_lshr_b32 s7, s8, 8
1334; SI-NEXT:    v_or_b32_e32 v0, v4, v0
1335; SI-NEXT:    v_mov_b32_e32 v4, s7
1336; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
1337; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1338; SI-NEXT:    v_mov_b32_e32 v5, s8
1339; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
1340; SI-NEXT:    v_cndmask_b32_e32 v5, 5, v5, vcc
1341; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
1342; SI-NEXT:    v_and_b32_e32 v5, s5, v5
1343; SI-NEXT:    v_or_b32_e32 v4, v5, v4
1344; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1345; SI-NEXT:    v_and_b32_e32 v4, s6, v4
1346; SI-NEXT:    v_or_b32_e32 v0, v4, v0
1347; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1348; SI-NEXT:    s_endpgm
1349;
1350; VI-LABEL: dynamic_insertelement_v16i8:
1351; VI:       ; %bb.0:
1352; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1353; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
1354; VI-NEXT:    s_load_dword s4, s[4:5], 0x20
1355; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1356; VI-NEXT:    s_mov_b32 s2, -1
1357; VI-NEXT:    s_waitcnt lgkmcnt(0)
1358; VI-NEXT:    s_lshr_b32 s5, s11, 24
1359; VI-NEXT:    v_mov_b32_e32 v0, s5
1360; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 15
1361; VI-NEXT:    s_lshr_b32 s5, s11, 16
1362; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1363; VI-NEXT:    v_mov_b32_e32 v1, s5
1364; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 14
1365; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1366; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1367; VI-NEXT:    s_lshr_b32 s5, s11, 8
1368; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1369; VI-NEXT:    v_mov_b32_e32 v1, s5
1370; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 13
1371; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1372; VI-NEXT:    v_mov_b32_e32 v2, s11
1373; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 12
1374; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1375; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1376; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1377; VI-NEXT:    s_lshr_b32 s5, s10, 24
1378; VI-NEXT:    v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1379; VI-NEXT:    v_mov_b32_e32 v0, s5
1380; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 11
1381; VI-NEXT:    s_lshr_b32 s5, s10, 16
1382; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1383; VI-NEXT:    v_mov_b32_e32 v1, s5
1384; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 10
1385; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1386; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1387; VI-NEXT:    s_lshr_b32 s5, s10, 8
1388; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1389; VI-NEXT:    v_mov_b32_e32 v1, s5
1390; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 9
1391; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1392; VI-NEXT:    v_mov_b32_e32 v2, s10
1393; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 8
1394; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1395; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1396; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1397; VI-NEXT:    s_lshr_b32 s5, s9, 24
1398; VI-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1399; VI-NEXT:    v_mov_b32_e32 v0, s5
1400; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
1401; VI-NEXT:    s_lshr_b32 s5, s9, 16
1402; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1403; VI-NEXT:    v_mov_b32_e32 v1, s5
1404; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
1405; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1406; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1407; VI-NEXT:    s_lshr_b32 s5, s9, 8
1408; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1409; VI-NEXT:    v_mov_b32_e32 v1, s5
1410; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
1411; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1412; VI-NEXT:    v_mov_b32_e32 v4, s9
1413; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
1414; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1415; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1416; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1417; VI-NEXT:    s_lshr_b32 s5, s8, 24
1418; VI-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1419; VI-NEXT:    v_mov_b32_e32 v0, s5
1420; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
1421; VI-NEXT:    s_lshr_b32 s5, s8, 16
1422; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1423; VI-NEXT:    v_mov_b32_e32 v4, s5
1424; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
1425; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1426; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1427; VI-NEXT:    s_lshr_b32 s5, s8, 8
1428; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1429; VI-NEXT:    v_mov_b32_e32 v4, s5
1430; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
1431; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1432; VI-NEXT:    v_mov_b32_e32 v5, s8
1433; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
1434; VI-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
1435; VI-NEXT:    v_cndmask_b32_e32 v5, 5, v5, vcc
1436; VI-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1437; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1438; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1439; VI-NEXT:    s_endpgm
1440  %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
1441  store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
1442  ret void
1443}
1444
1445; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
1446; the compiler doesn't crash.
1447define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
1448; SI-LABEL: insert_split_bb:
1449; SI:       ; %bb.0: ; %entry
1450; SI-NEXT:    s_load_dword s6, s[4:5], 0x4
1451; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1452; SI-NEXT:    s_waitcnt lgkmcnt(0)
1453; SI-NEXT:    s_cmp_lg_u32 s6, 0
1454; SI-NEXT:    s_cbranch_scc0 BB30_2
1455; SI-NEXT:  ; %bb.1: ; %else
1456; SI-NEXT:    s_load_dword s7, s[2:3], 0x1
1457; SI-NEXT:    s_mov_b64 s[4:5], 0
1458; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
1459; SI-NEXT:    s_waitcnt lgkmcnt(0)
1460; SI-NEXT:    s_mov_b64 vcc, vcc
1461; SI-NEXT:    s_cbranch_vccz BB30_3
1462; SI-NEXT:    s_branch BB30_4
1463; SI-NEXT:  BB30_2:
1464; SI-NEXT:  BB30_3: ; %if
1465; SI-NEXT:    s_load_dword s7, s[2:3], 0x0
1466; SI-NEXT:  BB30_4: ; %endif
1467; SI-NEXT:    s_waitcnt lgkmcnt(0)
1468; SI-NEXT:    v_mov_b32_e32 v0, s6
1469; SI-NEXT:    s_mov_b32 s3, 0x100f000
1470; SI-NEXT:    s_mov_b32 s2, -1
1471; SI-NEXT:    v_mov_b32_e32 v1, s7
1472; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1473; SI-NEXT:    s_endpgm
1474;
1475; VI-LABEL: insert_split_bb:
1476; VI:       ; %bb.0: ; %entry
1477; VI-NEXT:    s_load_dword s6, s[4:5], 0x10
1478; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1479; VI-NEXT:    s_waitcnt lgkmcnt(0)
1480; VI-NEXT:    s_cmp_lg_u32 s6, 0
1481; VI-NEXT:    s_cbranch_scc0 BB30_2
1482; VI-NEXT:  ; %bb.1: ; %else
1483; VI-NEXT:    s_load_dword s7, s[2:3], 0x4
1484; VI-NEXT:    s_cbranch_execz BB30_3
1485; VI-NEXT:    s_branch BB30_4
1486; VI-NEXT:  BB30_2:
1487; VI-NEXT:  BB30_3: ; %if
1488; VI-NEXT:    s_waitcnt lgkmcnt(0)
1489; VI-NEXT:    s_load_dword s7, s[2:3], 0x0
1490; VI-NEXT:  BB30_4: ; %endif
1491; VI-NEXT:    s_waitcnt lgkmcnt(0)
1492; VI-NEXT:    v_mov_b32_e32 v0, s6
1493; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1494; VI-NEXT:    s_mov_b32 s2, -1
1495; VI-NEXT:    v_mov_b32_e32 v1, s7
1496; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1497; VI-NEXT:    s_endpgm
1498entry:
1499  %0 = insertelement <2 x i32> undef, i32 %a, i32 0
1500  %1 = icmp eq i32 %a, 0
1501  br i1 %1, label %if, label %else
1502
1503if:
1504  %2 = load i32, i32 addrspace(1)* %in
1505  %3 = insertelement <2 x i32> %0, i32 %2, i32 1
1506  br label %endif
1507
1508else:
1509  %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1
1510  %5 = load i32, i32 addrspace(1)* %4
1511  %6 = insertelement <2 x i32> %0, i32 %5, i32 1
1512  br label %endif
1513
1514endif:
1515  %7 = phi <2 x i32> [%3, %if], [%6, %else]
1516  store <2 x i32> %7, <2 x i32> addrspace(1)* %out
1517  ret void
1518}
1519
1520define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
1521; SI-LABEL: dynamic_insertelement_v2f64:
1522; SI:       ; %bb.0:
1523; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1524; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xc
1525; SI-NEXT:    s_load_dword s4, s[4:5], 0x18
1526; SI-NEXT:    v_mov_b32_e32 v1, 0x40200000
1527; SI-NEXT:    s_mov_b32 s3, 0x100f000
1528; SI-NEXT:    s_mov_b32 s2, -1
1529; SI-NEXT:    s_waitcnt lgkmcnt(0)
1530; SI-NEXT:    v_mov_b32_e32 v0, s11
1531; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
1532; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
1533; SI-NEXT:    v_mov_b32_e32 v0, s10
1534; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1535; SI-NEXT:    v_mov_b32_e32 v0, s9
1536; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
1537; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
1538; SI-NEXT:    v_mov_b32_e32 v0, s8
1539; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1540; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1541; SI-NEXT:    s_endpgm
1542;
1543; VI-LABEL: dynamic_insertelement_v2f64:
1544; VI:       ; %bb.0:
1545; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1546; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x30
1547; VI-NEXT:    s_load_dword s4, s[4:5], 0x60
1548; VI-NEXT:    v_mov_b32_e32 v1, 0x40200000
1549; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1550; VI-NEXT:    s_mov_b32 s2, -1
1551; VI-NEXT:    s_waitcnt lgkmcnt(0)
1552; VI-NEXT:    v_mov_b32_e32 v0, s11
1553; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
1554; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
1555; VI-NEXT:    v_mov_b32_e32 v0, s10
1556; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1557; VI-NEXT:    v_mov_b32_e32 v0, s9
1558; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
1559; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
1560; VI-NEXT:    v_mov_b32_e32 v0, s8
1561; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1562; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1563; VI-NEXT:    s_endpgm
1564  %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
1565  store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
1566  ret void
1567}
1568
1569define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
1570; SI-LABEL: dynamic_insertelement_v2i64:
1571; SI:       ; %bb.0:
1572; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1573; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
1574; SI-NEXT:    s_load_dword s6, s[4:5], 0x8
1575; SI-NEXT:    s_mov_b32 s3, 0x100f000
1576; SI-NEXT:    s_mov_b32 s2, -1
1577; SI-NEXT:    s_waitcnt lgkmcnt(0)
1578; SI-NEXT:    v_mov_b32_e32 v0, s11
1579; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 1
1580; SI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1581; SI-NEXT:    v_mov_b32_e32 v0, s10
1582; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1583; SI-NEXT:    v_mov_b32_e32 v0, s9
1584; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 0
1585; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1586; SI-NEXT:    v_mov_b32_e32 v0, s8
1587; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1588; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1589; SI-NEXT:    s_endpgm
1590;
1591; VI-LABEL: dynamic_insertelement_v2i64:
1592; VI:       ; %bb.0:
1593; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1594; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
1595; VI-NEXT:    s_load_dword s6, s[4:5], 0x20
1596; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1597; VI-NEXT:    s_mov_b32 s2, -1
1598; VI-NEXT:    s_waitcnt lgkmcnt(0)
1599; VI-NEXT:    v_mov_b32_e32 v0, s11
1600; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 1
1601; VI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1602; VI-NEXT:    v_mov_b32_e32 v0, s10
1603; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1604; VI-NEXT:    v_mov_b32_e32 v0, s9
1605; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 0
1606; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1607; VI-NEXT:    v_mov_b32_e32 v0, s8
1608; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1609; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1610; VI-NEXT:    s_endpgm
1611  %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
1612  store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
1613  ret void
1614}
1615
1616define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
1617; SI-LABEL: dynamic_insertelement_v3i64:
1618; SI:       ; %bb.0:
1619; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1620; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
1621; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xc
1622; SI-NEXT:    s_load_dword s12, s[4:5], 0x10
1623; SI-NEXT:    s_mov_b32 s3, 0x100f000
1624; SI-NEXT:    s_mov_b32 s2, -1
1625; SI-NEXT:    s_waitcnt lgkmcnt(0)
1626; SI-NEXT:    v_mov_b32_e32 v0, s11
1627; SI-NEXT:    v_mov_b32_e32 v4, s7
1628; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 1
1629; SI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1630; SI-NEXT:    v_mov_b32_e32 v0, s10
1631; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1632; SI-NEXT:    v_mov_b32_e32 v0, s9
1633; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 0
1634; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1635; SI-NEXT:    v_mov_b32_e32 v0, s8
1636; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1637; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 2
1638; SI-NEXT:    v_cndmask_b32_e64 v5, v4, 0, s[4:5]
1639; SI-NEXT:    v_mov_b32_e32 v4, s6
1640; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 5, s[4:5]
1641; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
1642; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1643; SI-NEXT:    s_endpgm
1644;
1645; VI-LABEL: dynamic_insertelement_v3i64:
1646; VI:       ; %bb.0:
1647; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1648; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
1649; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x30
1650; VI-NEXT:    s_load_dword s12, s[4:5], 0x40
1651; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1652; VI-NEXT:    s_mov_b32 s2, -1
1653; VI-NEXT:    s_waitcnt lgkmcnt(0)
1654; VI-NEXT:    v_mov_b32_e32 v0, s11
1655; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 1
1656; VI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1657; VI-NEXT:    v_mov_b32_e32 v0, s10
1658; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1659; VI-NEXT:    v_mov_b32_e32 v0, s9
1660; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 0
1661; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1662; VI-NEXT:    v_mov_b32_e32 v0, s8
1663; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1664; VI-NEXT:    v_mov_b32_e32 v4, s7
1665; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 2
1666; VI-NEXT:    v_cndmask_b32_e64 v5, v4, 0, s[4:5]
1667; VI-NEXT:    v_mov_b32_e32 v4, s6
1668; VI-NEXT:    v_cndmask_b32_e64 v4, v4, 5, s[4:5]
1669; VI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
1670; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1671; VI-NEXT:    s_endpgm
1672  %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
1673  store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
1674  ret void
1675}
1676
1677define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
1678; SI-LABEL: dynamic_insertelement_v4f64:
1679; SI:       ; %bb.0:
1680; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1681; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
1682; SI-NEXT:    s_load_dword s4, s[4:5], 0x10
1683; SI-NEXT:    v_mov_b32_e32 v4, 0x40200000
1684; SI-NEXT:    s_mov_b32 s3, 0x100f000
1685; SI-NEXT:    s_mov_b32 s2, -1
1686; SI-NEXT:    s_waitcnt lgkmcnt(0)
1687; SI-NEXT:    v_mov_b32_e32 v0, s11
1688; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
1689; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
1690; SI-NEXT:    v_mov_b32_e32 v0, s10
1691; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1692; SI-NEXT:    v_mov_b32_e32 v0, s9
1693; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
1694; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
1695; SI-NEXT:    v_mov_b32_e32 v0, s8
1696; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1697; SI-NEXT:    v_mov_b32_e32 v5, s15
1698; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 3
1699; SI-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
1700; SI-NEXT:    v_mov_b32_e32 v5, s14
1701; SI-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
1702; SI-NEXT:    v_mov_b32_e32 v5, s13
1703; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 2
1704; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
1705; SI-NEXT:    v_mov_b32_e32 v4, s12
1706; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
1707; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1708; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1709; SI-NEXT:    s_endpgm
1710;
1711; VI-LABEL: dynamic_insertelement_v4f64:
1712; VI:       ; %bb.0:
1713; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1714; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
1715; VI-NEXT:    s_load_dword s4, s[4:5], 0x40
1716; VI-NEXT:    v_mov_b32_e32 v4, 0x40200000
1717; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1718; VI-NEXT:    s_mov_b32 s2, -1
1719; VI-NEXT:    s_waitcnt lgkmcnt(0)
1720; VI-NEXT:    v_mov_b32_e32 v0, s11
1721; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
1722; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
1723; VI-NEXT:    v_mov_b32_e32 v0, s10
1724; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1725; VI-NEXT:    v_mov_b32_e32 v0, s9
1726; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
1727; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
1728; VI-NEXT:    v_mov_b32_e32 v0, s8
1729; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1730; VI-NEXT:    v_mov_b32_e32 v5, s15
1731; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 3
1732; VI-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
1733; VI-NEXT:    v_mov_b32_e32 v5, s14
1734; VI-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
1735; VI-NEXT:    v_mov_b32_e32 v5, s13
1736; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 2
1737; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
1738; VI-NEXT:    v_mov_b32_e32 v4, s12
1739; VI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
1740; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1741; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1742; VI-NEXT:    s_endpgm
1743  %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
1744  store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
1745  ret void
1746}
1747
1748define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 {
1749; SI-LABEL: dynamic_insertelement_v8f64:
1750; SI:       ; %bb.0:
1751; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1752; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
1753; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
1754; SI-NEXT:    v_mov_b32_e32 v16, 0x40200000
1755; SI-NEXT:    s_mov_b32 s3, 0x100f000
1756; SI-NEXT:    s_mov_b32 s2, -1
1757; SI-NEXT:    s_waitcnt lgkmcnt(0)
1758; SI-NEXT:    v_mov_b32_e32 v0, s8
1759; SI-NEXT:    s_lshl_b32 s4, s4, 1
1760; SI-NEXT:    v_mov_b32_e32 v1, s9
1761; SI-NEXT:    v_mov_b32_e32 v2, s10
1762; SI-NEXT:    v_mov_b32_e32 v3, s11
1763; SI-NEXT:    v_mov_b32_e32 v4, s12
1764; SI-NEXT:    v_mov_b32_e32 v5, s13
1765; SI-NEXT:    v_mov_b32_e32 v6, s14
1766; SI-NEXT:    v_mov_b32_e32 v7, s15
1767; SI-NEXT:    v_mov_b32_e32 v8, s16
1768; SI-NEXT:    v_mov_b32_e32 v9, s17
1769; SI-NEXT:    v_mov_b32_e32 v10, s18
1770; SI-NEXT:    v_mov_b32_e32 v11, s19
1771; SI-NEXT:    v_mov_b32_e32 v12, s20
1772; SI-NEXT:    v_mov_b32_e32 v13, s21
1773; SI-NEXT:    v_mov_b32_e32 v14, s22
1774; SI-NEXT:    v_mov_b32_e32 v15, s23
1775; SI-NEXT:    s_mov_b32 m0, s4
1776; SI-NEXT:    v_movreld_b32_e32 v0, 0
1777; SI-NEXT:    v_movreld_b32_e32 v1, v16
1778; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1779; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1780; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1781; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1782; SI-NEXT:    s_endpgm
1783;
1784; VI-LABEL: dynamic_insertelement_v8f64:
1785; VI:       ; %bb.0:
1786; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1787; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
1788; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
1789; VI-NEXT:    v_mov_b32_e32 v16, 0x40200000
1790; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1791; VI-NEXT:    s_mov_b32 s2, -1
1792; VI-NEXT:    s_waitcnt lgkmcnt(0)
1793; VI-NEXT:    v_mov_b32_e32 v0, s8
1794; VI-NEXT:    s_lshl_b32 s4, s4, 1
1795; VI-NEXT:    v_mov_b32_e32 v1, s9
1796; VI-NEXT:    v_mov_b32_e32 v2, s10
1797; VI-NEXT:    v_mov_b32_e32 v3, s11
1798; VI-NEXT:    v_mov_b32_e32 v4, s12
1799; VI-NEXT:    v_mov_b32_e32 v5, s13
1800; VI-NEXT:    v_mov_b32_e32 v6, s14
1801; VI-NEXT:    v_mov_b32_e32 v7, s15
1802; VI-NEXT:    v_mov_b32_e32 v8, s16
1803; VI-NEXT:    v_mov_b32_e32 v9, s17
1804; VI-NEXT:    v_mov_b32_e32 v10, s18
1805; VI-NEXT:    v_mov_b32_e32 v11, s19
1806; VI-NEXT:    v_mov_b32_e32 v12, s20
1807; VI-NEXT:    v_mov_b32_e32 v13, s21
1808; VI-NEXT:    v_mov_b32_e32 v14, s22
1809; VI-NEXT:    v_mov_b32_e32 v15, s23
1810; VI-NEXT:    s_mov_b32 m0, s4
1811; VI-NEXT:    v_movreld_b32_e32 v0, 0
1812; VI-NEXT:    v_movreld_b32_e32 v1, v16
1813; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1814; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1815; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1816; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1817; VI-NEXT:    s_endpgm
1818  %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
1819  store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
1820  ret void
1821}
1822
1823declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
1824
1825attributes #0 = { nounwind }
1826attributes #1 = { nounwind readnone }
1827