1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
4
5; FIXME: Broken on evergreen
6; FIXME: For some reason the 8 and 16 vectors are being stored as
7; individual elements instead of 128-bit stores.
8
9define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind {
10; SI-LABEL: insertelement_v2f32_0:
11; SI:       ; %bb.0:
12; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
14; SI-NEXT:    s_mov_b32 s3, 0x100f000
15; SI-NEXT:    s_mov_b32 s2, -1
16; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
17; SI-NEXT:    s_waitcnt lgkmcnt(0)
18; SI-NEXT:    v_mov_b32_e32 v1, s5
19; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
20; SI-NEXT:    s_endpgm
21;
22; VI-LABEL: insertelement_v2f32_0:
23; VI:       ; %bb.0:
24; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
25; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
26; VI-NEXT:    s_mov_b32 s3, 0x1100f000
27; VI-NEXT:    s_mov_b32 s2, -1
28; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
29; VI-NEXT:    s_waitcnt lgkmcnt(0)
30; VI-NEXT:    v_mov_b32_e32 v1, s5
31; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
32; VI-NEXT:    s_endpgm
33  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 0
34  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16
35  ret void
36}
37
38define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind {
39; SI-LABEL: insertelement_v2f32_1:
40; SI:       ; %bb.0:
41; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
42; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
43; SI-NEXT:    s_mov_b32 s3, 0x100f000
44; SI-NEXT:    s_mov_b32 s2, -1
45; SI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
46; SI-NEXT:    s_waitcnt lgkmcnt(0)
47; SI-NEXT:    v_mov_b32_e32 v0, s4
48; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
49; SI-NEXT:    s_endpgm
50;
51; VI-LABEL: insertelement_v2f32_1:
52; VI:       ; %bb.0:
53; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
54; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
55; VI-NEXT:    s_mov_b32 s3, 0x1100f000
56; VI-NEXT:    s_mov_b32 s2, -1
57; VI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
58; VI-NEXT:    s_waitcnt lgkmcnt(0)
59; VI-NEXT:    v_mov_b32_e32 v0, s4
60; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
61; VI-NEXT:    s_endpgm
62  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 1
63  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16
64  ret void
65}
66
67define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
68; SI-LABEL: insertelement_v2i32_0:
69; SI:       ; %bb.0:
70; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
71; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
72; SI-NEXT:    s_mov_b32 s3, 0x100f000
73; SI-NEXT:    s_mov_b32 s2, -1
74; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
75; SI-NEXT:    s_waitcnt lgkmcnt(0)
76; SI-NEXT:    v_mov_b32_e32 v1, s5
77; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
78; SI-NEXT:    s_endpgm
79;
80; VI-LABEL: insertelement_v2i32_0:
81; VI:       ; %bb.0:
82; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
83; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
84; VI-NEXT:    s_mov_b32 s3, 0x1100f000
85; VI-NEXT:    s_mov_b32 s2, -1
86; VI-NEXT:    v_mov_b32_e32 v0, 0x3e7
87; VI-NEXT:    s_waitcnt lgkmcnt(0)
88; VI-NEXT:    v_mov_b32_e32 v1, s5
89; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
90; VI-NEXT:    s_endpgm
91  %vecins = insertelement <2 x i32> %a, i32 999, i32 0
92  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16
93  ret void
94}
95
96define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
97; SI-LABEL: insertelement_v2i32_1:
98; SI:       ; %bb.0:
99; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
100; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
101; SI-NEXT:    s_mov_b32 s3, 0x100f000
102; SI-NEXT:    s_mov_b32 s2, -1
103; SI-NEXT:    v_mov_b32_e32 v1, 0x3e7
104; SI-NEXT:    s_waitcnt lgkmcnt(0)
105; SI-NEXT:    v_mov_b32_e32 v0, s4
106; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
107; SI-NEXT:    s_endpgm
108;
109; VI-LABEL: insertelement_v2i32_1:
110; VI:       ; %bb.0:
111; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
112; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
113; VI-NEXT:    s_mov_b32 s3, 0x1100f000
114; VI-NEXT:    s_mov_b32 s2, -1
115; VI-NEXT:    v_mov_b32_e32 v1, 0x3e7
116; VI-NEXT:    s_waitcnt lgkmcnt(0)
117; VI-NEXT:    v_mov_b32_e32 v0, s4
118; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
119; VI-NEXT:    s_endpgm
120  %vecins = insertelement <2 x i32> %a, i32 999, i32 1
121  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16
122  ret void
123}
124
125; FIXME: Why is the constant moved into the intermediate register and
126; not just directly into the vector component?
127define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
128; SI-LABEL: insertelement_v4f32_0:
129; SI:       ; %bb.0:
130; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
131; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
132; SI-NEXT:    s_waitcnt lgkmcnt(0)
133; SI-NEXT:    s_mov_b32 s4, 0x40a00000
134; SI-NEXT:    s_mov_b32 s3, 0x100f000
135; SI-NEXT:    s_mov_b32 s2, -1
136; SI-NEXT:    v_mov_b32_e32 v0, s4
137; SI-NEXT:    v_mov_b32_e32 v1, s5
138; SI-NEXT:    v_mov_b32_e32 v2, s6
139; SI-NEXT:    v_mov_b32_e32 v3, s7
140; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
141; SI-NEXT:    s_endpgm
142;
143; VI-LABEL: insertelement_v4f32_0:
144; VI:       ; %bb.0:
145; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
146; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
147; VI-NEXT:    s_waitcnt lgkmcnt(0)
148; VI-NEXT:    s_mov_b32 s4, 0x40a00000
149; VI-NEXT:    s_mov_b32 s3, 0x1100f000
150; VI-NEXT:    s_mov_b32 s2, -1
151; VI-NEXT:    v_mov_b32_e32 v0, s4
152; VI-NEXT:    v_mov_b32_e32 v1, s5
153; VI-NEXT:    v_mov_b32_e32 v2, s6
154; VI-NEXT:    v_mov_b32_e32 v3, s7
155; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
156; VI-NEXT:    s_endpgm
157  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
158  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
159  ret void
160}
161
162define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
163; SI-LABEL: insertelement_v4f32_1:
164; SI:       ; %bb.0:
165; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
166; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
167; SI-NEXT:    s_waitcnt lgkmcnt(0)
168; SI-NEXT:    s_mov_b32 s5, 0x40a00000
169; SI-NEXT:    s_mov_b32 s3, 0x100f000
170; SI-NEXT:    s_mov_b32 s2, -1
171; SI-NEXT:    v_mov_b32_e32 v0, s4
172; SI-NEXT:    v_mov_b32_e32 v1, s5
173; SI-NEXT:    v_mov_b32_e32 v2, s6
174; SI-NEXT:    v_mov_b32_e32 v3, s7
175; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
176; SI-NEXT:    s_endpgm
177;
178; VI-LABEL: insertelement_v4f32_1:
179; VI:       ; %bb.0:
180; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
181; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
182; VI-NEXT:    s_waitcnt lgkmcnt(0)
183; VI-NEXT:    s_mov_b32 s5, 0x40a00000
184; VI-NEXT:    s_mov_b32 s3, 0x1100f000
185; VI-NEXT:    s_mov_b32 s2, -1
186; VI-NEXT:    v_mov_b32_e32 v0, s4
187; VI-NEXT:    v_mov_b32_e32 v1, s5
188; VI-NEXT:    v_mov_b32_e32 v2, s6
189; VI-NEXT:    v_mov_b32_e32 v3, s7
190; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
191; VI-NEXT:    s_endpgm
192  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
193  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
194  ret void
195}
196
197define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
198; SI-LABEL: insertelement_v4f32_2:
199; SI:       ; %bb.0:
200; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
201; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
202; SI-NEXT:    s_waitcnt lgkmcnt(0)
203; SI-NEXT:    s_mov_b32 s6, 0x40a00000
204; SI-NEXT:    s_mov_b32 s3, 0x100f000
205; SI-NEXT:    s_mov_b32 s2, -1
206; SI-NEXT:    v_mov_b32_e32 v0, s4
207; SI-NEXT:    v_mov_b32_e32 v1, s5
208; SI-NEXT:    v_mov_b32_e32 v2, s6
209; SI-NEXT:    v_mov_b32_e32 v3, s7
210; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
211; SI-NEXT:    s_endpgm
212;
213; VI-LABEL: insertelement_v4f32_2:
214; VI:       ; %bb.0:
215; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
216; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
217; VI-NEXT:    s_waitcnt lgkmcnt(0)
218; VI-NEXT:    s_mov_b32 s6, 0x40a00000
219; VI-NEXT:    s_mov_b32 s3, 0x1100f000
220; VI-NEXT:    s_mov_b32 s2, -1
221; VI-NEXT:    v_mov_b32_e32 v0, s4
222; VI-NEXT:    v_mov_b32_e32 v1, s5
223; VI-NEXT:    v_mov_b32_e32 v2, s6
224; VI-NEXT:    v_mov_b32_e32 v3, s7
225; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
226; VI-NEXT:    s_endpgm
227  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
228  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
229  ret void
230}
231
232define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
233; SI-LABEL: insertelement_v4f32_3:
234; SI:       ; %bb.0:
235; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
236; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
237; SI-NEXT:    s_waitcnt lgkmcnt(0)
238; SI-NEXT:    s_mov_b32 s7, 0x40a00000
239; SI-NEXT:    s_mov_b32 s3, 0x100f000
240; SI-NEXT:    s_mov_b32 s2, -1
241; SI-NEXT:    v_mov_b32_e32 v0, s4
242; SI-NEXT:    v_mov_b32_e32 v1, s5
243; SI-NEXT:    v_mov_b32_e32 v2, s6
244; SI-NEXT:    v_mov_b32_e32 v3, s7
245; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
246; SI-NEXT:    s_endpgm
247;
248; VI-LABEL: insertelement_v4f32_3:
249; VI:       ; %bb.0:
250; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
251; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
252; VI-NEXT:    s_waitcnt lgkmcnt(0)
253; VI-NEXT:    s_mov_b32 s7, 0x40a00000
254; VI-NEXT:    s_mov_b32 s3, 0x1100f000
255; VI-NEXT:    s_mov_b32 s2, -1
256; VI-NEXT:    v_mov_b32_e32 v0, s4
257; VI-NEXT:    v_mov_b32_e32 v1, s5
258; VI-NEXT:    v_mov_b32_e32 v2, s6
259; VI-NEXT:    v_mov_b32_e32 v3, s7
260; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
261; VI-NEXT:    s_endpgm
262  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
263  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
264  ret void
265}
266
267define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
268; SI-LABEL: insertelement_v4i32_0:
269; SI:       ; %bb.0:
270; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
271; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
272; SI-NEXT:    s_waitcnt lgkmcnt(0)
273; SI-NEXT:    s_movk_i32 s4, 0x3e7
274; SI-NEXT:    s_mov_b32 s3, 0x100f000
275; SI-NEXT:    s_mov_b32 s2, -1
276; SI-NEXT:    v_mov_b32_e32 v0, s4
277; SI-NEXT:    v_mov_b32_e32 v1, s5
278; SI-NEXT:    v_mov_b32_e32 v2, s6
279; SI-NEXT:    v_mov_b32_e32 v3, s7
280; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
281; SI-NEXT:    s_endpgm
282;
283; VI-LABEL: insertelement_v4i32_0:
284; VI:       ; %bb.0:
285; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
286; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
287; VI-NEXT:    s_waitcnt lgkmcnt(0)
288; VI-NEXT:    s_movk_i32 s4, 0x3e7
289; VI-NEXT:    s_mov_b32 s3, 0x1100f000
290; VI-NEXT:    s_mov_b32 s2, -1
291; VI-NEXT:    v_mov_b32_e32 v0, s4
292; VI-NEXT:    v_mov_b32_e32 v1, s5
293; VI-NEXT:    v_mov_b32_e32 v2, s6
294; VI-NEXT:    v_mov_b32_e32 v3, s7
295; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
296; VI-NEXT:    s_endpgm
297  %vecins = insertelement <4 x i32> %a, i32 999, i32 0
298  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
299  ret void
300}
301
302define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
303; SI-LABEL: insertelement_v3f32_1:
304; SI:       ; %bb.0:
305; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
306; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
307; SI-NEXT:    s_mov_b32 s3, 0x100f000
308; SI-NEXT:    s_mov_b32 s2, -1
309; SI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
310; SI-NEXT:    s_waitcnt lgkmcnt(0)
311; SI-NEXT:    v_mov_b32_e32 v0, s4
312; SI-NEXT:    v_mov_b32_e32 v2, s6
313; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
314; SI-NEXT:    s_endpgm
315;
316; VI-LABEL: insertelement_v3f32_1:
317; VI:       ; %bb.0:
318; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
319; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
320; VI-NEXT:    s_mov_b32 s3, 0x1100f000
321; VI-NEXT:    s_mov_b32 s2, -1
322; VI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
323; VI-NEXT:    s_waitcnt lgkmcnt(0)
324; VI-NEXT:    v_mov_b32_e32 v0, s4
325; VI-NEXT:    v_mov_b32_e32 v2, s6
326; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
327; VI-NEXT:    s_endpgm
328  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
329  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
330  ret void
331}
332
333define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
334; SI-LABEL: insertelement_v3f32_2:
335; SI:       ; %bb.0:
336; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
337; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
338; SI-NEXT:    s_mov_b32 s3, 0x100f000
339; SI-NEXT:    s_mov_b32 s2, -1
340; SI-NEXT:    v_mov_b32_e32 v2, 0x40a00000
341; SI-NEXT:    s_waitcnt lgkmcnt(0)
342; SI-NEXT:    v_mov_b32_e32 v0, s4
343; SI-NEXT:    v_mov_b32_e32 v1, s5
344; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
345; SI-NEXT:    s_endpgm
346;
347; VI-LABEL: insertelement_v3f32_2:
348; VI:       ; %bb.0:
349; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
350; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
351; VI-NEXT:    s_mov_b32 s3, 0x1100f000
352; VI-NEXT:    s_mov_b32 s2, -1
353; VI-NEXT:    v_mov_b32_e32 v2, 0x40a00000
354; VI-NEXT:    s_waitcnt lgkmcnt(0)
355; VI-NEXT:    v_mov_b32_e32 v0, s4
356; VI-NEXT:    v_mov_b32_e32 v1, s5
357; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
358; VI-NEXT:    s_endpgm
359  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
360  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
361  ret void
362}
363
364define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
365; GCN-LABEL: insertelement_v3f32_3:
366; GCN:       ; %bb.0:
367; GCN-NEXT:    s_endpgm
368  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
369  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
370  ret void
371}
372
373define <4 x float> @insertelement_to_sgpr() nounwind {
374; GCN-LABEL: insertelement_to_sgpr:
375; GCN:       ; %bb.0:
376; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377; GCN-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x0
378; GCN-NEXT:    s_waitcnt lgkmcnt(0)
379; GCN-NEXT:    s_mov_b32 s12, 0
380; GCN-NEXT:    s_mov_b32 s4, s12
381; GCN-NEXT:    s_mov_b32 s5, s12
382; GCN-NEXT:    s_mov_b32 s6, s12
383; GCN-NEXT:    s_mov_b32 s7, s12
384; GCN-NEXT:    s_mov_b32 s8, s12
385; GCN-NEXT:    s_mov_b32 s9, s12
386; GCN-NEXT:    s_mov_b32 s10, s12
387; GCN-NEXT:    s_mov_b32 s11, s12
388; GCN-NEXT:    image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1
389; GCN-NEXT:    s_waitcnt vmcnt(0)
390; GCN-NEXT:    s_setpc_b64 s[30:31]
391  %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
392  %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
393  %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
394  ret <4 x float> %tmp2
395}
396
397define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
398; SI-LABEL: dynamic_insertelement_v2f32:
399; SI:       ; %bb.0:
400; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
401; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
402; SI-NEXT:    s_load_dword s4, s[4:5], 0x4
403; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
404; SI-NEXT:    s_mov_b32 s3, 0x100f000
405; SI-NEXT:    s_mov_b32 s2, -1
406; SI-NEXT:    s_waitcnt lgkmcnt(0)
407; SI-NEXT:    v_mov_b32_e32 v1, s7
408; SI-NEXT:    s_cmp_lg_u32 s4, 1
409; SI-NEXT:    s_cselect_b64 vcc, -1, 0
410; SI-NEXT:    s_cmp_lg_u32 s4, 0
411; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
412; SI-NEXT:    v_mov_b32_e32 v2, s6
413; SI-NEXT:    s_cselect_b64 vcc, -1, 0
414; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
415; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
416; SI-NEXT:    s_endpgm
417;
418; VI-LABEL: dynamic_insertelement_v2f32:
419; VI:       ; %bb.0:
420; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
421; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
422; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
423; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
424; VI-NEXT:    s_mov_b32 s3, 0x1100f000
425; VI-NEXT:    s_mov_b32 s2, -1
426; VI-NEXT:    s_waitcnt lgkmcnt(0)
427; VI-NEXT:    v_mov_b32_e32 v1, s7
428; VI-NEXT:    s_cmp_lg_u32 s4, 1
429; VI-NEXT:    s_cselect_b64 vcc, -1, 0
430; VI-NEXT:    s_cmp_lg_u32 s4, 0
431; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
432; VI-NEXT:    v_mov_b32_e32 v2, s6
433; VI-NEXT:    s_cselect_b64 vcc, -1, 0
434; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
435; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
436; VI-NEXT:    s_endpgm
437  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
438  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
439  ret void
440}
441
442define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
443; SI-LABEL: dynamic_insertelement_v3f32:
444; SI:       ; %bb.0:
445; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
446; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
447; SI-NEXT:    s_load_dword s4, s[4:5], 0x8
448; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
449; SI-NEXT:    s_mov_b32 s3, 0x100f000
450; SI-NEXT:    s_mov_b32 s2, -1
451; SI-NEXT:    s_waitcnt lgkmcnt(0)
452; SI-NEXT:    v_mov_b32_e32 v1, s10
453; SI-NEXT:    s_cmp_lg_u32 s4, 2
454; SI-NEXT:    s_cselect_b64 vcc, -1, 0
455; SI-NEXT:    s_cmp_lg_u32 s4, 1
456; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
457; SI-NEXT:    s_cselect_b64 vcc, -1, 0
458; SI-NEXT:    v_mov_b32_e32 v1, s9
459; SI-NEXT:    s_cmp_lg_u32 s4, 0
460; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
461; SI-NEXT:    v_mov_b32_e32 v3, s8
462; SI-NEXT:    s_cselect_b64 vcc, -1, 0
463; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
464; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
465; SI-NEXT:    s_endpgm
466;
467; VI-LABEL: dynamic_insertelement_v3f32:
468; VI:       ; %bb.0:
469; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
470; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
471; VI-NEXT:    s_load_dword s4, s[4:5], 0x20
472; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
473; VI-NEXT:    s_mov_b32 s3, 0x1100f000
474; VI-NEXT:    s_mov_b32 s2, -1
475; VI-NEXT:    s_waitcnt lgkmcnt(0)
476; VI-NEXT:    v_mov_b32_e32 v1, s10
477; VI-NEXT:    s_cmp_lg_u32 s4, 2
478; VI-NEXT:    s_cselect_b64 vcc, -1, 0
479; VI-NEXT:    s_cmp_lg_u32 s4, 1
480; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
481; VI-NEXT:    s_cselect_b64 vcc, -1, 0
482; VI-NEXT:    v_mov_b32_e32 v1, s9
483; VI-NEXT:    s_cmp_lg_u32 s4, 0
484; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
485; VI-NEXT:    v_mov_b32_e32 v3, s8
486; VI-NEXT:    s_cselect_b64 vcc, -1, 0
487; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
488; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
489; VI-NEXT:    s_endpgm
490  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
491  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
492  ret void
493}
494
495define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
496; SI-LABEL: dynamic_insertelement_v4f32:
497; SI:       ; %bb.0:
498; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
499; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
500; SI-NEXT:    s_load_dword s4, s[4:5], 0x8
501; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
502; SI-NEXT:    s_mov_b32 s3, 0x100f000
503; SI-NEXT:    s_mov_b32 s2, -1
504; SI-NEXT:    s_waitcnt lgkmcnt(0)
505; SI-NEXT:    v_mov_b32_e32 v1, s11
506; SI-NEXT:    s_cmp_lg_u32 s4, 3
507; SI-NEXT:    s_cselect_b64 vcc, -1, 0
508; SI-NEXT:    s_cmp_lg_u32 s4, 2
509; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
510; SI-NEXT:    s_cselect_b64 vcc, -1, 0
511; SI-NEXT:    v_mov_b32_e32 v1, s10
512; SI-NEXT:    s_cmp_lg_u32 s4, 1
513; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
514; SI-NEXT:    s_cselect_b64 vcc, -1, 0
515; SI-NEXT:    v_mov_b32_e32 v1, s9
516; SI-NEXT:    s_cmp_lg_u32 s4, 0
517; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
518; SI-NEXT:    v_mov_b32_e32 v4, s8
519; SI-NEXT:    s_cselect_b64 vcc, -1, 0
520; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
521; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
522; SI-NEXT:    s_endpgm
523;
524; VI-LABEL: dynamic_insertelement_v4f32:
525; VI:       ; %bb.0:
526; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
527; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
528; VI-NEXT:    s_load_dword s4, s[4:5], 0x20
529; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
530; VI-NEXT:    s_mov_b32 s3, 0x1100f000
531; VI-NEXT:    s_mov_b32 s2, -1
532; VI-NEXT:    s_waitcnt lgkmcnt(0)
533; VI-NEXT:    v_mov_b32_e32 v1, s11
534; VI-NEXT:    s_cmp_lg_u32 s4, 3
535; VI-NEXT:    s_cselect_b64 vcc, -1, 0
536; VI-NEXT:    s_cmp_lg_u32 s4, 2
537; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
538; VI-NEXT:    s_cselect_b64 vcc, -1, 0
539; VI-NEXT:    v_mov_b32_e32 v1, s10
540; VI-NEXT:    s_cmp_lg_u32 s4, 1
541; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
542; VI-NEXT:    s_cselect_b64 vcc, -1, 0
543; VI-NEXT:    v_mov_b32_e32 v1, s9
544; VI-NEXT:    s_cmp_lg_u32 s4, 0
545; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
546; VI-NEXT:    v_mov_b32_e32 v4, s8
547; VI-NEXT:    s_cselect_b64 vcc, -1, 0
548; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
549; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
550; VI-NEXT:    s_endpgm
551  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
552  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
553  ret void
554}
555
556define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
557; SI-LABEL: dynamic_insertelement_v8f32:
558; SI:       ; %bb.0:
559; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
560; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
561; SI-NEXT:    s_load_dword s4, s[4:5], 0x10
562; SI-NEXT:    v_mov_b32_e32 v4, 0x40a00000
563; SI-NEXT:    s_mov_b32 s3, 0x100f000
564; SI-NEXT:    s_mov_b32 s2, -1
565; SI-NEXT:    s_waitcnt lgkmcnt(0)
566; SI-NEXT:    v_mov_b32_e32 v0, s11
567; SI-NEXT:    s_cmp_lg_u32 s4, 3
568; SI-NEXT:    s_cselect_b64 vcc, -1, 0
569; SI-NEXT:    s_cmp_lg_u32 s4, 2
570; SI-NEXT:    v_cndmask_b32_e32 v3, v4, v0, vcc
571; SI-NEXT:    s_cselect_b64 vcc, -1, 0
572; SI-NEXT:    v_mov_b32_e32 v0, s10
573; SI-NEXT:    s_cmp_lg_u32 s4, 1
574; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
575; SI-NEXT:    s_cselect_b64 vcc, -1, 0
576; SI-NEXT:    v_mov_b32_e32 v0, s9
577; SI-NEXT:    s_cmp_lg_u32 s4, 0
578; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v0, vcc
579; SI-NEXT:    s_cselect_b64 vcc, -1, 0
580; SI-NEXT:    v_mov_b32_e32 v0, s8
581; SI-NEXT:    s_cmp_lg_u32 s4, 7
582; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
583; SI-NEXT:    s_cselect_b64 vcc, -1, 0
584; SI-NEXT:    v_mov_b32_e32 v5, s15
585; SI-NEXT:    s_cmp_lg_u32 s4, 6
586; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v5, vcc
587; SI-NEXT:    s_cselect_b64 vcc, -1, 0
588; SI-NEXT:    v_mov_b32_e32 v5, s14
589; SI-NEXT:    s_cmp_lg_u32 s4, 5
590; SI-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
591; SI-NEXT:    s_cselect_b64 vcc, -1, 0
592; SI-NEXT:    v_mov_b32_e32 v5, s13
593; SI-NEXT:    s_cmp_lg_u32 s4, 4
594; SI-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc
595; SI-NEXT:    v_mov_b32_e32 v8, s12
596; SI-NEXT:    s_cselect_b64 vcc, -1, 0
597; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
598; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
599; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
600; SI-NEXT:    s_endpgm
601;
602; VI-LABEL: dynamic_insertelement_v8f32:
603; VI:       ; %bb.0:
604; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
605; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
606; VI-NEXT:    s_load_dword s4, s[4:5], 0x40
607; VI-NEXT:    v_mov_b32_e32 v4, 0x40a00000
608; VI-NEXT:    s_mov_b32 s3, 0x1100f000
609; VI-NEXT:    s_mov_b32 s2, -1
610; VI-NEXT:    s_waitcnt lgkmcnt(0)
611; VI-NEXT:    v_mov_b32_e32 v0, s11
612; VI-NEXT:    s_cmp_lg_u32 s4, 3
613; VI-NEXT:    s_cselect_b64 vcc, -1, 0
614; VI-NEXT:    s_cmp_lg_u32 s4, 2
615; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v0, vcc
616; VI-NEXT:    s_cselect_b64 vcc, -1, 0
617; VI-NEXT:    v_mov_b32_e32 v0, s10
618; VI-NEXT:    s_cmp_lg_u32 s4, 1
619; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
620; VI-NEXT:    s_cselect_b64 vcc, -1, 0
621; VI-NEXT:    v_mov_b32_e32 v0, s9
622; VI-NEXT:    s_cmp_lg_u32 s4, 0
623; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v0, vcc
624; VI-NEXT:    s_cselect_b64 vcc, -1, 0
625; VI-NEXT:    v_mov_b32_e32 v0, s8
626; VI-NEXT:    s_cmp_lg_u32 s4, 7
627; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
628; VI-NEXT:    s_cselect_b64 vcc, -1, 0
629; VI-NEXT:    v_mov_b32_e32 v5, s15
630; VI-NEXT:    s_cmp_lg_u32 s4, 6
631; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v5, vcc
632; VI-NEXT:    s_cselect_b64 vcc, -1, 0
633; VI-NEXT:    v_mov_b32_e32 v5, s14
634; VI-NEXT:    s_cmp_lg_u32 s4, 5
635; VI-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
636; VI-NEXT:    s_cselect_b64 vcc, -1, 0
637; VI-NEXT:    v_mov_b32_e32 v5, s13
638; VI-NEXT:    s_cmp_lg_u32 s4, 4
639; VI-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc
640; VI-NEXT:    v_mov_b32_e32 v8, s12
641; VI-NEXT:    s_cselect_b64 vcc, -1, 0
642; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
643; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
644; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
645; VI-NEXT:    s_endpgm
646  %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
647  store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
648  ret void
649}
650
651define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
652; SI-LABEL: dynamic_insertelement_v16f32:
653; SI:       ; %bb.0:
654; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
655; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
656; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
657; SI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
658; SI-NEXT:    s_mov_b32 s3, 0x100f000
659; SI-NEXT:    s_mov_b32 s2, -1
660; SI-NEXT:    s_waitcnt lgkmcnt(0)
661; SI-NEXT:    v_mov_b32_e32 v0, s8
662; SI-NEXT:    v_mov_b32_e32 v1, s9
663; SI-NEXT:    v_mov_b32_e32 v2, s10
664; SI-NEXT:    v_mov_b32_e32 v3, s11
665; SI-NEXT:    v_mov_b32_e32 v4, s12
666; SI-NEXT:    v_mov_b32_e32 v5, s13
667; SI-NEXT:    v_mov_b32_e32 v6, s14
668; SI-NEXT:    v_mov_b32_e32 v7, s15
669; SI-NEXT:    v_mov_b32_e32 v8, s16
670; SI-NEXT:    v_mov_b32_e32 v9, s17
671; SI-NEXT:    v_mov_b32_e32 v10, s18
672; SI-NEXT:    v_mov_b32_e32 v11, s19
673; SI-NEXT:    v_mov_b32_e32 v12, s20
674; SI-NEXT:    v_mov_b32_e32 v13, s21
675; SI-NEXT:    v_mov_b32_e32 v14, s22
676; SI-NEXT:    v_mov_b32_e32 v15, s23
677; SI-NEXT:    s_mov_b32 m0, s4
678; SI-NEXT:    v_movreld_b32_e32 v0, v16
679; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
680; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
681; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
682; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
683; SI-NEXT:    s_endpgm
684;
685; VI-LABEL: dynamic_insertelement_v16f32:
686; VI:       ; %bb.0:
687; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
688; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
689; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
690; VI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
691; VI-NEXT:    s_mov_b32 s3, 0x1100f000
692; VI-NEXT:    s_mov_b32 s2, -1
693; VI-NEXT:    s_waitcnt lgkmcnt(0)
694; VI-NEXT:    v_mov_b32_e32 v0, s8
695; VI-NEXT:    v_mov_b32_e32 v1, s9
696; VI-NEXT:    v_mov_b32_e32 v2, s10
697; VI-NEXT:    v_mov_b32_e32 v3, s11
698; VI-NEXT:    v_mov_b32_e32 v4, s12
699; VI-NEXT:    v_mov_b32_e32 v5, s13
700; VI-NEXT:    v_mov_b32_e32 v6, s14
701; VI-NEXT:    v_mov_b32_e32 v7, s15
702; VI-NEXT:    v_mov_b32_e32 v8, s16
703; VI-NEXT:    v_mov_b32_e32 v9, s17
704; VI-NEXT:    v_mov_b32_e32 v10, s18
705; VI-NEXT:    v_mov_b32_e32 v11, s19
706; VI-NEXT:    v_mov_b32_e32 v12, s20
707; VI-NEXT:    v_mov_b32_e32 v13, s21
708; VI-NEXT:    v_mov_b32_e32 v14, s22
709; VI-NEXT:    v_mov_b32_e32 v15, s23
710; VI-NEXT:    s_mov_b32 m0, s4
711; VI-NEXT:    v_movreld_b32_e32 v0, v16
712; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
713; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
714; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
715; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
716; VI-NEXT:    s_endpgm
717  %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
718  store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
719  ret void
720}
721
722define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
723; SI-LABEL: dynamic_insertelement_v2i32:
724; SI:       ; %bb.0:
725; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
726; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
727; SI-NEXT:    s_load_dword s4, s[4:5], 0x4
728; SI-NEXT:    s_mov_b32 s3, 0x100f000
729; SI-NEXT:    s_mov_b32 s2, -1
730; SI-NEXT:    s_waitcnt lgkmcnt(0)
731; SI-NEXT:    v_mov_b32_e32 v0, s7
732; SI-NEXT:    s_cmp_lg_u32 s4, 1
733; SI-NEXT:    s_cselect_b64 vcc, -1, 0
734; SI-NEXT:    s_cmp_lg_u32 s4, 0
735; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
736; SI-NEXT:    v_mov_b32_e32 v0, s6
737; SI-NEXT:    s_cselect_b64 vcc, -1, 0
738; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
739; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
740; SI-NEXT:    s_endpgm
741;
742; VI-LABEL: dynamic_insertelement_v2i32:
743; VI:       ; %bb.0:
744; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
745; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
746; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
747; VI-NEXT:    s_mov_b32 s3, 0x1100f000
748; VI-NEXT:    s_mov_b32 s2, -1
749; VI-NEXT:    s_waitcnt lgkmcnt(0)
750; VI-NEXT:    s_cmp_lg_u32 s4, 1
751; VI-NEXT:    s_cselect_b32 s5, s7, 5
752; VI-NEXT:    s_cmp_lg_u32 s4, 0
753; VI-NEXT:    s_cselect_b32 s4, s6, 5
754; VI-NEXT:    v_mov_b32_e32 v0, s4
755; VI-NEXT:    v_mov_b32_e32 v1, s5
756; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
757; VI-NEXT:    s_endpgm
758  %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
759  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
760  ret void
761}
762
763define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
764; SI-LABEL: dynamic_insertelement_v3i32:
765; SI:       ; %bb.0:
766; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
767; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
768; SI-NEXT:    s_load_dword s4, s[4:5], 0x8
769; SI-NEXT:    s_mov_b32 s3, 0x100f000
770; SI-NEXT:    s_mov_b32 s2, -1
771; SI-NEXT:    s_waitcnt lgkmcnt(0)
772; SI-NEXT:    v_mov_b32_e32 v0, s10
773; SI-NEXT:    s_cmp_lg_u32 s4, 2
774; SI-NEXT:    s_cselect_b64 vcc, -1, 0
775; SI-NEXT:    s_cmp_lg_u32 s4, 1
776; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
777; SI-NEXT:    s_cselect_b64 vcc, -1, 0
778; SI-NEXT:    v_mov_b32_e32 v0, s9
779; SI-NEXT:    s_cmp_lg_u32 s4, 0
780; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
781; SI-NEXT:    v_mov_b32_e32 v0, s8
782; SI-NEXT:    s_cselect_b64 vcc, -1, 0
783; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
784; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
785; SI-NEXT:    s_endpgm
786;
787; VI-LABEL: dynamic_insertelement_v3i32:
788; VI:       ; %bb.0:
789; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
790; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
791; VI-NEXT:    s_load_dword s4, s[4:5], 0x20
792; VI-NEXT:    s_mov_b32 s3, 0x1100f000
793; VI-NEXT:    s_mov_b32 s2, -1
794; VI-NEXT:    s_waitcnt lgkmcnt(0)
795; VI-NEXT:    s_cmp_lg_u32 s4, 2
796; VI-NEXT:    s_cselect_b32 s5, s10, 5
797; VI-NEXT:    s_cmp_lg_u32 s4, 1
798; VI-NEXT:    s_cselect_b32 s6, s9, 5
799; VI-NEXT:    s_cmp_lg_u32 s4, 0
800; VI-NEXT:    s_cselect_b32 s4, s8, 5
801; VI-NEXT:    v_mov_b32_e32 v0, s4
802; VI-NEXT:    v_mov_b32_e32 v1, s6
803; VI-NEXT:    v_mov_b32_e32 v2, s5
804; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
805; VI-NEXT:    s_endpgm
806  %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
807  store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
808  ret void
809}
810
811define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
812; SI-LABEL: dynamic_insertelement_v4i32:
813; SI:       ; %bb.0:
814; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
815; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
816; SI-NEXT:    s_load_dword s6, s[4:5], 0x8
817; SI-NEXT:    s_load_dword s4, s[4:5], 0x11
818; SI-NEXT:    s_mov_b32 s3, 0x100f000
819; SI-NEXT:    s_mov_b32 s2, -1
820; SI-NEXT:    s_waitcnt lgkmcnt(0)
821; SI-NEXT:    v_mov_b32_e32 v0, s11
822; SI-NEXT:    s_cmp_eq_u32 s6, 3
823; SI-NEXT:    s_cselect_b64 vcc, -1, 0
824; SI-NEXT:    v_mov_b32_e32 v4, s4
825; SI-NEXT:    s_cmp_eq_u32 s6, 2
826; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
827; SI-NEXT:    s_cselect_b64 vcc, -1, 0
828; SI-NEXT:    v_mov_b32_e32 v0, s10
829; SI-NEXT:    s_cmp_eq_u32 s6, 1
830; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
831; SI-NEXT:    s_cselect_b64 vcc, -1, 0
832; SI-NEXT:    v_mov_b32_e32 v0, s9
833; SI-NEXT:    s_cmp_eq_u32 s6, 0
834; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
835; SI-NEXT:    v_mov_b32_e32 v0, s8
836; SI-NEXT:    s_cselect_b64 vcc, -1, 0
837; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
838; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
839; SI-NEXT:    s_endpgm
840;
841; VI-LABEL: dynamic_insertelement_v4i32:
842; VI:       ; %bb.0:
843; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
844; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
845; VI-NEXT:    s_load_dword s6, s[4:5], 0x20
846; VI-NEXT:    s_load_dword s4, s[4:5], 0x44
847; VI-NEXT:    s_mov_b32 s3, 0x1100f000
848; VI-NEXT:    s_mov_b32 s2, -1
849; VI-NEXT:    s_waitcnt lgkmcnt(0)
850; VI-NEXT:    s_cmp_eq_u32 s6, 3
851; VI-NEXT:    s_cselect_b32 s5, s4, s11
852; VI-NEXT:    s_cmp_eq_u32 s6, 2
853; VI-NEXT:    s_cselect_b32 s7, s4, s10
854; VI-NEXT:    s_cmp_eq_u32 s6, 1
855; VI-NEXT:    s_cselect_b32 s9, s4, s9
856; VI-NEXT:    s_cmp_eq_u32 s6, 0
857; VI-NEXT:    s_cselect_b32 s4, s4, s8
858; VI-NEXT:    v_mov_b32_e32 v0, s4
859; VI-NEXT:    v_mov_b32_e32 v1, s9
860; VI-NEXT:    v_mov_b32_e32 v2, s7
861; VI-NEXT:    v_mov_b32_e32 v3, s5
862; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
863; VI-NEXT:    s_endpgm
864  %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
865  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
866  ret void
867}
868
869define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
870; SI-LABEL: dynamic_insertelement_v8i32:
871; SI:       ; %bb.0:
872; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
873; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
874; SI-NEXT:    s_load_dword s4, s[4:5], 0x10
875; SI-NEXT:    s_mov_b32 s3, 0x100f000
876; SI-NEXT:    s_mov_b32 s2, -1
877; SI-NEXT:    s_waitcnt lgkmcnt(0)
878; SI-NEXT:    v_mov_b32_e32 v0, s11
879; SI-NEXT:    s_cmp_lg_u32 s4, 3
880; SI-NEXT:    s_cselect_b64 vcc, -1, 0
881; SI-NEXT:    s_cmp_lg_u32 s4, 2
882; SI-NEXT:    v_cndmask_b32_e32 v3, 5, v0, vcc
883; SI-NEXT:    s_cselect_b64 vcc, -1, 0
884; SI-NEXT:    v_mov_b32_e32 v0, s10
885; SI-NEXT:    s_cmp_lg_u32 s4, 1
886; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
887; SI-NEXT:    s_cselect_b64 vcc, -1, 0
888; SI-NEXT:    v_mov_b32_e32 v0, s9
889; SI-NEXT:    s_cmp_lg_u32 s4, 0
890; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
891; SI-NEXT:    s_cselect_b64 vcc, -1, 0
892; SI-NEXT:    v_mov_b32_e32 v0, s8
893; SI-NEXT:    s_cmp_lg_u32 s4, 7
894; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
895; SI-NEXT:    s_cselect_b64 vcc, -1, 0
896; SI-NEXT:    v_mov_b32_e32 v4, s15
897; SI-NEXT:    s_cmp_lg_u32 s4, 6
898; SI-NEXT:    v_cndmask_b32_e32 v7, 5, v4, vcc
899; SI-NEXT:    s_cselect_b64 vcc, -1, 0
900; SI-NEXT:    v_mov_b32_e32 v4, s14
901; SI-NEXT:    s_cmp_lg_u32 s4, 5
902; SI-NEXT:    v_cndmask_b32_e32 v6, 5, v4, vcc
903; SI-NEXT:    s_cselect_b64 vcc, -1, 0
904; SI-NEXT:    v_mov_b32_e32 v4, s13
905; SI-NEXT:    s_cmp_lg_u32 s4, 4
906; SI-NEXT:    v_cndmask_b32_e32 v5, 5, v4, vcc
907; SI-NEXT:    v_mov_b32_e32 v4, s12
908; SI-NEXT:    s_cselect_b64 vcc, -1, 0
909; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
910; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
911; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
912; SI-NEXT:    s_endpgm
913;
914; VI-LABEL: dynamic_insertelement_v8i32:
915; VI:       ; %bb.0:
916; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
917; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
918; VI-NEXT:    s_load_dword s4, s[4:5], 0x40
919; VI-NEXT:    s_mov_b32 s3, 0x1100f000
920; VI-NEXT:    s_mov_b32 s2, -1
921; VI-NEXT:    s_waitcnt lgkmcnt(0)
922; VI-NEXT:    s_cmp_lg_u32 s4, 3
923; VI-NEXT:    s_cselect_b32 s5, s11, 5
924; VI-NEXT:    s_cmp_lg_u32 s4, 2
925; VI-NEXT:    s_cselect_b32 s6, s10, 5
926; VI-NEXT:    s_cmp_lg_u32 s4, 1
927; VI-NEXT:    s_cselect_b32 s7, s9, 5
928; VI-NEXT:    s_cmp_lg_u32 s4, 0
929; VI-NEXT:    s_cselect_b32 s8, s8, 5
930; VI-NEXT:    s_cmp_lg_u32 s4, 7
931; VI-NEXT:    s_cselect_b32 s9, s15, 5
932; VI-NEXT:    s_cmp_lg_u32 s4, 6
933; VI-NEXT:    s_cselect_b32 s10, s14, 5
934; VI-NEXT:    s_cmp_lg_u32 s4, 5
935; VI-NEXT:    s_cselect_b32 s11, s13, 5
936; VI-NEXT:    s_cmp_lg_u32 s4, 4
937; VI-NEXT:    s_cselect_b32 s4, s12, 5
938; VI-NEXT:    v_mov_b32_e32 v0, s4
939; VI-NEXT:    v_mov_b32_e32 v1, s11
940; VI-NEXT:    v_mov_b32_e32 v2, s10
941; VI-NEXT:    v_mov_b32_e32 v3, s9
942; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
943; VI-NEXT:    s_nop 0
944; VI-NEXT:    v_mov_b32_e32 v0, s8
945; VI-NEXT:    v_mov_b32_e32 v1, s7
946; VI-NEXT:    v_mov_b32_e32 v2, s6
947; VI-NEXT:    v_mov_b32_e32 v3, s5
948; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
949; VI-NEXT:    s_endpgm
950  %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
951  store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
952  ret void
953}
954
955define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
956; SI-LABEL: dynamic_insertelement_v16i32:
957; SI:       ; %bb.0:
958; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
959; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
960; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
961; SI-NEXT:    s_mov_b32 s3, 0x100f000
962; SI-NEXT:    s_mov_b32 s2, -1
963; SI-NEXT:    s_waitcnt lgkmcnt(0)
964; SI-NEXT:    v_mov_b32_e32 v0, s8
965; SI-NEXT:    v_mov_b32_e32 v1, s9
966; SI-NEXT:    v_mov_b32_e32 v2, s10
967; SI-NEXT:    v_mov_b32_e32 v3, s11
968; SI-NEXT:    v_mov_b32_e32 v4, s12
969; SI-NEXT:    v_mov_b32_e32 v5, s13
970; SI-NEXT:    v_mov_b32_e32 v6, s14
971; SI-NEXT:    v_mov_b32_e32 v7, s15
972; SI-NEXT:    v_mov_b32_e32 v8, s16
973; SI-NEXT:    v_mov_b32_e32 v9, s17
974; SI-NEXT:    v_mov_b32_e32 v10, s18
975; SI-NEXT:    v_mov_b32_e32 v11, s19
976; SI-NEXT:    v_mov_b32_e32 v12, s20
977; SI-NEXT:    v_mov_b32_e32 v13, s21
978; SI-NEXT:    v_mov_b32_e32 v14, s22
979; SI-NEXT:    v_mov_b32_e32 v15, s23
980; SI-NEXT:    s_mov_b32 m0, s4
981; SI-NEXT:    v_movreld_b32_e32 v0, 5
982; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
983; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
984; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
985; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
986; SI-NEXT:    s_endpgm
987;
988; VI-LABEL: dynamic_insertelement_v16i32:
989; VI:       ; %bb.0:
990; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
991; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
992; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
993; VI-NEXT:    s_mov_b32 s3, 0x1100f000
994; VI-NEXT:    s_mov_b32 s2, -1
995; VI-NEXT:    s_waitcnt lgkmcnt(0)
996; VI-NEXT:    v_mov_b32_e32 v0, s8
997; VI-NEXT:    v_mov_b32_e32 v1, s9
998; VI-NEXT:    v_mov_b32_e32 v2, s10
999; VI-NEXT:    v_mov_b32_e32 v3, s11
1000; VI-NEXT:    v_mov_b32_e32 v4, s12
1001; VI-NEXT:    v_mov_b32_e32 v5, s13
1002; VI-NEXT:    v_mov_b32_e32 v6, s14
1003; VI-NEXT:    v_mov_b32_e32 v7, s15
1004; VI-NEXT:    v_mov_b32_e32 v8, s16
1005; VI-NEXT:    v_mov_b32_e32 v9, s17
1006; VI-NEXT:    v_mov_b32_e32 v10, s18
1007; VI-NEXT:    v_mov_b32_e32 v11, s19
1008; VI-NEXT:    v_mov_b32_e32 v12, s20
1009; VI-NEXT:    v_mov_b32_e32 v13, s21
1010; VI-NEXT:    v_mov_b32_e32 v14, s22
1011; VI-NEXT:    v_mov_b32_e32 v15, s23
1012; VI-NEXT:    s_mov_b32 m0, s4
1013; VI-NEXT:    v_movreld_b32_e32 v0, 5
1014; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1015; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1016; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1017; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1018; VI-NEXT:    s_endpgm
1019  %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
1020  store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
1021  ret void
1022}
1023
1024define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
1025; SI-LABEL: dynamic_insertelement_v2i16:
1026; SI:       ; %bb.0:
1027; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1028; SI-NEXT:    s_load_dword s6, s[4:5], 0x2
1029; SI-NEXT:    s_load_dword s4, s[4:5], 0x3
1030; SI-NEXT:    s_mov_b32 s3, 0x100f000
1031; SI-NEXT:    s_mov_b32 s2, -1
1032; SI-NEXT:    s_waitcnt lgkmcnt(0)
1033; SI-NEXT:    s_lshl_b32 s4, s4, 4
1034; SI-NEXT:    s_lshl_b32 s4, 0xffff, s4
1035; SI-NEXT:    s_andn2_b32 s5, s6, s4
1036; SI-NEXT:    s_and_b32 s4, s4, 0x50005
1037; SI-NEXT:    s_or_b32 s4, s4, s5
1038; SI-NEXT:    v_mov_b32_e32 v0, s4
1039; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1040; SI-NEXT:    s_endpgm
1041;
1042; VI-LABEL: dynamic_insertelement_v2i16:
1043; VI:       ; %bb.0:
1044; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1045; VI-NEXT:    s_load_dword s6, s[4:5], 0x8
1046; VI-NEXT:    s_load_dword s4, s[4:5], 0xc
1047; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1048; VI-NEXT:    s_mov_b32 s2, -1
1049; VI-NEXT:    s_waitcnt lgkmcnt(0)
1050; VI-NEXT:    s_lshl_b32 s4, s4, 4
1051; VI-NEXT:    s_lshl_b32 s4, 0xffff, s4
1052; VI-NEXT:    s_andn2_b32 s5, s6, s4
1053; VI-NEXT:    s_and_b32 s4, s4, 0x50005
1054; VI-NEXT:    s_or_b32 s4, s4, s5
1055; VI-NEXT:    v_mov_b32_e32 v0, s4
1056; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1057; VI-NEXT:    s_endpgm
1058  %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
1059  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
1060  ret void
1061}
1062
1063define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
1064; SI-LABEL: dynamic_insertelement_v3i16:
1065; SI:       ; %bb.0:
1066; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1067; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
1068; SI-NEXT:    s_load_dword s4, s[4:5], 0x4
1069; SI-NEXT:    s_mov_b32 s3, 0x100f000
1070; SI-NEXT:    s_mov_b32 s2, -1
1071; SI-NEXT:    s_waitcnt lgkmcnt(0)
1072; SI-NEXT:    s_lshl_b32 s8, s4, 4
1073; SI-NEXT:    s_mov_b64 s[4:5], 0xffff
1074; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
1075; SI-NEXT:    s_mov_b32 s8, 0x50005
1076; SI-NEXT:    s_and_b32 s9, s5, s8
1077; SI-NEXT:    s_and_b32 s8, s4, s8
1078; SI-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
1079; SI-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
1080; SI-NEXT:    v_mov_b32_e32 v0, s5
1081; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
1082; SI-NEXT:    v_mov_b32_e32 v0, s4
1083; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1084; SI-NEXT:    s_endpgm
1085;
1086; VI-LABEL: dynamic_insertelement_v3i16:
1087; VI:       ; %bb.0:
1088; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1089; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
1090; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1091; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1092; VI-NEXT:    s_mov_b32 s2, -1
1093; VI-NEXT:    s_waitcnt lgkmcnt(0)
1094; VI-NEXT:    s_lshl_b32 s8, s4, 4
1095; VI-NEXT:    s_mov_b64 s[4:5], 0xffff
1096; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
1097; VI-NEXT:    s_mov_b32 s8, 0x50005
1098; VI-NEXT:    s_mov_b32 s9, s8
1099; VI-NEXT:    s_andn2_b64 s[6:7], s[6:7], s[4:5]
1100; VI-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
1101; VI-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
1102; VI-NEXT:    v_mov_b32_e32 v0, s5
1103; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
1104; VI-NEXT:    v_mov_b32_e32 v0, s4
1105; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1106; VI-NEXT:    s_endpgm
1107  %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
1108  store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8
1109  ret void
1110}
1111
1112define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
1113; SI-LABEL: dynamic_insertelement_v2i8:
1114; SI:       ; %bb.0:
1115; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1116; SI-NEXT:    s_load_dword s6, s[4:5], 0xa
1117; SI-NEXT:    s_load_dword s4, s[4:5], 0x13
1118; SI-NEXT:    s_mov_b32 s3, 0x100f000
1119; SI-NEXT:    s_mov_b32 s2, -1
1120; SI-NEXT:    s_waitcnt lgkmcnt(0)
1121; SI-NEXT:    s_lshl_b32 s4, s4, 3
1122; SI-NEXT:    s_lshl_b32 s4, -1, s4
1123; SI-NEXT:    s_andn2_b32 s5, s6, s4
1124; SI-NEXT:    s_and_b32 s4, s4, 0x505
1125; SI-NEXT:    s_or_b32 s4, s4, s5
1126; SI-NEXT:    v_mov_b32_e32 v0, s4
1127; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1128; SI-NEXT:    s_endpgm
1129;
1130; VI-LABEL: dynamic_insertelement_v2i8:
1131; VI:       ; %bb.0:
1132; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1133; VI-NEXT:    s_load_dword s6, s[4:5], 0x28
1134; VI-NEXT:    s_load_dword s4, s[4:5], 0x4c
1135; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1136; VI-NEXT:    s_mov_b32 s2, -1
1137; VI-NEXT:    s_waitcnt lgkmcnt(0)
1138; VI-NEXT:    s_lshl_b32 s4, s4, 3
1139; VI-NEXT:    v_lshlrev_b16_e64 v0, s4, -1
1140; VI-NEXT:    v_not_b32_e32 v1, v0
1141; VI-NEXT:    v_and_b32_e32 v1, s6, v1
1142; VI-NEXT:    v_and_b32_e32 v0, 0x505, v0
1143; VI-NEXT:    v_or_b32_e32 v0, v0, v1
1144; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1145; VI-NEXT:    s_endpgm
1146  %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
1147  store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
1148  ret void
1149}
1150
1151; FIXME: post legalize i16 and i32 shifts aren't merged because of
1152; isTypeDesirableForOp in SimplifyDemandedBits
1153define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
1154; SI-LABEL: dynamic_insertelement_v3i8:
1155; SI:       ; %bb.0:
1156; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1157; SI-NEXT:    s_load_dword s6, s[4:5], 0xa
1158; SI-NEXT:    s_load_dword s4, s[4:5], 0x13
1159; SI-NEXT:    s_mov_b32 s3, 0x100f000
1160; SI-NEXT:    s_mov_b32 s2, -1
1161; SI-NEXT:    s_waitcnt lgkmcnt(0)
1162; SI-NEXT:    s_lshl_b32 s4, s4, 3
1163; SI-NEXT:    s_lshl_b32 s4, 0xffff, s4
1164; SI-NEXT:    s_andn2_b32 s5, s6, s4
1165; SI-NEXT:    s_and_b32 s4, s4, 0x5050505
1166; SI-NEXT:    s_or_b32 s4, s4, s5
1167; SI-NEXT:    v_mov_b32_e32 v0, s4
1168; SI-NEXT:    s_lshr_b32 s5, s4, 16
1169; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1170; SI-NEXT:    v_mov_b32_e32 v0, s5
1171; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:2
1172; SI-NEXT:    s_endpgm
1173;
1174; VI-LABEL: dynamic_insertelement_v3i8:
1175; VI:       ; %bb.0:
1176; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1177; VI-NEXT:    s_load_dword s6, s[4:5], 0x28
1178; VI-NEXT:    s_load_dword s4, s[4:5], 0x4c
1179; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1180; VI-NEXT:    s_mov_b32 s2, -1
1181; VI-NEXT:    s_waitcnt lgkmcnt(0)
1182; VI-NEXT:    s_lshl_b32 s4, s4, 3
1183; VI-NEXT:    s_lshl_b32 s4, 0xffff, s4
1184; VI-NEXT:    s_andn2_b32 s5, s6, s4
1185; VI-NEXT:    s_and_b32 s4, s4, 0x5050505
1186; VI-NEXT:    s_or_b32 s4, s4, s5
1187; VI-NEXT:    v_mov_b32_e32 v0, s4
1188; VI-NEXT:    s_lshr_b32 s5, s4, 16
1189; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1190; VI-NEXT:    v_mov_b32_e32 v0, s5
1191; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:2
1192; VI-NEXT:    s_endpgm
1193  %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
1194  store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
1195  ret void
1196}
1197
1198define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
1199; SI-LABEL: dynamic_insertelement_v4i8:
1200; SI:       ; %bb.0:
1201; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1202; SI-NEXT:    s_load_dword s6, s[4:5], 0xa
1203; SI-NEXT:    s_load_dword s4, s[4:5], 0x13
1204; SI-NEXT:    s_mov_b32 s3, 0x100f000
1205; SI-NEXT:    s_mov_b32 s2, -1
1206; SI-NEXT:    s_waitcnt lgkmcnt(0)
1207; SI-NEXT:    s_lshl_b32 s4, s4, 3
1208; SI-NEXT:    s_lshl_b32 s4, 0xffff, s4
1209; SI-NEXT:    s_andn2_b32 s5, s6, s4
1210; SI-NEXT:    s_and_b32 s4, s4, 0x5050505
1211; SI-NEXT:    s_or_b32 s4, s4, s5
1212; SI-NEXT:    v_mov_b32_e32 v0, s4
1213; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1214; SI-NEXT:    s_endpgm
1215;
1216; VI-LABEL: dynamic_insertelement_v4i8:
1217; VI:       ; %bb.0:
1218; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1219; VI-NEXT:    s_load_dword s6, s[4:5], 0x28
1220; VI-NEXT:    s_load_dword s4, s[4:5], 0x4c
1221; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1222; VI-NEXT:    s_mov_b32 s2, -1
1223; VI-NEXT:    s_waitcnt lgkmcnt(0)
1224; VI-NEXT:    s_lshl_b32 s4, s4, 3
1225; VI-NEXT:    s_lshl_b32 s4, 0xffff, s4
1226; VI-NEXT:    s_andn2_b32 s5, s6, s4
1227; VI-NEXT:    s_and_b32 s4, s4, 0x5050505
1228; VI-NEXT:    s_or_b32 s4, s4, s5
1229; VI-NEXT:    v_mov_b32_e32 v0, s4
1230; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1231; VI-NEXT:    s_endpgm
1232  %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
1233  store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
1234  ret void
1235}
1236
1237define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind {
1238; SI-LABEL: s_dynamic_insertelement_v8i8:
1239; SI:       ; %bb.0:
1240; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
1241; SI-NEXT:    s_load_dword s6, s[4:5], 0x4
1242; SI-NEXT:    s_mov_b32 s3, 0x100f000
1243; SI-NEXT:    s_mov_b32 s2, -1
1244; SI-NEXT:    s_waitcnt lgkmcnt(0)
1245; SI-NEXT:    s_load_dwordx2 s[4:5], s[10:11], 0x0
1246; SI-NEXT:    s_mov_b32 s0, s8
1247; SI-NEXT:    s_lshl_b32 s8, s6, 3
1248; SI-NEXT:    s_mov_b64 s[6:7], 0xffff
1249; SI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s8
1250; SI-NEXT:    s_mov_b32 s8, 0x5050505
1251; SI-NEXT:    s_mov_b32 s1, s9
1252; SI-NEXT:    s_and_b32 s9, s7, s8
1253; SI-NEXT:    s_and_b32 s8, s6, s8
1254; SI-NEXT:    s_waitcnt lgkmcnt(0)
1255; SI-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
1256; SI-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
1257; SI-NEXT:    v_mov_b32_e32 v0, s4
1258; SI-NEXT:    v_mov_b32_e32 v1, s5
1259; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1260; SI-NEXT:    s_endpgm
1261;
1262; VI-LABEL: s_dynamic_insertelement_v8i8:
1263; VI:       ; %bb.0:
1264; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
1265; VI-NEXT:    s_load_dword s6, s[4:5], 0x10
1266; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1267; VI-NEXT:    s_mov_b32 s2, -1
1268; VI-NEXT:    s_waitcnt lgkmcnt(0)
1269; VI-NEXT:    s_load_dwordx2 s[4:5], s[10:11], 0x0
1270; VI-NEXT:    s_mov_b32 s0, s8
1271; VI-NEXT:    s_lshl_b32 s8, s6, 3
1272; VI-NEXT:    s_mov_b64 s[6:7], 0xffff
1273; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s8
1274; VI-NEXT:    s_mov_b32 s8, 0x5050505
1275; VI-NEXT:    s_mov_b32 s1, s9
1276; VI-NEXT:    s_and_b32 s9, s7, s8
1277; VI-NEXT:    s_and_b32 s8, s6, s8
1278; VI-NEXT:    s_waitcnt lgkmcnt(0)
1279; VI-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
1280; VI-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
1281; VI-NEXT:    v_mov_b32_e32 v0, s4
1282; VI-NEXT:    v_mov_b32_e32 v1, s5
1283; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1284; VI-NEXT:    s_endpgm
1285  %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4
1286  %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
1287  store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
1288  ret void
1289}
1290
1291define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
1292; SI-LABEL: dynamic_insertelement_v16i8:
1293; SI:       ; %bb.0:
1294; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1295; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
1296; SI-NEXT:    s_load_dword s4, s[4:5], 0x8
1297; SI-NEXT:    s_mov_b32 s3, 0x100f000
1298; SI-NEXT:    s_mov_b32 s2, -1
1299; SI-NEXT:    s_waitcnt lgkmcnt(0)
1300; SI-NEXT:    s_lshr_b32 s5, s11, 24
1301; SI-NEXT:    s_cmp_lg_u32 s4, 15
1302; SI-NEXT:    v_mov_b32_e32 v0, s5
1303; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1304; SI-NEXT:    s_lshr_b32 s5, s11, 16
1305; SI-NEXT:    s_cmp_lg_u32 s4, 14
1306; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1307; SI-NEXT:    v_mov_b32_e32 v1, s5
1308; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1309; SI-NEXT:    s_lshr_b32 s6, s11, 8
1310; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1311; SI-NEXT:    s_movk_i32 s5, 0xff
1312; SI-NEXT:    s_cmp_lg_u32 s4, 13
1313; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1314; SI-NEXT:    v_and_b32_e32 v1, s5, v1
1315; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1316; SI-NEXT:    v_mov_b32_e32 v1, s6
1317; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1318; SI-NEXT:    s_cmp_lg_u32 s4, 12
1319; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1320; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1321; SI-NEXT:    v_mov_b32_e32 v2, s11
1322; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1323; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1324; SI-NEXT:    v_and_b32_e32 v2, s5, v2
1325; SI-NEXT:    v_or_b32_e32 v1, v2, v1
1326; SI-NEXT:    s_mov_b32 s6, 0xffff
1327; SI-NEXT:    s_lshr_b32 s7, s10, 24
1328; SI-NEXT:    s_cmp_lg_u32 s4, 11
1329; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1330; SI-NEXT:    v_and_b32_e32 v1, s6, v1
1331; SI-NEXT:    v_or_b32_e32 v3, v1, v0
1332; SI-NEXT:    v_mov_b32_e32 v0, s7
1333; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1334; SI-NEXT:    s_lshr_b32 s7, s10, 16
1335; SI-NEXT:    s_cmp_lg_u32 s4, 10
1336; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1337; SI-NEXT:    v_mov_b32_e32 v1, s7
1338; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1339; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1340; SI-NEXT:    s_lshr_b32 s7, s10, 8
1341; SI-NEXT:    s_cmp_lg_u32 s4, 9
1342; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1343; SI-NEXT:    v_and_b32_e32 v1, s5, v1
1344; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1345; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1346; SI-NEXT:    v_mov_b32_e32 v1, s7
1347; SI-NEXT:    s_cmp_lg_u32 s4, 8
1348; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1349; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1350; SI-NEXT:    v_mov_b32_e32 v2, s10
1351; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1352; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1353; SI-NEXT:    v_and_b32_e32 v2, s5, v2
1354; SI-NEXT:    v_or_b32_e32 v1, v2, v1
1355; SI-NEXT:    s_lshr_b32 s7, s9, 24
1356; SI-NEXT:    s_cmp_lg_u32 s4, 7
1357; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1358; SI-NEXT:    v_and_b32_e32 v1, s6, v1
1359; SI-NEXT:    v_or_b32_e32 v2, v1, v0
1360; SI-NEXT:    v_mov_b32_e32 v0, s7
1361; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1362; SI-NEXT:    s_lshr_b32 s7, s9, 16
1363; SI-NEXT:    s_cmp_lg_u32 s4, 6
1364; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1365; SI-NEXT:    v_mov_b32_e32 v1, s7
1366; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1367; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1368; SI-NEXT:    s_lshr_b32 s7, s9, 8
1369; SI-NEXT:    s_cmp_lg_u32 s4, 5
1370; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1371; SI-NEXT:    v_and_b32_e32 v1, s5, v1
1372; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1373; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1374; SI-NEXT:    v_mov_b32_e32 v1, s7
1375; SI-NEXT:    s_cmp_lg_u32 s4, 4
1376; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1377; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1378; SI-NEXT:    v_mov_b32_e32 v4, s9
1379; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1380; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1381; SI-NEXT:    v_and_b32_e32 v4, s5, v4
1382; SI-NEXT:    v_or_b32_e32 v1, v4, v1
1383; SI-NEXT:    s_lshr_b32 s7, s8, 24
1384; SI-NEXT:    s_cmp_lg_u32 s4, 3
1385; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1386; SI-NEXT:    v_and_b32_e32 v1, s6, v1
1387; SI-NEXT:    v_or_b32_e32 v1, v1, v0
1388; SI-NEXT:    v_mov_b32_e32 v0, s7
1389; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1390; SI-NEXT:    s_lshr_b32 s7, s8, 16
1391; SI-NEXT:    s_cmp_lg_u32 s4, 2
1392; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1393; SI-NEXT:    v_mov_b32_e32 v4, s7
1394; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1395; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1396; SI-NEXT:    s_lshr_b32 s7, s8, 8
1397; SI-NEXT:    s_cmp_lg_u32 s4, 1
1398; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1399; SI-NEXT:    v_and_b32_e32 v4, s5, v4
1400; SI-NEXT:    v_or_b32_e32 v0, v4, v0
1401; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1402; SI-NEXT:    v_mov_b32_e32 v4, s7
1403; SI-NEXT:    s_cmp_lg_u32 s4, 0
1404; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1405; SI-NEXT:    v_mov_b32_e32 v5, s8
1406; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1407; SI-NEXT:    v_cndmask_b32_e32 v5, 5, v5, vcc
1408; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
1409; SI-NEXT:    v_and_b32_e32 v5, s5, v5
1410; SI-NEXT:    v_or_b32_e32 v4, v5, v4
1411; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1412; SI-NEXT:    v_and_b32_e32 v4, s6, v4
1413; SI-NEXT:    v_or_b32_e32 v0, v4, v0
1414; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1415; SI-NEXT:    s_endpgm
1416;
1417; VI-LABEL: dynamic_insertelement_v16i8:
1418; VI:       ; %bb.0:
1419; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1420; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
1421; VI-NEXT:    s_load_dword s4, s[4:5], 0x20
1422; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1423; VI-NEXT:    s_mov_b32 s2, -1
1424; VI-NEXT:    s_waitcnt lgkmcnt(0)
1425; VI-NEXT:    s_lshr_b32 s5, s11, 24
1426; VI-NEXT:    s_cmp_lg_u32 s4, 15
1427; VI-NEXT:    v_mov_b32_e32 v0, s5
1428; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1429; VI-NEXT:    s_lshr_b32 s5, s11, 16
1430; VI-NEXT:    s_cmp_lg_u32 s4, 14
1431; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1432; VI-NEXT:    v_mov_b32_e32 v1, s5
1433; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1434; VI-NEXT:    s_lshr_b32 s5, s11, 8
1435; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1436; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1437; VI-NEXT:    s_cmp_lg_u32 s4, 13
1438; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1439; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1440; VI-NEXT:    v_mov_b32_e32 v1, s5
1441; VI-NEXT:    s_cmp_lg_u32 s4, 12
1442; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1443; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1444; VI-NEXT:    v_mov_b32_e32 v2, s11
1445; VI-NEXT:    s_lshr_b32 s5, s10, 24
1446; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1447; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1448; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1449; VI-NEXT:    s_cmp_lg_u32 s4, 11
1450; VI-NEXT:    v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1451; VI-NEXT:    v_mov_b32_e32 v0, s5
1452; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1453; VI-NEXT:    s_lshr_b32 s5, s10, 16
1454; VI-NEXT:    s_cmp_lg_u32 s4, 10
1455; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1456; VI-NEXT:    v_mov_b32_e32 v1, s5
1457; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1458; VI-NEXT:    s_lshr_b32 s5, s10, 8
1459; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1460; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1461; VI-NEXT:    s_cmp_lg_u32 s4, 9
1462; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1463; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1464; VI-NEXT:    v_mov_b32_e32 v1, s5
1465; VI-NEXT:    s_cmp_lg_u32 s4, 8
1466; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1467; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1468; VI-NEXT:    v_mov_b32_e32 v2, s10
1469; VI-NEXT:    s_lshr_b32 s5, s9, 24
1470; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1471; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1472; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1473; VI-NEXT:    s_cmp_lg_u32 s4, 7
1474; VI-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1475; VI-NEXT:    v_mov_b32_e32 v0, s5
1476; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1477; VI-NEXT:    s_lshr_b32 s5, s9, 16
1478; VI-NEXT:    s_cmp_lg_u32 s4, 6
1479; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1480; VI-NEXT:    v_mov_b32_e32 v1, s5
1481; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1482; VI-NEXT:    s_lshr_b32 s5, s9, 8
1483; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1484; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1485; VI-NEXT:    s_cmp_lg_u32 s4, 5
1486; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1487; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1488; VI-NEXT:    v_mov_b32_e32 v1, s5
1489; VI-NEXT:    s_cmp_lg_u32 s4, 4
1490; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1491; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1492; VI-NEXT:    v_mov_b32_e32 v4, s9
1493; VI-NEXT:    s_lshr_b32 s5, s8, 24
1494; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1495; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1496; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1497; VI-NEXT:    s_cmp_lg_u32 s4, 3
1498; VI-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1499; VI-NEXT:    v_mov_b32_e32 v0, s5
1500; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1501; VI-NEXT:    s_lshr_b32 s5, s8, 16
1502; VI-NEXT:    s_cmp_lg_u32 s4, 2
1503; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1504; VI-NEXT:    v_mov_b32_e32 v4, s5
1505; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1506; VI-NEXT:    s_lshr_b32 s5, s8, 8
1507; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1508; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1509; VI-NEXT:    s_cmp_lg_u32 s4, 1
1510; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1511; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1512; VI-NEXT:    v_mov_b32_e32 v4, s5
1513; VI-NEXT:    s_cmp_lg_u32 s4, 0
1514; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1515; VI-NEXT:    v_mov_b32_e32 v5, s8
1516; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1517; VI-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
1518; VI-NEXT:    v_cndmask_b32_e32 v5, 5, v5, vcc
1519; VI-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1520; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1521; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1522; VI-NEXT:    s_endpgm
1523  %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
1524  store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
1525  ret void
1526}
1527
1528; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
1529; the compiler doesn't crash.
1530define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
1531; SI-LABEL: insert_split_bb:
1532; SI:       ; %bb.0: ; %entry
1533; SI-NEXT:    s_load_dword s6, s[4:5], 0x4
1534; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1535; SI-NEXT:    s_waitcnt lgkmcnt(0)
1536; SI-NEXT:    s_cmp_lg_u32 s6, 0
1537; SI-NEXT:    s_cbranch_scc0 BB30_2
1538; SI-NEXT:  ; %bb.1: ; %else
1539; SI-NEXT:    s_load_dword s7, s[2:3], 0x1
1540; SI-NEXT:    s_mov_b64 s[4:5], 0
1541; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
1542; SI-NEXT:    s_waitcnt lgkmcnt(0)
1543; SI-NEXT:    s_mov_b64 vcc, vcc
1544; SI-NEXT:    s_cbranch_vccz BB30_3
1545; SI-NEXT:    s_branch BB30_4
1546; SI-NEXT:  BB30_2:
1547; SI-NEXT:  BB30_3: ; %if
1548; SI-NEXT:    s_load_dword s7, s[2:3], 0x0
1549; SI-NEXT:  BB30_4: ; %endif
1550; SI-NEXT:    s_waitcnt lgkmcnt(0)
1551; SI-NEXT:    v_mov_b32_e32 v0, s6
1552; SI-NEXT:    s_mov_b32 s3, 0x100f000
1553; SI-NEXT:    s_mov_b32 s2, -1
1554; SI-NEXT:    v_mov_b32_e32 v1, s7
1555; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1556; SI-NEXT:    s_endpgm
1557;
1558; VI-LABEL: insert_split_bb:
1559; VI:       ; %bb.0: ; %entry
1560; VI-NEXT:    s_load_dword s6, s[4:5], 0x10
1561; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1562; VI-NEXT:    s_waitcnt lgkmcnt(0)
1563; VI-NEXT:    s_cmp_lg_u32 s6, 0
1564; VI-NEXT:    s_cbranch_scc0 BB30_2
1565; VI-NEXT:  ; %bb.1: ; %else
1566; VI-NEXT:    s_load_dword s7, s[2:3], 0x4
1567; VI-NEXT:    s_cbranch_execz BB30_3
1568; VI-NEXT:    s_branch BB30_4
1569; VI-NEXT:  BB30_2:
1570; VI-NEXT:  BB30_3: ; %if
1571; VI-NEXT:    s_waitcnt lgkmcnt(0)
1572; VI-NEXT:    s_load_dword s7, s[2:3], 0x0
1573; VI-NEXT:  BB30_4: ; %endif
1574; VI-NEXT:    s_waitcnt lgkmcnt(0)
1575; VI-NEXT:    v_mov_b32_e32 v0, s6
1576; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1577; VI-NEXT:    s_mov_b32 s2, -1
1578; VI-NEXT:    v_mov_b32_e32 v1, s7
1579; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1580; VI-NEXT:    s_endpgm
1581entry:
1582  %0 = insertelement <2 x i32> undef, i32 %a, i32 0
1583  %1 = icmp eq i32 %a, 0
1584  br i1 %1, label %if, label %else
1585
1586if:
1587  %2 = load i32, i32 addrspace(1)* %in
1588  %3 = insertelement <2 x i32> %0, i32 %2, i32 1
1589  br label %endif
1590
1591else:
1592  %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1
1593  %5 = load i32, i32 addrspace(1)* %4
1594  %6 = insertelement <2 x i32> %0, i32 %5, i32 1
1595  br label %endif
1596
1597endif:
1598  %7 = phi <2 x i32> [%3, %if], [%6, %else]
1599  store <2 x i32> %7, <2 x i32> addrspace(1)* %out
1600  ret void
1601}
1602
1603define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
1604; SI-LABEL: dynamic_insertelement_v2f64:
1605; SI:       ; %bb.0:
1606; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1607; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xc
1608; SI-NEXT:    s_load_dword s4, s[4:5], 0x18
1609; SI-NEXT:    v_mov_b32_e32 v1, 0x40200000
1610; SI-NEXT:    s_mov_b32 s3, 0x100f000
1611; SI-NEXT:    s_mov_b32 s2, -1
1612; SI-NEXT:    s_waitcnt lgkmcnt(0)
1613; SI-NEXT:    v_mov_b32_e32 v0, s11
1614; SI-NEXT:    s_cmp_eq_u32 s4, 1
1615; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1616; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
1617; SI-NEXT:    v_mov_b32_e32 v0, s10
1618; SI-NEXT:    s_cmp_eq_u32 s4, 0
1619; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1620; SI-NEXT:    v_mov_b32_e32 v0, s9
1621; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1622; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
1623; SI-NEXT:    v_mov_b32_e32 v0, s8
1624; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1625; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1626; SI-NEXT:    s_endpgm
1627;
1628; VI-LABEL: dynamic_insertelement_v2f64:
1629; VI:       ; %bb.0:
1630; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1631; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x30
1632; VI-NEXT:    s_load_dword s4, s[4:5], 0x60
1633; VI-NEXT:    v_mov_b32_e32 v1, 0x40200000
1634; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1635; VI-NEXT:    s_mov_b32 s2, -1
1636; VI-NEXT:    s_waitcnt lgkmcnt(0)
1637; VI-NEXT:    v_mov_b32_e32 v0, s11
1638; VI-NEXT:    s_cmp_eq_u32 s4, 1
1639; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1640; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
1641; VI-NEXT:    v_mov_b32_e32 v0, s10
1642; VI-NEXT:    s_cmp_eq_u32 s4, 0
1643; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1644; VI-NEXT:    v_mov_b32_e32 v0, s9
1645; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1646; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
1647; VI-NEXT:    v_mov_b32_e32 v0, s8
1648; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1649; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1650; VI-NEXT:    s_endpgm
1651  %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
1652  store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
1653  ret void
1654}
1655
1656define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
1657; SI-LABEL: dynamic_insertelement_v2i64:
1658; SI:       ; %bb.0:
1659; SI-NEXT:    s_load_dword s6, s[4:5], 0x8
1660; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1661; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
1662; SI-NEXT:    s_mov_b32 s3, 0x100f000
1663; SI-NEXT:    s_mov_b32 s2, -1
1664; SI-NEXT:    s_waitcnt lgkmcnt(0)
1665; SI-NEXT:    s_cmp_eq_u32 s6, 1
1666; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
1667; SI-NEXT:    v_mov_b32_e32 v0, s11
1668; SI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1669; SI-NEXT:    v_mov_b32_e32 v0, s10
1670; SI-NEXT:    s_cmp_eq_u32 s6, 0
1671; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1672; SI-NEXT:    v_mov_b32_e32 v0, s9
1673; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
1674; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1675; SI-NEXT:    v_mov_b32_e32 v0, s8
1676; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1677; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1678; SI-NEXT:    s_endpgm
1679;
1680; VI-LABEL: dynamic_insertelement_v2i64:
1681; VI:       ; %bb.0:
1682; VI-NEXT:    s_load_dword s6, s[4:5], 0x20
1683; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1684; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
1685; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1686; VI-NEXT:    s_mov_b32 s2, -1
1687; VI-NEXT:    s_waitcnt lgkmcnt(0)
1688; VI-NEXT:    s_cmp_eq_u32 s6, 1
1689; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
1690; VI-NEXT:    v_mov_b32_e32 v0, s11
1691; VI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1692; VI-NEXT:    v_mov_b32_e32 v0, s10
1693; VI-NEXT:    s_cmp_eq_u32 s6, 0
1694; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1695; VI-NEXT:    v_mov_b32_e32 v0, s9
1696; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
1697; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1698; VI-NEXT:    v_mov_b32_e32 v0, s8
1699; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1700; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1701; VI-NEXT:    s_endpgm
1702  %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
1703  store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
1704  ret void
1705}
1706
1707define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
1708; SI-LABEL: dynamic_insertelement_v3i64:
1709; SI:       ; %bb.0:
1710; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1711; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
1712; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xc
1713; SI-NEXT:    s_load_dword s12, s[4:5], 0x10
1714; SI-NEXT:    s_mov_b32 s3, 0x100f000
1715; SI-NEXT:    s_mov_b32 s2, -1
1716; SI-NEXT:    s_waitcnt lgkmcnt(0)
1717; SI-NEXT:    v_mov_b32_e32 v0, s11
1718; SI-NEXT:    v_mov_b32_e32 v4, s7
1719; SI-NEXT:    s_cmp_eq_u32 s12, 1
1720; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
1721; SI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1722; SI-NEXT:    v_mov_b32_e32 v0, s10
1723; SI-NEXT:    s_cmp_eq_u32 s12, 0
1724; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1725; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
1726; SI-NEXT:    v_mov_b32_e32 v0, s9
1727; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1728; SI-NEXT:    v_mov_b32_e32 v0, s8
1729; SI-NEXT:    s_cmp_eq_u32 s12, 2
1730; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1731; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
1732; SI-NEXT:    v_cndmask_b32_e64 v5, v4, 0, s[4:5]
1733; SI-NEXT:    v_mov_b32_e32 v4, s6
1734; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 5, s[4:5]
1735; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
1736; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1737; SI-NEXT:    s_endpgm
1738;
1739; VI-LABEL: dynamic_insertelement_v3i64:
1740; VI:       ; %bb.0:
1741; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1742; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
1743; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x30
1744; VI-NEXT:    s_load_dword s12, s[4:5], 0x40
1745; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1746; VI-NEXT:    s_mov_b32 s2, -1
1747; VI-NEXT:    s_waitcnt lgkmcnt(0)
1748; VI-NEXT:    v_mov_b32_e32 v0, s11
1749; VI-NEXT:    s_cmp_eq_u32 s12, 1
1750; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
1751; VI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1752; VI-NEXT:    v_mov_b32_e32 v0, s10
1753; VI-NEXT:    s_cmp_eq_u32 s12, 0
1754; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1755; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
1756; VI-NEXT:    v_mov_b32_e32 v0, s9
1757; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1758; VI-NEXT:    v_mov_b32_e32 v0, s8
1759; VI-NEXT:    s_cmp_eq_u32 s12, 2
1760; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1761; VI-NEXT:    v_mov_b32_e32 v4, s7
1762; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
1763; VI-NEXT:    v_cndmask_b32_e64 v5, v4, 0, s[4:5]
1764; VI-NEXT:    v_mov_b32_e32 v4, s6
1765; VI-NEXT:    v_cndmask_b32_e64 v4, v4, 5, s[4:5]
1766; VI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
1767; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1768; VI-NEXT:    s_endpgm
1769  %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
1770  store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
1771  ret void
1772}
1773
1774define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
1775; SI-LABEL: dynamic_insertelement_v4f64:
1776; SI:       ; %bb.0:
1777; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1778; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
1779; SI-NEXT:    s_load_dword s4, s[4:5], 0x10
1780; SI-NEXT:    v_mov_b32_e32 v4, 0x40200000
1781; SI-NEXT:    s_mov_b32 s3, 0x100f000
1782; SI-NEXT:    s_mov_b32 s2, -1
1783; SI-NEXT:    s_waitcnt lgkmcnt(0)
1784; SI-NEXT:    v_mov_b32_e32 v0, s11
1785; SI-NEXT:    s_cmp_eq_u32 s4, 1
1786; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1787; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
1788; SI-NEXT:    v_mov_b32_e32 v0, s10
1789; SI-NEXT:    s_cmp_eq_u32 s4, 0
1790; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1791; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1792; SI-NEXT:    v_mov_b32_e32 v0, s9
1793; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
1794; SI-NEXT:    v_mov_b32_e32 v0, s8
1795; SI-NEXT:    s_cmp_eq_u32 s4, 3
1796; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1797; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1798; SI-NEXT:    v_mov_b32_e32 v5, s15
1799; SI-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
1800; SI-NEXT:    v_mov_b32_e32 v5, s14
1801; SI-NEXT:    s_cmp_eq_u32 s4, 2
1802; SI-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
1803; SI-NEXT:    v_mov_b32_e32 v5, s13
1804; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1805; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
1806; SI-NEXT:    v_mov_b32_e32 v4, s12
1807; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
1808; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1809; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1810; SI-NEXT:    s_endpgm
1811;
1812; VI-LABEL: dynamic_insertelement_v4f64:
1813; VI:       ; %bb.0:
1814; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1815; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
1816; VI-NEXT:    s_load_dword s4, s[4:5], 0x40
1817; VI-NEXT:    v_mov_b32_e32 v4, 0x40200000
1818; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1819; VI-NEXT:    s_mov_b32 s2, -1
1820; VI-NEXT:    s_waitcnt lgkmcnt(0)
1821; VI-NEXT:    v_mov_b32_e32 v0, s11
1822; VI-NEXT:    s_cmp_eq_u32 s4, 1
1823; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1824; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
1825; VI-NEXT:    v_mov_b32_e32 v0, s10
1826; VI-NEXT:    s_cmp_eq_u32 s4, 0
1827; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1828; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1829; VI-NEXT:    v_mov_b32_e32 v0, s9
1830; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
1831; VI-NEXT:    v_mov_b32_e32 v0, s8
1832; VI-NEXT:    s_cmp_eq_u32 s4, 3
1833; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1834; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1835; VI-NEXT:    v_mov_b32_e32 v5, s15
1836; VI-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
1837; VI-NEXT:    v_mov_b32_e32 v5, s14
1838; VI-NEXT:    s_cmp_eq_u32 s4, 2
1839; VI-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
1840; VI-NEXT:    v_mov_b32_e32 v5, s13
1841; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1842; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
1843; VI-NEXT:    v_mov_b32_e32 v4, s12
1844; VI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
1845; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1846; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1847; VI-NEXT:    s_endpgm
1848  %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
1849  store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
1850  ret void
1851}
1852
1853define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 {
1854; SI-LABEL: dynamic_insertelement_v8f64:
1855; SI:       ; %bb.0:
1856; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1857; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
1858; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
1859; SI-NEXT:    v_mov_b32_e32 v16, 0x40200000
1860; SI-NEXT:    s_mov_b32 s3, 0x100f000
1861; SI-NEXT:    s_mov_b32 s2, -1
1862; SI-NEXT:    s_waitcnt lgkmcnt(0)
1863; SI-NEXT:    v_mov_b32_e32 v0, s8
1864; SI-NEXT:    s_lshl_b32 s4, s4, 1
1865; SI-NEXT:    v_mov_b32_e32 v1, s9
1866; SI-NEXT:    v_mov_b32_e32 v2, s10
1867; SI-NEXT:    v_mov_b32_e32 v3, s11
1868; SI-NEXT:    v_mov_b32_e32 v4, s12
1869; SI-NEXT:    v_mov_b32_e32 v5, s13
1870; SI-NEXT:    v_mov_b32_e32 v6, s14
1871; SI-NEXT:    v_mov_b32_e32 v7, s15
1872; SI-NEXT:    v_mov_b32_e32 v8, s16
1873; SI-NEXT:    v_mov_b32_e32 v9, s17
1874; SI-NEXT:    v_mov_b32_e32 v10, s18
1875; SI-NEXT:    v_mov_b32_e32 v11, s19
1876; SI-NEXT:    v_mov_b32_e32 v12, s20
1877; SI-NEXT:    v_mov_b32_e32 v13, s21
1878; SI-NEXT:    v_mov_b32_e32 v14, s22
1879; SI-NEXT:    v_mov_b32_e32 v15, s23
1880; SI-NEXT:    s_mov_b32 m0, s4
1881; SI-NEXT:    v_movreld_b32_e32 v0, 0
1882; SI-NEXT:    v_movreld_b32_e32 v1, v16
1883; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1884; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1885; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1886; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1887; SI-NEXT:    s_endpgm
1888;
1889; VI-LABEL: dynamic_insertelement_v8f64:
1890; VI:       ; %bb.0:
1891; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1892; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
1893; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
1894; VI-NEXT:    v_mov_b32_e32 v16, 0x40200000
1895; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1896; VI-NEXT:    s_mov_b32 s2, -1
1897; VI-NEXT:    s_waitcnt lgkmcnt(0)
1898; VI-NEXT:    v_mov_b32_e32 v0, s8
1899; VI-NEXT:    s_lshl_b32 s4, s4, 1
1900; VI-NEXT:    v_mov_b32_e32 v1, s9
1901; VI-NEXT:    v_mov_b32_e32 v2, s10
1902; VI-NEXT:    v_mov_b32_e32 v3, s11
1903; VI-NEXT:    v_mov_b32_e32 v4, s12
1904; VI-NEXT:    v_mov_b32_e32 v5, s13
1905; VI-NEXT:    v_mov_b32_e32 v6, s14
1906; VI-NEXT:    v_mov_b32_e32 v7, s15
1907; VI-NEXT:    v_mov_b32_e32 v8, s16
1908; VI-NEXT:    v_mov_b32_e32 v9, s17
1909; VI-NEXT:    v_mov_b32_e32 v10, s18
1910; VI-NEXT:    v_mov_b32_e32 v11, s19
1911; VI-NEXT:    v_mov_b32_e32 v12, s20
1912; VI-NEXT:    v_mov_b32_e32 v13, s21
1913; VI-NEXT:    v_mov_b32_e32 v14, s22
1914; VI-NEXT:    v_mov_b32_e32 v15, s23
1915; VI-NEXT:    s_mov_b32 m0, s4
1916; VI-NEXT:    v_movreld_b32_e32 v0, 0
1917; VI-NEXT:    v_movreld_b32_e32 v1, v16
1918; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1919; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1920; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1921; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1922; VI-NEXT:    s_endpgm
1923  %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
1924  store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
1925  ret void
1926}
1927
1928declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
1929
1930attributes #0 = { nounwind }
1931attributes #1 = { nounwind readnone }
1932