1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=CIVI,VI %s
4; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CIVI,CI %s
5
6define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
7; GFX9-LABEL: s_insertelement_v2i16_0:
8; GFX9:       ; %bb.0:
9; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
10; GFX9-NEXT:    v_mov_b32_e32 v0, 0
11; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
12; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
13; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
14; GFX9-NEXT:    s_pack_lh_b32_b16 s2, 0x3e7, s2
15; GFX9-NEXT:    v_mov_b32_e32 v1, s2
16; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
17; GFX9-NEXT:    s_endpgm
18;
19; CIVI-LABEL: s_insertelement_v2i16_0:
20; CIVI:       ; %bb.0:
21; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
22; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
23; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
24; CIVI-NEXT:    v_mov_b32_e32 v0, s0
25; CIVI-NEXT:    v_mov_b32_e32 v1, s1
26; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
27; CIVI-NEXT:    s_and_b32 s0, s2, 0xffff0000
28; CIVI-NEXT:    s_or_b32 s0, s0, 0x3e7
29; CIVI-NEXT:    v_mov_b32_e32 v2, s0
30; CIVI-NEXT:    flat_store_dword v[0:1], v2
31; CIVI-NEXT:    s_endpgm
32  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
33  %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
34  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
35  ret void
36}
37
38
39define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
40; GFX9-LABEL: s_insertelement_v2i16_0_reg:
41; GFX9:       ; %bb.0:
42; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
43; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x30
44; GFX9-NEXT:    v_mov_b32_e32 v0, 0
45; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
46; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
47; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
48; GFX9-NEXT:    s_pack_lh_b32_b16 s2, s6, s2
49; GFX9-NEXT:    v_mov_b32_e32 v1, s2
50; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
51; GFX9-NEXT:    s_endpgm
52;
53; VI-LABEL: s_insertelement_v2i16_0_reg:
54; VI:       ; %bb.0:
55; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
56; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
57; VI-NEXT:    s_waitcnt lgkmcnt(0)
58; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
59; VI-NEXT:    v_mov_b32_e32 v0, s0
60; VI-NEXT:    v_mov_b32_e32 v1, s1
61; VI-NEXT:    s_and_b32 s0, s4, 0xffff
62; VI-NEXT:    s_waitcnt lgkmcnt(0)
63; VI-NEXT:    s_and_b32 s1, s2, 0xffff0000
64; VI-NEXT:    s_or_b32 s0, s0, s1
65; VI-NEXT:    v_mov_b32_e32 v2, s0
66; VI-NEXT:    flat_store_dword v[0:1], v2
67; VI-NEXT:    s_endpgm
68;
69; CI-LABEL: s_insertelement_v2i16_0_reg:
70; CI:       ; %bb.0:
71; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
72; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
73; CI-NEXT:    s_waitcnt lgkmcnt(0)
74; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
75; CI-NEXT:    v_mov_b32_e32 v0, s0
76; CI-NEXT:    v_mov_b32_e32 v1, s1
77; CI-NEXT:    s_and_b32 s1, s4, 0xffff
78; CI-NEXT:    s_waitcnt lgkmcnt(0)
79; CI-NEXT:    s_and_b32 s0, s2, 0xffff0000
80; CI-NEXT:    s_or_b32 s0, s1, s0
81; CI-NEXT:    v_mov_b32_e32 v2, s0
82; CI-NEXT:    flat_store_dword v[0:1], v2
83; CI-NEXT:    s_endpgm
84  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
85  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
86  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
87  ret void
88}
89
90define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
91; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
92; GFX9:       ; %bb.0:
93; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
94; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x30
95; GFX9-NEXT:    v_mov_b32_e32 v0, 0
96; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
97; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
98; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
99; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
100; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s6, s2
101; GFX9-NEXT:    v_mov_b32_e32 v1, s3
102; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
103; GFX9-NEXT:    ;;#ASMSTART
104; GFX9-NEXT:    ; use s2
105; GFX9-NEXT:    ;;#ASMEND
106; GFX9-NEXT:    s_endpgm
107;
108; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
109; VI:       ; %bb.0:
110; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
111; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
112; VI-NEXT:    s_waitcnt lgkmcnt(0)
113; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
114; VI-NEXT:    v_mov_b32_e32 v0, s0
115; VI-NEXT:    v_mov_b32_e32 v1, s1
116; VI-NEXT:    s_and_b32 s0, s4, 0xffff
117; VI-NEXT:    s_waitcnt lgkmcnt(0)
118; VI-NEXT:    s_lshr_b32 s1, s2, 16
119; VI-NEXT:    s_and_b32 s2, s2, 0xffff0000
120; VI-NEXT:    s_or_b32 s0, s0, s2
121; VI-NEXT:    v_mov_b32_e32 v2, s0
122; VI-NEXT:    flat_store_dword v[0:1], v2
123; VI-NEXT:    ;;#ASMSTART
124; VI-NEXT:    ; use s1
125; VI-NEXT:    ;;#ASMEND
126; VI-NEXT:    s_endpgm
127;
128; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
129; CI:       ; %bb.0:
130; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
131; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
132; CI-NEXT:    s_waitcnt lgkmcnt(0)
133; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
134; CI-NEXT:    v_mov_b32_e32 v1, s1
135; CI-NEXT:    v_mov_b32_e32 v0, s0
136; CI-NEXT:    s_and_b32 s0, s4, 0xffff
137; CI-NEXT:    s_waitcnt lgkmcnt(0)
138; CI-NEXT:    s_lshr_b32 s1, s2, 16
139; CI-NEXT:    s_lshl_b32 s2, s1, 16
140; CI-NEXT:    s_or_b32 s0, s0, s2
141; CI-NEXT:    v_mov_b32_e32 v2, s0
142; CI-NEXT:    flat_store_dword v[0:1], v2
143; CI-NEXT:    ;;#ASMSTART
144; CI-NEXT:    ; use s1
145; CI-NEXT:    ;;#ASMEND
146; CI-NEXT:    s_endpgm
147  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
148  %elt1 = extractelement <2 x i16> %vec, i32 1
149  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
150  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
151  %use1 = zext i16 %elt1 to i32
152  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
153  ret void
154}
155
156define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %elt.arg) #0 {
157; GFX9-LABEL: s_insertelement_v2i16_0_reghi:
158; GFX9:       ; %bb.0:
159; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
160; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x30
161; GFX9-NEXT:    v_mov_b32_e32 v0, 0
162; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
163; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
164; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
165; GFX9-NEXT:    s_pack_hh_b32_b16 s2, s6, s2
166; GFX9-NEXT:    v_mov_b32_e32 v1, s2
167; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
168; GFX9-NEXT:    s_endpgm
169;
170; VI-LABEL: s_insertelement_v2i16_0_reghi:
171; VI:       ; %bb.0:
172; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
173; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
174; VI-NEXT:    s_waitcnt lgkmcnt(0)
175; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
176; VI-NEXT:    v_mov_b32_e32 v0, s0
177; VI-NEXT:    v_mov_b32_e32 v2, s4
178; VI-NEXT:    v_mov_b32_e32 v1, s1
179; VI-NEXT:    s_waitcnt lgkmcnt(0)
180; VI-NEXT:    s_lshr_b32 s0, s2, 16
181; VI-NEXT:    v_alignbit_b32 v2, s0, v2, 16
182; VI-NEXT:    flat_store_dword v[0:1], v2
183; VI-NEXT:    s_endpgm
184;
185; CI-LABEL: s_insertelement_v2i16_0_reghi:
186; CI:       ; %bb.0:
187; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
188; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
189; CI-NEXT:    s_waitcnt lgkmcnt(0)
190; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
191; CI-NEXT:    v_mov_b32_e32 v0, s0
192; CI-NEXT:    v_mov_b32_e32 v1, s1
193; CI-NEXT:    s_lshr_b32 s1, s4, 16
194; CI-NEXT:    s_waitcnt lgkmcnt(0)
195; CI-NEXT:    s_and_b32 s0, s2, 0xffff0000
196; CI-NEXT:    s_or_b32 s0, s1, s0
197; CI-NEXT:    v_mov_b32_e32 v2, s0
198; CI-NEXT:    flat_store_dword v[0:1], v2
199; CI-NEXT:    s_endpgm
200  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
201  %elt.hi = lshr i32 %elt.arg, 16
202  %elt = trunc i32 %elt.hi to i16
203  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
204  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
205  ret void
206}
207
208define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
209; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
210; GFX9:       ; %bb.0:
211; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
212; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
213; GFX9-NEXT:    v_mov_b32_e32 v0, 0
214; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
215; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
216; GFX9-NEXT:    s_lshr_b32 s3, s6, 16
217; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
218; GFX9-NEXT:    s_pack_lh_b32_b16 s2, s3, s2
219; GFX9-NEXT:    v_mov_b32_e32 v1, s2
220; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
221; GFX9-NEXT:    ;;#ASMSTART
222; GFX9-NEXT:    ; use s3
223; GFX9-NEXT:    ;;#ASMEND
224; GFX9-NEXT:    s_endpgm
225;
226; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
227; VI:       ; %bb.0:
228; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
229; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
230; VI-NEXT:    s_waitcnt lgkmcnt(0)
231; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
232; VI-NEXT:    v_mov_b32_e32 v1, s1
233; VI-NEXT:    v_mov_b32_e32 v2, s4
234; VI-NEXT:    v_mov_b32_e32 v0, s0
235; VI-NEXT:    s_lshr_b32 s0, s4, 16
236; VI-NEXT:    s_waitcnt lgkmcnt(0)
237; VI-NEXT:    s_lshr_b32 s1, s2, 16
238; VI-NEXT:    v_alignbit_b32 v2, s1, v2, 16
239; VI-NEXT:    flat_store_dword v[0:1], v2
240; VI-NEXT:    ;;#ASMSTART
241; VI-NEXT:    ; use s0
242; VI-NEXT:    ;;#ASMEND
243; VI-NEXT:    s_endpgm
244;
245; CI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
246; CI:       ; %bb.0:
247; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
248; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
249; CI-NEXT:    s_waitcnt lgkmcnt(0)
250; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
251; CI-NEXT:    v_mov_b32_e32 v0, s0
252; CI-NEXT:    v_mov_b32_e32 v1, s1
253; CI-NEXT:    s_lshr_b32 s0, s4, 16
254; CI-NEXT:    s_waitcnt lgkmcnt(0)
255; CI-NEXT:    s_and_b32 s1, s2, 0xffff0000
256; CI-NEXT:    s_or_b32 s1, s0, s1
257; CI-NEXT:    v_mov_b32_e32 v2, s1
258; CI-NEXT:    flat_store_dword v[0:1], v2
259; CI-NEXT:    ;;#ASMSTART
260; CI-NEXT:    ; use s0
261; CI-NEXT:    ;;#ASMEND
262; CI-NEXT:    s_endpgm
263  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
264  %elt.hi = lshr i32 %elt.arg, 16
265  %elt = trunc i32 %elt.hi to i16
266  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
267  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
268  %use1 = zext i16 %elt to i32
269  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
270  ret void
271}
272
273define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
274; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
275; GFX9:       ; %bb.0:
276; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
277; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
278; GFX9-NEXT:    v_mov_b32_e32 v0, 0
279; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
280; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
281; GFX9-NEXT:    s_lshr_b32 s3, s6, 16
282; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
283; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
284; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s3, s2
285; GFX9-NEXT:    v_mov_b32_e32 v1, s4
286; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
287; GFX9-NEXT:    ;;#ASMSTART
288; GFX9-NEXT:    ; use s3
289; GFX9-NEXT:    ;;#ASMEND
290; GFX9-NEXT:    ;;#ASMSTART
291; GFX9-NEXT:    ; use s2
292; GFX9-NEXT:    ;;#ASMEND
293; GFX9-NEXT:    s_endpgm
294;
295; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
296; VI:       ; %bb.0:
297; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
298; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
299; VI-NEXT:    s_waitcnt lgkmcnt(0)
300; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
301; VI-NEXT:    v_mov_b32_e32 v1, s1
302; VI-NEXT:    v_mov_b32_e32 v2, s4
303; VI-NEXT:    v_mov_b32_e32 v0, s0
304; VI-NEXT:    s_lshr_b32 s0, s4, 16
305; VI-NEXT:    s_waitcnt lgkmcnt(0)
306; VI-NEXT:    s_lshr_b32 s1, s2, 16
307; VI-NEXT:    v_alignbit_b32 v2, s1, v2, 16
308; VI-NEXT:    flat_store_dword v[0:1], v2
309; VI-NEXT:    ;;#ASMSTART
310; VI-NEXT:    ; use s0
311; VI-NEXT:    ;;#ASMEND
312; VI-NEXT:    ;;#ASMSTART
313; VI-NEXT:    ; use s1
314; VI-NEXT:    ;;#ASMEND
315; VI-NEXT:    s_endpgm
316;
317; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
318; CI:       ; %bb.0:
319; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
320; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
321; CI-NEXT:    s_waitcnt lgkmcnt(0)
322; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
323; CI-NEXT:    v_mov_b32_e32 v1, s1
324; CI-NEXT:    v_mov_b32_e32 v2, s4
325; CI-NEXT:    v_mov_b32_e32 v0, s0
326; CI-NEXT:    s_lshr_b32 s0, s4, 16
327; CI-NEXT:    s_waitcnt lgkmcnt(0)
328; CI-NEXT:    s_lshr_b32 s1, s2, 16
329; CI-NEXT:    v_alignbit_b32 v2, s1, v2, 16
330; CI-NEXT:    flat_store_dword v[0:1], v2
331; CI-NEXT:    ;;#ASMSTART
332; CI-NEXT:    ; use s0
333; CI-NEXT:    ;;#ASMEND
334; CI-NEXT:    ;;#ASMSTART
335; CI-NEXT:    ; use s1
336; CI-NEXT:    ;;#ASMEND
337; CI-NEXT:    s_endpgm
338  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
339  %elt.hi = lshr i32 %elt.arg, 16
340  %elt = trunc i32 %elt.hi to i16
341  %vec.hi = extractelement <2 x i16> %vec, i32 1
342  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
343  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
344  %use1 = zext i16 %elt to i32
345  %vec.hi.use1 = zext i16 %vec.hi to i32
346
347  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
348  call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0
349  ret void
350}
351
352define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
353; GFX9-LABEL: s_insertelement_v2i16_1:
354; GFX9:       ; %bb.0:
355; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
356; GFX9-NEXT:    v_mov_b32_e32 v0, 0
357; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
358; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
359; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
360; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, 0x3e7
361; GFX9-NEXT:    v_mov_b32_e32 v1, s2
362; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
363; GFX9-NEXT:    s_endpgm
364;
365; CIVI-LABEL: s_insertelement_v2i16_1:
366; CIVI:       ; %bb.0:
367; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
368; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
369; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
370; CIVI-NEXT:    v_mov_b32_e32 v0, s0
371; CIVI-NEXT:    v_mov_b32_e32 v1, s1
372; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
373; CIVI-NEXT:    s_and_b32 s0, s2, 0xffff
374; CIVI-NEXT:    s_or_b32 s0, s0, 0x3e70000
375; CIVI-NEXT:    v_mov_b32_e32 v2, s0
376; CIVI-NEXT:    flat_store_dword v[0:1], v2
377; CIVI-NEXT:    s_endpgm
378  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
379  %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
380  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
381  ret void
382}
383
384define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
385; GFX9-LABEL: s_insertelement_v2i16_1_reg:
386; GFX9:       ; %bb.0:
387; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
388; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x30
389; GFX9-NEXT:    v_mov_b32_e32 v0, 0
390; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
391; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
392; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
393; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s6
394; GFX9-NEXT:    v_mov_b32_e32 v1, s2
395; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
396; GFX9-NEXT:    s_endpgm
397;
398; VI-LABEL: s_insertelement_v2i16_1_reg:
399; VI:       ; %bb.0:
400; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
401; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
402; VI-NEXT:    s_waitcnt lgkmcnt(0)
403; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
404; VI-NEXT:    v_mov_b32_e32 v0, s0
405; VI-NEXT:    v_mov_b32_e32 v1, s1
406; VI-NEXT:    s_lshl_b32 s0, s4, 16
407; VI-NEXT:    s_waitcnt lgkmcnt(0)
408; VI-NEXT:    s_and_b32 s1, s2, 0xffff
409; VI-NEXT:    s_or_b32 s0, s1, s0
410; VI-NEXT:    v_mov_b32_e32 v2, s0
411; VI-NEXT:    flat_store_dword v[0:1], v2
412; VI-NEXT:    s_endpgm
413;
414; CI-LABEL: s_insertelement_v2i16_1_reg:
415; CI:       ; %bb.0:
416; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
417; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
418; CI-NEXT:    s_waitcnt lgkmcnt(0)
419; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
420; CI-NEXT:    v_mov_b32_e32 v0, s0
421; CI-NEXT:    v_mov_b32_e32 v1, s1
422; CI-NEXT:    s_lshl_b32 s1, s4, 16
423; CI-NEXT:    s_waitcnt lgkmcnt(0)
424; CI-NEXT:    s_and_b32 s0, s2, 0xffff
425; CI-NEXT:    s_or_b32 s0, s0, s1
426; CI-NEXT:    v_mov_b32_e32 v2, s0
427; CI-NEXT:    flat_store_dword v[0:1], v2
428; CI-NEXT:    s_endpgm
429  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
430  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
431  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
432  ret void
433}
434
435define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
436; GFX9-LABEL: s_insertelement_v2f16_0:
437; GFX9:       ; %bb.0:
438; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
439; GFX9-NEXT:    v_mov_b32_e32 v0, 0
440; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
441; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
442; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
443; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
444; GFX9-NEXT:    s_pack_ll_b32_b16 s2, 0x4500, s2
445; GFX9-NEXT:    v_mov_b32_e32 v1, s2
446; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
447; GFX9-NEXT:    s_endpgm
448;
449; CIVI-LABEL: s_insertelement_v2f16_0:
450; CIVI:       ; %bb.0:
451; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
452; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
453; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
454; CIVI-NEXT:    v_mov_b32_e32 v0, s0
455; CIVI-NEXT:    v_mov_b32_e32 v1, s1
456; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
457; CIVI-NEXT:    s_and_b32 s0, s2, 0xffff0000
458; CIVI-NEXT:    s_or_b32 s0, s0, 0x4500
459; CIVI-NEXT:    v_mov_b32_e32 v2, s0
460; CIVI-NEXT:    flat_store_dword v[0:1], v2
461; CIVI-NEXT:    s_endpgm
462  %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
463  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
464  store <2 x half> %vecins, <2 x half> addrspace(1)* %out
465  ret void
466}
467
468define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
469; GFX9-LABEL: s_insertelement_v2f16_1:
470; GFX9:       ; %bb.0:
471; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
472; GFX9-NEXT:    v_mov_b32_e32 v0, 0
473; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
474; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
475; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
476; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, 0x4500
477; GFX9-NEXT:    v_mov_b32_e32 v1, s2
478; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
479; GFX9-NEXT:    s_endpgm
480;
481; CIVI-LABEL: s_insertelement_v2f16_1:
482; CIVI:       ; %bb.0:
483; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
484; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
485; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
486; CIVI-NEXT:    v_mov_b32_e32 v0, s0
487; CIVI-NEXT:    v_mov_b32_e32 v1, s1
488; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
489; CIVI-NEXT:    s_and_b32 s0, s2, 0xffff
490; CIVI-NEXT:    s_or_b32 s0, s0, 0x45000000
491; CIVI-NEXT:    v_mov_b32_e32 v2, s0
492; CIVI-NEXT:    flat_store_dword v[0:1], v2
493; CIVI-NEXT:    s_endpgm
494  %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
495  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
496  store <2 x half> %vecins, <2 x half> addrspace(1)* %out
497  ret void
498}
499
500define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
501; GFX9-LABEL: v_insertelement_v2i16_0:
502; GFX9:       ; %bb.0:
503; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
504; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
505; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
506; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
507; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
508; GFX9-NEXT:    s_movk_i32 s2, 0x3e7
509; GFX9-NEXT:    s_waitcnt vmcnt(0)
510; GFX9-NEXT:    v_bfi_b32 v1, v2, s2, v1
511; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
512; GFX9-NEXT:    s_endpgm
513;
514; VI-LABEL: v_insertelement_v2i16_0:
515; VI:       ; %bb.0:
516; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
517; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
518; VI-NEXT:    s_waitcnt lgkmcnt(0)
519; VI-NEXT:    v_mov_b32_e32 v1, s3
520; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
521; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
522; VI-NEXT:    flat_load_dword v3, v[0:1]
523; VI-NEXT:    v_mov_b32_e32 v1, s1
524; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
525; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
526; VI-NEXT:    s_waitcnt vmcnt(0)
527; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
528; VI-NEXT:    v_or_b32_e32 v2, 0x3e7, v2
529; VI-NEXT:    flat_store_dword v[0:1], v2
530; VI-NEXT:    s_endpgm
531;
532; CI-LABEL: v_insertelement_v2i16_0:
533; CI:       ; %bb.0:
534; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
535; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
536; CI-NEXT:    s_waitcnt lgkmcnt(0)
537; CI-NEXT:    v_mov_b32_e32 v1, s3
538; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
539; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
540; CI-NEXT:    flat_load_dword v3, v[0:1]
541; CI-NEXT:    v_mov_b32_e32 v1, s1
542; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
543; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
544; CI-NEXT:    s_waitcnt vmcnt(0)
545; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
546; CI-NEXT:    v_or_b32_e32 v2, 0x3e7, v2
547; CI-NEXT:    flat_store_dword v[0:1], v2
548; CI-NEXT:    s_endpgm
549  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
550  %tid.ext = sext i32 %tid to i64
551  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
552  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
553  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
554  %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
555  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
556  ret void
557}
558
559define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 {
560; GFX9-LABEL: v_insertelement_v2i16_0_reghi:
561; GFX9:       ; %bb.0:
562; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
563; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
564; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
565; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff0000
566; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
567; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
568; GFX9-NEXT:    v_lshrrev_b32_e64 v2, 16, s6
569; GFX9-NEXT:    s_waitcnt vmcnt(0)
570; GFX9-NEXT:    v_and_or_b32 v1, v1, v3, v2
571; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
572; GFX9-NEXT:    s_endpgm
573;
574; VI-LABEL: v_insertelement_v2i16_0_reghi:
575; VI:       ; %bb.0:
576; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
577; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
578; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
579; VI-NEXT:    s_waitcnt lgkmcnt(0)
580; VI-NEXT:    v_mov_b32_e32 v1, s3
581; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
582; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
583; VI-NEXT:    flat_load_dword v3, v[0:1]
584; VI-NEXT:    v_mov_b32_e32 v1, s1
585; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
586; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
587; VI-NEXT:    s_waitcnt vmcnt(0)
588; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
589; VI-NEXT:    v_alignbit_b32 v2, v2, s4, 16
590; VI-NEXT:    flat_store_dword v[0:1], v2
591; VI-NEXT:    s_endpgm
592;
593; CI-LABEL: v_insertelement_v2i16_0_reghi:
594; CI:       ; %bb.0:
595; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
596; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
597; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
598; CI-NEXT:    s_waitcnt lgkmcnt(0)
599; CI-NEXT:    v_mov_b32_e32 v1, s3
600; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
601; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
602; CI-NEXT:    flat_load_dword v3, v[0:1]
603; CI-NEXT:    v_mov_b32_e32 v1, s1
604; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
605; CI-NEXT:    s_lshr_b32 s0, s4, 16
606; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
607; CI-NEXT:    s_waitcnt vmcnt(0)
608; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
609; CI-NEXT:    v_or_b32_e32 v2, s0, v2
610; CI-NEXT:    flat_store_dword v[0:1], v2
611; CI-NEXT:    s_endpgm
612  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
613  %tid.ext = sext i32 %tid to i64
614  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
615  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
616  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
617  %elt.hi = lshr i32 %elt.arg, 16
618  %elt = trunc i32 %elt.hi to i16
619  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
620  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
621  ret void
622}
623
624define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
625; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm:
626; GFX9:       ; %bb.0:
627; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
628; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
629; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
630; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
631; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
632; GFX9-NEXT:    s_waitcnt vmcnt(0)
633; GFX9-NEXT:    v_bfi_b32 v1, v2, 53, v1
634; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
635; GFX9-NEXT:    s_endpgm
636;
637; VI-LABEL: v_insertelement_v2i16_0_inlineimm:
638; VI:       ; %bb.0:
639; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
640; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
641; VI-NEXT:    s_waitcnt lgkmcnt(0)
642; VI-NEXT:    v_mov_b32_e32 v1, s3
643; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
644; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
645; VI-NEXT:    flat_load_dword v3, v[0:1]
646; VI-NEXT:    v_mov_b32_e32 v1, s1
647; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
648; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
649; VI-NEXT:    s_waitcnt vmcnt(0)
650; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
651; VI-NEXT:    v_or_b32_e32 v2, 53, v2
652; VI-NEXT:    flat_store_dword v[0:1], v2
653; VI-NEXT:    s_endpgm
654;
655; CI-LABEL: v_insertelement_v2i16_0_inlineimm:
656; CI:       ; %bb.0:
657; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
658; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
659; CI-NEXT:    s_waitcnt lgkmcnt(0)
660; CI-NEXT:    v_mov_b32_e32 v1, s3
661; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
662; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
663; CI-NEXT:    flat_load_dword v3, v[0:1]
664; CI-NEXT:    v_mov_b32_e32 v1, s1
665; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
666; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
667; CI-NEXT:    s_waitcnt vmcnt(0)
668; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
669; CI-NEXT:    v_or_b32_e32 v2, 53, v2
670; CI-NEXT:    flat_store_dword v[0:1], v2
671; CI-NEXT:    s_endpgm
672  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
673  %tid.ext = sext i32 %tid to i64
674  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
675  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
676  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
677  %vecins = insertelement <2 x i16> %vec, i16 53, i32 0
678  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
679  ret void
680}
681
682; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0
683define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
684; GFX9-LABEL: v_insertelement_v2i16_1:
685; GFX9:       ; %bb.0:
686; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
687; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
688; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
689; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
690; GFX9-NEXT:    s_movk_i32 s2, 0x3e7
691; GFX9-NEXT:    s_waitcnt vmcnt(0)
692; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
693; GFX9-NEXT:    v_lshl_or_b32 v1, s2, 16, v1
694; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
695; GFX9-NEXT:    s_endpgm
696;
697; VI-LABEL: v_insertelement_v2i16_1:
698; VI:       ; %bb.0:
699; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
700; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
701; VI-NEXT:    s_waitcnt lgkmcnt(0)
702; VI-NEXT:    v_mov_b32_e32 v1, s3
703; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
704; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
705; VI-NEXT:    flat_load_dword v3, v[0:1]
706; VI-NEXT:    v_mov_b32_e32 v1, s1
707; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
708; VI-NEXT:    v_mov_b32_e32 v2, 0x3e70000
709; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
710; VI-NEXT:    s_waitcnt vmcnt(0)
711; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
712; VI-NEXT:    flat_store_dword v[0:1], v2
713; VI-NEXT:    s_endpgm
714;
715; CI-LABEL: v_insertelement_v2i16_1:
716; CI:       ; %bb.0:
717; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
718; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
719; CI-NEXT:    s_waitcnt lgkmcnt(0)
720; CI-NEXT:    v_mov_b32_e32 v1, s3
721; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
722; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
723; CI-NEXT:    flat_load_dword v3, v[0:1]
724; CI-NEXT:    v_mov_b32_e32 v1, s1
725; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
726; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
727; CI-NEXT:    s_waitcnt vmcnt(0)
728; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
729; CI-NEXT:    v_or_b32_e32 v2, 0x3e70000, v2
730; CI-NEXT:    flat_store_dword v[0:1], v2
731; CI-NEXT:    s_endpgm
732  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
733  %tid.ext = sext i32 %tid to i64
734  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
735  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
736  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
737  %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
738  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
739  ret void
740}
741
742define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
743; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm:
744; GFX9:       ; %bb.0:
745; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
746; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
747; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
748; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
749; GFX9-NEXT:    s_waitcnt vmcnt(0)
750; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
751; GFX9-NEXT:    v_lshl_or_b32 v1, -15, 16, v1
752; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
753; GFX9-NEXT:    s_endpgm
754;
755; VI-LABEL: v_insertelement_v2i16_1_inlineimm:
756; VI:       ; %bb.0:
757; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
758; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
759; VI-NEXT:    s_waitcnt lgkmcnt(0)
760; VI-NEXT:    v_mov_b32_e32 v1, s3
761; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
762; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
763; VI-NEXT:    flat_load_dword v3, v[0:1]
764; VI-NEXT:    v_mov_b32_e32 v1, s1
765; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
766; VI-NEXT:    v_mov_b32_e32 v2, 0xfff10000
767; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
768; VI-NEXT:    s_waitcnt vmcnt(0)
769; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
770; VI-NEXT:    flat_store_dword v[0:1], v2
771; VI-NEXT:    s_endpgm
772;
773; CI-LABEL: v_insertelement_v2i16_1_inlineimm:
774; CI:       ; %bb.0:
775; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
776; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
777; CI-NEXT:    s_waitcnt lgkmcnt(0)
778; CI-NEXT:    v_mov_b32_e32 v1, s3
779; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
780; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
781; CI-NEXT:    flat_load_dword v3, v[0:1]
782; CI-NEXT:    v_mov_b32_e32 v1, s1
783; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
784; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
785; CI-NEXT:    s_waitcnt vmcnt(0)
786; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
787; CI-NEXT:    v_or_b32_e32 v2, 0xfff10000, v2
788; CI-NEXT:    flat_store_dword v[0:1], v2
789; CI-NEXT:    s_endpgm
790  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
791  %tid.ext = sext i32 %tid to i64
792  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
793  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
794  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
795  %vecins = insertelement <2 x i16> %vec, i16 -15, i32 1
796  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
797  ret void
798}
799
800define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
801; GFX9-LABEL: v_insertelement_v2f16_0:
802; GFX9:       ; %bb.0:
803; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
804; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
805; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4500
806; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
807; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
808; GFX9-NEXT:    s_waitcnt vmcnt(0)
809; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
810; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
811; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
812; GFX9-NEXT:    s_endpgm
813;
814; VI-LABEL: v_insertelement_v2f16_0:
815; VI:       ; %bb.0:
816; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
817; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
818; VI-NEXT:    s_waitcnt lgkmcnt(0)
819; VI-NEXT:    v_mov_b32_e32 v1, s3
820; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
821; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
822; VI-NEXT:    flat_load_dword v3, v[0:1]
823; VI-NEXT:    v_mov_b32_e32 v1, s1
824; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
825; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
826; VI-NEXT:    s_waitcnt vmcnt(0)
827; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
828; VI-NEXT:    v_or_b32_e32 v2, 0x4500, v2
829; VI-NEXT:    flat_store_dword v[0:1], v2
830; VI-NEXT:    s_endpgm
831;
832; CI-LABEL: v_insertelement_v2f16_0:
833; CI:       ; %bb.0:
834; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
835; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
836; CI-NEXT:    s_waitcnt lgkmcnt(0)
837; CI-NEXT:    v_mov_b32_e32 v1, s3
838; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
839; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
840; CI-NEXT:    flat_load_dword v3, v[0:1]
841; CI-NEXT:    v_mov_b32_e32 v1, s1
842; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
843; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
844; CI-NEXT:    s_waitcnt vmcnt(0)
845; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
846; CI-NEXT:    v_or_b32_e32 v2, 0x4500, v2
847; CI-NEXT:    flat_store_dword v[0:1], v2
848; CI-NEXT:    s_endpgm
849  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
850  %tid.ext = sext i32 %tid to i64
851  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
852  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
853  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
854  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
855  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
856  ret void
857}
858
859define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
860; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm:
861; GFX9:       ; %bb.0:
862; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
863; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
864; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
865; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
866; GFX9-NEXT:    s_waitcnt vmcnt(0)
867; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
868; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, 53
869; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
870; GFX9-NEXT:    s_endpgm
871;
872; VI-LABEL: v_insertelement_v2f16_0_inlineimm:
873; VI:       ; %bb.0:
874; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
875; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
876; VI-NEXT:    s_waitcnt lgkmcnt(0)
877; VI-NEXT:    v_mov_b32_e32 v1, s3
878; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
879; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
880; VI-NEXT:    flat_load_dword v3, v[0:1]
881; VI-NEXT:    v_mov_b32_e32 v1, s1
882; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
883; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
884; VI-NEXT:    s_waitcnt vmcnt(0)
885; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
886; VI-NEXT:    v_or_b32_e32 v2, 53, v2
887; VI-NEXT:    flat_store_dword v[0:1], v2
888; VI-NEXT:    s_endpgm
889;
890; CI-LABEL: v_insertelement_v2f16_0_inlineimm:
891; CI:       ; %bb.0:
892; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
893; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
894; CI-NEXT:    s_waitcnt lgkmcnt(0)
895; CI-NEXT:    v_mov_b32_e32 v1, s3
896; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
897; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
898; CI-NEXT:    flat_load_dword v3, v[0:1]
899; CI-NEXT:    v_mov_b32_e32 v1, s1
900; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
901; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
902; CI-NEXT:    s_waitcnt vmcnt(0)
903; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
904; CI-NEXT:    v_or_b32_e32 v2, 53, v2
905; CI-NEXT:    flat_store_dword v[0:1], v2
906; CI-NEXT:    s_endpgm
907  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
908  %tid.ext = sext i32 %tid to i64
909  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
910  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
911  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
912  %vecins = insertelement <2 x half> %vec, half 0xH0035, i32 0
913  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
914  ret void
915}
916
917define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
918; GFX9-LABEL: v_insertelement_v2f16_1:
919; GFX9:       ; %bb.0:
920; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
921; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
922; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
923; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
924; GFX9-NEXT:    s_movk_i32 s2, 0x4500
925; GFX9-NEXT:    s_waitcnt vmcnt(0)
926; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
927; GFX9-NEXT:    v_lshl_or_b32 v1, s2, 16, v1
928; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
929; GFX9-NEXT:    s_endpgm
930;
931; VI-LABEL: v_insertelement_v2f16_1:
932; VI:       ; %bb.0:
933; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
934; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
935; VI-NEXT:    s_waitcnt lgkmcnt(0)
936; VI-NEXT:    v_mov_b32_e32 v1, s3
937; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
938; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
939; VI-NEXT:    flat_load_dword v3, v[0:1]
940; VI-NEXT:    v_mov_b32_e32 v1, s1
941; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
942; VI-NEXT:    v_mov_b32_e32 v2, 0x45000000
943; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
944; VI-NEXT:    s_waitcnt vmcnt(0)
945; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
946; VI-NEXT:    flat_store_dword v[0:1], v2
947; VI-NEXT:    s_endpgm
948;
949; CI-LABEL: v_insertelement_v2f16_1:
950; CI:       ; %bb.0:
951; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
952; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
953; CI-NEXT:    s_waitcnt lgkmcnt(0)
954; CI-NEXT:    v_mov_b32_e32 v1, s3
955; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
956; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
957; CI-NEXT:    flat_load_dword v3, v[0:1]
958; CI-NEXT:    v_mov_b32_e32 v1, s1
959; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
960; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
961; CI-NEXT:    s_waitcnt vmcnt(0)
962; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
963; CI-NEXT:    v_or_b32_e32 v2, 0x45000000, v2
964; CI-NEXT:    flat_store_dword v[0:1], v2
965; CI-NEXT:    s_endpgm
966  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
967  %tid.ext = sext i32 %tid to i64
968  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
969  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
970  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
971  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
972  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
973  ret void
974}
975
976define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
977; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm:
978; GFX9:       ; %bb.0:
979; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
980; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
981; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
982; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
983; GFX9-NEXT:    s_waitcnt vmcnt(0)
984; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
985; GFX9-NEXT:    v_lshl_or_b32 v1, 35, 16, v1
986; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
987; GFX9-NEXT:    s_endpgm
988;
989; VI-LABEL: v_insertelement_v2f16_1_inlineimm:
990; VI:       ; %bb.0:
991; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
992; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
993; VI-NEXT:    s_waitcnt lgkmcnt(0)
994; VI-NEXT:    v_mov_b32_e32 v1, s3
995; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
996; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
997; VI-NEXT:    flat_load_dword v3, v[0:1]
998; VI-NEXT:    v_mov_b32_e32 v1, s1
999; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1000; VI-NEXT:    v_mov_b32_e32 v2, 0x230000
1001; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1002; VI-NEXT:    s_waitcnt vmcnt(0)
1003; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1004; VI-NEXT:    flat_store_dword v[0:1], v2
1005; VI-NEXT:    s_endpgm
1006;
1007; CI-LABEL: v_insertelement_v2f16_1_inlineimm:
1008; CI:       ; %bb.0:
1009; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1010; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1011; CI-NEXT:    s_waitcnt lgkmcnt(0)
1012; CI-NEXT:    v_mov_b32_e32 v1, s3
1013; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1014; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1015; CI-NEXT:    flat_load_dword v3, v[0:1]
1016; CI-NEXT:    v_mov_b32_e32 v1, s1
1017; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1018; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1019; CI-NEXT:    s_waitcnt vmcnt(0)
1020; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
1021; CI-NEXT:    v_or_b32_e32 v2, 0x230000, v2
1022; CI-NEXT:    flat_store_dword v[0:1], v2
1023; CI-NEXT:    s_endpgm
1024  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1025  %tid.ext = sext i32 %tid to i64
1026  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
1027  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
1028  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
1029  %vecins = insertelement <2 x half> %vec, half 0xH0023, i32 1
1030  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
1031  ret void
1032}
1033
1034; FIXME: Enable for others when argument load not split
1035define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(4)* %idx.ptr) #0 {
1036; GFX9-LABEL: s_insertelement_v2i16_dynamic:
1037; GFX9:       ; %bb.0:
1038; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
1039; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1040; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1041; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1042; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
1043; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
1044; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1045; GFX9-NEXT:    s_lshl_b32 s2, s4, 4
1046; GFX9-NEXT:    s_lshl_b32 s2, 0xffff, s2
1047; GFX9-NEXT:    s_andn2_b32 s3, s5, s2
1048; GFX9-NEXT:    s_and_b32 s2, s2, 0x3e703e7
1049; GFX9-NEXT:    s_or_b32 s2, s2, s3
1050; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1051; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1052; GFX9-NEXT:    s_endpgm
1053;
1054; VI-LABEL: s_insertelement_v2i16_dynamic:
1055; VI:       ; %bb.0:
1056; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
1057; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1058; VI-NEXT:    s_waitcnt lgkmcnt(0)
1059; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
1060; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
1061; VI-NEXT:    v_mov_b32_e32 v0, s0
1062; VI-NEXT:    v_mov_b32_e32 v1, s1
1063; VI-NEXT:    s_waitcnt lgkmcnt(0)
1064; VI-NEXT:    s_lshl_b32 s0, s4, 4
1065; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1066; VI-NEXT:    s_andn2_b32 s1, s2, s0
1067; VI-NEXT:    s_and_b32 s0, s0, 0x3e703e7
1068; VI-NEXT:    s_or_b32 s0, s0, s1
1069; VI-NEXT:    v_mov_b32_e32 v2, s0
1070; VI-NEXT:    flat_store_dword v[0:1], v2
1071; VI-NEXT:    s_endpgm
1072;
1073; CI-LABEL: s_insertelement_v2i16_dynamic:
1074; CI:       ; %bb.0:
1075; CI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4
1076; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1077; CI-NEXT:    s_waitcnt lgkmcnt(0)
1078; CI-NEXT:    s_load_dword s4, s[6:7], 0x0
1079; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
1080; CI-NEXT:    v_mov_b32_e32 v0, s0
1081; CI-NEXT:    v_mov_b32_e32 v1, s1
1082; CI-NEXT:    s_waitcnt lgkmcnt(0)
1083; CI-NEXT:    s_lshl_b32 s0, s4, 4
1084; CI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1085; CI-NEXT:    s_andn2_b32 s1, s2, s0
1086; CI-NEXT:    s_and_b32 s0, s0, 0x3e703e7
1087; CI-NEXT:    s_or_b32 s0, s0, s1
1088; CI-NEXT:    v_mov_b32_e32 v2, s0
1089; CI-NEXT:    flat_store_dword v[0:1], v2
1090; CI-NEXT:    s_endpgm
1091  %idx = load volatile i32, i32 addrspace(4)* %idx.ptr
1092  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
1093  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
1094  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
1095  ret void
1096}
1097
1098define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 {
1099; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1100; GFX9:       ; %bb.0:
1101; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1102; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
1103; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1104; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3e703e7
1105; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1106; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1107; GFX9-NEXT:    s_lshl_b32 s2, s6, 4
1108; GFX9-NEXT:    s_lshl_b32 s2, 0xffff, s2
1109; GFX9-NEXT:    s_waitcnt vmcnt(0)
1110; GFX9-NEXT:    v_bfi_b32 v1, s2, v2, v1
1111; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1112; GFX9-NEXT:    s_endpgm
1113;
1114; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1115; VI:       ; %bb.0:
1116; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1117; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1118; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1119; VI-NEXT:    s_waitcnt lgkmcnt(0)
1120; VI-NEXT:    v_mov_b32_e32 v1, s3
1121; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1122; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1123; VI-NEXT:    flat_load_dword v3, v[0:1]
1124; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1125; VI-NEXT:    s_lshl_b32 s0, s4, 4
1126; VI-NEXT:    v_mov_b32_e32 v1, s1
1127; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1128; VI-NEXT:    v_mov_b32_e32 v2, 0x3e703e7
1129; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1130; VI-NEXT:    s_waitcnt vmcnt(0)
1131; VI-NEXT:    v_bfi_b32 v2, s0, v2, v3
1132; VI-NEXT:    flat_store_dword v[0:1], v2
1133; VI-NEXT:    s_endpgm
1134;
1135; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1136; CI:       ; %bb.0:
1137; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1138; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1139; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1140; CI-NEXT:    s_waitcnt lgkmcnt(0)
1141; CI-NEXT:    v_mov_b32_e32 v1, s3
1142; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1143; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1144; CI-NEXT:    flat_load_dword v3, v[0:1]
1145; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1146; CI-NEXT:    s_lshl_b32 s0, s4, 4
1147; CI-NEXT:    v_mov_b32_e32 v1, s1
1148; CI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1149; CI-NEXT:    v_mov_b32_e32 v2, 0x3e703e7
1150; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1151; CI-NEXT:    s_waitcnt vmcnt(0)
1152; CI-NEXT:    v_bfi_b32 v2, s0, v2, v3
1153; CI-NEXT:    flat_store_dword v[0:1], v2
1154; CI-NEXT:    s_endpgm
1155  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1156  %tid.ext = sext i32 %tid to i64
1157  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1158  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1159  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
1160  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
1161  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
1162  ret void
1163}
1164
1165define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
1166; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1167; GFX9:       ; %bb.0:
1168; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
1169; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1170; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1171; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1172; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
1173; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
1174; GFX9-NEXT:    s_mov_b32 s2, 0xffff
1175; GFX9-NEXT:    s_waitcnt vmcnt(1)
1176; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1177; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
1178; GFX9-NEXT:    s_mov_b32 s2, 0x12341234
1179; GFX9-NEXT:    s_waitcnt vmcnt(0)
1180; GFX9-NEXT:    v_bfi_b32 v1, v1, s2, v2
1181; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1182; GFX9-NEXT:    s_endpgm
1183;
1184; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1185; VI:       ; %bb.0:
1186; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1187; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
1188; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1189; VI-NEXT:    s_waitcnt lgkmcnt(0)
1190; VI-NEXT:    v_mov_b32_e32 v3, s3
1191; VI-NEXT:    v_mov_b32_e32 v1, s5
1192; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1193; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1194; VI-NEXT:    flat_load_dword v4, v[0:1]
1195; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1196; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
1197; VI-NEXT:    flat_load_dword v3, v[0:1]
1198; VI-NEXT:    s_mov_b32 s2, 0xffff
1199; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1200; VI-NEXT:    v_mov_b32_e32 v1, s1
1201; VI-NEXT:    s_mov_b32 s0, 0x12341234
1202; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1203; VI-NEXT:    s_waitcnt vmcnt(1)
1204; VI-NEXT:    v_lshlrev_b32_e32 v2, 4, v4
1205; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, s2
1206; VI-NEXT:    s_waitcnt vmcnt(0)
1207; VI-NEXT:    v_bfi_b32 v2, v2, s0, v3
1208; VI-NEXT:    flat_store_dword v[0:1], v2
1209; VI-NEXT:    s_endpgm
1210;
1211; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1212; CI:       ; %bb.0:
1213; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1214; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
1215; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1216; CI-NEXT:    s_waitcnt lgkmcnt(0)
1217; CI-NEXT:    v_mov_b32_e32 v3, s3
1218; CI-NEXT:    v_mov_b32_e32 v1, s5
1219; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
1220; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1221; CI-NEXT:    flat_load_dword v4, v[0:1]
1222; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1223; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
1224; CI-NEXT:    flat_load_dword v3, v[0:1]
1225; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1226; CI-NEXT:    v_mov_b32_e32 v1, s1
1227; CI-NEXT:    s_mov_b32 s0, 0x12341234
1228; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1229; CI-NEXT:    s_waitcnt vmcnt(1)
1230; CI-NEXT:    v_lshlrev_b32_e32 v2, 4, v4
1231; CI-NEXT:    v_lshl_b32_e32 v2, 0xffff, v2
1232; CI-NEXT:    s_waitcnt vmcnt(0)
1233; CI-NEXT:    v_bfi_b32 v2, v2, s0, v3
1234; CI-NEXT:    flat_store_dword v[0:1], v2
1235; CI-NEXT:    s_endpgm
1236  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1237  %tid.ext = sext i32 %tid to i64
1238  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
1239  %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
1240  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
1241  %idx = load i32, i32 addrspace(1)* %idx.gep
1242  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
1243  %vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx
1244  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
1245  ret void
1246}
1247
1248define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
1249; GFX9-LABEL: v_insertelement_v4f16_0:
1250; GFX9:       ; %bb.0:
1251; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1252; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x30
1253; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1254; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
1255; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1256; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1257; GFX9-NEXT:    s_waitcnt vmcnt(0)
1258; GFX9-NEXT:    v_bfi_b32 v0, v3, s6, v0
1259; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1260; GFX9-NEXT:    s_endpgm
1261;
1262; VI-LABEL: v_insertelement_v4f16_0:
1263; VI:       ; %bb.0:
1264; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1265; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
1266; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1267; VI-NEXT:    s_waitcnt lgkmcnt(0)
1268; VI-NEXT:    v_mov_b32_e32 v1, s3
1269; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1270; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1271; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1272; VI-NEXT:    v_mov_b32_e32 v3, s1
1273; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1274; VI-NEXT:    s_mov_b32 s0, 0xffff
1275; VI-NEXT:    v_mov_b32_e32 v4, s4
1276; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1277; VI-NEXT:    s_waitcnt vmcnt(0)
1278; VI-NEXT:    v_bfi_b32 v0, s0, v4, v0
1279; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1280; VI-NEXT:    s_endpgm
1281;
1282; CI-LABEL: v_insertelement_v4f16_0:
1283; CI:       ; %bb.0:
1284; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1285; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
1286; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1287; CI-NEXT:    s_waitcnt lgkmcnt(0)
1288; CI-NEXT:    v_mov_b32_e32 v1, s3
1289; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1290; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1291; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1292; CI-NEXT:    v_mov_b32_e32 v3, s1
1293; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1294; CI-NEXT:    s_mov_b32 s0, 0xffff
1295; CI-NEXT:    v_mov_b32_e32 v4, s4
1296; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1297; CI-NEXT:    s_waitcnt vmcnt(0)
1298; CI-NEXT:    v_bfi_b32 v0, s0, v4, v0
1299; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1300; CI-NEXT:    s_endpgm
1301  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1302  %tid.ext = sext i32 %tid to i64
1303  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1304  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1305  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1306  %val.trunc = trunc i32 %val to i16
1307  %val.cvt = bitcast i16 %val.trunc to half
1308  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 0
1309  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1310  ret void
1311}
1312
1313define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
1314; GFX9-LABEL: v_insertelement_v4f16_1:
1315; GFX9:       ; %bb.0:
1316; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1317; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
1318; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1319; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1320; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1321; GFX9-NEXT:    s_waitcnt vmcnt(0)
1322; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1323; GFX9-NEXT:    v_lshl_or_b32 v0, s6, 16, v0
1324; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1325; GFX9-NEXT:    s_endpgm
1326;
1327; VI-LABEL: v_insertelement_v4f16_1:
1328; VI:       ; %bb.0:
1329; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1330; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1331; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1332; VI-NEXT:    s_waitcnt lgkmcnt(0)
1333; VI-NEXT:    v_mov_b32_e32 v1, s3
1334; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1335; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1336; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1337; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1338; VI-NEXT:    s_lshl_b32 s0, s4, 16
1339; VI-NEXT:    v_mov_b32_e32 v3, s1
1340; VI-NEXT:    v_mov_b32_e32 v4, s0
1341; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1342; VI-NEXT:    s_waitcnt vmcnt(0)
1343; VI-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1344; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1345; VI-NEXT:    s_endpgm
1346;
1347; CI-LABEL: v_insertelement_v4f16_1:
1348; CI:       ; %bb.0:
1349; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1350; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1351; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1352; CI-NEXT:    s_waitcnt lgkmcnt(0)
1353; CI-NEXT:    v_mov_b32_e32 v1, s3
1354; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1355; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1356; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1357; CI-NEXT:    v_mov_b32_e32 v3, s1
1358; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1359; CI-NEXT:    s_lshl_b32 s0, s4, 16
1360; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1361; CI-NEXT:    s_waitcnt vmcnt(0)
1362; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1363; CI-NEXT:    v_or_b32_e32 v0, s0, v0
1364; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1365; CI-NEXT:    s_endpgm
1366  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1367  %tid.ext = sext i32 %tid to i64
1368  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1369  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1370  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1371  %val.trunc = trunc i32 %val to i16
1372  %val.cvt = bitcast i16 %val.trunc to half
1373  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 1
1374  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1375  ret void
1376}
1377
1378define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
1379; GFX9-LABEL: v_insertelement_v4f16_2:
1380; GFX9:       ; %bb.0:
1381; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1382; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x30
1383; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1384; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
1385; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1386; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1387; GFX9-NEXT:    s_waitcnt vmcnt(0)
1388; GFX9-NEXT:    v_bfi_b32 v1, v3, s6, v1
1389; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1390; GFX9-NEXT:    s_endpgm
1391;
1392; VI-LABEL: v_insertelement_v4f16_2:
1393; VI:       ; %bb.0:
1394; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1395; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
1396; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1397; VI-NEXT:    s_waitcnt lgkmcnt(0)
1398; VI-NEXT:    v_mov_b32_e32 v1, s3
1399; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1400; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1401; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1402; VI-NEXT:    v_mov_b32_e32 v3, s1
1403; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1404; VI-NEXT:    s_mov_b32 s0, 0xffff
1405; VI-NEXT:    v_mov_b32_e32 v4, s4
1406; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1407; VI-NEXT:    s_waitcnt vmcnt(0)
1408; VI-NEXT:    v_bfi_b32 v1, s0, v4, v1
1409; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1410; VI-NEXT:    s_endpgm
1411;
1412; CI-LABEL: v_insertelement_v4f16_2:
1413; CI:       ; %bb.0:
1414; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1415; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
1416; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1417; CI-NEXT:    s_waitcnt lgkmcnt(0)
1418; CI-NEXT:    v_mov_b32_e32 v1, s3
1419; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1420; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1421; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1422; CI-NEXT:    v_mov_b32_e32 v3, s1
1423; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1424; CI-NEXT:    s_mov_b32 s0, 0xffff
1425; CI-NEXT:    v_mov_b32_e32 v4, s4
1426; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1427; CI-NEXT:    s_waitcnt vmcnt(0)
1428; CI-NEXT:    v_bfi_b32 v1, s0, v4, v1
1429; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1430; CI-NEXT:    s_endpgm
1431  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1432  %tid.ext = sext i32 %tid to i64
1433  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1434  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1435  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1436  %val.trunc = trunc i32 %val to i16
1437  %val.cvt = bitcast i16 %val.trunc to half
1438  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 2
1439  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1440  ret void
1441}
1442
1443define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
1444; GFX9-LABEL: v_insertelement_v4f16_3:
1445; GFX9:       ; %bb.0:
1446; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1447; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
1448; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1449; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1450; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1451; GFX9-NEXT:    s_waitcnt vmcnt(0)
1452; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1453; GFX9-NEXT:    v_lshl_or_b32 v1, s6, 16, v1
1454; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1455; GFX9-NEXT:    s_endpgm
1456;
1457; VI-LABEL: v_insertelement_v4f16_3:
1458; VI:       ; %bb.0:
1459; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1460; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1461; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1462; VI-NEXT:    s_waitcnt lgkmcnt(0)
1463; VI-NEXT:    v_mov_b32_e32 v1, s3
1464; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1465; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1466; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1467; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1468; VI-NEXT:    s_lshl_b32 s0, s4, 16
1469; VI-NEXT:    v_mov_b32_e32 v3, s1
1470; VI-NEXT:    v_mov_b32_e32 v4, s0
1471; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1472; VI-NEXT:    s_waitcnt vmcnt(0)
1473; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1474; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1475; VI-NEXT:    s_endpgm
1476;
1477; CI-LABEL: v_insertelement_v4f16_3:
1478; CI:       ; %bb.0:
1479; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1480; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1481; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1482; CI-NEXT:    s_waitcnt lgkmcnt(0)
1483; CI-NEXT:    v_mov_b32_e32 v1, s3
1484; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1485; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1486; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1487; CI-NEXT:    v_mov_b32_e32 v3, s1
1488; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1489; CI-NEXT:    s_lshl_b32 s0, s4, 16
1490; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1491; CI-NEXT:    s_waitcnt vmcnt(0)
1492; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1493; CI-NEXT:    v_or_b32_e32 v1, s0, v1
1494; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1495; CI-NEXT:    s_endpgm
1496  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1497  %tid.ext = sext i32 %tid to i64
1498  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1499  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1500  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1501  %val.trunc = trunc i32 %val to i16
1502  %val.cvt = bitcast i16 %val.trunc to half
1503  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 3
1504  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1505  ret void
1506}
1507
1508define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
1509; GFX9-LABEL: v_insertelement_v4i16_2:
1510; GFX9:       ; %bb.0:
1511; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1512; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
1513; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1514; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
1515; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1516; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1517; GFX9-NEXT:    s_waitcnt vmcnt(0)
1518; GFX9-NEXT:    v_bfi_b32 v1, v3, s6, v1
1519; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1520; GFX9-NEXT:    s_endpgm
1521;
1522; VI-LABEL: v_insertelement_v4i16_2:
1523; VI:       ; %bb.0:
1524; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1525; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1526; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1527; VI-NEXT:    s_waitcnt lgkmcnt(0)
1528; VI-NEXT:    v_mov_b32_e32 v1, s3
1529; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1530; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1531; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1532; VI-NEXT:    v_mov_b32_e32 v3, s1
1533; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1534; VI-NEXT:    s_mov_b32 s0, 0xffff
1535; VI-NEXT:    v_mov_b32_e32 v4, s4
1536; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1537; VI-NEXT:    s_waitcnt vmcnt(0)
1538; VI-NEXT:    v_bfi_b32 v1, s0, v4, v1
1539; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1540; VI-NEXT:    s_endpgm
1541;
1542; CI-LABEL: v_insertelement_v4i16_2:
1543; CI:       ; %bb.0:
1544; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1545; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1546; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1547; CI-NEXT:    s_waitcnt lgkmcnt(0)
1548; CI-NEXT:    v_mov_b32_e32 v1, s3
1549; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1550; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1551; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1552; CI-NEXT:    v_mov_b32_e32 v3, s1
1553; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1554; CI-NEXT:    s_mov_b32 s0, 0xffff
1555; CI-NEXT:    v_mov_b32_e32 v4, s4
1556; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1557; CI-NEXT:    s_waitcnt vmcnt(0)
1558; CI-NEXT:    v_bfi_b32 v1, s0, v4, v1
1559; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1560; CI-NEXT:    s_endpgm
1561  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1562  %tid.ext = sext i32 %tid to i64
1563  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
1564  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
1565  %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
1566  %val.trunc = trunc i32 %val to i16
1567  %val.cvt = bitcast i16 %val.trunc to i16
1568  %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 2
1569  store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep
1570  ret void
1571}
1572
1573; FIXME: Better code on CI?
1574define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
1575; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1576; GFX9:       ; %bb.0:
1577; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1578; GFX9-NEXT:    global_load_dword v2, v[0:1], off glc
1579; GFX9-NEXT:    s_waitcnt vmcnt(0)
1580; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1581; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
1582; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1583; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
1584; GFX9-NEXT:    s_mov_b64 s[2:3], 0xffff
1585; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
1586; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, s[2:3]
1587; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s4, s4
1588; GFX9-NEXT:    s_waitcnt vmcnt(0)
1589; GFX9-NEXT:    v_bfi_b32 v1, v3, s2, v1
1590; GFX9-NEXT:    v_bfi_b32 v0, v2, s2, v0
1591; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
1592; GFX9-NEXT:    s_endpgm
1593;
1594; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1595; VI:       ; %bb.0:
1596; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1597; VI-NEXT:    flat_load_dword v4, v[0:1] glc
1598; VI-NEXT:    s_waitcnt vmcnt(0)
1599; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1600; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1601; VI-NEXT:    s_waitcnt lgkmcnt(0)
1602; VI-NEXT:    v_mov_b32_e32 v1, s3
1603; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1604; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1605; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1606; VI-NEXT:    s_mov_b64 s[2:3], 0xffff
1607; VI-NEXT:    v_mov_b32_e32 v3, s1
1608; VI-NEXT:    s_lshl_b32 s1, s4, 16
1609; VI-NEXT:    s_and_b32 s4, s4, s2
1610; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1611; VI-NEXT:    s_or_b32 s0, s4, s1
1612; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1613; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
1614; VI-NEXT:    v_lshlrev_b64 v[4:5], v4, s[2:3]
1615; VI-NEXT:    s_waitcnt vmcnt(0)
1616; VI-NEXT:    v_bfi_b32 v1, v5, s0, v1
1617; VI-NEXT:    v_bfi_b32 v0, v4, s0, v0
1618; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1619; VI-NEXT:    s_endpgm
1620;
1621; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1622; CI:       ; %bb.0:
1623; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1624; CI-NEXT:    flat_load_dword v4, v[0:1] glc
1625; CI-NEXT:    s_waitcnt vmcnt(0)
1626; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1627; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1628; CI-NEXT:    s_waitcnt lgkmcnt(0)
1629; CI-NEXT:    v_mov_b32_e32 v1, s3
1630; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1631; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1632; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1633; CI-NEXT:    s_mov_b64 s[2:3], 0xffff
1634; CI-NEXT:    v_mov_b32_e32 v3, s1
1635; CI-NEXT:    s_lshl_b32 s1, s4, 16
1636; CI-NEXT:    s_and_b32 s4, s4, s2
1637; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1638; CI-NEXT:    s_or_b32 s0, s4, s1
1639; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1640; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
1641; CI-NEXT:    v_lshl_b64 v[4:5], s[2:3], v4
1642; CI-NEXT:    s_waitcnt vmcnt(0)
1643; CI-NEXT:    v_bfi_b32 v1, v5, s0, v1
1644; CI-NEXT:    v_bfi_b32 v0, v4, s0, v0
1645; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1646; CI-NEXT:    s_endpgm
1647  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1648  %tid.ext = sext i32 %tid to i64
1649  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
1650  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
1651  %idx.val = load volatile i32, i32 addrspace(1)* undef
1652  %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
1653  %val.trunc = trunc i32 %val to i16
1654  %val.cvt = bitcast i16 %val.trunc to i16
1655  %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 %idx.val
1656  store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep
1657  ret void
1658}
1659
1660define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val, i32 %idxval) #0 {
1661; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr:
1662; GFX9:       ; %bb.0:
1663; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1664; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
1665; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1666; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1667; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1668; GFX9-NEXT:    s_mov_b64 s[2:3], 0xffff
1669; GFX9-NEXT:    s_lshl_b32 s4, s7, 4
1670; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s6, s6
1671; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
1672; GFX9-NEXT:    v_mov_b32_e32 v3, s5
1673; GFX9-NEXT:    v_mov_b32_e32 v4, s5
1674; GFX9-NEXT:    s_waitcnt vmcnt(0)
1675; GFX9-NEXT:    v_bfi_b32 v1, s3, v3, v1
1676; GFX9-NEXT:    v_bfi_b32 v0, s2, v4, v0
1677; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1678; GFX9-NEXT:    s_endpgm
1679;
1680; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
1681; VI:       ; %bb.0:
1682; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1683; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
1684; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1685; VI-NEXT:    s_waitcnt lgkmcnt(0)
1686; VI-NEXT:    v_mov_b32_e32 v1, s3
1687; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1688; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1689; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1690; VI-NEXT:    s_mov_b64 s[2:3], 0xffff
1691; VI-NEXT:    v_mov_b32_e32 v3, s1
1692; VI-NEXT:    s_lshl_b32 s1, s5, 4
1693; VI-NEXT:    s_lshl_b32 s5, s4, 16
1694; VI-NEXT:    s_and_b32 s4, s4, s2
1695; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1696; VI-NEXT:    s_lshl_b64 s[0:1], s[2:3], s1
1697; VI-NEXT:    s_or_b32 s2, s4, s5
1698; VI-NEXT:    v_mov_b32_e32 v4, s2
1699; VI-NEXT:    v_mov_b32_e32 v5, s2
1700; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1701; VI-NEXT:    s_waitcnt vmcnt(0)
1702; VI-NEXT:    v_bfi_b32 v1, s1, v4, v1
1703; VI-NEXT:    v_bfi_b32 v0, s0, v5, v0
1704; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1705; VI-NEXT:    s_endpgm
1706;
1707; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
1708; CI:       ; %bb.0:
1709; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1710; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
1711; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1712; CI-NEXT:    s_waitcnt lgkmcnt(0)
1713; CI-NEXT:    v_mov_b32_e32 v1, s3
1714; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1715; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1716; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1717; CI-NEXT:    s_mov_b64 s[2:3], 0xffff
1718; CI-NEXT:    v_mov_b32_e32 v3, s1
1719; CI-NEXT:    s_and_b32 s6, s4, s2
1720; CI-NEXT:    s_lshl_b32 s1, s5, 4
1721; CI-NEXT:    s_lshl_b32 s4, s4, 16
1722; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1723; CI-NEXT:    s_lshl_b64 s[0:1], s[2:3], s1
1724; CI-NEXT:    s_or_b32 s2, s6, s4
1725; CI-NEXT:    v_mov_b32_e32 v4, s2
1726; CI-NEXT:    v_mov_b32_e32 v5, s2
1727; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1728; CI-NEXT:    s_waitcnt vmcnt(0)
1729; CI-NEXT:    v_bfi_b32 v1, s1, v4, v1
1730; CI-NEXT:    v_bfi_b32 v0, s0, v5, v0
1731; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1732; CI-NEXT:    s_endpgm
1733  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1734  %tid.ext = sext i32 %tid to i64
1735  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1736  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1737  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1738  %val.trunc = trunc i32 %val to i16
1739  %val.cvt = bitcast i16 %val.trunc to half
1740  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval
1741  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1742  ret void
1743}
1744
1745define amdgpu_kernel void @v_insertelement_v8f16_3(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %val) {
1746; GFX9-LABEL: v_insertelement_v8f16_3:
1747; GFX9:       ; %bb.0:
1748; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1749; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
1750; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1751; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1752; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
1753; GFX9-NEXT:    s_waitcnt vmcnt(0)
1754; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1755; GFX9-NEXT:    v_lshl_or_b32 v1, s6, 16, v1
1756; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1757; GFX9-NEXT:    s_endpgm
1758;
1759; VI-LABEL: v_insertelement_v8f16_3:
1760; VI:       ; %bb.0:
1761; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1762; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1763; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1764; VI-NEXT:    s_waitcnt lgkmcnt(0)
1765; VI-NEXT:    v_mov_b32_e32 v1, s3
1766; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
1767; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1768; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1769; VI-NEXT:    v_mov_b32_e32 v5, s1
1770; VI-NEXT:    s_lshl_b32 s1, s4, 16
1771; VI-NEXT:    s_mov_b32 s2, 0xffff
1772; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
1773; VI-NEXT:    v_mov_b32_e32 v6, s1
1774; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
1775; VI-NEXT:    s_waitcnt vmcnt(0)
1776; VI-NEXT:    v_bfi_b32 v3, s2, v3, v3
1777; VI-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1778; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1779; VI-NEXT:    s_endpgm
1780;
1781; CI-LABEL: v_insertelement_v8f16_3:
1782; CI:       ; %bb.0:
1783; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1784; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1785; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1786; CI-NEXT:    s_waitcnt lgkmcnt(0)
1787; CI-NEXT:    v_mov_b32_e32 v1, s3
1788; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
1789; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1790; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1791; CI-NEXT:    v_mov_b32_e32 v5, s1
1792; CI-NEXT:    v_add_i32_e32 v4, vcc, s0, v4
1793; CI-NEXT:    s_lshl_b32 s0, s4, 16
1794; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
1795; CI-NEXT:    s_waitcnt vmcnt(0)
1796; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1797; CI-NEXT:    v_or_b32_e32 v1, s0, v1
1798; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1799; CI-NEXT:    s_endpgm
1800  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1801  %tid.ext = sext i32 %tid to i64
1802  %in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext
1803  %out.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %out, i64 %tid.ext
1804  %vec = load <8 x half>, <8 x half> addrspace(1)* %in.gep
1805  %val.trunc = trunc i32 %val to i16
1806  %val.cvt = bitcast i16 %val.trunc to half
1807  %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 3
1808  store <8 x half> %vecins, <8 x half> addrspace(1)* %out.gep
1809  ret void
1810}
1811
1812define amdgpu_kernel void @v_insertelement_v8i16_6(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in, i32 %val) {
1813; GFX9-LABEL: v_insertelement_v8i16_6:
1814; GFX9:       ; %bb.0:
1815; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1816; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
1817; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1818; GFX9-NEXT:    v_mov_b32_e32 v5, 0xffff
1819; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1820; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
1821; GFX9-NEXT:    s_waitcnt vmcnt(0)
1822; GFX9-NEXT:    v_bfi_b32 v3, v5, s6, v3
1823; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1824; GFX9-NEXT:    s_endpgm
1825;
1826; VI-LABEL: v_insertelement_v8i16_6:
1827; VI:       ; %bb.0:
1828; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1829; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1830; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1831; VI-NEXT:    s_waitcnt lgkmcnt(0)
1832; VI-NEXT:    v_mov_b32_e32 v1, s3
1833; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
1834; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1835; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1836; VI-NEXT:    s_mov_b32 s2, 0xffff
1837; VI-NEXT:    v_mov_b32_e32 v5, s1
1838; VI-NEXT:    v_mov_b32_e32 v6, s4
1839; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
1840; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
1841; VI-NEXT:    s_waitcnt vmcnt(0)
1842; VI-NEXT:    v_bfi_b32 v3, s2, v6, v3
1843; VI-NEXT:    v_bfi_b32 v1, s2, v1, v1
1844; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1845; VI-NEXT:    s_endpgm
1846;
1847; CI-LABEL: v_insertelement_v8i16_6:
1848; CI:       ; %bb.0:
1849; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1850; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1851; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1852; CI-NEXT:    s_waitcnt lgkmcnt(0)
1853; CI-NEXT:    v_mov_b32_e32 v1, s3
1854; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
1855; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1856; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1857; CI-NEXT:    v_mov_b32_e32 v5, s1
1858; CI-NEXT:    v_add_i32_e32 v4, vcc, s0, v4
1859; CI-NEXT:    s_mov_b32 s0, 0xffff
1860; CI-NEXT:    v_mov_b32_e32 v6, s4
1861; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
1862; CI-NEXT:    s_waitcnt vmcnt(0)
1863; CI-NEXT:    v_bfi_b32 v3, s0, v6, v3
1864; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1865; CI-NEXT:    s_endpgm
1866  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1867  %tid.ext = sext i32 %tid to i64
1868  %in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext
1869  %out.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %out, i64 %tid.ext
1870  %vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep
1871  %val.trunc = trunc i32 %val to i16
1872  %val.cvt = bitcast i16 %val.trunc to i16
1873  %vecins = insertelement <8 x i16> %vec, i16 %val.cvt, i32 6
1874  store <8 x i16> %vecins, <8 x i16> addrspace(1)* %out.gep
1875  ret void
1876}
1877
1878define amdgpu_kernel void @v_insertelement_v8f16_dynamic(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %val, i32 %n) {
1879; GFX9-LABEL: v_insertelement_v8f16_dynamic:
1880; GFX9:       ; %bb.0:
1881; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1882; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
1883; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1884; GFX9-NEXT:    v_mov_b32_e32 v5, 0xffff
1885; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1886; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
1887; GFX9-NEXT:    s_cmp_eq_u32 s7, 7
1888; GFX9-NEXT:    v_mov_b32_e32 v6, s6
1889; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
1890; GFX9-NEXT:    s_cmp_eq_u32 s7, 6
1891; GFX9-NEXT:    s_waitcnt vmcnt(0)
1892; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
1893; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
1894; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
1895; GFX9-NEXT:    s_cmp_eq_u32 s7, 5
1896; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
1897; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1898; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
1899; GFX9-NEXT:    s_cmp_eq_u32 s7, 4
1900; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v6, vcc
1901; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
1902; GFX9-NEXT:    s_cmp_eq_u32 s7, 3
1903; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
1904; GFX9-NEXT:    v_and_b32_e32 v3, v5, v3
1905; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1906; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
1907; GFX9-NEXT:    s_cmp_eq_u32 s7, 2
1908; GFX9-NEXT:    v_lshl_or_b32 v3, v7, 16, v3
1909; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
1910; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
1911; GFX9-NEXT:    s_cmp_eq_u32 s7, 1
1912; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
1913; GFX9-NEXT:    v_and_b32_e32 v2, v5, v2
1914; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1915; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
1916; GFX9-NEXT:    s_cmp_eq_u32 s7, 0
1917; GFX9-NEXT:    v_lshl_or_b32 v2, v8, 16, v2
1918; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v6, vcc
1919; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
1920; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
1921; GFX9-NEXT:    v_and_b32_e32 v1, v5, v1
1922; GFX9-NEXT:    v_and_b32_e32 v0, v5, v0
1923; GFX9-NEXT:    v_lshl_or_b32 v1, v7, 16, v1
1924; GFX9-NEXT:    v_lshl_or_b32 v0, v8, 16, v0
1925; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1926; GFX9-NEXT:    s_endpgm
1927;
1928; VI-LABEL: v_insertelement_v8f16_dynamic:
1929; VI:       ; %bb.0:
1930; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1931; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
1932; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1933; VI-NEXT:    s_waitcnt lgkmcnt(0)
1934; VI-NEXT:    v_mov_b32_e32 v1, s3
1935; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
1936; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1937; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1938; VI-NEXT:    v_mov_b32_e32 v5, s1
1939; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
1940; VI-NEXT:    s_cmp_eq_u32 s5, 6
1941; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
1942; VI-NEXT:    v_mov_b32_e32 v6, s4
1943; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1944; VI-NEXT:    s_cmp_eq_u32 s5, 7
1945; VI-NEXT:    s_waitcnt vmcnt(0)
1946; VI-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc
1947; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1948; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1949; VI-NEXT:    s_cmp_eq_u32 s5, 4
1950; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1951; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1952; VI-NEXT:    s_cmp_eq_u32 s5, 5
1953; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
1954; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1955; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1956; VI-NEXT:    s_cmp_eq_u32 s5, 2
1957; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1958; VI-NEXT:    v_cndmask_b32_e32 v8, v8, v6, vcc
1959; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1960; VI-NEXT:    s_cmp_eq_u32 s5, 3
1961; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
1962; VI-NEXT:    v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1963; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
1964; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1965; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1966; VI-NEXT:    s_cmp_eq_u32 s5, 0
1967; VI-NEXT:    v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1968; VI-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
1969; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1970; VI-NEXT:    s_cmp_eq_u32 s5, 1
1971; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
1972; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
1973; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1974; VI-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
1975; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
1976; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
1977; VI-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1978; VI-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1979; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1980; VI-NEXT:    s_endpgm
1981;
1982; CI-LABEL: v_insertelement_v8f16_dynamic:
1983; CI:       ; %bb.0:
1984; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1985; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
1986; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1987; CI-NEXT:    s_waitcnt lgkmcnt(0)
1988; CI-NEXT:    v_mov_b32_e32 v1, s3
1989; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
1990; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1991; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1992; CI-NEXT:    v_mov_b32_e32 v5, s1
1993; CI-NEXT:    v_add_i32_e32 v4, vcc, s0, v4
1994; CI-NEXT:    v_cvt_f32_f16_e32 v6, s4
1995; CI-NEXT:    s_cmp_eq_u32 s5, 7
1996; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
1997; CI-NEXT:    s_cselect_b64 vcc, -1, 0
1998; CI-NEXT:    s_cmp_eq_u32 s5, 6
1999; CI-NEXT:    s_cselect_b64 s[0:1], -1, 0
2000; CI-NEXT:    s_cmp_eq_u32 s5, 5
2001; CI-NEXT:    s_cselect_b64 s[2:3], -1, 0
2002; CI-NEXT:    s_cmp_eq_u32 s5, 4
2003; CI-NEXT:    s_waitcnt vmcnt(0)
2004; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
2005; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
2006; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
2007; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
2008; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
2009; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
2010; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2011; CI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
2012; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
2013; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
2014; CI-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
2015; CI-NEXT:    s_cselect_b64 s[0:1], -1, 0
2016; CI-NEXT:    s_cmp_eq_u32 s5, 3
2017; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
2018; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
2019; CI-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
2020; CI-NEXT:    s_cselect_b64 vcc, -1, 0
2021; CI-NEXT:    s_cmp_eq_u32 s5, 2
2022; CI-NEXT:    v_cndmask_b32_e32 v9, v9, v6, vcc
2023; CI-NEXT:    s_cselect_b64 vcc, -1, 0
2024; CI-NEXT:    s_cmp_eq_u32 s5, 1
2025; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
2026; CI-NEXT:    s_cselect_b64 vcc, -1, 0
2027; CI-NEXT:    s_cmp_eq_u32 s5, 0
2028; CI-NEXT:    v_cndmask_b32_e64 v8, v8, v6, s[2:3]
2029; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
2030; CI-NEXT:    v_cndmask_b32_e32 v10, v10, v6, vcc
2031; CI-NEXT:    s_cselect_b64 vcc, -1, 0
2032; CI-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
2033; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
2034; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
2035; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
2036; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
2037; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
2038; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2039; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2040; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2041; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
2042; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
2043; CI-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
2044; CI-NEXT:    v_or_b32_e32 v3, v3, v6
2045; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
2046; CI-NEXT:    v_or_b32_e32 v2, v2, v7
2047; CI-NEXT:    v_or_b32_e32 v1, v1, v8
2048; CI-NEXT:    v_or_b32_e32 v0, v0, v6
2049; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2050; CI-NEXT:    s_endpgm
2051  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2052  %tid.ext = sext i32 %tid to i64
2053  %in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext
2054  %out.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %out, i64 %tid.ext
2055  %vec = load <8 x half>, <8 x half> addrspace(1)* %in.gep
2056  %val.trunc = trunc i32 %val to i16
2057  %val.cvt = bitcast i16 %val.trunc to half
2058  %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 %n
2059  store <8 x half> %vecins, <8 x half> addrspace(1)* %out.gep
2060  ret void
2061}
2062
2063declare i32 @llvm.amdgcn.workitem.id.x() #1
2064
2065attributes #0 = { nounwind }
2066attributes #1 = { nounwind readnone }
2067