1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=CIVI,VI %s
4; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CIVI,CI %s
5; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
6
7define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
8; GFX9-LABEL: s_insertelement_v2i16_0:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
11; GFX9-NEXT:    v_mov_b32_e32 v0, 0
12; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
13; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
14; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX9-NEXT:    s_pack_lh_b32_b16 s2, 0x3e7, s2
16; GFX9-NEXT:    v_mov_b32_e32 v1, s2
17; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
18; GFX9-NEXT:    s_endpgm
19;
20; CIVI-LABEL: s_insertelement_v2i16_0:
21; CIVI:       ; %bb.0:
22; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
23; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
24; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
25; CIVI-NEXT:    v_mov_b32_e32 v0, s0
26; CIVI-NEXT:    v_mov_b32_e32 v1, s1
27; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
28; CIVI-NEXT:    s_and_b32 s0, s2, 0xffff0000
29; CIVI-NEXT:    s_or_b32 s0, s0, 0x3e7
30; CIVI-NEXT:    v_mov_b32_e32 v2, s0
31; CIVI-NEXT:    flat_store_dword v[0:1], v2
32; CIVI-NEXT:    s_endpgm
33;
34; GFX11-LABEL: s_insertelement_v2i16_0:
35; GFX11:       ; %bb.0:
36; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
37; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
38; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
39; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX11-NEXT:    s_pack_lh_b32_b16 s2, 0x3e7, s2
41; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
42; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
43; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
44; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
45; GFX11-NEXT:    s_endpgm
46  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
47  %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
48  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
49  ret void
50}
51
52
53define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
54; GFX9-LABEL: s_insertelement_v2i16_0_reg:
55; GFX9:       ; %bb.0:
56; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
57; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x30
58; GFX9-NEXT:    v_mov_b32_e32 v0, 0
59; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
60; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
61; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
62; GFX9-NEXT:    s_pack_lh_b32_b16 s2, s6, s2
63; GFX9-NEXT:    v_mov_b32_e32 v1, s2
64; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
65; GFX9-NEXT:    s_endpgm
66;
67; VI-LABEL: s_insertelement_v2i16_0_reg:
68; VI:       ; %bb.0:
69; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
70; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
71; VI-NEXT:    s_waitcnt lgkmcnt(0)
72; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
73; VI-NEXT:    v_mov_b32_e32 v0, s0
74; VI-NEXT:    v_mov_b32_e32 v1, s1
75; VI-NEXT:    s_and_b32 s0, s4, 0xffff
76; VI-NEXT:    s_waitcnt lgkmcnt(0)
77; VI-NEXT:    s_and_b32 s1, s2, 0xffff0000
78; VI-NEXT:    s_or_b32 s0, s0, s1
79; VI-NEXT:    v_mov_b32_e32 v2, s0
80; VI-NEXT:    flat_store_dword v[0:1], v2
81; VI-NEXT:    s_endpgm
82;
83; CI-LABEL: s_insertelement_v2i16_0_reg:
84; CI:       ; %bb.0:
85; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
86; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
87; CI-NEXT:    s_waitcnt lgkmcnt(0)
88; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
89; CI-NEXT:    v_mov_b32_e32 v0, s0
90; CI-NEXT:    v_mov_b32_e32 v1, s1
91; CI-NEXT:    s_and_b32 s1, s4, 0xffff
92; CI-NEXT:    s_waitcnt lgkmcnt(0)
93; CI-NEXT:    s_and_b32 s0, s2, 0xffff0000
94; CI-NEXT:    s_or_b32 s0, s1, s0
95; CI-NEXT:    v_mov_b32_e32 v2, s0
96; CI-NEXT:    flat_store_dword v[0:1], v2
97; CI-NEXT:    s_endpgm
98;
99; GFX11-LABEL: s_insertelement_v2i16_0_reg:
100; GFX11:       ; %bb.0:
101; GFX11-NEXT:    s_clause 0x1
102; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
103; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x30
104; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
105; GFX11-NEXT:    s_load_b32 s1, s[6:7], 0x0
106; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
107; GFX11-NEXT:    s_pack_lh_b32_b16 s0, s0, s1
108; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
109; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
110; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
111; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
112; GFX11-NEXT:    s_endpgm
113  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
114  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
115  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
116  ret void
117}
118
119define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
120; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
121; GFX9:       ; %bb.0:
122; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
123; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x30
124; GFX9-NEXT:    v_mov_b32_e32 v0, 0
125; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
126; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
127; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
129; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s6, s2
130; GFX9-NEXT:    v_mov_b32_e32 v1, s3
131; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
132; GFX9-NEXT:    ;;#ASMSTART
133; GFX9-NEXT:    ; use s2
134; GFX9-NEXT:    ;;#ASMEND
135; GFX9-NEXT:    s_endpgm
136;
137; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
138; VI:       ; %bb.0:
139; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
140; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
141; VI-NEXT:    s_waitcnt lgkmcnt(0)
142; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
143; VI-NEXT:    v_mov_b32_e32 v0, s0
144; VI-NEXT:    v_mov_b32_e32 v1, s1
145; VI-NEXT:    s_and_b32 s0, s4, 0xffff
146; VI-NEXT:    s_waitcnt lgkmcnt(0)
147; VI-NEXT:    s_lshr_b32 s1, s2, 16
148; VI-NEXT:    s_and_b32 s2, s2, 0xffff0000
149; VI-NEXT:    s_or_b32 s0, s0, s2
150; VI-NEXT:    v_mov_b32_e32 v2, s0
151; VI-NEXT:    flat_store_dword v[0:1], v2
152; VI-NEXT:    ;;#ASMSTART
153; VI-NEXT:    ; use s1
154; VI-NEXT:    ;;#ASMEND
155; VI-NEXT:    s_endpgm
156;
157; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
158; CI:       ; %bb.0:
159; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
160; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
161; CI-NEXT:    s_waitcnt lgkmcnt(0)
162; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
163; CI-NEXT:    v_mov_b32_e32 v0, s0
164; CI-NEXT:    v_mov_b32_e32 v1, s1
165; CI-NEXT:    s_and_b32 s0, s4, 0xffff
166; CI-NEXT:    s_waitcnt lgkmcnt(0)
167; CI-NEXT:    s_and_b32 s1, s2, 0xffff0000
168; CI-NEXT:    s_or_b32 s0, s0, s1
169; CI-NEXT:    v_mov_b32_e32 v2, s0
170; CI-NEXT:    s_lshr_b32 s2, s2, 16
171; CI-NEXT:    flat_store_dword v[0:1], v2
172; CI-NEXT:    ;;#ASMSTART
173; CI-NEXT:    ; use s2
174; CI-NEXT:    ;;#ASMEND
175; CI-NEXT:    s_endpgm
176;
177; GFX11-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
178; GFX11:       ; %bb.0:
179; GFX11-NEXT:    s_clause 0x1
180; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
181; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x30
182; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX11-NEXT:    s_load_b32 s1, s[6:7], 0x0
184; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
185; GFX11-NEXT:    s_lshr_b32 s1, s1, 16
186; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
187; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
188; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
189; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
190; GFX11-NEXT:    ;;#ASMSTART
191; GFX11-NEXT:    ; use s1
192; GFX11-NEXT:    ;;#ASMEND
193; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
194; GFX11-NEXT:    s_endpgm
195  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
196  %elt1 = extractelement <2 x i16> %vec, i32 1
197  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
198  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
199  %use1 = zext i16 %elt1 to i32
200  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
201  ret void
202}
203
204define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %elt.arg) #0 {
205; GFX9-LABEL: s_insertelement_v2i16_0_reghi:
206; GFX9:       ; %bb.0:
207; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
208; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x30
209; GFX9-NEXT:    v_mov_b32_e32 v0, 0
210; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
211; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
212; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
213; GFX9-NEXT:    s_pack_hh_b32_b16 s2, s6, s2
214; GFX9-NEXT:    v_mov_b32_e32 v1, s2
215; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
216; GFX9-NEXT:    s_endpgm
217;
218; VI-LABEL: s_insertelement_v2i16_0_reghi:
219; VI:       ; %bb.0:
220; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
221; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
222; VI-NEXT:    s_waitcnt lgkmcnt(0)
223; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
224; VI-NEXT:    v_mov_b32_e32 v0, s0
225; VI-NEXT:    v_mov_b32_e32 v2, s4
226; VI-NEXT:    v_mov_b32_e32 v1, s1
227; VI-NEXT:    s_waitcnt lgkmcnt(0)
228; VI-NEXT:    s_lshr_b32 s0, s2, 16
229; VI-NEXT:    v_alignbit_b32 v2, s0, v2, 16
230; VI-NEXT:    flat_store_dword v[0:1], v2
231; VI-NEXT:    s_endpgm
232;
233; CI-LABEL: s_insertelement_v2i16_0_reghi:
234; CI:       ; %bb.0:
235; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
236; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
237; CI-NEXT:    s_waitcnt lgkmcnt(0)
238; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
239; CI-NEXT:    v_mov_b32_e32 v0, s0
240; CI-NEXT:    v_mov_b32_e32 v1, s1
241; CI-NEXT:    s_lshr_b32 s1, s4, 16
242; CI-NEXT:    s_waitcnt lgkmcnt(0)
243; CI-NEXT:    s_and_b32 s0, s2, 0xffff0000
244; CI-NEXT:    s_or_b32 s0, s1, s0
245; CI-NEXT:    v_mov_b32_e32 v2, s0
246; CI-NEXT:    flat_store_dword v[0:1], v2
247; CI-NEXT:    s_endpgm
248;
249; GFX11-LABEL: s_insertelement_v2i16_0_reghi:
250; GFX11:       ; %bb.0:
251; GFX11-NEXT:    s_clause 0x1
252; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
253; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x30
254; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
255; GFX11-NEXT:    s_load_b32 s1, s[6:7], 0x0
256; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
257; GFX11-NEXT:    s_pack_hh_b32_b16 s0, s0, s1
258; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
259; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
260; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
261; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
262; GFX11-NEXT:    s_endpgm
263  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
264  %elt.hi = lshr i32 %elt.arg, 16
265  %elt = trunc i32 %elt.hi to i16
266  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
267  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
268  ret void
269}
270
271define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
272; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
273; GFX9:       ; %bb.0:
274; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
275; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
276; GFX9-NEXT:    v_mov_b32_e32 v0, 0
277; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
278; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
279; GFX9-NEXT:    s_lshr_b32 s3, s6, 16
280; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
281; GFX9-NEXT:    s_pack_lh_b32_b16 s2, s3, s2
282; GFX9-NEXT:    v_mov_b32_e32 v1, s2
283; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
284; GFX9-NEXT:    ;;#ASMSTART
285; GFX9-NEXT:    ; use s3
286; GFX9-NEXT:    ;;#ASMEND
287; GFX9-NEXT:    s_endpgm
288;
289; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
290; VI:       ; %bb.0:
291; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
292; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
293; VI-NEXT:    s_waitcnt lgkmcnt(0)
294; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
295; VI-NEXT:    v_mov_b32_e32 v1, s1
296; VI-NEXT:    v_mov_b32_e32 v2, s4
297; VI-NEXT:    v_mov_b32_e32 v0, s0
298; VI-NEXT:    s_lshr_b32 s0, s4, 16
299; VI-NEXT:    s_waitcnt lgkmcnt(0)
300; VI-NEXT:    s_lshr_b32 s1, s2, 16
301; VI-NEXT:    v_alignbit_b32 v2, s1, v2, 16
302; VI-NEXT:    flat_store_dword v[0:1], v2
303; VI-NEXT:    ;;#ASMSTART
304; VI-NEXT:    ; use s0
305; VI-NEXT:    ;;#ASMEND
306; VI-NEXT:    s_endpgm
307;
308; CI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
309; CI:       ; %bb.0:
310; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
311; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
312; CI-NEXT:    s_waitcnt lgkmcnt(0)
313; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
314; CI-NEXT:    v_mov_b32_e32 v0, s0
315; CI-NEXT:    v_mov_b32_e32 v1, s1
316; CI-NEXT:    s_lshr_b32 s0, s4, 16
317; CI-NEXT:    s_waitcnt lgkmcnt(0)
318; CI-NEXT:    s_and_b32 s1, s2, 0xffff0000
319; CI-NEXT:    s_or_b32 s1, s0, s1
320; CI-NEXT:    v_mov_b32_e32 v2, s1
321; CI-NEXT:    flat_store_dword v[0:1], v2
322; CI-NEXT:    ;;#ASMSTART
323; CI-NEXT:    ; use s0
324; CI-NEXT:    ;;#ASMEND
325; CI-NEXT:    s_endpgm
326;
327; GFX11-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
328; GFX11:       ; %bb.0:
329; GFX11-NEXT:    s_clause 0x1
330; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
331; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x10
332; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
333; GFX11-NEXT:    s_load_b32 s1, s[6:7], 0x0
334; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
335; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
336; GFX11-NEXT:    s_pack_lh_b32_b16 s1, s0, s1
337; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
338; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1
339; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
340; GFX11-NEXT:    ;;#ASMSTART
341; GFX11-NEXT:    ; use s0
342; GFX11-NEXT:    ;;#ASMEND
343; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
344; GFX11-NEXT:    s_endpgm
345  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
346  %elt.hi = lshr i32 %elt.arg, 16
347  %elt = trunc i32 %elt.hi to i16
348  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
349  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
350  %use1 = zext i16 %elt to i32
351  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
352  ret void
353}
354
355define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
356; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
357; GFX9:       ; %bb.0:
358; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
359; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
360; GFX9-NEXT:    v_mov_b32_e32 v0, 0
361; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
362; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
363; GFX9-NEXT:    s_lshr_b32 s3, s6, 16
364; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
365; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
366; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s3, s2
367; GFX9-NEXT:    v_mov_b32_e32 v1, s4
368; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
369; GFX9-NEXT:    ;;#ASMSTART
370; GFX9-NEXT:    ; use s3
371; GFX9-NEXT:    ;;#ASMEND
372; GFX9-NEXT:    ;;#ASMSTART
373; GFX9-NEXT:    ; use s2
374; GFX9-NEXT:    ;;#ASMEND
375; GFX9-NEXT:    s_endpgm
376;
377; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
378; VI:       ; %bb.0:
379; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
380; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
381; VI-NEXT:    s_waitcnt lgkmcnt(0)
382; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
383; VI-NEXT:    v_mov_b32_e32 v1, s1
384; VI-NEXT:    v_mov_b32_e32 v2, s4
385; VI-NEXT:    v_mov_b32_e32 v0, s0
386; VI-NEXT:    s_lshr_b32 s0, s4, 16
387; VI-NEXT:    s_waitcnt lgkmcnt(0)
388; VI-NEXT:    s_lshr_b32 s1, s2, 16
389; VI-NEXT:    v_alignbit_b32 v2, s1, v2, 16
390; VI-NEXT:    flat_store_dword v[0:1], v2
391; VI-NEXT:    ;;#ASMSTART
392; VI-NEXT:    ; use s0
393; VI-NEXT:    ;;#ASMEND
394; VI-NEXT:    ;;#ASMSTART
395; VI-NEXT:    ; use s1
396; VI-NEXT:    ;;#ASMEND
397; VI-NEXT:    s_endpgm
398;
399; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
400; CI:       ; %bb.0:
401; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
402; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
403; CI-NEXT:    s_waitcnt lgkmcnt(0)
404; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
405; CI-NEXT:    v_mov_b32_e32 v1, s1
406; CI-NEXT:    v_mov_b32_e32 v2, s4
407; CI-NEXT:    v_mov_b32_e32 v0, s0
408; CI-NEXT:    s_lshr_b32 s0, s4, 16
409; CI-NEXT:    s_waitcnt lgkmcnt(0)
410; CI-NEXT:    s_lshr_b32 s1, s2, 16
411; CI-NEXT:    v_alignbit_b32 v2, s1, v2, 16
412; CI-NEXT:    flat_store_dword v[0:1], v2
413; CI-NEXT:    ;;#ASMSTART
414; CI-NEXT:    ; use s0
415; CI-NEXT:    ;;#ASMEND
416; CI-NEXT:    ;;#ASMSTART
417; CI-NEXT:    ; use s1
418; CI-NEXT:    ;;#ASMEND
419; CI-NEXT:    s_endpgm
420;
421; GFX11-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
422; GFX11:       ; %bb.0:
423; GFX11-NEXT:    s_clause 0x1
424; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
425; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x10
426; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
427; GFX11-NEXT:    s_load_b32 s1, s[6:7], 0x0
428; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
429; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
430; GFX11-NEXT:    s_lshr_b32 s1, s1, 16
431; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
432; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s0, s1
433; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
434; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
435; GFX11-NEXT:    ;;#ASMSTART
436; GFX11-NEXT:    ; use s0
437; GFX11-NEXT:    ;;#ASMEND
438; GFX11-NEXT:    ;;#ASMSTART
439; GFX11-NEXT:    ; use s1
440; GFX11-NEXT:    ;;#ASMEND
441; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
442; GFX11-NEXT:    s_endpgm
443  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
444  %elt.hi = lshr i32 %elt.arg, 16
445  %elt = trunc i32 %elt.hi to i16
446  %vec.hi = extractelement <2 x i16> %vec, i32 1
447  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
448  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
449  %use1 = zext i16 %elt to i32
450  %vec.hi.use1 = zext i16 %vec.hi to i32
451
452  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
453  call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0
454  ret void
455}
456
457define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
458; GFX9-LABEL: s_insertelement_v2i16_1:
459; GFX9:       ; %bb.0:
460; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
461; GFX9-NEXT:    v_mov_b32_e32 v0, 0
462; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
463; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
464; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
465; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, 0x3e7
466; GFX9-NEXT:    v_mov_b32_e32 v1, s2
467; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
468; GFX9-NEXT:    s_endpgm
469;
470; CIVI-LABEL: s_insertelement_v2i16_1:
471; CIVI:       ; %bb.0:
472; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
473; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
474; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
475; CIVI-NEXT:    v_mov_b32_e32 v0, s0
476; CIVI-NEXT:    v_mov_b32_e32 v1, s1
477; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
478; CIVI-NEXT:    s_and_b32 s0, s2, 0xffff
479; CIVI-NEXT:    s_or_b32 s0, s0, 0x3e70000
480; CIVI-NEXT:    v_mov_b32_e32 v2, s0
481; CIVI-NEXT:    flat_store_dword v[0:1], v2
482; CIVI-NEXT:    s_endpgm
483;
484; GFX11-LABEL: s_insertelement_v2i16_1:
485; GFX11:       ; %bb.0:
486; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
487; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
488; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
489; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
490; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s2, 0x3e7
491; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
492; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
493; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
494; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
495; GFX11-NEXT:    s_endpgm
496  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
497  %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
498  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
499  ret void
500}
501
502define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
503; GFX9-LABEL: s_insertelement_v2i16_1_reg:
504; GFX9:       ; %bb.0:
505; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
506; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x30
507; GFX9-NEXT:    v_mov_b32_e32 v0, 0
508; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
509; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
510; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
511; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s6
512; GFX9-NEXT:    v_mov_b32_e32 v1, s2
513; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
514; GFX9-NEXT:    s_endpgm
515;
516; VI-LABEL: s_insertelement_v2i16_1_reg:
517; VI:       ; %bb.0:
518; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
519; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
520; VI-NEXT:    s_waitcnt lgkmcnt(0)
521; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
522; VI-NEXT:    v_mov_b32_e32 v0, s0
523; VI-NEXT:    v_mov_b32_e32 v1, s1
524; VI-NEXT:    s_lshl_b32 s0, s4, 16
525; VI-NEXT:    s_waitcnt lgkmcnt(0)
526; VI-NEXT:    s_and_b32 s1, s2, 0xffff
527; VI-NEXT:    s_or_b32 s0, s1, s0
528; VI-NEXT:    v_mov_b32_e32 v2, s0
529; VI-NEXT:    flat_store_dword v[0:1], v2
530; VI-NEXT:    s_endpgm
531;
532; CI-LABEL: s_insertelement_v2i16_1_reg:
533; CI:       ; %bb.0:
534; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
535; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
536; CI-NEXT:    s_waitcnt lgkmcnt(0)
537; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
538; CI-NEXT:    v_mov_b32_e32 v0, s0
539; CI-NEXT:    v_mov_b32_e32 v1, s1
540; CI-NEXT:    s_lshl_b32 s1, s4, 16
541; CI-NEXT:    s_waitcnt lgkmcnt(0)
542; CI-NEXT:    s_and_b32 s0, s2, 0xffff
543; CI-NEXT:    s_or_b32 s0, s0, s1
544; CI-NEXT:    v_mov_b32_e32 v2, s0
545; CI-NEXT:    flat_store_dword v[0:1], v2
546; CI-NEXT:    s_endpgm
547;
548; GFX11-LABEL: s_insertelement_v2i16_1_reg:
549; GFX11:       ; %bb.0:
550; GFX11-NEXT:    s_clause 0x1
551; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
552; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x30
553; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
554; GFX11-NEXT:    s_load_b32 s1, s[6:7], 0x0
555; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
556; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s1, s0
557; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
558; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
559; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
560; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
561; GFX11-NEXT:    s_endpgm
562  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
563  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
564  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
565  ret void
566}
567
568define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
569; GFX9-LABEL: s_insertelement_v2f16_0:
570; GFX9:       ; %bb.0:
571; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
572; GFX9-NEXT:    v_mov_b32_e32 v0, 0
573; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
574; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
575; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
576; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
577; GFX9-NEXT:    s_pack_ll_b32_b16 s2, 0x4500, s2
578; GFX9-NEXT:    v_mov_b32_e32 v1, s2
579; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
580; GFX9-NEXT:    s_endpgm
581;
582; CIVI-LABEL: s_insertelement_v2f16_0:
583; CIVI:       ; %bb.0:
584; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
585; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
586; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
587; CIVI-NEXT:    v_mov_b32_e32 v0, s0
588; CIVI-NEXT:    v_mov_b32_e32 v1, s1
589; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
590; CIVI-NEXT:    s_and_b32 s0, s2, 0xffff0000
591; CIVI-NEXT:    s_or_b32 s0, s0, 0x4500
592; CIVI-NEXT:    v_mov_b32_e32 v2, s0
593; CIVI-NEXT:    flat_store_dword v[0:1], v2
594; CIVI-NEXT:    s_endpgm
595;
596; GFX11-LABEL: s_insertelement_v2f16_0:
597; GFX11:       ; %bb.0:
598; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
599; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
600; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
601; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
602; GFX11-NEXT:    s_lshr_b32 s2, s2, 16
603; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
604; GFX11-NEXT:    s_pack_ll_b32_b16 s2, 0x4500, s2
605; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
606; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
607; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
608; GFX11-NEXT:    s_endpgm
609  %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
610  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
611  store <2 x half> %vecins, <2 x half> addrspace(1)* %out
612  ret void
613}
614
615define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
616; GFX9-LABEL: s_insertelement_v2f16_1:
617; GFX9:       ; %bb.0:
618; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
619; GFX9-NEXT:    v_mov_b32_e32 v0, 0
620; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
621; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
622; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
623; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, 0x4500
624; GFX9-NEXT:    v_mov_b32_e32 v1, s2
625; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
626; GFX9-NEXT:    s_endpgm
627;
628; CIVI-LABEL: s_insertelement_v2f16_1:
629; CIVI:       ; %bb.0:
630; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
631; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
632; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
633; CIVI-NEXT:    v_mov_b32_e32 v0, s0
634; CIVI-NEXT:    v_mov_b32_e32 v1, s1
635; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
636; CIVI-NEXT:    s_and_b32 s0, s2, 0xffff
637; CIVI-NEXT:    s_or_b32 s0, s0, 0x45000000
638; CIVI-NEXT:    v_mov_b32_e32 v2, s0
639; CIVI-NEXT:    flat_store_dword v[0:1], v2
640; CIVI-NEXT:    s_endpgm
641;
642; GFX11-LABEL: s_insertelement_v2f16_1:
643; GFX11:       ; %bb.0:
644; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
645; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
646; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
647; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
648; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s2, 0x4500
649; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
650; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
651; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
652; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
653; GFX11-NEXT:    s_endpgm
654  %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
655  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
656  store <2 x half> %vecins, <2 x half> addrspace(1)* %out
657  ret void
658}
659
660define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
661; GFX9-LABEL: v_insertelement_v2i16_0:
662; GFX9:       ; %bb.0:
663; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
664; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
665; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
666; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
667; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
668; GFX9-NEXT:    s_movk_i32 s2, 0x3e7
669; GFX9-NEXT:    s_waitcnt vmcnt(0)
670; GFX9-NEXT:    v_bfi_b32 v1, v2, s2, v1
671; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
672; GFX9-NEXT:    s_endpgm
673;
674; VI-LABEL: v_insertelement_v2i16_0:
675; VI:       ; %bb.0:
676; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
677; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
678; VI-NEXT:    s_waitcnt lgkmcnt(0)
679; VI-NEXT:    v_mov_b32_e32 v1, s3
680; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
681; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
682; VI-NEXT:    flat_load_dword v3, v[0:1]
683; VI-NEXT:    v_mov_b32_e32 v1, s1
684; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
685; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
686; VI-NEXT:    s_waitcnt vmcnt(0)
687; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
688; VI-NEXT:    v_or_b32_e32 v2, 0x3e7, v2
689; VI-NEXT:    flat_store_dword v[0:1], v2
690; VI-NEXT:    s_endpgm
691;
692; CI-LABEL: v_insertelement_v2i16_0:
693; CI:       ; %bb.0:
694; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
695; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
696; CI-NEXT:    s_waitcnt lgkmcnt(0)
697; CI-NEXT:    v_mov_b32_e32 v1, s3
698; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
699; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
700; CI-NEXT:    flat_load_dword v3, v[0:1]
701; CI-NEXT:    v_mov_b32_e32 v1, s1
702; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
703; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
704; CI-NEXT:    s_waitcnt vmcnt(0)
705; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
706; CI-NEXT:    v_or_b32_e32 v2, 0x3e7, v2
707; CI-NEXT:    flat_store_dword v[0:1], v2
708; CI-NEXT:    s_endpgm
709;
710; GFX11-LABEL: v_insertelement_v2i16_0:
711; GFX11:       ; %bb.0:
712; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
713; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
714; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
715; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
716; GFX11-NEXT:    s_movk_i32 s2, 0x3e7
717; GFX11-NEXT:    s_waitcnt vmcnt(0)
718; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, s2, v1
719; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
720; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
721; GFX11-NEXT:    s_endpgm
722  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
723  %tid.ext = sext i32 %tid to i64
724  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
725  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
726  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
727  %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
728  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
729  ret void
730}
731
732define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 {
733; GFX9-LABEL: v_insertelement_v2i16_0_reghi:
734; GFX9:       ; %bb.0:
735; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
736; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
737; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
738; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff0000
739; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
740; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
741; GFX9-NEXT:    v_lshrrev_b32_e64 v2, 16, s6
742; GFX9-NEXT:    s_waitcnt vmcnt(0)
743; GFX9-NEXT:    v_and_or_b32 v1, v1, v3, v2
744; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
745; GFX9-NEXT:    s_endpgm
746;
747; VI-LABEL: v_insertelement_v2i16_0_reghi:
748; VI:       ; %bb.0:
749; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
750; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
751; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
752; VI-NEXT:    s_waitcnt lgkmcnt(0)
753; VI-NEXT:    v_mov_b32_e32 v1, s3
754; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
755; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
756; VI-NEXT:    flat_load_dword v3, v[0:1]
757; VI-NEXT:    v_mov_b32_e32 v1, s1
758; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
759; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
760; VI-NEXT:    s_waitcnt vmcnt(0)
761; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
762; VI-NEXT:    v_alignbit_b32 v2, v2, s4, 16
763; VI-NEXT:    flat_store_dword v[0:1], v2
764; VI-NEXT:    s_endpgm
765;
766; CI-LABEL: v_insertelement_v2i16_0_reghi:
767; CI:       ; %bb.0:
768; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
769; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
770; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
771; CI-NEXT:    s_waitcnt lgkmcnt(0)
772; CI-NEXT:    v_mov_b32_e32 v1, s3
773; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
774; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
775; CI-NEXT:    flat_load_dword v3, v[0:1]
776; CI-NEXT:    v_mov_b32_e32 v1, s1
777; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
778; CI-NEXT:    s_lshr_b32 s0, s4, 16
779; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
780; CI-NEXT:    s_waitcnt vmcnt(0)
781; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
782; CI-NEXT:    v_or_b32_e32 v2, s0, v2
783; CI-NEXT:    flat_store_dword v[0:1], v2
784; CI-NEXT:    s_endpgm
785;
786; GFX11-LABEL: v_insertelement_v2i16_0_reghi:
787; GFX11:       ; %bb.0:
788; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
789; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
790; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x10
791; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
792; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7]
793; GFX11-NEXT:    v_lshrrev_b32_e64 v2, 16, s0
794; GFX11-NEXT:    s_waitcnt vmcnt(0)
795; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
796; GFX11-NEXT:    v_and_or_b32 v1, 0xffff0000, v1, v2
797; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
798; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
799; GFX11-NEXT:    s_endpgm
800  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
801  %tid.ext = sext i32 %tid to i64
802  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
803  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
804  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
805  %elt.hi = lshr i32 %elt.arg, 16
806  %elt = trunc i32 %elt.hi to i16
807  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
808  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
809  ret void
810}
811
812define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
813; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm:
814; GFX9:       ; %bb.0:
815; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
816; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
817; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
818; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
819; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
820; GFX9-NEXT:    s_waitcnt vmcnt(0)
821; GFX9-NEXT:    v_bfi_b32 v1, v2, 53, v1
822; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
823; GFX9-NEXT:    s_endpgm
824;
825; VI-LABEL: v_insertelement_v2i16_0_inlineimm:
826; VI:       ; %bb.0:
827; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
828; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
829; VI-NEXT:    s_waitcnt lgkmcnt(0)
830; VI-NEXT:    v_mov_b32_e32 v1, s3
831; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
832; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
833; VI-NEXT:    flat_load_dword v3, v[0:1]
834; VI-NEXT:    v_mov_b32_e32 v1, s1
835; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
836; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
837; VI-NEXT:    s_waitcnt vmcnt(0)
838; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
839; VI-NEXT:    v_or_b32_e32 v2, 53, v2
840; VI-NEXT:    flat_store_dword v[0:1], v2
841; VI-NEXT:    s_endpgm
842;
843; CI-LABEL: v_insertelement_v2i16_0_inlineimm:
844; CI:       ; %bb.0:
845; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
846; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
847; CI-NEXT:    s_waitcnt lgkmcnt(0)
848; CI-NEXT:    v_mov_b32_e32 v1, s3
849; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
850; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
851; CI-NEXT:    flat_load_dword v3, v[0:1]
852; CI-NEXT:    v_mov_b32_e32 v1, s1
853; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
854; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
855; CI-NEXT:    s_waitcnt vmcnt(0)
856; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
857; CI-NEXT:    v_or_b32_e32 v2, 53, v2
858; CI-NEXT:    flat_store_dword v[0:1], v2
859; CI-NEXT:    s_endpgm
860;
861; GFX11-LABEL: v_insertelement_v2i16_0_inlineimm:
862; GFX11:       ; %bb.0:
863; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
864; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
865; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
866; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
867; GFX11-NEXT:    s_waitcnt vmcnt(0)
868; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, 53, v1
869; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
870; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
871; GFX11-NEXT:    s_endpgm
872  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
873  %tid.ext = sext i32 %tid to i64
874  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
875  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
876  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
877  %vecins = insertelement <2 x i16> %vec, i16 53, i32 0
878  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
879  ret void
880}
881
882; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0
883define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
884; GFX9-LABEL: v_insertelement_v2i16_1:
885; GFX9:       ; %bb.0:
886; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
887; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
888; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
889; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
890; GFX9-NEXT:    s_movk_i32 s2, 0x3e7
891; GFX9-NEXT:    s_waitcnt vmcnt(0)
892; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
893; GFX9-NEXT:    v_lshl_or_b32 v1, s2, 16, v1
894; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
895; GFX9-NEXT:    s_endpgm
896;
897; VI-LABEL: v_insertelement_v2i16_1:
898; VI:       ; %bb.0:
899; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
900; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
901; VI-NEXT:    s_waitcnt lgkmcnt(0)
902; VI-NEXT:    v_mov_b32_e32 v1, s3
903; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
904; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
905; VI-NEXT:    flat_load_dword v3, v[0:1]
906; VI-NEXT:    v_mov_b32_e32 v1, s1
907; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
908; VI-NEXT:    v_mov_b32_e32 v2, 0x3e70000
909; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
910; VI-NEXT:    s_waitcnt vmcnt(0)
911; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
912; VI-NEXT:    flat_store_dword v[0:1], v2
913; VI-NEXT:    s_endpgm
914;
915; CI-LABEL: v_insertelement_v2i16_1:
916; CI:       ; %bb.0:
917; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
918; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
919; CI-NEXT:    s_waitcnt lgkmcnt(0)
920; CI-NEXT:    v_mov_b32_e32 v1, s3
921; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
922; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
923; CI-NEXT:    flat_load_dword v3, v[0:1]
924; CI-NEXT:    v_mov_b32_e32 v1, s1
925; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
926; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
927; CI-NEXT:    s_waitcnt vmcnt(0)
928; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
929; CI-NEXT:    v_or_b32_e32 v2, 0x3e70000, v2
930; CI-NEXT:    flat_store_dword v[0:1], v2
931; CI-NEXT:    s_endpgm
932;
933; GFX11-LABEL: v_insertelement_v2i16_1:
934; GFX11:       ; %bb.0:
935; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
936; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
937; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
938; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
939; GFX11-NEXT:    s_waitcnt vmcnt(0)
940; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
941; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
942; GFX11-NEXT:    v_lshl_or_b32 v1, 0x3e7, 16, v1
943; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
944; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
945; GFX11-NEXT:    s_endpgm
946  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
947  %tid.ext = sext i32 %tid to i64
948  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
949  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
950  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
951  %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
952  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
953  ret void
954}
955
956define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
957; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm:
958; GFX9:       ; %bb.0:
959; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
960; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
961; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
962; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
963; GFX9-NEXT:    s_waitcnt vmcnt(0)
964; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
965; GFX9-NEXT:    v_lshl_or_b32 v1, -15, 16, v1
966; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
967; GFX9-NEXT:    s_endpgm
968;
969; VI-LABEL: v_insertelement_v2i16_1_inlineimm:
970; VI:       ; %bb.0:
971; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
972; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
973; VI-NEXT:    s_waitcnt lgkmcnt(0)
974; VI-NEXT:    v_mov_b32_e32 v1, s3
975; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
976; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
977; VI-NEXT:    flat_load_dword v3, v[0:1]
978; VI-NEXT:    v_mov_b32_e32 v1, s1
979; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
980; VI-NEXT:    v_mov_b32_e32 v2, 0xfff10000
981; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
982; VI-NEXT:    s_waitcnt vmcnt(0)
983; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
984; VI-NEXT:    flat_store_dword v[0:1], v2
985; VI-NEXT:    s_endpgm
986;
987; CI-LABEL: v_insertelement_v2i16_1_inlineimm:
988; CI:       ; %bb.0:
989; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
990; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
991; CI-NEXT:    s_waitcnt lgkmcnt(0)
992; CI-NEXT:    v_mov_b32_e32 v1, s3
993; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
994; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
995; CI-NEXT:    flat_load_dword v3, v[0:1]
996; CI-NEXT:    v_mov_b32_e32 v1, s1
997; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
998; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
999; CI-NEXT:    s_waitcnt vmcnt(0)
1000; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
1001; CI-NEXT:    v_or_b32_e32 v2, 0xfff10000, v2
1002; CI-NEXT:    flat_store_dword v[0:1], v2
1003; CI-NEXT:    s_endpgm
1004;
1005; GFX11-LABEL: v_insertelement_v2i16_1_inlineimm:
1006; GFX11:       ; %bb.0:
1007; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
1008; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1009; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1010; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1011; GFX11-NEXT:    s_waitcnt vmcnt(0)
1012; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1013; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1014; GFX11-NEXT:    v_lshl_or_b32 v1, -15, 16, v1
1015; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1016; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1017; GFX11-NEXT:    s_endpgm
1018  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1019  %tid.ext = sext i32 %tid to i64
1020  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1021  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1022  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
1023  %vecins = insertelement <2 x i16> %vec, i16 -15, i32 1
1024  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
1025  ret void
1026}
1027
1028define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
1029; GFX9-LABEL: v_insertelement_v2f16_0:
1030; GFX9:       ; %bb.0:
1031; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1032; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1033; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4500
1034; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1035; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1036; GFX9-NEXT:    s_waitcnt vmcnt(0)
1037; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1038; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1039; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1040; GFX9-NEXT:    s_endpgm
1041;
1042; VI-LABEL: v_insertelement_v2f16_0:
1043; VI:       ; %bb.0:
1044; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1045; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1046; VI-NEXT:    s_waitcnt lgkmcnt(0)
1047; VI-NEXT:    v_mov_b32_e32 v1, s3
1048; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1049; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1050; VI-NEXT:    flat_load_dword v3, v[0:1]
1051; VI-NEXT:    v_mov_b32_e32 v1, s1
1052; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1053; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1054; VI-NEXT:    s_waitcnt vmcnt(0)
1055; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
1056; VI-NEXT:    v_or_b32_e32 v2, 0x4500, v2
1057; VI-NEXT:    flat_store_dword v[0:1], v2
1058; VI-NEXT:    s_endpgm
1059;
1060; CI-LABEL: v_insertelement_v2f16_0:
1061; CI:       ; %bb.0:
1062; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1063; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1064; CI-NEXT:    s_waitcnt lgkmcnt(0)
1065; CI-NEXT:    v_mov_b32_e32 v1, s3
1066; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1067; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1068; CI-NEXT:    flat_load_dword v3, v[0:1]
1069; CI-NEXT:    v_mov_b32_e32 v1, s1
1070; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1071; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1072; CI-NEXT:    s_waitcnt vmcnt(0)
1073; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
1074; CI-NEXT:    v_or_b32_e32 v2, 0x4500, v2
1075; CI-NEXT:    flat_store_dword v[0:1], v2
1076; CI-NEXT:    s_endpgm
1077;
1078; GFX11-LABEL: v_insertelement_v2f16_0:
1079; GFX11:       ; %bb.0:
1080; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
1081; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1082; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1083; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1084; GFX11-NEXT:    s_waitcnt vmcnt(0)
1085; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1086; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1087; GFX11-NEXT:    v_lshl_or_b32 v1, v1, 16, 0x4500
1088; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1089; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1090; GFX11-NEXT:    s_endpgm
1091  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1092  %tid.ext = sext i32 %tid to i64
1093  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
1094  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
1095  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
1096  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
1097  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
1098  ret void
1099}
1100
1101define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
1102; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm:
1103; GFX9:       ; %bb.0:
1104; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1105; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1106; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1107; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1108; GFX9-NEXT:    s_waitcnt vmcnt(0)
1109; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1110; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, 53
1111; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1112; GFX9-NEXT:    s_endpgm
1113;
1114; VI-LABEL: v_insertelement_v2f16_0_inlineimm:
1115; VI:       ; %bb.0:
1116; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1117; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1118; VI-NEXT:    s_waitcnt lgkmcnt(0)
1119; VI-NEXT:    v_mov_b32_e32 v1, s3
1120; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1121; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1122; VI-NEXT:    flat_load_dword v3, v[0:1]
1123; VI-NEXT:    v_mov_b32_e32 v1, s1
1124; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1125; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1126; VI-NEXT:    s_waitcnt vmcnt(0)
1127; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
1128; VI-NEXT:    v_or_b32_e32 v2, 53, v2
1129; VI-NEXT:    flat_store_dword v[0:1], v2
1130; VI-NEXT:    s_endpgm
1131;
1132; CI-LABEL: v_insertelement_v2f16_0_inlineimm:
1133; CI:       ; %bb.0:
1134; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1135; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1136; CI-NEXT:    s_waitcnt lgkmcnt(0)
1137; CI-NEXT:    v_mov_b32_e32 v1, s3
1138; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1139; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1140; CI-NEXT:    flat_load_dword v3, v[0:1]
1141; CI-NEXT:    v_mov_b32_e32 v1, s1
1142; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1143; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1144; CI-NEXT:    s_waitcnt vmcnt(0)
1145; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
1146; CI-NEXT:    v_or_b32_e32 v2, 53, v2
1147; CI-NEXT:    flat_store_dword v[0:1], v2
1148; CI-NEXT:    s_endpgm
1149;
1150; GFX11-LABEL: v_insertelement_v2f16_0_inlineimm:
1151; GFX11:       ; %bb.0:
1152; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
1153; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1154; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1155; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1156; GFX11-NEXT:    s_waitcnt vmcnt(0)
1157; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1158; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1159; GFX11-NEXT:    v_lshl_or_b32 v1, v1, 16, 53
1160; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1161; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1162; GFX11-NEXT:    s_endpgm
1163  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1164  %tid.ext = sext i32 %tid to i64
1165  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
1166  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
1167  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
1168  %vecins = insertelement <2 x half> %vec, half 0xH0035, i32 0
1169  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
1170  ret void
1171}
1172
1173define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
1174; GFX9-LABEL: v_insertelement_v2f16_1:
1175; GFX9:       ; %bb.0:
1176; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1177; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1178; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1179; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1180; GFX9-NEXT:    s_movk_i32 s2, 0x4500
1181; GFX9-NEXT:    s_waitcnt vmcnt(0)
1182; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1183; GFX9-NEXT:    v_lshl_or_b32 v1, s2, 16, v1
1184; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1185; GFX9-NEXT:    s_endpgm
1186;
1187; VI-LABEL: v_insertelement_v2f16_1:
1188; VI:       ; %bb.0:
1189; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1190; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1191; VI-NEXT:    s_waitcnt lgkmcnt(0)
1192; VI-NEXT:    v_mov_b32_e32 v1, s3
1193; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1194; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1195; VI-NEXT:    flat_load_dword v3, v[0:1]
1196; VI-NEXT:    v_mov_b32_e32 v1, s1
1197; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1198; VI-NEXT:    v_mov_b32_e32 v2, 0x45000000
1199; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1200; VI-NEXT:    s_waitcnt vmcnt(0)
1201; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1202; VI-NEXT:    flat_store_dword v[0:1], v2
1203; VI-NEXT:    s_endpgm
1204;
1205; CI-LABEL: v_insertelement_v2f16_1:
1206; CI:       ; %bb.0:
1207; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1208; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1209; CI-NEXT:    s_waitcnt lgkmcnt(0)
1210; CI-NEXT:    v_mov_b32_e32 v1, s3
1211; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1212; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1213; CI-NEXT:    flat_load_dword v3, v[0:1]
1214; CI-NEXT:    v_mov_b32_e32 v1, s1
1215; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1216; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1217; CI-NEXT:    s_waitcnt vmcnt(0)
1218; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
1219; CI-NEXT:    v_or_b32_e32 v2, 0x45000000, v2
1220; CI-NEXT:    flat_store_dword v[0:1], v2
1221; CI-NEXT:    s_endpgm
1222;
1223; GFX11-LABEL: v_insertelement_v2f16_1:
1224; GFX11:       ; %bb.0:
1225; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
1226; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1227; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1228; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1229; GFX11-NEXT:    s_waitcnt vmcnt(0)
1230; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1231; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1232; GFX11-NEXT:    v_lshl_or_b32 v1, 0x4500, 16, v1
1233; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1234; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1235; GFX11-NEXT:    s_endpgm
1236  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1237  %tid.ext = sext i32 %tid to i64
1238  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
1239  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
1240  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
1241  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
1242  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
1243  ret void
1244}
1245
1246define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
1247; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm:
1248; GFX9:       ; %bb.0:
1249; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1250; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1251; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1252; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1253; GFX9-NEXT:    s_waitcnt vmcnt(0)
1254; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1255; GFX9-NEXT:    v_lshl_or_b32 v1, 35, 16, v1
1256; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1257; GFX9-NEXT:    s_endpgm
1258;
1259; VI-LABEL: v_insertelement_v2f16_1_inlineimm:
1260; VI:       ; %bb.0:
1261; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1262; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1263; VI-NEXT:    s_waitcnt lgkmcnt(0)
1264; VI-NEXT:    v_mov_b32_e32 v1, s3
1265; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1266; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1267; VI-NEXT:    flat_load_dword v3, v[0:1]
1268; VI-NEXT:    v_mov_b32_e32 v1, s1
1269; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1270; VI-NEXT:    v_mov_b32_e32 v2, 0x230000
1271; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1272; VI-NEXT:    s_waitcnt vmcnt(0)
1273; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1274; VI-NEXT:    flat_store_dword v[0:1], v2
1275; VI-NEXT:    s_endpgm
1276;
1277; CI-LABEL: v_insertelement_v2f16_1_inlineimm:
1278; CI:       ; %bb.0:
1279; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1280; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1281; CI-NEXT:    s_waitcnt lgkmcnt(0)
1282; CI-NEXT:    v_mov_b32_e32 v1, s3
1283; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1284; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1285; CI-NEXT:    flat_load_dword v3, v[0:1]
1286; CI-NEXT:    v_mov_b32_e32 v1, s1
1287; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1288; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1289; CI-NEXT:    s_waitcnt vmcnt(0)
1290; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
1291; CI-NEXT:    v_or_b32_e32 v2, 0x230000, v2
1292; CI-NEXT:    flat_store_dword v[0:1], v2
1293; CI-NEXT:    s_endpgm
1294;
1295; GFX11-LABEL: v_insertelement_v2f16_1_inlineimm:
1296; GFX11:       ; %bb.0:
1297; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
1298; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1299; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1300; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1301; GFX11-NEXT:    s_waitcnt vmcnt(0)
1302; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1303; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1304; GFX11-NEXT:    v_lshl_or_b32 v1, 35, 16, v1
1305; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1306; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1307; GFX11-NEXT:    s_endpgm
1308  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1309  %tid.ext = sext i32 %tid to i64
1310  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
1311  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
1312  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
1313  %vecins = insertelement <2 x half> %vec, half 0xH0023, i32 1
1314  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
1315  ret void
1316}
1317
1318; FIXME: Enable for others when argument load not split
1319define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(4)* %idx.ptr) #0 {
1320; GFX9-LABEL: s_insertelement_v2i16_dynamic:
1321; GFX9:       ; %bb.0:
1322; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
1323; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1324; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1325; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1326; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
1327; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
1328; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1329; GFX9-NEXT:    s_lshl_b32 s2, s4, 4
1330; GFX9-NEXT:    s_lshl_b32 s2, 0xffff, s2
1331; GFX9-NEXT:    s_andn2_b32 s3, s5, s2
1332; GFX9-NEXT:    s_and_b32 s2, s2, 0x3e703e7
1333; GFX9-NEXT:    s_or_b32 s2, s2, s3
1334; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1335; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1336; GFX9-NEXT:    s_endpgm
1337;
1338; VI-LABEL: s_insertelement_v2i16_dynamic:
1339; VI:       ; %bb.0:
1340; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
1341; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1342; VI-NEXT:    s_waitcnt lgkmcnt(0)
1343; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
1344; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
1345; VI-NEXT:    v_mov_b32_e32 v0, s0
1346; VI-NEXT:    v_mov_b32_e32 v1, s1
1347; VI-NEXT:    s_waitcnt lgkmcnt(0)
1348; VI-NEXT:    s_lshl_b32 s0, s4, 4
1349; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1350; VI-NEXT:    s_andn2_b32 s1, s2, s0
1351; VI-NEXT:    s_and_b32 s0, s0, 0x3e703e7
1352; VI-NEXT:    s_or_b32 s0, s0, s1
1353; VI-NEXT:    v_mov_b32_e32 v2, s0
1354; VI-NEXT:    flat_store_dword v[0:1], v2
1355; VI-NEXT:    s_endpgm
1356;
1357; CI-LABEL: s_insertelement_v2i16_dynamic:
1358; CI:       ; %bb.0:
1359; CI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4
1360; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1361; CI-NEXT:    s_waitcnt lgkmcnt(0)
1362; CI-NEXT:    s_load_dword s4, s[6:7], 0x0
1363; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
1364; CI-NEXT:    v_mov_b32_e32 v0, s0
1365; CI-NEXT:    v_mov_b32_e32 v1, s1
1366; CI-NEXT:    s_waitcnt lgkmcnt(0)
1367; CI-NEXT:    s_lshl_b32 s0, s4, 4
1368; CI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1369; CI-NEXT:    s_andn2_b32 s1, s2, s0
1370; CI-NEXT:    s_and_b32 s0, s0, 0x3e703e7
1371; CI-NEXT:    s_or_b32 s0, s0, s1
1372; CI-NEXT:    v_mov_b32_e32 v2, s0
1373; CI-NEXT:    flat_store_dword v[0:1], v2
1374; CI-NEXT:    s_endpgm
1375;
1376; GFX11-LABEL: s_insertelement_v2i16_dynamic:
1377; GFX11:       ; %bb.0:
1378; GFX11-NEXT:    s_clause 0x1
1379; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x10
1380; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
1381; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1382; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x0
1383; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
1384; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1385; GFX11-NEXT:    s_lshl_b32 s3, s4, 4
1386; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1387; GFX11-NEXT:    s_lshl_b32 s3, 0xffff, s3
1388; GFX11-NEXT:    s_and_not1_b32 s2, s2, s3
1389; GFX11-NEXT:    s_and_b32 s3, s3, 0x3e703e7
1390; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1391; GFX11-NEXT:    s_or_b32 s2, s3, s2
1392; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
1393; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1394; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1395; GFX11-NEXT:    s_endpgm
1396  %idx = load volatile i32, i32 addrspace(4)* %idx.ptr
1397  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
1398  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
1399  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
1400  ret void
1401}
1402
1403define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 {
1404; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1405; GFX9:       ; %bb.0:
1406; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1407; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
1408; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1409; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3e703e7
1410; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1411; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1412; GFX9-NEXT:    s_lshl_b32 s2, s6, 4
1413; GFX9-NEXT:    s_lshl_b32 s2, 0xffff, s2
1414; GFX9-NEXT:    s_waitcnt vmcnt(0)
1415; GFX9-NEXT:    v_bfi_b32 v1, s2, v2, v1
1416; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1417; GFX9-NEXT:    s_endpgm
1418;
1419; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1420; VI:       ; %bb.0:
1421; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1422; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1423; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1424; VI-NEXT:    s_waitcnt lgkmcnt(0)
1425; VI-NEXT:    v_mov_b32_e32 v1, s3
1426; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1427; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1428; VI-NEXT:    flat_load_dword v3, v[0:1]
1429; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1430; VI-NEXT:    s_lshl_b32 s0, s4, 4
1431; VI-NEXT:    v_mov_b32_e32 v1, s1
1432; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1433; VI-NEXT:    v_mov_b32_e32 v2, 0x3e703e7
1434; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1435; VI-NEXT:    s_waitcnt vmcnt(0)
1436; VI-NEXT:    v_bfi_b32 v2, s0, v2, v3
1437; VI-NEXT:    flat_store_dword v[0:1], v2
1438; VI-NEXT:    s_endpgm
1439;
1440; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1441; CI:       ; %bb.0:
1442; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1443; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1444; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1445; CI-NEXT:    s_waitcnt lgkmcnt(0)
1446; CI-NEXT:    v_mov_b32_e32 v1, s3
1447; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1448; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1449; CI-NEXT:    flat_load_dword v3, v[0:1]
1450; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1451; CI-NEXT:    s_lshl_b32 s0, s4, 4
1452; CI-NEXT:    v_mov_b32_e32 v1, s1
1453; CI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1454; CI-NEXT:    v_mov_b32_e32 v2, 0x3e703e7
1455; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1456; CI-NEXT:    s_waitcnt vmcnt(0)
1457; CI-NEXT:    v_bfi_b32 v2, s0, v2, v3
1458; CI-NEXT:    flat_store_dword v[0:1], v2
1459; CI-NEXT:    s_endpgm
1460;
1461; GFX11-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1462; GFX11:       ; %bb.0:
1463; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
1464; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1465; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x10
1466; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1467; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7]
1468; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
1469; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1470; GFX11-NEXT:    s_lshl_b32 s0, 0xffff, s0
1471; GFX11-NEXT:    s_waitcnt vmcnt(0)
1472; GFX11-NEXT:    v_bfi_b32 v1, s0, 0x3e703e7, v1
1473; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
1474; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1475; GFX11-NEXT:    s_endpgm
1476  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1477  %tid.ext = sext i32 %tid to i64
1478  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1479  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1480  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
1481  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
1482  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
1483  ret void
1484}
1485
1486define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
1487; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1488; GFX9:       ; %bb.0:
1489; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
1490; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1491; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1492; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1493; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
1494; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
1495; GFX9-NEXT:    s_mov_b32 s2, 0xffff
1496; GFX9-NEXT:    s_waitcnt vmcnt(1)
1497; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1498; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
1499; GFX9-NEXT:    s_mov_b32 s2, 0x12341234
1500; GFX9-NEXT:    s_waitcnt vmcnt(0)
1501; GFX9-NEXT:    v_bfi_b32 v1, v1, s2, v2
1502; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1503; GFX9-NEXT:    s_endpgm
1504;
1505; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1506; VI:       ; %bb.0:
1507; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1508; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
1509; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1510; VI-NEXT:    s_waitcnt lgkmcnt(0)
1511; VI-NEXT:    v_mov_b32_e32 v3, s3
1512; VI-NEXT:    v_mov_b32_e32 v1, s5
1513; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1514; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1515; VI-NEXT:    flat_load_dword v4, v[0:1]
1516; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1517; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
1518; VI-NEXT:    flat_load_dword v3, v[0:1]
1519; VI-NEXT:    s_mov_b32 s2, 0xffff
1520; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1521; VI-NEXT:    v_mov_b32_e32 v1, s1
1522; VI-NEXT:    s_mov_b32 s0, 0x12341234
1523; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1524; VI-NEXT:    s_waitcnt vmcnt(1)
1525; VI-NEXT:    v_lshlrev_b32_e32 v2, 4, v4
1526; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, s2
1527; VI-NEXT:    s_waitcnt vmcnt(0)
1528; VI-NEXT:    v_bfi_b32 v2, v2, s0, v3
1529; VI-NEXT:    flat_store_dword v[0:1], v2
1530; VI-NEXT:    s_endpgm
1531;
1532; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1533; CI:       ; %bb.0:
1534; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1535; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
1536; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1537; CI-NEXT:    s_waitcnt lgkmcnt(0)
1538; CI-NEXT:    v_mov_b32_e32 v3, s3
1539; CI-NEXT:    v_mov_b32_e32 v1, s5
1540; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
1541; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1542; CI-NEXT:    flat_load_dword v4, v[0:1]
1543; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1544; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
1545; CI-NEXT:    flat_load_dword v3, v[0:1]
1546; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1547; CI-NEXT:    v_mov_b32_e32 v1, s1
1548; CI-NEXT:    s_mov_b32 s0, 0x12341234
1549; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1550; CI-NEXT:    s_waitcnt vmcnt(1)
1551; CI-NEXT:    v_lshlrev_b32_e32 v2, 4, v4
1552; CI-NEXT:    v_lshl_b32_e32 v2, 0xffff, v2
1553; CI-NEXT:    s_waitcnt vmcnt(0)
1554; CI-NEXT:    v_bfi_b32 v2, v2, s0, v3
1555; CI-NEXT:    flat_store_dword v[0:1], v2
1556; CI-NEXT:    s_endpgm
1557;
1558; GFX11-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1559; GFX11:       ; %bb.0:
1560; GFX11-NEXT:    s_clause 0x1
1561; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x10
1562; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
1563; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1564; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1565; GFX11-NEXT:    s_clause 0x1
1566; GFX11-NEXT:    global_load_b32 v1, v0, s[4:5]
1567; GFX11-NEXT:    global_load_b32 v2, v0, s[2:3]
1568; GFX11-NEXT:    s_waitcnt vmcnt(1)
1569; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1570; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1571; GFX11-NEXT:    v_lshlrev_b32_e64 v1, v1, 0xffff
1572; GFX11-NEXT:    s_waitcnt vmcnt(0)
1573; GFX11-NEXT:    v_bfi_b32 v1, v1, 0x12341234, v2
1574; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1575; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1576; GFX11-NEXT:    s_endpgm
1577  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1578  %tid.ext = sext i32 %tid to i64
1579  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
1580  %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
1581  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
1582  %idx = load i32, i32 addrspace(1)* %idx.gep
1583  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
1584  %vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx
1585  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
1586  ret void
1587}
1588
1589define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
1590; GFX9-LABEL: v_insertelement_v4f16_0:
1591; GFX9:       ; %bb.0:
1592; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1593; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x30
1594; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1595; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
1596; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1597; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1598; GFX9-NEXT:    s_waitcnt vmcnt(0)
1599; GFX9-NEXT:    v_bfi_b32 v0, v3, s6, v0
1600; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1601; GFX9-NEXT:    s_endpgm
1602;
1603; VI-LABEL: v_insertelement_v4f16_0:
1604; VI:       ; %bb.0:
1605; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1606; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
1607; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1608; VI-NEXT:    s_waitcnt lgkmcnt(0)
1609; VI-NEXT:    v_mov_b32_e32 v1, s3
1610; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1611; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1612; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1613; VI-NEXT:    v_mov_b32_e32 v3, s1
1614; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1615; VI-NEXT:    s_mov_b32 s0, 0xffff
1616; VI-NEXT:    v_mov_b32_e32 v4, s4
1617; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1618; VI-NEXT:    s_waitcnt vmcnt(0)
1619; VI-NEXT:    v_bfi_b32 v0, s0, v4, v0
1620; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1621; VI-NEXT:    s_endpgm
1622;
1623; CI-LABEL: v_insertelement_v4f16_0:
1624; CI:       ; %bb.0:
1625; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1626; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
1627; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1628; CI-NEXT:    s_waitcnt lgkmcnt(0)
1629; CI-NEXT:    v_mov_b32_e32 v1, s3
1630; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1631; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1632; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1633; CI-NEXT:    v_mov_b32_e32 v3, s1
1634; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1635; CI-NEXT:    s_mov_b32 s0, 0xffff
1636; CI-NEXT:    v_mov_b32_e32 v4, s4
1637; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1638; CI-NEXT:    s_waitcnt vmcnt(0)
1639; CI-NEXT:    v_bfi_b32 v0, s0, v4, v0
1640; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1641; CI-NEXT:    s_endpgm
1642;
1643; GFX11-LABEL: v_insertelement_v4f16_0:
1644; GFX11:       ; %bb.0:
1645; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
1646; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1647; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x30
1648; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1649; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[6:7]
1650; GFX11-NEXT:    s_waitcnt vmcnt(0)
1651; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, s0, v0
1652; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
1653; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1654; GFX11-NEXT:    s_endpgm
1655  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1656  %tid.ext = sext i32 %tid to i64
1657  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1658  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1659  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1660  %val.trunc = trunc i32 %val to i16
1661  %val.cvt = bitcast i16 %val.trunc to half
1662  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 0
1663  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1664  ret void
1665}
1666
1667define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
1668; GFX9-LABEL: v_insertelement_v4f16_1:
1669; GFX9:       ; %bb.0:
1670; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1671; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
1672; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1673; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1674; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1675; GFX9-NEXT:    s_waitcnt vmcnt(0)
1676; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1677; GFX9-NEXT:    v_lshl_or_b32 v0, s6, 16, v0
1678; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1679; GFX9-NEXT:    s_endpgm
1680;
1681; VI-LABEL: v_insertelement_v4f16_1:
1682; VI:       ; %bb.0:
1683; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1684; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1685; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1686; VI-NEXT:    s_waitcnt lgkmcnt(0)
1687; VI-NEXT:    v_mov_b32_e32 v1, s3
1688; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1689; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1690; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1691; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1692; VI-NEXT:    s_lshl_b32 s0, s4, 16
1693; VI-NEXT:    v_mov_b32_e32 v3, s1
1694; VI-NEXT:    v_mov_b32_e32 v4, s0
1695; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1696; VI-NEXT:    s_waitcnt vmcnt(0)
1697; VI-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1698; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1699; VI-NEXT:    s_endpgm
1700;
1701; CI-LABEL: v_insertelement_v4f16_1:
1702; CI:       ; %bb.0:
1703; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1704; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1705; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1706; CI-NEXT:    s_waitcnt lgkmcnt(0)
1707; CI-NEXT:    v_mov_b32_e32 v1, s3
1708; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1709; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1710; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1711; CI-NEXT:    v_mov_b32_e32 v3, s1
1712; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1713; CI-NEXT:    s_lshl_b32 s0, s4, 16
1714; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1715; CI-NEXT:    s_waitcnt vmcnt(0)
1716; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1717; CI-NEXT:    v_or_b32_e32 v0, s0, v0
1718; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1719; CI-NEXT:    s_endpgm
1720;
1721; GFX11-LABEL: v_insertelement_v4f16_1:
1722; GFX11:       ; %bb.0:
1723; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
1724; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1725; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x10
1726; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1727; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[6:7]
1728; GFX11-NEXT:    s_waitcnt vmcnt(0)
1729; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1730; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1731; GFX11-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
1732; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
1733; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1734; GFX11-NEXT:    s_endpgm
1735  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1736  %tid.ext = sext i32 %tid to i64
1737  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1738  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1739  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1740  %val.trunc = trunc i32 %val to i16
1741  %val.cvt = bitcast i16 %val.trunc to half
1742  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 1
1743  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1744  ret void
1745}
1746
1747define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
1748; GFX9-LABEL: v_insertelement_v4f16_2:
1749; GFX9:       ; %bb.0:
1750; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1751; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x30
1752; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1753; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
1754; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1755; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1756; GFX9-NEXT:    s_waitcnt vmcnt(0)
1757; GFX9-NEXT:    v_bfi_b32 v1, v3, s6, v1
1758; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1759; GFX9-NEXT:    s_endpgm
1760;
1761; VI-LABEL: v_insertelement_v4f16_2:
1762; VI:       ; %bb.0:
1763; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1764; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
1765; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1766; VI-NEXT:    s_waitcnt lgkmcnt(0)
1767; VI-NEXT:    v_mov_b32_e32 v1, s3
1768; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1769; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1770; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1771; VI-NEXT:    v_mov_b32_e32 v3, s1
1772; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1773; VI-NEXT:    s_mov_b32 s0, 0xffff
1774; VI-NEXT:    v_mov_b32_e32 v4, s4
1775; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1776; VI-NEXT:    s_waitcnt vmcnt(0)
1777; VI-NEXT:    v_bfi_b32 v1, s0, v4, v1
1778; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1779; VI-NEXT:    s_endpgm
1780;
1781; CI-LABEL: v_insertelement_v4f16_2:
1782; CI:       ; %bb.0:
1783; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1784; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
1785; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1786; CI-NEXT:    s_waitcnt lgkmcnt(0)
1787; CI-NEXT:    v_mov_b32_e32 v1, s3
1788; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1789; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1790; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1791; CI-NEXT:    v_mov_b32_e32 v3, s1
1792; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1793; CI-NEXT:    s_mov_b32 s0, 0xffff
1794; CI-NEXT:    v_mov_b32_e32 v4, s4
1795; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1796; CI-NEXT:    s_waitcnt vmcnt(0)
1797; CI-NEXT:    v_bfi_b32 v1, s0, v4, v1
1798; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1799; CI-NEXT:    s_endpgm
1800;
1801; GFX11-LABEL: v_insertelement_v4f16_2:
1802; GFX11:       ; %bb.0:
1803; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
1804; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1805; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x30
1806; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1807; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[6:7]
1808; GFX11-NEXT:    s_waitcnt vmcnt(0)
1809; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, s0, v1
1810; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
1811; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1812; GFX11-NEXT:    s_endpgm
1813  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1814  %tid.ext = sext i32 %tid to i64
1815  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1816  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1817  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1818  %val.trunc = trunc i32 %val to i16
1819  %val.cvt = bitcast i16 %val.trunc to half
1820  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 2
1821  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1822  ret void
1823}
1824
1825define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
1826; GFX9-LABEL: v_insertelement_v4f16_3:
1827; GFX9:       ; %bb.0:
1828; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1829; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
1830; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1831; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1832; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1833; GFX9-NEXT:    s_waitcnt vmcnt(0)
1834; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1835; GFX9-NEXT:    v_lshl_or_b32 v1, s6, 16, v1
1836; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1837; GFX9-NEXT:    s_endpgm
1838;
1839; VI-LABEL: v_insertelement_v4f16_3:
1840; VI:       ; %bb.0:
1841; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1842; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1843; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1844; VI-NEXT:    s_waitcnt lgkmcnt(0)
1845; VI-NEXT:    v_mov_b32_e32 v1, s3
1846; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1847; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1848; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1849; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1850; VI-NEXT:    s_lshl_b32 s0, s4, 16
1851; VI-NEXT:    v_mov_b32_e32 v3, s1
1852; VI-NEXT:    v_mov_b32_e32 v4, s0
1853; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1854; VI-NEXT:    s_waitcnt vmcnt(0)
1855; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1856; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1857; VI-NEXT:    s_endpgm
1858;
1859; CI-LABEL: v_insertelement_v4f16_3:
1860; CI:       ; %bb.0:
1861; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1862; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1863; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1864; CI-NEXT:    s_waitcnt lgkmcnt(0)
1865; CI-NEXT:    v_mov_b32_e32 v1, s3
1866; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1867; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1868; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1869; CI-NEXT:    v_mov_b32_e32 v3, s1
1870; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1871; CI-NEXT:    s_lshl_b32 s0, s4, 16
1872; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1873; CI-NEXT:    s_waitcnt vmcnt(0)
1874; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1875; CI-NEXT:    v_or_b32_e32 v1, s0, v1
1876; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1877; CI-NEXT:    s_endpgm
1878;
1879; GFX11-LABEL: v_insertelement_v4f16_3:
1880; GFX11:       ; %bb.0:
1881; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
1882; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1883; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x10
1884; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1885; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[6:7]
1886; GFX11-NEXT:    s_waitcnt vmcnt(0)
1887; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1888; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1889; GFX11-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
1890; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
1891; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1892; GFX11-NEXT:    s_endpgm
1893  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1894  %tid.ext = sext i32 %tid to i64
1895  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1896  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1897  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1898  %val.trunc = trunc i32 %val to i16
1899  %val.cvt = bitcast i16 %val.trunc to half
1900  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 3
1901  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1902  ret void
1903}
1904
1905define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
1906; GFX9-LABEL: v_insertelement_v4i16_2:
1907; GFX9:       ; %bb.0:
1908; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1909; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
1910; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1911; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
1912; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1913; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1914; GFX9-NEXT:    s_waitcnt vmcnt(0)
1915; GFX9-NEXT:    v_bfi_b32 v1, v3, s6, v1
1916; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1917; GFX9-NEXT:    s_endpgm
1918;
1919; VI-LABEL: v_insertelement_v4i16_2:
1920; VI:       ; %bb.0:
1921; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1922; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1923; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1924; VI-NEXT:    s_waitcnt lgkmcnt(0)
1925; VI-NEXT:    v_mov_b32_e32 v1, s3
1926; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1927; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1928; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1929; VI-NEXT:    v_mov_b32_e32 v3, s1
1930; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1931; VI-NEXT:    s_mov_b32 s0, 0xffff
1932; VI-NEXT:    v_mov_b32_e32 v4, s4
1933; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1934; VI-NEXT:    s_waitcnt vmcnt(0)
1935; VI-NEXT:    v_bfi_b32 v1, s0, v4, v1
1936; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1937; VI-NEXT:    s_endpgm
1938;
1939; CI-LABEL: v_insertelement_v4i16_2:
1940; CI:       ; %bb.0:
1941; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1942; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1943; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1944; CI-NEXT:    s_waitcnt lgkmcnt(0)
1945; CI-NEXT:    v_mov_b32_e32 v1, s3
1946; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1947; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1948; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1949; CI-NEXT:    v_mov_b32_e32 v3, s1
1950; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1951; CI-NEXT:    s_mov_b32 s0, 0xffff
1952; CI-NEXT:    v_mov_b32_e32 v4, s4
1953; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1954; CI-NEXT:    s_waitcnt vmcnt(0)
1955; CI-NEXT:    v_bfi_b32 v1, s0, v4, v1
1956; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1957; CI-NEXT:    s_endpgm
1958;
1959; GFX11-LABEL: v_insertelement_v4i16_2:
1960; GFX11:       ; %bb.0:
1961; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
1962; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1963; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x10
1964; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1965; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[6:7]
1966; GFX11-NEXT:    s_waitcnt vmcnt(0)
1967; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, s0, v1
1968; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
1969; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1970; GFX11-NEXT:    s_endpgm
1971  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1972  %tid.ext = sext i32 %tid to i64
1973  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
1974  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
1975  %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
1976  %val.trunc = trunc i32 %val to i16
1977  %val.cvt = bitcast i16 %val.trunc to i16
1978  %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 2
1979  store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep
1980  ret void
1981}
1982
1983; FIXME: Better code on CI?
1984define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
1985; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1986; GFX9:       ; %bb.0:
1987; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1988; GFX9-NEXT:    global_load_dword v2, v[0:1], off glc
1989; GFX9-NEXT:    s_waitcnt vmcnt(0)
1990; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1991; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
1992; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1993; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
1994; GFX9-NEXT:    s_mov_b64 s[2:3], 0xffff
1995; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
1996; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, s[2:3]
1997; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s4, s4
1998; GFX9-NEXT:    s_waitcnt vmcnt(0)
1999; GFX9-NEXT:    v_bfi_b32 v1, v3, s2, v1
2000; GFX9-NEXT:    v_bfi_b32 v0, v2, s2, v0
2001; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
2002; GFX9-NEXT:    s_endpgm
2003;
2004; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
2005; VI:       ; %bb.0:
2006; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2007; VI-NEXT:    flat_load_dword v4, v[0:1] glc
2008; VI-NEXT:    s_waitcnt vmcnt(0)
2009; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2010; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
2011; VI-NEXT:    s_waitcnt lgkmcnt(0)
2012; VI-NEXT:    v_mov_b32_e32 v1, s3
2013; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2014; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2015; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2016; VI-NEXT:    s_mov_b64 s[2:3], 0xffff
2017; VI-NEXT:    v_mov_b32_e32 v3, s1
2018; VI-NEXT:    s_lshl_b32 s1, s4, 16
2019; VI-NEXT:    s_and_b32 s4, s4, 0xffff
2020; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
2021; VI-NEXT:    s_or_b32 s0, s4, s1
2022; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2023; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
2024; VI-NEXT:    v_lshlrev_b64 v[4:5], v4, s[2:3]
2025; VI-NEXT:    s_waitcnt vmcnt(0)
2026; VI-NEXT:    v_bfi_b32 v1, v5, s0, v1
2027; VI-NEXT:    v_bfi_b32 v0, v4, s0, v0
2028; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2029; VI-NEXT:    s_endpgm
2030;
2031; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
2032; CI:       ; %bb.0:
2033; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2034; CI-NEXT:    flat_load_dword v4, v[0:1] glc
2035; CI-NEXT:    s_waitcnt vmcnt(0)
2036; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2037; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
2038; CI-NEXT:    s_waitcnt lgkmcnt(0)
2039; CI-NEXT:    v_mov_b32_e32 v1, s3
2040; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
2041; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2042; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2043; CI-NEXT:    s_mov_b64 s[2:3], 0xffff
2044; CI-NEXT:    v_mov_b32_e32 v3, s1
2045; CI-NEXT:    s_lshl_b32 s1, s4, 16
2046; CI-NEXT:    s_and_b32 s4, s4, 0xffff
2047; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
2048; CI-NEXT:    s_or_b32 s0, s4, s1
2049; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2050; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
2051; CI-NEXT:    v_lshl_b64 v[4:5], s[2:3], v4
2052; CI-NEXT:    s_waitcnt vmcnt(0)
2053; CI-NEXT:    v_bfi_b32 v1, v5, s0, v1
2054; CI-NEXT:    v_bfi_b32 v0, v4, s0, v0
2055; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2056; CI-NEXT:    s_endpgm
2057;
2058; GFX11-LABEL: v_insertelement_v4i16_dynamic_vgpr:
2059; GFX11:       ; %bb.0:
2060; GFX11-NEXT:    s_clause 0x1
2061; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
2062; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x10
2063; GFX11-NEXT:    global_load_b32 v2, v[0:1], off glc dlc
2064; GFX11-NEXT:    s_waitcnt vmcnt(0)
2065; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
2066; GFX11-NEXT:    s_mov_b64 s[0:1], 0xffff
2067; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2068; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[6:7]
2069; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
2070; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2071; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, s[0:1]
2072; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s2, s2
2073; GFX11-NEXT:    s_waitcnt vmcnt(0)
2074; GFX11-NEXT:    v_bfi_b32 v1, v3, s0, v1
2075; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2076; GFX11-NEXT:    v_bfi_b32 v0, v2, s0, v0
2077; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
2078; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2079; GFX11-NEXT:    s_endpgm
2080  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2081  %tid.ext = sext i32 %tid to i64
2082  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
2083  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
2084  %idx.val = load volatile i32, i32 addrspace(1)* undef
2085  %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
2086  %val.trunc = trunc i32 %val to i16
2087  %val.cvt = bitcast i16 %val.trunc to i16
2088  %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 %idx.val
2089  store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep
2090  ret void
2091}
2092
2093define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val, i32 %idxval) #0 {
2094; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr:
2095; GFX9:       ; %bb.0:
2096; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2097; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
2098; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2099; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2100; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
2101; GFX9-NEXT:    s_mov_b64 s[2:3], 0xffff
2102; GFX9-NEXT:    s_lshl_b32 s4, s7, 4
2103; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s6, s6
2104; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
2105; GFX9-NEXT:    v_mov_b32_e32 v3, s5
2106; GFX9-NEXT:    v_mov_b32_e32 v4, s5
2107; GFX9-NEXT:    s_waitcnt vmcnt(0)
2108; GFX9-NEXT:    v_bfi_b32 v1, s3, v3, v1
2109; GFX9-NEXT:    v_bfi_b32 v0, s2, v4, v0
2110; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
2111; GFX9-NEXT:    s_endpgm
2112;
2113; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
2114; VI:       ; %bb.0:
2115; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2116; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
2117; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2118; VI-NEXT:    s_waitcnt lgkmcnt(0)
2119; VI-NEXT:    v_mov_b32_e32 v1, s3
2120; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2121; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2122; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2123; VI-NEXT:    s_mov_b64 s[2:3], 0xffff
2124; VI-NEXT:    v_mov_b32_e32 v3, s1
2125; VI-NEXT:    s_lshl_b32 s1, s5, 4
2126; VI-NEXT:    s_lshl_b32 s5, s4, 16
2127; VI-NEXT:    s_and_b32 s4, s4, 0xffff
2128; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
2129; VI-NEXT:    s_lshl_b64 s[0:1], s[2:3], s1
2130; VI-NEXT:    s_or_b32 s2, s4, s5
2131; VI-NEXT:    v_mov_b32_e32 v4, s2
2132; VI-NEXT:    v_mov_b32_e32 v5, s2
2133; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2134; VI-NEXT:    s_waitcnt vmcnt(0)
2135; VI-NEXT:    v_bfi_b32 v1, s1, v4, v1
2136; VI-NEXT:    v_bfi_b32 v0, s0, v5, v0
2137; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2138; VI-NEXT:    s_endpgm
2139;
2140; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
2141; CI:       ; %bb.0:
2142; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2143; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
2144; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2145; CI-NEXT:    s_waitcnt lgkmcnt(0)
2146; CI-NEXT:    v_mov_b32_e32 v1, s3
2147; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
2148; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2149; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2150; CI-NEXT:    s_mov_b64 s[2:3], 0xffff
2151; CI-NEXT:    v_mov_b32_e32 v3, s1
2152; CI-NEXT:    s_and_b32 s6, s4, 0xffff
2153; CI-NEXT:    s_lshl_b32 s1, s5, 4
2154; CI-NEXT:    s_lshl_b32 s4, s4, 16
2155; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
2156; CI-NEXT:    s_lshl_b64 s[0:1], s[2:3], s1
2157; CI-NEXT:    s_or_b32 s2, s6, s4
2158; CI-NEXT:    v_mov_b32_e32 v4, s2
2159; CI-NEXT:    v_mov_b32_e32 v5, s2
2160; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2161; CI-NEXT:    s_waitcnt vmcnt(0)
2162; CI-NEXT:    v_bfi_b32 v1, s1, v4, v1
2163; CI-NEXT:    v_bfi_b32 v0, s0, v5, v0
2164; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2165; CI-NEXT:    s_endpgm
2166;
2167; GFX11-LABEL: v_insertelement_v4f16_dynamic_sgpr:
2168; GFX11:       ; %bb.0:
2169; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
2170; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2171; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x10
2172; GFX11-NEXT:    s_mov_b64 s[2:3], 0xffff
2173; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2174; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[6:7]
2175; GFX11-NEXT:    s_lshl_b32 s1, s1, 4
2176; GFX11-NEXT:    s_pack_ll_b32_b16 s6, s0, s0
2177; GFX11-NEXT:    s_lshl_b64 s[0:1], s[2:3], s1
2178; GFX11-NEXT:    s_waitcnt vmcnt(0)
2179; GFX11-NEXT:    v_bfi_b32 v1, s1, s6, v1
2180; GFX11-NEXT:    v_bfi_b32 v0, s0, s6, v0
2181; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
2182; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2183; GFX11-NEXT:    s_endpgm
2184  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2185  %tid.ext = sext i32 %tid to i64
2186  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
2187  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
2188  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
2189  %val.trunc = trunc i32 %val to i16
2190  %val.cvt = bitcast i16 %val.trunc to half
2191  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval
2192  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
2193  ret void
2194}
2195
2196define amdgpu_kernel void @v_insertelement_v8f16_3(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %val) {
2197; GFX9-LABEL: v_insertelement_v8f16_3:
2198; GFX9:       ; %bb.0:
2199; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2200; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
2201; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2202; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2203; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
2204; GFX9-NEXT:    s_waitcnt vmcnt(0)
2205; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2206; GFX9-NEXT:    v_lshl_or_b32 v1, s6, 16, v1
2207; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2208; GFX9-NEXT:    s_endpgm
2209;
2210; VI-LABEL: v_insertelement_v8f16_3:
2211; VI:       ; %bb.0:
2212; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2213; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
2214; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2215; VI-NEXT:    s_waitcnt lgkmcnt(0)
2216; VI-NEXT:    v_mov_b32_e32 v1, s3
2217; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
2218; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2219; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2220; VI-NEXT:    v_mov_b32_e32 v5, s1
2221; VI-NEXT:    s_lshl_b32 s1, s4, 16
2222; VI-NEXT:    s_mov_b32 s2, 0xffff
2223; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
2224; VI-NEXT:    v_mov_b32_e32 v6, s1
2225; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
2226; VI-NEXT:    s_waitcnt vmcnt(0)
2227; VI-NEXT:    v_bfi_b32 v3, s2, v3, v3
2228; VI-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2229; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2230; VI-NEXT:    s_endpgm
2231;
2232; CI-LABEL: v_insertelement_v8f16_3:
2233; CI:       ; %bb.0:
2234; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2235; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
2236; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2237; CI-NEXT:    s_waitcnt lgkmcnt(0)
2238; CI-NEXT:    v_mov_b32_e32 v1, s3
2239; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
2240; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2241; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2242; CI-NEXT:    v_mov_b32_e32 v5, s1
2243; CI-NEXT:    v_add_i32_e32 v4, vcc, s0, v4
2244; CI-NEXT:    s_lshl_b32 s0, s4, 16
2245; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
2246; CI-NEXT:    s_waitcnt vmcnt(0)
2247; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2248; CI-NEXT:    v_or_b32_e32 v1, s0, v1
2249; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2250; CI-NEXT:    s_endpgm
2251;
2252; GFX11-LABEL: v_insertelement_v8f16_3:
2253; GFX11:       ; %bb.0:
2254; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
2255; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2256; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x10
2257; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2258; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[6:7]
2259; GFX11-NEXT:    s_waitcnt vmcnt(0)
2260; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2261; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2262; GFX11-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
2263; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
2264; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2265; GFX11-NEXT:    s_endpgm
2266  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2267  %tid.ext = sext i32 %tid to i64
2268  %in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext
2269  %out.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %out, i64 %tid.ext
2270  %vec = load <8 x half>, <8 x half> addrspace(1)* %in.gep
2271  %val.trunc = trunc i32 %val to i16
2272  %val.cvt = bitcast i16 %val.trunc to half
2273  %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 3
2274  store <8 x half> %vecins, <8 x half> addrspace(1)* %out.gep
2275  ret void
2276}
2277
2278define amdgpu_kernel void @v_insertelement_v8i16_6(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in, i32 %val) {
2279; GFX9-LABEL: v_insertelement_v8i16_6:
2280; GFX9:       ; %bb.0:
2281; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2282; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
2283; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2284; GFX9-NEXT:    v_mov_b32_e32 v5, 0xffff
2285; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2286; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
2287; GFX9-NEXT:    s_waitcnt vmcnt(0)
2288; GFX9-NEXT:    v_bfi_b32 v3, v5, s6, v3
2289; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2290; GFX9-NEXT:    s_endpgm
2291;
2292; VI-LABEL: v_insertelement_v8i16_6:
2293; VI:       ; %bb.0:
2294; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2295; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
2296; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2297; VI-NEXT:    s_waitcnt lgkmcnt(0)
2298; VI-NEXT:    v_mov_b32_e32 v1, s3
2299; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
2300; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2301; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2302; VI-NEXT:    s_mov_b32 s2, 0xffff
2303; VI-NEXT:    v_mov_b32_e32 v5, s1
2304; VI-NEXT:    v_mov_b32_e32 v6, s4
2305; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
2306; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
2307; VI-NEXT:    s_waitcnt vmcnt(0)
2308; VI-NEXT:    v_bfi_b32 v3, s2, v6, v3
2309; VI-NEXT:    v_bfi_b32 v1, s2, v1, v1
2310; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2311; VI-NEXT:    s_endpgm
2312;
2313; CI-LABEL: v_insertelement_v8i16_6:
2314; CI:       ; %bb.0:
2315; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2316; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
2317; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2318; CI-NEXT:    s_waitcnt lgkmcnt(0)
2319; CI-NEXT:    v_mov_b32_e32 v1, s3
2320; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
2321; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2322; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2323; CI-NEXT:    v_mov_b32_e32 v5, s1
2324; CI-NEXT:    v_add_i32_e32 v4, vcc, s0, v4
2325; CI-NEXT:    s_mov_b32 s0, 0xffff
2326; CI-NEXT:    v_mov_b32_e32 v6, s4
2327; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
2328; CI-NEXT:    s_waitcnt vmcnt(0)
2329; CI-NEXT:    v_bfi_b32 v3, s0, v6, v3
2330; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2331; CI-NEXT:    s_endpgm
2332;
2333; GFX11-LABEL: v_insertelement_v8i16_6:
2334; GFX11:       ; %bb.0:
2335; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
2336; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2337; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x10
2338; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2339; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[6:7]
2340; GFX11-NEXT:    s_waitcnt vmcnt(0)
2341; GFX11-NEXT:    v_bfi_b32 v3, 0xffff, s0, v3
2342; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
2343; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2344; GFX11-NEXT:    s_endpgm
2345  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2346  %tid.ext = sext i32 %tid to i64
2347  %in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext
2348  %out.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %out, i64 %tid.ext
2349  %vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep
2350  %val.trunc = trunc i32 %val to i16
2351  %val.cvt = bitcast i16 %val.trunc to i16
2352  %vecins = insertelement <8 x i16> %vec, i16 %val.cvt, i32 6
2353  store <8 x i16> %vecins, <8 x i16> addrspace(1)* %out.gep
2354  ret void
2355}
2356
2357define amdgpu_kernel void @v_insertelement_v8f16_dynamic(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %val, i32 %n) {
2358; GFX9-LABEL: v_insertelement_v8f16_dynamic:
2359; GFX9:       ; %bb.0:
2360; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2361; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
2362; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2363; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2364; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
2365; GFX9-NEXT:    s_cmp_eq_u32 s7, 7
2366; GFX9-NEXT:    v_mov_b32_e32 v5, s6
2367; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2368; GFX9-NEXT:    s_cmp_eq_u32 s7, 6
2369; GFX9-NEXT:    s_waitcnt vmcnt(0)
2370; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
2371; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
2372; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2373; GFX9-NEXT:    s_cmp_eq_u32 s7, 5
2374; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
2375; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
2376; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2377; GFX9-NEXT:    s_cmp_eq_u32 s7, 4
2378; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v5, vcc
2379; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2380; GFX9-NEXT:    s_cmp_eq_u32 s7, 3
2381; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
2382; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v3
2383; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
2384; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2385; GFX9-NEXT:    s_cmp_eq_u32 s7, 2
2386; GFX9-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
2387; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v5, vcc
2388; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2389; GFX9-NEXT:    s_cmp_eq_u32 s7, 1
2390; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
2391; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2392; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
2393; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2394; GFX9-NEXT:    s_cmp_eq_u32 s7, 0
2395; GFX9-NEXT:    v_lshl_or_b32 v2, v7, 16, v2
2396; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v5, vcc
2397; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2398; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
2399; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2400; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2401; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
2402; GFX9-NEXT:    v_lshl_or_b32 v0, v7, 16, v0
2403; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2404; GFX9-NEXT:    s_endpgm
2405;
2406; VI-LABEL: v_insertelement_v8f16_dynamic:
2407; VI:       ; %bb.0:
2408; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2409; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
2410; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2411; VI-NEXT:    s_waitcnt lgkmcnt(0)
2412; VI-NEXT:    v_mov_b32_e32 v1, s3
2413; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
2414; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2415; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2416; VI-NEXT:    v_mov_b32_e32 v5, s1
2417; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
2418; VI-NEXT:    s_cmp_eq_u32 s5, 6
2419; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
2420; VI-NEXT:    v_mov_b32_e32 v6, s4
2421; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2422; VI-NEXT:    s_cmp_eq_u32 s5, 7
2423; VI-NEXT:    s_waitcnt vmcnt(0)
2424; VI-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc
2425; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2426; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2427; VI-NEXT:    s_cmp_eq_u32 s5, 4
2428; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
2429; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2430; VI-NEXT:    s_cmp_eq_u32 s5, 5
2431; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
2432; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
2433; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2434; VI-NEXT:    s_cmp_eq_u32 s5, 2
2435; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2436; VI-NEXT:    v_cndmask_b32_e32 v8, v8, v6, vcc
2437; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2438; VI-NEXT:    s_cmp_eq_u32 s5, 3
2439; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
2440; VI-NEXT:    v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2441; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
2442; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
2443; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2444; VI-NEXT:    s_cmp_eq_u32 s5, 0
2445; VI-NEXT:    v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2446; VI-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
2447; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2448; VI-NEXT:    s_cmp_eq_u32 s5, 1
2449; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
2450; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
2451; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2452; VI-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
2453; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
2454; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
2455; VI-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2456; VI-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2457; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2458; VI-NEXT:    s_endpgm
2459;
2460; CI-LABEL: v_insertelement_v8f16_dynamic:
2461; CI:       ; %bb.0:
2462; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2463; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
2464; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2465; CI-NEXT:    s_waitcnt lgkmcnt(0)
2466; CI-NEXT:    v_mov_b32_e32 v1, s3
2467; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
2468; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2469; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2470; CI-NEXT:    v_mov_b32_e32 v5, s1
2471; CI-NEXT:    v_add_i32_e32 v4, vcc, s0, v4
2472; CI-NEXT:    v_cvt_f32_f16_e32 v6, s4
2473; CI-NEXT:    s_cmp_eq_u32 s5, 7
2474; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
2475; CI-NEXT:    s_cselect_b64 vcc, -1, 0
2476; CI-NEXT:    s_cmp_eq_u32 s5, 6
2477; CI-NEXT:    s_cselect_b64 s[0:1], -1, 0
2478; CI-NEXT:    s_cmp_eq_u32 s5, 5
2479; CI-NEXT:    s_cselect_b64 s[2:3], -1, 0
2480; CI-NEXT:    s_cmp_eq_u32 s5, 4
2481; CI-NEXT:    s_waitcnt vmcnt(0)
2482; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
2483; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
2484; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
2485; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
2486; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
2487; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
2488; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2489; CI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
2490; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
2491; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
2492; CI-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
2493; CI-NEXT:    s_cselect_b64 s[0:1], -1, 0
2494; CI-NEXT:    s_cmp_eq_u32 s5, 3
2495; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
2496; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
2497; CI-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
2498; CI-NEXT:    s_cselect_b64 vcc, -1, 0
2499; CI-NEXT:    s_cmp_eq_u32 s5, 2
2500; CI-NEXT:    v_cndmask_b32_e32 v9, v9, v6, vcc
2501; CI-NEXT:    s_cselect_b64 vcc, -1, 0
2502; CI-NEXT:    s_cmp_eq_u32 s5, 1
2503; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
2504; CI-NEXT:    s_cselect_b64 vcc, -1, 0
2505; CI-NEXT:    s_cmp_eq_u32 s5, 0
2506; CI-NEXT:    v_cndmask_b32_e64 v8, v8, v6, s[2:3]
2507; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
2508; CI-NEXT:    v_cndmask_b32_e32 v10, v10, v6, vcc
2509; CI-NEXT:    s_cselect_b64 vcc, -1, 0
2510; CI-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
2511; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
2512; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
2513; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
2514; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
2515; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
2516; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2517; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2518; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2519; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
2520; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
2521; CI-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
2522; CI-NEXT:    v_or_b32_e32 v3, v3, v6
2523; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
2524; CI-NEXT:    v_or_b32_e32 v2, v2, v7
2525; CI-NEXT:    v_or_b32_e32 v1, v1, v8
2526; CI-NEXT:    v_or_b32_e32 v0, v0, v6
2527; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2528; CI-NEXT:    s_endpgm
2529;
2530; GFX11-LABEL: v_insertelement_v8f16_dynamic:
2531; GFX11:       ; %bb.0:
2532; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
2533; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2534; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x10
2535; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2536; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[6:7]
2537; GFX11-NEXT:    s_cmp_eq_u32 s1, 7
2538; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
2539; GFX11-NEXT:    s_cmp_eq_u32 s1, 6
2540; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
2541; GFX11-NEXT:    s_cmp_eq_u32 s1, 5
2542; GFX11-NEXT:    s_waitcnt vmcnt(0)
2543; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
2544; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
2545; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s0, s3
2546; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
2547; GFX11-NEXT:    s_cmp_eq_u32 s1, 4
2548; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, s0, s2
2549; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
2550; GFX11-NEXT:    s_cmp_eq_u32 s1, 3
2551; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, s0, s3
2552; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
2553; GFX11-NEXT:    s_cmp_eq_u32 s1, 2
2554; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s0, s2
2555; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
2556; GFX11-NEXT:    s_cmp_eq_u32 s1, 1
2557; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
2558; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
2559; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
2560; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
2561; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
2562; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s0, s2
2563; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s1
2564; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
2565; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2566; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, s0, s3
2567; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2568; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, s0, s6
2569; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2570; GFX11-NEXT:    v_lshl_or_b32 v3, v5, 16, v3
2571; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v2
2572; GFX11-NEXT:    v_lshl_or_b32 v1, v7, 16, v1
2573; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
2574; GFX11-NEXT:    v_lshl_or_b32 v0, v8, 16, v0
2575; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
2576; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2577; GFX11-NEXT:    s_endpgm
2578  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2579  %tid.ext = sext i32 %tid to i64
2580  %in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext
2581  %out.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %out, i64 %tid.ext
2582  %vec = load <8 x half>, <8 x half> addrspace(1)* %in.gep
2583  %val.trunc = trunc i32 %val to i16
2584  %val.cvt = bitcast i16 %val.trunc to half
2585  %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 %n
2586  store <8 x half> %vecins, <8 x half> addrspace(1)* %out.gep
2587  ret void
2588}
2589
2590define amdgpu_kernel void @v_insertelement_v16f16_3(<16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %in, i32 %val) {
2591; GFX9-LABEL: v_insertelement_v16f16_3:
2592; GFX9:       ; %bb.0:
2593; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2594; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
2595; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2596; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2597; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
2598; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
2599; GFX9-NEXT:    s_waitcnt vmcnt(1)
2600; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2601; GFX9-NEXT:    v_lshl_or_b32 v1, s6, 16, v1
2602; GFX9-NEXT:    s_waitcnt vmcnt(0)
2603; GFX9-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
2604; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
2605; GFX9-NEXT:    s_endpgm
2606;
2607; VI-LABEL: v_insertelement_v16f16_3:
2608; VI:       ; %bb.0:
2609; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2610; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
2611; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2612; VI-NEXT:    s_waitcnt lgkmcnt(0)
2613; VI-NEXT:    v_mov_b32_e32 v1, s3
2614; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v8
2615; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2616; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
2617; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
2618; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2619; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2620; VI-NEXT:    v_mov_b32_e32 v9, s1
2621; VI-NEXT:    v_add_u32_e32 v8, vcc, s0, v8
2622; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
2623; VI-NEXT:    s_lshl_b32 s1, s4, 16
2624; VI-NEXT:    v_add_u32_e32 v10, vcc, 16, v8
2625; VI-NEXT:    v_mov_b32_e32 v12, s1
2626; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
2627; VI-NEXT:    s_waitcnt vmcnt(1)
2628; VI-NEXT:    v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2629; VI-NEXT:    s_waitcnt vmcnt(0)
2630; VI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
2631; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
2632; VI-NEXT:    s_endpgm
2633;
2634; CI-LABEL: v_insertelement_v16f16_3:
2635; CI:       ; %bb.0:
2636; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2637; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
2638; CI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2639; CI-NEXT:    s_waitcnt lgkmcnt(0)
2640; CI-NEXT:    v_mov_b32_e32 v0, s3
2641; CI-NEXT:    v_add_i32_e32 v4, vcc, s2, v8
2642; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
2643; CI-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
2644; CI-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
2645; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
2646; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2647; CI-NEXT:    v_mov_b32_e32 v9, s1
2648; CI-NEXT:    v_add_i32_e32 v8, vcc, s0, v8
2649; CI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
2650; CI-NEXT:    v_add_i32_e32 v10, vcc, 16, v8
2651; CI-NEXT:    s_lshl_b32 s1, s4, 16
2652; CI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
2653; CI-NEXT:    s_waitcnt vmcnt(1)
2654; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2655; CI-NEXT:    v_or_b32_e32 v1, s1, v1
2656; CI-NEXT:    s_waitcnt vmcnt(0)
2657; CI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
2658; CI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
2659; CI-NEXT:    s_endpgm
2660;
2661; GFX11-LABEL: v_insertelement_v16f16_3:
2662; GFX11:       ; %bb.0:
2663; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
2664; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2665; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x10
2666; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2667; GFX11-NEXT:    s_clause 0x1
2668; GFX11-NEXT:    global_load_b128 v[0:3], v8, s[6:7]
2669; GFX11-NEXT:    global_load_b128 v[4:7], v8, s[6:7] offset:16
2670; GFX11-NEXT:    s_waitcnt vmcnt(1)
2671; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2672; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2673; GFX11-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
2674; GFX11-NEXT:    s_waitcnt vmcnt(0)
2675; GFX11-NEXT:    s_clause 0x1
2676; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[4:5] offset:16
2677; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
2678; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2679; GFX11-NEXT:    s_endpgm
2680  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2681  %tid.ext = sext i32 %tid to i64
2682  %in.gep = getelementptr inbounds <16 x half>, <16 x half> addrspace(1)* %in, i64 %tid.ext
2683  %out.gep = getelementptr inbounds <16 x half>, <16 x half> addrspace(1)* %out, i64 %tid.ext
2684  %vec = load <16 x half>, <16 x half> addrspace(1)* %in.gep
2685  %val.trunc = trunc i32 %val to i16
2686  %val.cvt = bitcast i16 %val.trunc to half
2687  %vecins = insertelement <16 x half> %vec, half %val.cvt, i32 3
2688  store <16 x half> %vecins, <16 x half> addrspace(1)* %out.gep
2689  ret void
2690}
2691
2692define amdgpu_kernel void @v_insertelement_v16i16_6(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in, i32 %val) {
2693; GFX9-LABEL: v_insertelement_v16i16_6:
2694; GFX9:       ; %bb.0:
2695; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2696; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
2697; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2698; GFX9-NEXT:    v_mov_b32_e32 v9, 0xffff
2699; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2700; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
2701; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
2702; GFX9-NEXT:    s_waitcnt vmcnt(1)
2703; GFX9-NEXT:    v_bfi_b32 v3, v9, s6, v3
2704; GFX9-NEXT:    s_waitcnt vmcnt(0)
2705; GFX9-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
2706; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
2707; GFX9-NEXT:    s_endpgm
2708;
2709; VI-LABEL: v_insertelement_v16i16_6:
2710; VI:       ; %bb.0:
2711; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2712; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
2713; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2714; VI-NEXT:    s_waitcnt lgkmcnt(0)
2715; VI-NEXT:    v_mov_b32_e32 v1, s3
2716; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v8
2717; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2718; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
2719; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
2720; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2721; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2722; VI-NEXT:    v_mov_b32_e32 v9, s1
2723; VI-NEXT:    v_add_u32_e32 v8, vcc, s0, v8
2724; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
2725; VI-NEXT:    v_add_u32_e32 v10, vcc, 16, v8
2726; VI-NEXT:    s_mov_b32 s2, 0xffff
2727; VI-NEXT:    v_mov_b32_e32 v12, s4
2728; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
2729; VI-NEXT:    s_waitcnt vmcnt(1)
2730; VI-NEXT:    v_bfi_b32 v3, s2, v12, v3
2731; VI-NEXT:    s_waitcnt vmcnt(0)
2732; VI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
2733; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
2734; VI-NEXT:    s_endpgm
2735;
2736; CI-LABEL: v_insertelement_v16i16_6:
2737; CI:       ; %bb.0:
2738; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2739; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
2740; CI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2741; CI-NEXT:    s_waitcnt lgkmcnt(0)
2742; CI-NEXT:    v_mov_b32_e32 v1, s3
2743; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v8
2744; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2745; CI-NEXT:    v_add_i32_e32 v4, vcc, 16, v0
2746; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
2747; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2748; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2749; CI-NEXT:    v_mov_b32_e32 v9, s1
2750; CI-NEXT:    v_add_i32_e32 v8, vcc, s0, v8
2751; CI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
2752; CI-NEXT:    v_add_i32_e32 v10, vcc, 16, v8
2753; CI-NEXT:    s_mov_b32 s2, 0xffff
2754; CI-NEXT:    v_mov_b32_e32 v12, s4
2755; CI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
2756; CI-NEXT:    s_waitcnt vmcnt(1)
2757; CI-NEXT:    v_bfi_b32 v3, s2, v12, v3
2758; CI-NEXT:    s_waitcnt vmcnt(0)
2759; CI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
2760; CI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
2761; CI-NEXT:    s_endpgm
2762;
2763; GFX11-LABEL: v_insertelement_v16i16_6:
2764; GFX11:       ; %bb.0:
2765; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
2766; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2767; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x10
2768; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2769; GFX11-NEXT:    s_clause 0x1
2770; GFX11-NEXT:    global_load_b128 v[0:3], v8, s[6:7]
2771; GFX11-NEXT:    global_load_b128 v[4:7], v8, s[6:7] offset:16
2772; GFX11-NEXT:    s_waitcnt vmcnt(1)
2773; GFX11-NEXT:    v_bfi_b32 v3, 0xffff, s0, v3
2774; GFX11-NEXT:    s_waitcnt vmcnt(0)
2775; GFX11-NEXT:    s_clause 0x1
2776; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[4:5] offset:16
2777; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
2778; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2779; GFX11-NEXT:    s_endpgm
2780  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2781  %tid.ext = sext i32 %tid to i64
2782  %in.gep = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(1)* %in, i64 %tid.ext
2783  %out.gep = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(1)* %out, i64 %tid.ext
2784  %vec = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep
2785  %val.trunc = trunc i32 %val to i16
2786  %val.cvt = bitcast i16 %val.trunc to i16
2787  %vecins = insertelement <16 x i16> %vec, i16 %val.cvt, i32 6
2788  store <16 x i16> %vecins, <16 x i16> addrspace(1)* %out.gep
2789  ret void
2790}
2791
2792define amdgpu_kernel void @v_insertelement_v16f16_dynamic(<16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %in, i32 %val, i32 %n) {
2793; GFX9-LABEL: v_insertelement_v16f16_dynamic:
2794; GFX9:       ; %bb.0:
2795; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2796; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
2797; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2798; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2799; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
2800; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
2801; GFX9-NEXT:    s_cmp_eq_u32 s7, 7
2802; GFX9-NEXT:    v_mov_b32_e32 v9, s6
2803; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2804; GFX9-NEXT:    s_cmp_eq_u32 s7, 6
2805; GFX9-NEXT:    s_waitcnt vmcnt(1)
2806; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
2807; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
2808; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2809; GFX9-NEXT:    s_cmp_eq_u32 s7, 5
2810; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
2811; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
2812; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2813; GFX9-NEXT:    s_cmp_eq_u32 s7, 4
2814; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v9, vcc
2815; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2816; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc
2817; GFX9-NEXT:    s_cmp_eq_u32 s7, 3
2818; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
2819; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2820; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2821; GFX9-NEXT:    s_cmp_eq_u32 s7, 2
2822; GFX9-NEXT:    v_lshl_or_b32 v2, v11, 16, v2
2823; GFX9-NEXT:    v_cndmask_b32_e32 v11, v12, v9, vcc
2824; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2825; GFX9-NEXT:    s_cmp_eq_u32 s7, 1
2826; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
2827; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
2828; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2829; GFX9-NEXT:    s_cmp_eq_u32 s7, 0
2830; GFX9-NEXT:    v_cndmask_b32_e32 v12, v13, v9, vcc
2831; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2832; GFX9-NEXT:    s_cmp_eq_u32 s7, 15
2833; GFX9-NEXT:    s_waitcnt vmcnt(0)
2834; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
2835; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
2836; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2837; GFX9-NEXT:    s_cmp_eq_u32 s7, 14
2838; GFX9-NEXT:    v_cndmask_b32_e32 v13, v14, v9, vcc
2839; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2840; GFX9-NEXT:    s_cmp_eq_u32 s7, 13
2841; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v6
2842; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2843; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
2844; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2845; GFX9-NEXT:    s_cmp_eq_u32 s7, 12
2846; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v3
2847; GFX9-NEXT:    v_lshl_or_b32 v0, v12, 16, v0
2848; GFX9-NEXT:    v_cndmask_b32_e32 v12, v15, v9, vcc
2849; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2850; GFX9-NEXT:    s_cmp_eq_u32 s7, 11
2851; GFX9-NEXT:    v_lshl_or_b32 v3, v10, 16, v3
2852; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
2853; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
2854; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2855; GFX9-NEXT:    s_cmp_eq_u32 s7, 10
2856; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2857; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
2858; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2859; GFX9-NEXT:    s_cmp_eq_u32 s7, 9
2860; GFX9-NEXT:    v_lshl_or_b32 v1, v11, 16, v1
2861; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
2862; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
2863; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2864; GFX9-NEXT:    s_cmp_eq_u32 s7, 8
2865; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v9, vcc
2866; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2867; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
2868; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff, v7
2869; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff, v6
2870; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff, v5
2871; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff, v4
2872; GFX9-NEXT:    v_lshl_or_b32 v7, v13, 16, v7
2873; GFX9-NEXT:    v_lshl_or_b32 v6, v12, 16, v6
2874; GFX9-NEXT:    v_lshl_or_b32 v5, v10, 16, v5
2875; GFX9-NEXT:    v_lshl_or_b32 v4, v11, 16, v4
2876; GFX9-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
2877; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
2878; GFX9-NEXT:    s_endpgm
2879;
2880; VI-LABEL: v_insertelement_v16f16_dynamic:
2881; VI:       ; %bb.0:
2882; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2883; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
2884; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2885; VI-NEXT:    s_waitcnt lgkmcnt(0)
2886; VI-NEXT:    v_mov_b32_e32 v0, s3
2887; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v8
2888; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
2889; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v4
2890; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
2891; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2892; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2893; VI-NEXT:    v_mov_b32_e32 v9, s1
2894; VI-NEXT:    v_add_u32_e32 v8, vcc, s0, v8
2895; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
2896; VI-NEXT:    v_add_u32_e32 v10, vcc, 16, v8
2897; VI-NEXT:    s_cmp_eq_u32 s7, 14
2898; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
2899; VI-NEXT:    v_mov_b32_e32 v12, s6
2900; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2901; VI-NEXT:    s_cmp_eq_u32 s7, 15
2902; VI-NEXT:    s_waitcnt vmcnt(1)
2903; VI-NEXT:    v_cndmask_b32_e32 v13, v3, v12, vcc
2904; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2905; VI-NEXT:    s_cmp_eq_u32 s7, 12
2906; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
2907; VI-NEXT:    s_cmp_eq_u32 s7, 13
2908; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
2909; VI-NEXT:    v_cndmask_b32_e64 v2, v2, v12, s[0:1]
2910; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
2911; VI-NEXT:    s_cmp_eq_u32 s7, 10
2912; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
2913; VI-NEXT:    s_cmp_eq_u32 s7, 11
2914; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
2915; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s[2:3]
2916; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
2917; VI-NEXT:    s_cmp_eq_u32 s7, 8
2918; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2919; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
2920; VI-NEXT:    v_cndmask_b32_e64 v15, v15, v12, s[2:3]
2921; VI-NEXT:    s_cmp_eq_u32 s7, 9
2922; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
2923; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc
2924; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
2925; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2926; VI-NEXT:    s_cmp_eq_u32 s7, 6
2927; VI-NEXT:    v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2928; VI-NEXT:    v_cndmask_b32_e32 v15, v16, v12, vcc
2929; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2930; VI-NEXT:    s_cmp_eq_u32 s7, 7
2931; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[4:5]
2932; VI-NEXT:    s_waitcnt vmcnt(0)
2933; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
2934; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2935; VI-NEXT:    v_cndmask_b32_e64 v14, v14, v12, s[0:1]
2936; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
2937; VI-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc
2938; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2939; VI-NEXT:    s_cmp_eq_u32 s7, 4
2940; VI-NEXT:    v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2941; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
2942; VI-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2943; VI-NEXT:    v_cndmask_b32_e32 v15, v17, v12, vcc
2944; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2945; VI-NEXT:    s_cmp_eq_u32 s7, 5
2946; VI-NEXT:    v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2947; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
2948; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v12, vcc
2949; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2950; VI-NEXT:    s_cmp_eq_u32 s7, 2
2951; VI-NEXT:    v_cndmask_b32_e32 v13, v13, v12, vcc
2952; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2953; VI-NEXT:    s_cmp_eq_u32 s7, 3
2954; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
2955; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc
2956; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2957; VI-NEXT:    s_cmp_eq_u32 s7, 0
2958; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
2959; VI-NEXT:    v_cndmask_b32_e32 v14, v14, v12, vcc
2960; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2961; VI-NEXT:    s_cmp_eq_u32 s7, 1
2962; VI-NEXT:    v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2963; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v4
2964; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
2965; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2966; VI-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
2967; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
2968; VI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
2969; VI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
2970; VI-NEXT:    v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2971; VI-NEXT:    v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2972; VI-NEXT:    v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2973; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
2974; VI-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
2975; VI-NEXT:    s_endpgm
2976;
2977; CI-LABEL: v_insertelement_v16f16_dynamic:
2978; CI:       ; %bb.0:
2979; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2980; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
2981; CI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2982; CI-NEXT:    s_waitcnt lgkmcnt(0)
2983; CI-NEXT:    v_mov_b32_e32 v0, s3
2984; CI-NEXT:    v_add_i32_e32 v4, vcc, s2, v8
2985; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
2986; CI-NEXT:    v_add_i32_e32 v0, vcc, 16, v4
2987; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
2988; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2989; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2990; CI-NEXT:    v_mov_b32_e32 v9, s1
2991; CI-NEXT:    v_add_i32_e32 v8, vcc, s0, v8
2992; CI-NEXT:    v_cvt_f32_f16_e32 v10, s4
2993; CI-NEXT:    s_cmp_eq_u32 s5, 15
2994; CI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
2995; CI-NEXT:    s_cselect_b64 vcc, -1, 0
2996; CI-NEXT:    s_cmp_eq_u32 s5, 14
2997; CI-NEXT:    s_cselect_b64 s[0:1], -1, 0
2998; CI-NEXT:    s_cmp_eq_u32 s5, 13
2999; CI-NEXT:    s_cselect_b64 s[2:3], -1, 0
3000; CI-NEXT:    s_cmp_eq_u32 s5, 12
3001; CI-NEXT:    s_waitcnt vmcnt(1)
3002; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
3003; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
3004; CI-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
3005; CI-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
3006; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
3007; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
3008; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
3009; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
3010; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
3011; CI-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[0:1]
3012; CI-NEXT:    s_cselect_b64 s[0:1], -1, 0
3013; CI-NEXT:    s_cmp_eq_u32 s5, 11
3014; CI-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
3015; CI-NEXT:    v_cndmask_b32_e64 v12, v12, v10, s[2:3]
3016; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3017; CI-NEXT:    s_cmp_eq_u32 s5, 10
3018; CI-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[0:1]
3019; CI-NEXT:    v_cndmask_b32_e32 v13, v13, v10, vcc
3020; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
3021; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3022; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
3023; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
3024; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
3025; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
3026; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
3027; CI-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
3028; CI-NEXT:    v_or_b32_e32 v2, v2, v12
3029; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
3030; CI-NEXT:    v_or_b32_e32 v1, v1, v12
3031; CI-NEXT:    v_cvt_f32_f16_e32 v12, v14
3032; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
3033; CI-NEXT:    s_waitcnt vmcnt(0)
3034; CI-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
3035; CI-NEXT:    v_cvt_f32_f16_e32 v13, v15
3036; CI-NEXT:    s_cmp_eq_u32 s5, 9
3037; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
3038; CI-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
3039; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3040; CI-NEXT:    s_cmp_eq_u32 s5, 8
3041; CI-NEXT:    v_cvt_f32_f16_e32 v14, v16
3042; CI-NEXT:    v_cndmask_b32_e32 v12, v12, v10, vcc
3043; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3044; CI-NEXT:    s_cmp_eq_u32 s5, 7
3045; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
3046; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
3047; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3048; CI-NEXT:    s_cmp_eq_u32 s5, 6
3049; CI-NEXT:    v_cndmask_b32_e32 v13, v13, v10, vcc
3050; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3051; CI-NEXT:    s_cmp_eq_u32 s5, 5
3052; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
3053; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
3054; CI-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
3055; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3056; CI-NEXT:    s_cmp_eq_u32 s5, 4
3057; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
3058; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
3059; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
3060; CI-NEXT:    v_cndmask_b32_e32 v14, v14, v10, vcc
3061; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3062; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
3063; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
3064; CI-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
3065; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
3066; CI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
3067; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
3068; CI-NEXT:    v_or_b32_e32 v3, v3, v11
3069; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
3070; CI-NEXT:    v_or_b32_e32 v0, v0, v12
3071; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
3072; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
3073; CI-NEXT:    v_or_b32_e32 v7, v7, v12
3074; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
3075; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
3076; CI-NEXT:    v_or_b32_e32 v6, v6, v12
3077; CI-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
3078; CI-NEXT:    s_cmp_eq_u32 s5, 3
3079; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
3080; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3081; CI-NEXT:    s_cmp_eq_u32 s5, 2
3082; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
3083; CI-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
3084; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3085; CI-NEXT:    s_cmp_eq_u32 s5, 1
3086; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
3087; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3088; CI-NEXT:    s_cmp_eq_u32 s5, 0
3089; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
3090; CI-NEXT:    v_cndmask_b32_e32 v12, v12, v10, vcc
3091; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3092; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
3093; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
3094; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
3095; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
3096; CI-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
3097; CI-NEXT:    v_or_b32_e32 v5, v5, v10
3098; CI-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
3099; CI-NEXT:    v_or_b32_e32 v4, v4, v10
3100; CI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
3101; CI-NEXT:    s_nop 0
3102; CI-NEXT:    v_add_i32_e32 v4, vcc, 16, v8
3103; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v9, vcc
3104; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3105; CI-NEXT:    s_endpgm
3106;
3107; GFX11-LABEL: v_insertelement_v16f16_dynamic:
3108; GFX11:       ; %bb.0:
3109; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
3110; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
3111; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x10
3112; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3113; GFX11-NEXT:    s_clause 0x1
3114; GFX11-NEXT:    global_load_b128 v[0:3], v8, s[6:7]
3115; GFX11-NEXT:    global_load_b128 v[4:7], v8, s[6:7] offset:16
3116; GFX11-NEXT:    s_cmp_eq_u32 s1, 7
3117; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3118; GFX11-NEXT:    s_cmp_eq_u32 s1, 6
3119; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
3120; GFX11-NEXT:    s_cmp_eq_u32 s1, 5
3121; GFX11-NEXT:    s_waitcnt vmcnt(1)
3122; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
3123; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s0, s3
3124; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
3125; GFX11-NEXT:    s_cmp_eq_u32 s1, 4
3126; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
3127; GFX11-NEXT:    v_cndmask_b32_e64 v9, v9, s0, s2
3128; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3129; GFX11-NEXT:    s_cmp_eq_u32 s1, 3
3130; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
3131; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s0, s2
3132; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3133; GFX11-NEXT:    s_cmp_eq_u32 s1, 2
3134; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
3135; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
3136; GFX11-NEXT:    v_cndmask_b32_e64 v11, v11, s0, s2
3137; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3138; GFX11-NEXT:    s_cmp_eq_u32 s1, 1
3139; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s0, s2
3140; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3141; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
3142; GFX11-NEXT:    s_waitcnt vmcnt(0)
3143; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v7
3144; GFX11-NEXT:    v_cndmask_b32_e64 v10, v10, s0, s3
3145; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
3146; GFX11-NEXT:    v_lshl_or_b32 v3, v9, 16, v3
3147; GFX11-NEXT:    v_cndmask_b32_e64 v9, v12, s0, s2
3148; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3149; GFX11-NEXT:    s_cmp_eq_u32 s1, 15
3150; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s2
3151; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3152; GFX11-NEXT:    s_cmp_eq_u32 s1, 14
3153; GFX11-NEXT:    v_lshl_or_b32 v2, v10, 16, v2
3154; GFX11-NEXT:    v_cndmask_b32_e64 v10, v13, s0, s2
3155; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3156; GFX11-NEXT:    s_cmp_eq_u32 s1, 13
3157; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
3158; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, s0, s2
3159; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3160; GFX11-NEXT:    s_cmp_eq_u32 s1, 12
3161; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
3162; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
3163; GFX11-NEXT:    s_cmp_eq_u32 s1, 11
3164; GFX11-NEXT:    v_cndmask_b32_e64 v12, v14, s0, s2
3165; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3166; GFX11-NEXT:    s_cmp_eq_u32 s1, 10
3167; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, s0, s3
3168; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
3169; GFX11-NEXT:    s_cmp_eq_u32 s1, 9
3170; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
3171; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
3172; GFX11-NEXT:    s_cmp_eq_u32 s1, 8
3173; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, s0, s3
3174; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
3175; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
3176; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, s0, s1
3177; GFX11-NEXT:    v_cndmask_b32_e64 v13, v15, s0, s2
3178; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
3179; GFX11-NEXT:    v_cndmask_b32_e64 v14, v16, s0, s6
3180; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
3181; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
3182; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3183; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3184; GFX11-NEXT:    v_lshl_or_b32 v7, v10, 16, v7
3185; GFX11-NEXT:    v_lshl_or_b32 v6, v12, 16, v6
3186; GFX11-NEXT:    v_lshl_or_b32 v5, v13, 16, v5
3187; GFX11-NEXT:    v_lshl_or_b32 v4, v14, 16, v4
3188; GFX11-NEXT:    v_lshl_or_b32 v1, v11, 16, v1
3189; GFX11-NEXT:    v_lshl_or_b32 v0, v9, 16, v0
3190; GFX11-NEXT:    s_clause 0x1
3191; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[4:5] offset:16
3192; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
3193; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3194; GFX11-NEXT:    s_endpgm
3195  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
3196  %tid.ext = sext i32 %tid to i64
3197  %in.gep = getelementptr inbounds <16 x half>, <16 x half> addrspace(1)* %in, i64 %tid.ext
3198  %out.gep = getelementptr inbounds <16 x half>, <16 x half> addrspace(1)* %out, i64 %tid.ext
3199  %vec = load <16 x half>, <16 x half> addrspace(1)* %in.gep
3200  %val.trunc = trunc i32 %val to i16
3201  %val.cvt = bitcast i16 %val.trunc to half
3202  %vecins = insertelement <16 x half> %vec, half %val.cvt, i32 %n
3203  store <16 x half> %vecins, <16 x half> addrspace(1)* %out.gep
3204  ret void
3205}
3206
3207
3208declare i32 @llvm.amdgcn.workitem.id.x() #1
3209
3210attributes #0 = { nounwind }
3211attributes #1 = { nounwind readnone }
3212