1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s
3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,CIVI,VI %s
4; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
5
6define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
7; GFX9-LABEL: s_insertelement_v2i16_0:
8; GFX9:       ; %bb.0:
9; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
10; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
11; GFX9-NEXT:    v_mov_b32_e32 v0, s0
12; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
13; GFX9-NEXT:    v_mov_b32_e32 v1, s1
14; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX9-NEXT:    s_pack_lh_b32_b16 s0, 0x3e7, s0
16; GFX9-NEXT:    v_mov_b32_e32 v2, s0
17; GFX9-NEXT:    global_store_dword v[0:1], v2, off
18; GFX9-NEXT:    s_endpgm
19;
20; CIVI-LABEL: s_insertelement_v2i16_0:
21; CIVI:       ; %bb.0:
22; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
23; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
24; CIVI-NEXT:    v_mov_b32_e32 v0, s0
25; CIVI-NEXT:    s_load_dword s0, s[2:3], 0x0
26; CIVI-NEXT:    v_mov_b32_e32 v1, s1
27; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
28; CIVI-NEXT:    s_and_b32 s0, s0, 0xffff0000
29; CIVI-NEXT:    s_or_b32 s0, s0, 0x3e7
30; CIVI-NEXT:    v_mov_b32_e32 v2, s0
31; CIVI-NEXT:    flat_store_dword v[0:1], v2
32; CIVI-NEXT:    s_endpgm
33  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
34  %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
35  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
36  ret void
37}
38
39
40define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
41; GFX9-LABEL: s_insertelement_v2i16_0_reg:
42; GFX9:       ; %bb.0:
43; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
44; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x30
45; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
46; GFX9-NEXT:    v_mov_b32_e32 v0, s0
47; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
48; GFX9-NEXT:    v_mov_b32_e32 v1, s1
49; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
50; GFX9-NEXT:    s_pack_lh_b32_b16 s0, s4, s0
51; GFX9-NEXT:    v_mov_b32_e32 v2, s0
52; GFX9-NEXT:    global_store_dword v[0:1], v2, off
53; GFX9-NEXT:    s_endpgm
54;
55; VI-LABEL: s_insertelement_v2i16_0_reg:
56; VI:       ; %bb.0:
57; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
58; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
59; VI-NEXT:    s_waitcnt lgkmcnt(0)
60; VI-NEXT:    v_mov_b32_e32 v0, s0
61; VI-NEXT:    s_load_dword s0, s[2:3], 0x0
62; VI-NEXT:    v_mov_b32_e32 v1, s1
63; VI-NEXT:    s_and_b32 s1, s4, 0xffff
64; VI-NEXT:    s_waitcnt lgkmcnt(0)
65; VI-NEXT:    s_and_b32 s0, s0, 0xffff0000
66; VI-NEXT:    s_or_b32 s0, s1, s0
67; VI-NEXT:    v_mov_b32_e32 v2, s0
68; VI-NEXT:    flat_store_dword v[0:1], v2
69; VI-NEXT:    s_endpgm
70;
71; CI-LABEL: s_insertelement_v2i16_0_reg:
72; CI:       ; %bb.0:
73; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
74; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
75; CI-NEXT:    s_waitcnt lgkmcnt(0)
76; CI-NEXT:    v_mov_b32_e32 v0, s0
77; CI-NEXT:    s_load_dword s0, s[2:3], 0x0
78; CI-NEXT:    v_mov_b32_e32 v1, s1
79; CI-NEXT:    s_and_b32 s1, s4, 0xffff
80; CI-NEXT:    s_waitcnt lgkmcnt(0)
81; CI-NEXT:    s_and_b32 s0, s0, 0xffff0000
82; CI-NEXT:    s_or_b32 s0, s1, s0
83; CI-NEXT:    v_mov_b32_e32 v2, s0
84; CI-NEXT:    flat_store_dword v[0:1], v2
85; CI-NEXT:    s_endpgm
86  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
87  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
88  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
89  ret void
90}
91
92define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
93; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
94; GFX9:       ; %bb.0:
95; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
96; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x30
97; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
98; GFX9-NEXT:    v_mov_b32_e32 v0, s0
99; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
100; GFX9-NEXT:    v_mov_b32_e32 v1, s1
101; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
102; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
103; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s4, s0
104; GFX9-NEXT:    v_mov_b32_e32 v2, s1
105; GFX9-NEXT:    global_store_dword v[0:1], v2, off
106; GFX9-NEXT:    ;;#ASMSTART
107; GFX9-NEXT:    ; use s0
108; GFX9-NEXT:    ;;#ASMEND
109; GFX9-NEXT:    s_endpgm
110;
111; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
112; VI:       ; %bb.0:
113; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
114; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
115; VI-NEXT:    s_waitcnt lgkmcnt(0)
116; VI-NEXT:    v_mov_b32_e32 v0, s0
117; VI-NEXT:    s_load_dword s0, s[2:3], 0x0
118; VI-NEXT:    v_mov_b32_e32 v1, s1
119; VI-NEXT:    s_and_b32 s1, s4, 0xffff
120; VI-NEXT:    s_waitcnt lgkmcnt(0)
121; VI-NEXT:    s_lshr_b32 s2, s0, 16
122; VI-NEXT:    s_and_b32 s0, s0, 0xffff0000
123; VI-NEXT:    s_or_b32 s0, s1, s0
124; VI-NEXT:    v_mov_b32_e32 v2, s0
125; VI-NEXT:    flat_store_dword v[0:1], v2
126; VI-NEXT:    ;;#ASMSTART
127; VI-NEXT:    ; use s2
128; VI-NEXT:    ;;#ASMEND
129; VI-NEXT:    s_endpgm
130;
131; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
132; CI:       ; %bb.0:
133; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
134; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
135; CI-NEXT:    s_waitcnt lgkmcnt(0)
136; CI-NEXT:    v_mov_b32_e32 v0, s0
137; CI-NEXT:    s_load_dword s0, s[2:3], 0x0
138; CI-NEXT:    v_mov_b32_e32 v1, s1
139; CI-NEXT:    s_and_b32 s1, s4, 0xffff
140; CI-NEXT:    s_waitcnt lgkmcnt(0)
141; CI-NEXT:    s_lshr_b32 s0, s0, 16
142; CI-NEXT:    s_lshl_b32 s2, s0, 16
143; CI-NEXT:    s_or_b32 s1, s1, s2
144; CI-NEXT:    v_mov_b32_e32 v2, s1
145; CI-NEXT:    flat_store_dword v[0:1], v2
146; CI-NEXT:    ;;#ASMSTART
147; CI-NEXT:    ; use s0
148; CI-NEXT:    ;;#ASMEND
149; CI-NEXT:    s_endpgm
150  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
151  %elt1 = extractelement <2 x i16> %vec, i32 1
152  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
153  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
154  %use1 = zext i16 %elt1 to i32
155  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
156  ret void
157}
158
159define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %elt.arg) #0 {
160; GFX9-LABEL: s_insertelement_v2i16_0_reghi:
161; GFX9:       ; %bb.0:
162; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
163; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x30
164; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
165; GFX9-NEXT:    v_mov_b32_e32 v0, s0
166; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
167; GFX9-NEXT:    v_mov_b32_e32 v1, s1
168; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
169; GFX9-NEXT:    s_pack_hh_b32_b16 s0, s4, s0
170; GFX9-NEXT:    v_mov_b32_e32 v2, s0
171; GFX9-NEXT:    global_store_dword v[0:1], v2, off
172; GFX9-NEXT:    s_endpgm
173;
174; VI-LABEL: s_insertelement_v2i16_0_reghi:
175; VI:       ; %bb.0:
176; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
177; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
178; VI-NEXT:    s_waitcnt lgkmcnt(0)
179; VI-NEXT:    v_mov_b32_e32 v0, s0
180; VI-NEXT:    s_load_dword s0, s[2:3], 0x0
181; VI-NEXT:    v_mov_b32_e32 v1, s1
182; VI-NEXT:    s_lshr_b32 s1, s4, 16
183; VI-NEXT:    s_waitcnt lgkmcnt(0)
184; VI-NEXT:    s_and_b32 s0, s0, 0xffff0000
185; VI-NEXT:    s_or_b32 s0, s1, s0
186; VI-NEXT:    v_mov_b32_e32 v2, s0
187; VI-NEXT:    flat_store_dword v[0:1], v2
188; VI-NEXT:    s_endpgm
189;
190; CI-LABEL: s_insertelement_v2i16_0_reghi:
191; CI:       ; %bb.0:
192; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
193; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
194; CI-NEXT:    s_waitcnt lgkmcnt(0)
195; CI-NEXT:    v_mov_b32_e32 v0, s0
196; CI-NEXT:    s_load_dword s0, s[2:3], 0x0
197; CI-NEXT:    v_mov_b32_e32 v1, s1
198; CI-NEXT:    s_lshr_b32 s1, s4, 16
199; CI-NEXT:    s_waitcnt lgkmcnt(0)
200; CI-NEXT:    s_and_b32 s0, s0, 0xffff0000
201; CI-NEXT:    s_or_b32 s0, s1, s0
202; CI-NEXT:    v_mov_b32_e32 v2, s0
203; CI-NEXT:    flat_store_dword v[0:1], v2
204; CI-NEXT:    s_endpgm
205  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
206  %elt.hi = lshr i32 %elt.arg, 16
207  %elt = trunc i32 %elt.hi to i16
208  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
209  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
210  ret void
211}
212
213define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
214; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
215; GFX9:       ; %bb.0:
216; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
217; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
218; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
219; GFX9-NEXT:    v_mov_b32_e32 v0, s0
220; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
221; GFX9-NEXT:    v_mov_b32_e32 v1, s1
222; GFX9-NEXT:    s_lshr_b32 s1, s4, 16
223; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
224; GFX9-NEXT:    s_pack_lh_b32_b16 s0, s1, s0
225; GFX9-NEXT:    v_mov_b32_e32 v2, s0
226; GFX9-NEXT:    global_store_dword v[0:1], v2, off
227; GFX9-NEXT:    ;;#ASMSTART
228; GFX9-NEXT:    ; use s1
229; GFX9-NEXT:    ;;#ASMEND
230; GFX9-NEXT:    s_endpgm
231;
232; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
233; VI:       ; %bb.0:
234; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
235; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
236; VI-NEXT:    s_waitcnt lgkmcnt(0)
237; VI-NEXT:    v_mov_b32_e32 v0, s0
238; VI-NEXT:    s_load_dword s0, s[2:3], 0x0
239; VI-NEXT:    v_mov_b32_e32 v1, s1
240; VI-NEXT:    s_lshr_b32 s1, s4, 16
241; VI-NEXT:    s_waitcnt lgkmcnt(0)
242; VI-NEXT:    s_and_b32 s0, s0, 0xffff0000
243; VI-NEXT:    s_or_b32 s0, s1, s0
244; VI-NEXT:    v_mov_b32_e32 v2, s0
245; VI-NEXT:    flat_store_dword v[0:1], v2
246; VI-NEXT:    ;;#ASMSTART
247; VI-NEXT:    ; use s1
248; VI-NEXT:    ;;#ASMEND
249; VI-NEXT:    s_endpgm
250;
251; CI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
252; CI:       ; %bb.0:
253; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
254; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
255; CI-NEXT:    s_waitcnt lgkmcnt(0)
256; CI-NEXT:    v_mov_b32_e32 v0, s0
257; CI-NEXT:    s_load_dword s0, s[2:3], 0x0
258; CI-NEXT:    v_mov_b32_e32 v1, s1
259; CI-NEXT:    s_lshr_b32 s1, s4, 16
260; CI-NEXT:    s_waitcnt lgkmcnt(0)
261; CI-NEXT:    s_and_b32 s0, s0, 0xffff0000
262; CI-NEXT:    s_or_b32 s0, s1, s0
263; CI-NEXT:    v_mov_b32_e32 v2, s0
264; CI-NEXT:    flat_store_dword v[0:1], v2
265; CI-NEXT:    ;;#ASMSTART
266; CI-NEXT:    ; use s1
267; CI-NEXT:    ;;#ASMEND
268; CI-NEXT:    s_endpgm
269  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
270  %elt.hi = lshr i32 %elt.arg, 16
271  %elt = trunc i32 %elt.hi to i16
272  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
273  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
274  %use1 = zext i16 %elt to i32
275  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
276  ret void
277}
278
279define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
280; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
281; GFX9:       ; %bb.0:
282; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
283; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
284; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
285; GFX9-NEXT:    v_mov_b32_e32 v0, s0
286; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
287; GFX9-NEXT:    v_mov_b32_e32 v1, s1
288; GFX9-NEXT:    s_lshr_b32 s1, s4, 16
289; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
290; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
291; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s1, s0
292; GFX9-NEXT:    v_mov_b32_e32 v2, s2
293; GFX9-NEXT:    global_store_dword v[0:1], v2, off
294; GFX9-NEXT:    ;;#ASMSTART
295; GFX9-NEXT:    ; use s1
296; GFX9-NEXT:    ;;#ASMEND
297; GFX9-NEXT:    ;;#ASMSTART
298; GFX9-NEXT:    ; use s0
299; GFX9-NEXT:    ;;#ASMEND
300; GFX9-NEXT:    s_endpgm
301;
302; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
303; VI:       ; %bb.0:
304; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
305; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
306; VI-NEXT:    s_waitcnt lgkmcnt(0)
307; VI-NEXT:    v_mov_b32_e32 v0, s0
308; VI-NEXT:    s_load_dword s0, s[2:3], 0x0
309; VI-NEXT:    v_mov_b32_e32 v1, s1
310; VI-NEXT:    s_lshr_b32 s1, s4, 16
311; VI-NEXT:    s_waitcnt lgkmcnt(0)
312; VI-NEXT:    s_lshr_b32 s2, s0, 16
313; VI-NEXT:    s_and_b32 s0, s0, 0xffff0000
314; VI-NEXT:    s_or_b32 s0, s1, s0
315; VI-NEXT:    v_mov_b32_e32 v2, s0
316; VI-NEXT:    flat_store_dword v[0:1], v2
317; VI-NEXT:    ;;#ASMSTART
318; VI-NEXT:    ; use s1
319; VI-NEXT:    ;;#ASMEND
320; VI-NEXT:    ;;#ASMSTART
321; VI-NEXT:    ; use s2
322; VI-NEXT:    ;;#ASMEND
323; VI-NEXT:    s_endpgm
324;
325; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
326; CI:       ; %bb.0:
327; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
328; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
329; CI-NEXT:    s_waitcnt lgkmcnt(0)
330; CI-NEXT:    v_mov_b32_e32 v0, s0
331; CI-NEXT:    s_load_dword s0, s[2:3], 0x0
332; CI-NEXT:    v_mov_b32_e32 v2, s4
333; CI-NEXT:    v_mov_b32_e32 v1, s1
334; CI-NEXT:    s_lshr_b32 s1, s4, 16
335; CI-NEXT:    s_waitcnt lgkmcnt(0)
336; CI-NEXT:    s_lshr_b32 s0, s0, 16
337; CI-NEXT:    v_alignbit_b32 v2, s0, v2, 16
338; CI-NEXT:    flat_store_dword v[0:1], v2
339; CI-NEXT:    ;;#ASMSTART
340; CI-NEXT:    ; use s1
341; CI-NEXT:    ;;#ASMEND
342; CI-NEXT:    ;;#ASMSTART
343; CI-NEXT:    ; use s0
344; CI-NEXT:    ;;#ASMEND
345; CI-NEXT:    s_endpgm
346  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
347  %elt.hi = lshr i32 %elt.arg, 16
348  %elt = trunc i32 %elt.hi to i16
349  %vec.hi = extractelement <2 x i16> %vec, i32 1
350  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
351  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
352  %use1 = zext i16 %elt to i32
353  %vec.hi.use1 = zext i16 %vec.hi to i32
354
355  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
356  call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0
357  ret void
358}
359
360define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
361; GFX9-LABEL: s_insertelement_v2i16_1:
362; GFX9:       ; %bb.0:
363; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
364; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
365; GFX9-NEXT:    v_mov_b32_e32 v0, s0
366; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
367; GFX9-NEXT:    v_mov_b32_e32 v1, s1
368; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
369; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, 0x3e7
370; GFX9-NEXT:    v_mov_b32_e32 v2, s0
371; GFX9-NEXT:    global_store_dword v[0:1], v2, off
372; GFX9-NEXT:    s_endpgm
373;
374; CIVI-LABEL: s_insertelement_v2i16_1:
375; CIVI:       ; %bb.0:
376; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
377; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
378; CIVI-NEXT:    v_mov_b32_e32 v0, s0
379; CIVI-NEXT:    s_load_dword s0, s[2:3], 0x0
380; CIVI-NEXT:    v_mov_b32_e32 v1, s1
381; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
382; CIVI-NEXT:    s_and_b32 s0, s0, 0xffff
383; CIVI-NEXT:    s_or_b32 s0, s0, 0x3e70000
384; CIVI-NEXT:    v_mov_b32_e32 v2, s0
385; CIVI-NEXT:    flat_store_dword v[0:1], v2
386; CIVI-NEXT:    s_endpgm
387  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
388  %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
389  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
390  ret void
391}
392
393define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
394; GFX9-LABEL: s_insertelement_v2i16_1_reg:
395; GFX9:       ; %bb.0:
396; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
397; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x30
398; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
399; GFX9-NEXT:    v_mov_b32_e32 v0, s0
400; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
401; GFX9-NEXT:    v_mov_b32_e32 v1, s1
402; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
403; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
404; GFX9-NEXT:    v_mov_b32_e32 v2, s0
405; GFX9-NEXT:    global_store_dword v[0:1], v2, off
406; GFX9-NEXT:    s_endpgm
407;
408; VI-LABEL: s_insertelement_v2i16_1_reg:
409; VI:       ; %bb.0:
410; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
411; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
412; VI-NEXT:    s_waitcnt lgkmcnt(0)
413; VI-NEXT:    v_mov_b32_e32 v0, s0
414; VI-NEXT:    s_load_dword s0, s[2:3], 0x0
415; VI-NEXT:    v_mov_b32_e32 v1, s1
416; VI-NEXT:    s_lshl_b32 s1, s4, 16
417; VI-NEXT:    s_waitcnt lgkmcnt(0)
418; VI-NEXT:    s_and_b32 s0, s0, 0xffff
419; VI-NEXT:    s_or_b32 s0, s0, s1
420; VI-NEXT:    v_mov_b32_e32 v2, s0
421; VI-NEXT:    flat_store_dword v[0:1], v2
422; VI-NEXT:    s_endpgm
423;
424; CI-LABEL: s_insertelement_v2i16_1_reg:
425; CI:       ; %bb.0:
426; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
427; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
428; CI-NEXT:    s_waitcnt lgkmcnt(0)
429; CI-NEXT:    v_mov_b32_e32 v0, s0
430; CI-NEXT:    s_load_dword s0, s[2:3], 0x0
431; CI-NEXT:    v_mov_b32_e32 v1, s1
432; CI-NEXT:    s_lshl_b32 s1, s4, 16
433; CI-NEXT:    s_waitcnt lgkmcnt(0)
434; CI-NEXT:    s_and_b32 s0, s0, 0xffff
435; CI-NEXT:    s_or_b32 s0, s0, s1
436; CI-NEXT:    v_mov_b32_e32 v2, s0
437; CI-NEXT:    flat_store_dword v[0:1], v2
438; CI-NEXT:    s_endpgm
439  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
440  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
441  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
442  ret void
443}
444
445define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
446; GFX9-LABEL: s_insertelement_v2f16_0:
447; GFX9:       ; %bb.0:
448; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
449; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
450; GFX9-NEXT:    v_mov_b32_e32 v0, s0
451; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
452; GFX9-NEXT:    v_mov_b32_e32 v1, s1
453; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
454; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
455; GFX9-NEXT:    s_pack_ll_b32_b16 s0, 0x4500, s0
456; GFX9-NEXT:    v_mov_b32_e32 v2, s0
457; GFX9-NEXT:    global_store_dword v[0:1], v2, off
458; GFX9-NEXT:    s_endpgm
459;
460; CIVI-LABEL: s_insertelement_v2f16_0:
461; CIVI:       ; %bb.0:
462; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
463; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
464; CIVI-NEXT:    v_mov_b32_e32 v0, s0
465; CIVI-NEXT:    s_load_dword s0, s[2:3], 0x0
466; CIVI-NEXT:    v_mov_b32_e32 v1, s1
467; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
468; CIVI-NEXT:    s_and_b32 s0, s0, 0xffff0000
469; CIVI-NEXT:    s_or_b32 s0, s0, 0x4500
470; CIVI-NEXT:    v_mov_b32_e32 v2, s0
471; CIVI-NEXT:    flat_store_dword v[0:1], v2
472; CIVI-NEXT:    s_endpgm
473  %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
474  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
475  store <2 x half> %vecins, <2 x half> addrspace(1)* %out
476  ret void
477}
478
479define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
480; GFX9-LABEL: s_insertelement_v2f16_1:
481; GFX9:       ; %bb.0:
482; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
483; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
484; GFX9-NEXT:    v_mov_b32_e32 v0, s0
485; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
486; GFX9-NEXT:    v_mov_b32_e32 v1, s1
487; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
488; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, 0x4500
489; GFX9-NEXT:    v_mov_b32_e32 v2, s0
490; GFX9-NEXT:    global_store_dword v[0:1], v2, off
491; GFX9-NEXT:    s_endpgm
492;
493; CIVI-LABEL: s_insertelement_v2f16_1:
494; CIVI:       ; %bb.0:
495; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
496; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
497; CIVI-NEXT:    v_mov_b32_e32 v0, s0
498; CIVI-NEXT:    s_load_dword s0, s[2:3], 0x0
499; CIVI-NEXT:    v_mov_b32_e32 v1, s1
500; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
501; CIVI-NEXT:    s_and_b32 s0, s0, 0xffff
502; CIVI-NEXT:    s_or_b32 s0, s0, 0x45000000
503; CIVI-NEXT:    v_mov_b32_e32 v2, s0
504; CIVI-NEXT:    flat_store_dword v[0:1], v2
505; CIVI-NEXT:    s_endpgm
506  %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
507  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
508  store <2 x half> %vecins, <2 x half> addrspace(1)* %out
509  ret void
510}
511
512define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
513; GFX9-LABEL: v_insertelement_v2i16_0:
514; GFX9:       ; %bb.0:
515; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
516; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
517; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
518; GFX9-NEXT:    v_mov_b32_e32 v1, s3
519; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
520; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
521; GFX9-NEXT:    global_load_dword v0, v[0:1], off
522; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
523; GFX9-NEXT:    v_mov_b32_e32 v3, s1
524; GFX9-NEXT:    s_movk_i32 s0, 0x3e7
525; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
526; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
527; GFX9-NEXT:    s_waitcnt vmcnt(0)
528; GFX9-NEXT:    v_bfi_b32 v0, v1, s0, v0
529; GFX9-NEXT:    global_store_dword v[2:3], v0, off
530; GFX9-NEXT:    s_endpgm
531;
532; VI-LABEL: v_insertelement_v2i16_0:
533; VI:       ; %bb.0:
534; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
535; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
536; VI-NEXT:    s_waitcnt lgkmcnt(0)
537; VI-NEXT:    v_mov_b32_e32 v1, s3
538; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
539; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
540; VI-NEXT:    flat_load_dword v0, v[0:1]
541; VI-NEXT:    v_mov_b32_e32 v3, s1
542; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
543; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
544; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
545; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
546; VI-NEXT:    v_or_b32_e32 v0, 0x3e7, v0
547; VI-NEXT:    flat_store_dword v[2:3], v0
548; VI-NEXT:    s_endpgm
549;
550; CI-LABEL: v_insertelement_v2i16_0:
551; CI:       ; %bb.0:
552; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
553; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
554; CI-NEXT:    s_waitcnt lgkmcnt(0)
555; CI-NEXT:    v_mov_b32_e32 v1, s3
556; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
557; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
558; CI-NEXT:    flat_load_dword v0, v[0:1]
559; CI-NEXT:    v_mov_b32_e32 v3, s1
560; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
561; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
562; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
563; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
564; CI-NEXT:    v_or_b32_e32 v0, 0x3e7, v0
565; CI-NEXT:    flat_store_dword v[2:3], v0
566; CI-NEXT:    s_endpgm
567  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
568  %tid.ext = sext i32 %tid to i64
569  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
570  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
571  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
572  %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
573  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
574  ret void
575}
576
577define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 {
578; GFX9-LABEL: v_insertelement_v2i16_0_reghi:
579; GFX9:       ; %bb.0:
580; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
581; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
582; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
583; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff0000
584; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
585; GFX9-NEXT:    v_mov_b32_e32 v1, s3
586; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
587; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
588; GFX9-NEXT:    global_load_dword v0, v[0:1], off
589; GFX9-NEXT:    v_lshrrev_b32_e64 v1, 16, s4
590; GFX9-NEXT:    v_mov_b32_e32 v3, s1
591; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
592; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
593; GFX9-NEXT:    s_waitcnt vmcnt(0)
594; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v1
595; GFX9-NEXT:    global_store_dword v[2:3], v0, off
596; GFX9-NEXT:    s_endpgm
597;
598; VI-LABEL: v_insertelement_v2i16_0_reghi:
599; VI:       ; %bb.0:
600; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
601; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
602; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
603; VI-NEXT:    s_waitcnt lgkmcnt(0)
604; VI-NEXT:    v_mov_b32_e32 v1, s3
605; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
606; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
607; VI-NEXT:    flat_load_dword v0, v[0:1]
608; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
609; VI-NEXT:    v_mov_b32_e32 v3, s1
610; VI-NEXT:    s_lshr_b32 s0, s4, 16
611; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
612; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
613; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
614; VI-NEXT:    v_or_b32_e32 v0, s0, v0
615; VI-NEXT:    flat_store_dword v[2:3], v0
616; VI-NEXT:    s_endpgm
617;
618; CI-LABEL: v_insertelement_v2i16_0_reghi:
619; CI:       ; %bb.0:
620; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
621; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
622; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
623; CI-NEXT:    s_waitcnt lgkmcnt(0)
624; CI-NEXT:    v_mov_b32_e32 v1, s3
625; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
626; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
627; CI-NEXT:    flat_load_dword v0, v[0:1]
628; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
629; CI-NEXT:    v_mov_b32_e32 v3, s1
630; CI-NEXT:    s_lshr_b32 s0, s4, 16
631; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
632; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
633; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
634; CI-NEXT:    v_or_b32_e32 v0, s0, v0
635; CI-NEXT:    flat_store_dword v[2:3], v0
636; CI-NEXT:    s_endpgm
637  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
638  %tid.ext = sext i32 %tid to i64
639  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
640  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
641  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
642  %elt.hi = lshr i32 %elt.arg, 16
643  %elt = trunc i32 %elt.hi to i16
644  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
645  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
646  ret void
647}
648
649define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
650; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm:
651; GFX9:       ; %bb.0:
652; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
653; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
654; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
655; GFX9-NEXT:    v_mov_b32_e32 v1, s3
656; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
657; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
658; GFX9-NEXT:    global_load_dword v0, v[0:1], off
659; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
660; GFX9-NEXT:    v_mov_b32_e32 v3, s1
661; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
662; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
663; GFX9-NEXT:    s_waitcnt vmcnt(0)
664; GFX9-NEXT:    v_bfi_b32 v0, v1, 53, v0
665; GFX9-NEXT:    global_store_dword v[2:3], v0, off
666; GFX9-NEXT:    s_endpgm
667;
668; VI-LABEL: v_insertelement_v2i16_0_inlineimm:
669; VI:       ; %bb.0:
670; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
671; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
672; VI-NEXT:    s_waitcnt lgkmcnt(0)
673; VI-NEXT:    v_mov_b32_e32 v1, s3
674; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
675; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
676; VI-NEXT:    flat_load_dword v0, v[0:1]
677; VI-NEXT:    v_mov_b32_e32 v3, s1
678; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
679; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
680; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
681; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
682; VI-NEXT:    v_or_b32_e32 v0, 53, v0
683; VI-NEXT:    flat_store_dword v[2:3], v0
684; VI-NEXT:    s_endpgm
685;
686; CI-LABEL: v_insertelement_v2i16_0_inlineimm:
687; CI:       ; %bb.0:
688; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
689; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
690; CI-NEXT:    s_waitcnt lgkmcnt(0)
691; CI-NEXT:    v_mov_b32_e32 v1, s3
692; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
693; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
694; CI-NEXT:    flat_load_dword v0, v[0:1]
695; CI-NEXT:    v_mov_b32_e32 v3, s1
696; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
697; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
698; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
699; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
700; CI-NEXT:    v_or_b32_e32 v0, 53, v0
701; CI-NEXT:    flat_store_dword v[2:3], v0
702; CI-NEXT:    s_endpgm
703  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
704  %tid.ext = sext i32 %tid to i64
705  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
706  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
707  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
708  %vecins = insertelement <2 x i16> %vec, i16 53, i32 0
709  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
710  ret void
711}
712
713; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0
714define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
715; GFX9-LABEL: v_insertelement_v2i16_1:
716; GFX9:       ; %bb.0:
717; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
718; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
719; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
720; GFX9-NEXT:    v_mov_b32_e32 v1, s3
721; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
722; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
723; GFX9-NEXT:    global_load_dword v0, v[0:1], off
724; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
725; GFX9-NEXT:    v_mov_b32_e32 v3, s1
726; GFX9-NEXT:    s_movk_i32 s0, 0x3e7
727; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
728; GFX9-NEXT:    s_waitcnt vmcnt(0)
729; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
730; GFX9-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
731; GFX9-NEXT:    global_store_dword v[2:3], v0, off
732; GFX9-NEXT:    s_endpgm
733;
734; VI-LABEL: v_insertelement_v2i16_1:
735; VI:       ; %bb.0:
736; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
737; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
738; VI-NEXT:    s_waitcnt lgkmcnt(0)
739; VI-NEXT:    v_mov_b32_e32 v1, s3
740; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
741; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
742; VI-NEXT:    flat_load_dword v0, v[0:1]
743; VI-NEXT:    v_mov_b32_e32 v1, 0x3e70000
744; VI-NEXT:    v_mov_b32_e32 v3, s1
745; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
746; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
747; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
748; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
749; VI-NEXT:    flat_store_dword v[2:3], v0
750; VI-NEXT:    s_endpgm
751;
752; CI-LABEL: v_insertelement_v2i16_1:
753; CI:       ; %bb.0:
754; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
755; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
756; CI-NEXT:    s_waitcnt lgkmcnt(0)
757; CI-NEXT:    v_mov_b32_e32 v1, s3
758; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
759; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
760; CI-NEXT:    flat_load_dword v0, v[0:1]
761; CI-NEXT:    v_mov_b32_e32 v3, s1
762; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
763; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
764; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
765; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
766; CI-NEXT:    v_or_b32_e32 v0, 0x3e70000, v0
767; CI-NEXT:    flat_store_dword v[2:3], v0
768; CI-NEXT:    s_endpgm
769  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
770  %tid.ext = sext i32 %tid to i64
771  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
772  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
773  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
774  %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
775  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
776  ret void
777}
778
779define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
780; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm:
781; GFX9:       ; %bb.0:
782; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
783; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
784; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
785; GFX9-NEXT:    v_mov_b32_e32 v1, s3
786; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
787; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
788; GFX9-NEXT:    global_load_dword v0, v[0:1], off
789; GFX9-NEXT:    v_mov_b32_e32 v3, s1
790; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
791; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
792; GFX9-NEXT:    s_waitcnt vmcnt(0)
793; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
794; GFX9-NEXT:    v_lshl_or_b32 v0, -15, 16, v0
795; GFX9-NEXT:    global_store_dword v[2:3], v0, off
796; GFX9-NEXT:    s_endpgm
797;
798; VI-LABEL: v_insertelement_v2i16_1_inlineimm:
799; VI:       ; %bb.0:
800; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
801; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
802; VI-NEXT:    s_waitcnt lgkmcnt(0)
803; VI-NEXT:    v_mov_b32_e32 v1, s3
804; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
805; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
806; VI-NEXT:    flat_load_dword v0, v[0:1]
807; VI-NEXT:    v_mov_b32_e32 v1, 0xfff10000
808; VI-NEXT:    v_mov_b32_e32 v3, s1
809; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
810; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
811; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
812; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
813; VI-NEXT:    flat_store_dword v[2:3], v0
814; VI-NEXT:    s_endpgm
815;
816; CI-LABEL: v_insertelement_v2i16_1_inlineimm:
817; CI:       ; %bb.0:
818; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
819; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
820; CI-NEXT:    s_waitcnt lgkmcnt(0)
821; CI-NEXT:    v_mov_b32_e32 v1, s3
822; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
823; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
824; CI-NEXT:    flat_load_dword v0, v[0:1]
825; CI-NEXT:    v_mov_b32_e32 v3, s1
826; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
827; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
828; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
829; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
830; CI-NEXT:    v_or_b32_e32 v0, 0xfff10000, v0
831; CI-NEXT:    flat_store_dword v[2:3], v0
832; CI-NEXT:    s_endpgm
833  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
834  %tid.ext = sext i32 %tid to i64
835  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
836  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
837  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
838  %vecins = insertelement <2 x i16> %vec, i16 -15, i32 1
839  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
840  ret void
841}
842
843define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
844; GFX9-LABEL: v_insertelement_v2f16_0:
845; GFX9:       ; %bb.0:
846; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
847; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
848; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
849; GFX9-NEXT:    v_mov_b32_e32 v1, s3
850; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
851; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
852; GFX9-NEXT:    global_load_dword v0, v[0:1], off
853; GFX9-NEXT:    v_mov_b32_e32 v3, s1
854; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
855; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4500
856; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
857; GFX9-NEXT:    s_waitcnt vmcnt(0)
858; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
859; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
860; GFX9-NEXT:    global_store_dword v[2:3], v0, off
861; GFX9-NEXT:    s_endpgm
862;
863; VI-LABEL: v_insertelement_v2f16_0:
864; VI:       ; %bb.0:
865; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
866; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
867; VI-NEXT:    s_waitcnt lgkmcnt(0)
868; VI-NEXT:    v_mov_b32_e32 v1, s3
869; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
870; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
871; VI-NEXT:    flat_load_dword v0, v[0:1]
872; VI-NEXT:    v_mov_b32_e32 v3, s1
873; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
874; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
875; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
876; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
877; VI-NEXT:    v_or_b32_e32 v0, 0x4500, v0
878; VI-NEXT:    flat_store_dword v[2:3], v0
879; VI-NEXT:    s_endpgm
880;
881; CI-LABEL: v_insertelement_v2f16_0:
882; CI:       ; %bb.0:
883; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
884; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
885; CI-NEXT:    s_waitcnt lgkmcnt(0)
886; CI-NEXT:    v_mov_b32_e32 v1, s3
887; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
888; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
889; CI-NEXT:    flat_load_dword v0, v[0:1]
890; CI-NEXT:    v_mov_b32_e32 v3, s1
891; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
892; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
893; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
894; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
895; CI-NEXT:    v_or_b32_e32 v0, 0x4500, v0
896; CI-NEXT:    flat_store_dword v[2:3], v0
897; CI-NEXT:    s_endpgm
898  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
899  %tid.ext = sext i32 %tid to i64
900  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
901  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
902  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
903  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
904  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
905  ret void
906}
907
908define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
909; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm:
910; GFX9:       ; %bb.0:
911; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
912; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
913; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
914; GFX9-NEXT:    v_mov_b32_e32 v1, s3
915; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
916; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
917; GFX9-NEXT:    global_load_dword v0, v[0:1], off
918; GFX9-NEXT:    v_mov_b32_e32 v3, s1
919; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
920; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
921; GFX9-NEXT:    s_waitcnt vmcnt(0)
922; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
923; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, 53
924; GFX9-NEXT:    global_store_dword v[2:3], v0, off
925; GFX9-NEXT:    s_endpgm
926;
927; VI-LABEL: v_insertelement_v2f16_0_inlineimm:
928; VI:       ; %bb.0:
929; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
930; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
931; VI-NEXT:    s_waitcnt lgkmcnt(0)
932; VI-NEXT:    v_mov_b32_e32 v1, s3
933; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
934; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
935; VI-NEXT:    flat_load_dword v0, v[0:1]
936; VI-NEXT:    v_mov_b32_e32 v3, s1
937; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
938; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
939; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
940; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
941; VI-NEXT:    v_or_b32_e32 v0, 53, v0
942; VI-NEXT:    flat_store_dword v[2:3], v0
943; VI-NEXT:    s_endpgm
944;
945; CI-LABEL: v_insertelement_v2f16_0_inlineimm:
946; CI:       ; %bb.0:
947; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
948; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
949; CI-NEXT:    s_waitcnt lgkmcnt(0)
950; CI-NEXT:    v_mov_b32_e32 v1, s3
951; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
952; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
953; CI-NEXT:    flat_load_dword v0, v[0:1]
954; CI-NEXT:    v_mov_b32_e32 v3, s1
955; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
956; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
957; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
958; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
959; CI-NEXT:    v_or_b32_e32 v0, 53, v0
960; CI-NEXT:    flat_store_dword v[2:3], v0
961; CI-NEXT:    s_endpgm
962  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
963  %tid.ext = sext i32 %tid to i64
964  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
965  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
966  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
967  %vecins = insertelement <2 x half> %vec, half 0xH0035, i32 0
968  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
969  ret void
970}
971
972define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
973; GFX9-LABEL: v_insertelement_v2f16_1:
974; GFX9:       ; %bb.0:
975; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
976; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
977; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
978; GFX9-NEXT:    v_mov_b32_e32 v1, s3
979; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
980; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
981; GFX9-NEXT:    global_load_dword v0, v[0:1], off
982; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
983; GFX9-NEXT:    v_mov_b32_e32 v3, s1
984; GFX9-NEXT:    s_movk_i32 s0, 0x4500
985; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
986; GFX9-NEXT:    s_waitcnt vmcnt(0)
987; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
988; GFX9-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
989; GFX9-NEXT:    global_store_dword v[2:3], v0, off
990; GFX9-NEXT:    s_endpgm
991;
992; VI-LABEL: v_insertelement_v2f16_1:
993; VI:       ; %bb.0:
994; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
995; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
996; VI-NEXT:    s_waitcnt lgkmcnt(0)
997; VI-NEXT:    v_mov_b32_e32 v1, s3
998; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
999; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1000; VI-NEXT:    flat_load_dword v0, v[0:1]
1001; VI-NEXT:    v_mov_b32_e32 v1, 0x45000000
1002; VI-NEXT:    v_mov_b32_e32 v3, s1
1003; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1004; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1005; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1006; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1007; VI-NEXT:    flat_store_dword v[2:3], v0
1008; VI-NEXT:    s_endpgm
1009;
1010; CI-LABEL: v_insertelement_v2f16_1:
1011; CI:       ; %bb.0:
1012; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1013; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1014; CI-NEXT:    s_waitcnt lgkmcnt(0)
1015; CI-NEXT:    v_mov_b32_e32 v1, s3
1016; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1017; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1018; CI-NEXT:    flat_load_dword v0, v[0:1]
1019; CI-NEXT:    v_mov_b32_e32 v3, s1
1020; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1021; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1022; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1023; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1024; CI-NEXT:    v_or_b32_e32 v0, 0x45000000, v0
1025; CI-NEXT:    flat_store_dword v[2:3], v0
1026; CI-NEXT:    s_endpgm
1027  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1028  %tid.ext = sext i32 %tid to i64
1029  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
1030  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
1031  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
1032  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
1033  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
1034  ret void
1035}
1036
1037define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
1038; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm:
1039; GFX9:       ; %bb.0:
1040; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1041; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1042; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1043; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1044; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
1045; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1046; GFX9-NEXT:    global_load_dword v0, v[0:1], off
1047; GFX9-NEXT:    v_mov_b32_e32 v3, s1
1048; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
1049; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1050; GFX9-NEXT:    s_waitcnt vmcnt(0)
1051; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1052; GFX9-NEXT:    v_lshl_or_b32 v0, 35, 16, v0
1053; GFX9-NEXT:    global_store_dword v[2:3], v0, off
1054; GFX9-NEXT:    s_endpgm
1055;
1056; VI-LABEL: v_insertelement_v2f16_1_inlineimm:
1057; VI:       ; %bb.0:
1058; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1059; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1060; VI-NEXT:    s_waitcnt lgkmcnt(0)
1061; VI-NEXT:    v_mov_b32_e32 v1, s3
1062; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1063; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1064; VI-NEXT:    flat_load_dword v0, v[0:1]
1065; VI-NEXT:    v_mov_b32_e32 v1, 0x230000
1066; VI-NEXT:    v_mov_b32_e32 v3, s1
1067; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1068; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1069; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1070; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1071; VI-NEXT:    flat_store_dword v[2:3], v0
1072; VI-NEXT:    s_endpgm
1073;
1074; CI-LABEL: v_insertelement_v2f16_1_inlineimm:
1075; CI:       ; %bb.0:
1076; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1077; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1078; CI-NEXT:    s_waitcnt lgkmcnt(0)
1079; CI-NEXT:    v_mov_b32_e32 v1, s3
1080; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1081; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1082; CI-NEXT:    flat_load_dword v0, v[0:1]
1083; CI-NEXT:    v_mov_b32_e32 v3, s1
1084; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1085; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1086; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1087; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1088; CI-NEXT:    v_or_b32_e32 v0, 0x230000, v0
1089; CI-NEXT:    flat_store_dword v[2:3], v0
1090; CI-NEXT:    s_endpgm
1091  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1092  %tid.ext = sext i32 %tid to i64
1093  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
1094  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
1095  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
1096  %vecins = insertelement <2 x half> %vec, half 0xH0023, i32 1
1097  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
1098  ret void
1099}
1100
1101; FIXME: Enable for others when argument load not split
1102define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(4)* %idx.ptr) #0 {
1103; GFX9-LABEL: s_insertelement_v2i16_dynamic:
1104; GFX9:       ; %bb.0:
1105; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1106; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
1107; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3e703e7
1108; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1109; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1110; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1111; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
1112; GFX9-NEXT:    s_load_dword s1, s[2:3], 0x0
1113; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1114; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
1115; GFX9-NEXT:    s_lshl_b32 s0, 0xffff, s0
1116; GFX9-NEXT:    v_mov_b32_e32 v3, s1
1117; GFX9-NEXT:    v_bfi_b32 v2, s0, v2, v3
1118; GFX9-NEXT:    global_store_dword v[0:1], v2, off
1119; GFX9-NEXT:    s_endpgm
1120;
1121; VI-LABEL: s_insertelement_v2i16_dynamic:
1122; VI:       ; %bb.0:
1123; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1124; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
1125; VI-NEXT:    v_mov_b32_e32 v2, 0x3e703e7
1126; VI-NEXT:    s_waitcnt lgkmcnt(0)
1127; VI-NEXT:    v_mov_b32_e32 v0, s0
1128; VI-NEXT:    v_mov_b32_e32 v1, s1
1129; VI-NEXT:    s_load_dword s0, s[4:5], 0x0
1130; VI-NEXT:    s_load_dword s1, s[2:3], 0x0
1131; VI-NEXT:    s_waitcnt lgkmcnt(0)
1132; VI-NEXT:    s_lshl_b32 s0, s0, 4
1133; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1134; VI-NEXT:    v_mov_b32_e32 v3, s1
1135; VI-NEXT:    v_bfi_b32 v2, s0, v2, v3
1136; VI-NEXT:    flat_store_dword v[0:1], v2
1137; VI-NEXT:    s_endpgm
1138;
1139; CI-LABEL: s_insertelement_v2i16_dynamic:
1140; CI:       ; %bb.0:
1141; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1142; CI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
1143; CI-NEXT:    v_mov_b32_e32 v2, 0x3e703e7
1144; CI-NEXT:    s_waitcnt lgkmcnt(0)
1145; CI-NEXT:    v_mov_b32_e32 v0, s0
1146; CI-NEXT:    v_mov_b32_e32 v1, s1
1147; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
1148; CI-NEXT:    s_load_dword s1, s[2:3], 0x0
1149; CI-NEXT:    s_waitcnt lgkmcnt(0)
1150; CI-NEXT:    s_lshl_b32 s0, s0, 4
1151; CI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1152; CI-NEXT:    v_mov_b32_e32 v3, s1
1153; CI-NEXT:    v_bfi_b32 v2, s0, v2, v3
1154; CI-NEXT:    flat_store_dword v[0:1], v2
1155; CI-NEXT:    s_endpgm
1156  %idx = load volatile i32, i32 addrspace(4)* %idx.ptr
1157  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
1158  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
1159  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
1160  ret void
1161}
1162
1163define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 {
1164; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1165; GFX9:       ; %bb.0:
1166; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1167; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
1168; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1169; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1170; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1171; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
1172; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1173; GFX9-NEXT:    global_load_dword v0, v[0:1], off
1174; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
1175; GFX9-NEXT:    s_lshl_b32 s0, s4, 4
1176; GFX9-NEXT:    v_mov_b32_e32 v3, s1
1177; GFX9-NEXT:    s_lshl_b32 s0, 0xffff, s0
1178; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3e703e7
1179; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1180; GFX9-NEXT:    s_waitcnt vmcnt(0)
1181; GFX9-NEXT:    v_bfi_b32 v0, s0, v1, v0
1182; GFX9-NEXT:    global_store_dword v[2:3], v0, off
1183; GFX9-NEXT:    s_endpgm
1184;
1185; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1186; VI:       ; %bb.0:
1187; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1188; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1189; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1190; VI-NEXT:    s_waitcnt lgkmcnt(0)
1191; VI-NEXT:    v_mov_b32_e32 v1, s3
1192; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1193; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1194; VI-NEXT:    flat_load_dword v0, v[0:1]
1195; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1196; VI-NEXT:    s_lshl_b32 s0, s4, 4
1197; VI-NEXT:    v_mov_b32_e32 v3, s1
1198; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1199; VI-NEXT:    v_mov_b32_e32 v1, 0x3e703e7
1200; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1201; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1202; VI-NEXT:    v_bfi_b32 v0, s0, v1, v0
1203; VI-NEXT:    flat_store_dword v[2:3], v0
1204; VI-NEXT:    s_endpgm
1205;
1206; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1207; CI:       ; %bb.0:
1208; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1209; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1210; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1211; CI-NEXT:    s_waitcnt lgkmcnt(0)
1212; CI-NEXT:    v_mov_b32_e32 v1, s3
1213; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1214; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1215; CI-NEXT:    flat_load_dword v0, v[0:1]
1216; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1217; CI-NEXT:    s_lshl_b32 s0, s4, 4
1218; CI-NEXT:    v_mov_b32_e32 v3, s1
1219; CI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1220; CI-NEXT:    v_mov_b32_e32 v1, 0x3e703e7
1221; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1222; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1223; CI-NEXT:    v_bfi_b32 v0, s0, v1, v0
1224; CI-NEXT:    flat_store_dword v[2:3], v0
1225; CI-NEXT:    s_endpgm
1226  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1227  %tid.ext = sext i32 %tid to i64
1228  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1229  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1230  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
1231  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
1232  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
1233  ret void
1234}
1235
1236define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
1237; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1238; GFX9:       ; %bb.0:
1239; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1240; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
1241; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
1242; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1243; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1244; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v4
1245; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1246; GFX9-NEXT:    v_mov_b32_e32 v3, s5
1247; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s4, v4
1248; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1249; GFX9-NEXT:    global_load_dword v0, v[0:1], off
1250; GFX9-NEXT:    global_load_dword v1, v[2:3], off
1251; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v4
1252; GFX9-NEXT:    s_mov_b32 s0, 0xffff
1253; GFX9-NEXT:    v_mov_b32_e32 v5, s1
1254; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
1255; GFX9-NEXT:    s_waitcnt vmcnt(0)
1256; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1257; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
1258; GFX9-NEXT:    s_mov_b32 s0, 0x12341234
1259; GFX9-NEXT:    v_bfi_b32 v0, v1, s0, v0
1260; GFX9-NEXT:    global_store_dword v[4:5], v0, off
1261; GFX9-NEXT:    s_endpgm
1262;
1263; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1264; VI:       ; %bb.0:
1265; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1266; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
1267; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
1268; VI-NEXT:    s_waitcnt lgkmcnt(0)
1269; VI-NEXT:    v_mov_b32_e32 v1, s3
1270; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
1271; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1272; VI-NEXT:    v_mov_b32_e32 v3, s5
1273; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
1274; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1275; VI-NEXT:    flat_load_dword v0, v[0:1]
1276; VI-NEXT:    flat_load_dword v1, v[2:3]
1277; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
1278; VI-NEXT:    s_mov_b32 s0, 0xffff
1279; VI-NEXT:    v_mov_b32_e32 v5, s1
1280; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
1281; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1282; VI-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1283; VI-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
1284; VI-NEXT:    s_mov_b32 s0, 0x12341234
1285; VI-NEXT:    v_bfi_b32 v0, v1, s0, v0
1286; VI-NEXT:    flat_store_dword v[4:5], v0
1287; VI-NEXT:    s_endpgm
1288;
1289; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1290; CI:       ; %bb.0:
1291; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1292; CI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
1293; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
1294; CI-NEXT:    s_waitcnt lgkmcnt(0)
1295; CI-NEXT:    v_mov_b32_e32 v1, s3
1296; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
1297; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1298; CI-NEXT:    v_mov_b32_e32 v3, s5
1299; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
1300; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1301; CI-NEXT:    flat_load_dword v2, v[2:3]
1302; CI-NEXT:    flat_load_dword v0, v[0:1]
1303; CI-NEXT:    v_add_i32_e32 v4, vcc, s0, v4
1304; CI-NEXT:    v_mov_b32_e32 v5, s1
1305; CI-NEXT:    s_mov_b32 s0, 0x12341234
1306; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
1307; CI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
1308; CI-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
1309; CI-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
1310; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1311; CI-NEXT:    v_bfi_b32 v0, v1, s0, v0
1312; CI-NEXT:    flat_store_dword v[4:5], v0
1313; CI-NEXT:    s_endpgm
1314  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1315  %tid.ext = sext i32 %tid to i64
1316  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
1317  %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
1318  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
1319  %idx = load i32, i32 addrspace(1)* %idx.gep
1320  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
1321  %vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx
1322  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
1323  ret void
1324}
1325
1326define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
1327; GFX9-LABEL: v_insertelement_v4f16_0:
1328; GFX9:       ; %bb.0:
1329; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1330; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x30
1331; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1332; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
1333; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1334; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1335; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
1336; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1337; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1338; GFX9-NEXT:    v_mov_b32_e32 v3, s1
1339; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
1340; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1341; GFX9-NEXT:    s_waitcnt vmcnt(0)
1342; GFX9-NEXT:    v_bfi_b32 v0, v4, s4, v0
1343; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1344; GFX9-NEXT:    s_endpgm
1345;
1346; VI-LABEL: v_insertelement_v4f16_0:
1347; VI:       ; %bb.0:
1348; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1349; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
1350; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1351; VI-NEXT:    s_waitcnt lgkmcnt(0)
1352; VI-NEXT:    v_mov_b32_e32 v1, s3
1353; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1354; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1355; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1356; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1357; VI-NEXT:    v_mov_b32_e32 v3, s1
1358; VI-NEXT:    s_and_b32 s0, s4, 0xffff
1359; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1360; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1361; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
1362; VI-NEXT:    v_or_b32_e32 v0, s0, v0
1363; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1364; VI-NEXT:    s_endpgm
1365;
1366; CI-LABEL: v_insertelement_v4f16_0:
1367; CI:       ; %bb.0:
1368; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1369; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
1370; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1371; CI-NEXT:    s_waitcnt lgkmcnt(0)
1372; CI-NEXT:    v_mov_b32_e32 v1, s3
1373; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1374; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1375; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1376; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1377; CI-NEXT:    v_mov_b32_e32 v3, s1
1378; CI-NEXT:    s_and_b32 s0, s4, 0xffff
1379; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1380; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1381; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
1382; CI-NEXT:    v_or_b32_e32 v0, s0, v0
1383; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1384; CI-NEXT:    s_endpgm
1385  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1386  %tid.ext = sext i32 %tid to i64
1387  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1388  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1389  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1390  %val.trunc = trunc i32 %val to i16
1391  %val.cvt = bitcast i16 %val.trunc to half
1392  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 0
1393  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1394  ret void
1395}
1396
1397define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
1398; GFX9-LABEL: v_insertelement_v4f16_1:
1399; GFX9:       ; %bb.0:
1400; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1401; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
1402; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1403; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1404; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1405; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
1406; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1407; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1408; GFX9-NEXT:    v_mov_b32_e32 v3, s1
1409; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
1410; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1411; GFX9-NEXT:    s_waitcnt vmcnt(0)
1412; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1413; GFX9-NEXT:    v_lshl_or_b32 v0, s4, 16, v0
1414; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1415; GFX9-NEXT:    s_endpgm
1416;
1417; VI-LABEL: v_insertelement_v4f16_1:
1418; VI:       ; %bb.0:
1419; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1420; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1421; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1422; VI-NEXT:    s_waitcnt lgkmcnt(0)
1423; VI-NEXT:    v_mov_b32_e32 v1, s3
1424; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1425; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1426; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1427; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1428; VI-NEXT:    s_lshl_b32 s0, s4, 16
1429; VI-NEXT:    v_mov_b32_e32 v3, s1
1430; VI-NEXT:    v_mov_b32_e32 v4, s0
1431; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1432; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1433; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1434; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1435; VI-NEXT:    s_endpgm
1436;
1437; CI-LABEL: v_insertelement_v4f16_1:
1438; CI:       ; %bb.0:
1439; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1440; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1441; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1442; CI-NEXT:    s_waitcnt lgkmcnt(0)
1443; CI-NEXT:    v_mov_b32_e32 v1, s3
1444; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1445; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1446; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1447; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1448; CI-NEXT:    v_mov_b32_e32 v3, s1
1449; CI-NEXT:    s_lshl_b32 s0, s4, 16
1450; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1451; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1452; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1453; CI-NEXT:    v_or_b32_e32 v0, s0, v0
1454; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1455; CI-NEXT:    s_endpgm
1456  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1457  %tid.ext = sext i32 %tid to i64
1458  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1459  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1460  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1461  %val.trunc = trunc i32 %val to i16
1462  %val.cvt = bitcast i16 %val.trunc to half
1463  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 1
1464  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1465  ret void
1466}
1467
1468define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
1469; GFX9-LABEL: v_insertelement_v4f16_2:
1470; GFX9:       ; %bb.0:
1471; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1472; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x30
1473; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1474; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
1475; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1476; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1477; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
1478; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1479; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1480; GFX9-NEXT:    v_mov_b32_e32 v3, s1
1481; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
1482; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1483; GFX9-NEXT:    s_waitcnt vmcnt(0)
1484; GFX9-NEXT:    v_bfi_b32 v1, v4, s4, v1
1485; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1486; GFX9-NEXT:    s_endpgm
1487;
1488; VI-LABEL: v_insertelement_v4f16_2:
1489; VI:       ; %bb.0:
1490; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1491; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
1492; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1493; VI-NEXT:    s_waitcnt lgkmcnt(0)
1494; VI-NEXT:    v_mov_b32_e32 v1, s3
1495; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1496; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1497; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1498; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1499; VI-NEXT:    v_mov_b32_e32 v3, s1
1500; VI-NEXT:    s_and_b32 s0, s4, 0xffff
1501; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1502; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1503; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
1504; VI-NEXT:    v_or_b32_e32 v1, s0, v1
1505; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1506; VI-NEXT:    s_endpgm
1507;
1508; CI-LABEL: v_insertelement_v4f16_2:
1509; CI:       ; %bb.0:
1510; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1511; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
1512; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1513; CI-NEXT:    s_waitcnt lgkmcnt(0)
1514; CI-NEXT:    v_mov_b32_e32 v1, s3
1515; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1516; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1517; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1518; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1519; CI-NEXT:    v_mov_b32_e32 v3, s1
1520; CI-NEXT:    s_and_b32 s0, s4, 0xffff
1521; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1522; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1523; CI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
1524; CI-NEXT:    v_or_b32_e32 v1, s0, v1
1525; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1526; CI-NEXT:    s_endpgm
1527  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1528  %tid.ext = sext i32 %tid to i64
1529  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1530  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1531  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1532  %val.trunc = trunc i32 %val to i16
1533  %val.cvt = bitcast i16 %val.trunc to half
1534  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 2
1535  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1536  ret void
1537}
1538
1539define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
1540; GFX9-LABEL: v_insertelement_v4f16_3:
1541; GFX9:       ; %bb.0:
1542; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1543; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
1544; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1545; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1546; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1547; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
1548; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1549; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1550; GFX9-NEXT:    v_mov_b32_e32 v3, s1
1551; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
1552; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1553; GFX9-NEXT:    s_waitcnt vmcnt(0)
1554; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1555; GFX9-NEXT:    v_lshl_or_b32 v1, s4, 16, v1
1556; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1557; GFX9-NEXT:    s_endpgm
1558;
1559; VI-LABEL: v_insertelement_v4f16_3:
1560; VI:       ; %bb.0:
1561; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1562; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1563; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1564; VI-NEXT:    s_waitcnt lgkmcnt(0)
1565; VI-NEXT:    v_mov_b32_e32 v1, s3
1566; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1567; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1568; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1569; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1570; VI-NEXT:    s_lshl_b32 s0, s4, 16
1571; VI-NEXT:    v_mov_b32_e32 v3, s1
1572; VI-NEXT:    v_mov_b32_e32 v4, s0
1573; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1574; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1575; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1576; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1577; VI-NEXT:    s_endpgm
1578;
1579; CI-LABEL: v_insertelement_v4f16_3:
1580; CI:       ; %bb.0:
1581; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1582; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1583; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1584; CI-NEXT:    s_waitcnt lgkmcnt(0)
1585; CI-NEXT:    v_mov_b32_e32 v1, s3
1586; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1587; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1588; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1589; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1590; CI-NEXT:    v_mov_b32_e32 v3, s1
1591; CI-NEXT:    s_lshl_b32 s0, s4, 16
1592; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1593; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1594; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1595; CI-NEXT:    v_or_b32_e32 v1, s0, v1
1596; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1597; CI-NEXT:    s_endpgm
1598  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1599  %tid.ext = sext i32 %tid to i64
1600  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1601  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1602  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1603  %val.trunc = trunc i32 %val to i16
1604  %val.cvt = bitcast i16 %val.trunc to half
1605  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 3
1606  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1607  ret void
1608}
1609
1610define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
1611; GFX9-LABEL: v_insertelement_v4i16_2:
1612; GFX9:       ; %bb.0:
1613; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1614; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
1615; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1616; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
1617; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1618; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1619; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
1620; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1621; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1622; GFX9-NEXT:    v_mov_b32_e32 v3, s1
1623; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
1624; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1625; GFX9-NEXT:    s_waitcnt vmcnt(0)
1626; GFX9-NEXT:    v_bfi_b32 v1, v4, s4, v1
1627; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1628; GFX9-NEXT:    s_endpgm
1629;
1630; VI-LABEL: v_insertelement_v4i16_2:
1631; VI:       ; %bb.0:
1632; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1633; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1634; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1635; VI-NEXT:    s_waitcnt lgkmcnt(0)
1636; VI-NEXT:    v_mov_b32_e32 v1, s3
1637; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1638; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1639; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1640; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1641; VI-NEXT:    v_mov_b32_e32 v3, s1
1642; VI-NEXT:    s_and_b32 s0, s4, 0xffff
1643; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1644; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1645; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
1646; VI-NEXT:    v_or_b32_e32 v1, s0, v1
1647; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1648; VI-NEXT:    s_endpgm
1649;
1650; CI-LABEL: v_insertelement_v4i16_2:
1651; CI:       ; %bb.0:
1652; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1653; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1654; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1655; CI-NEXT:    s_waitcnt lgkmcnt(0)
1656; CI-NEXT:    v_mov_b32_e32 v1, s3
1657; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1658; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1659; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1660; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1661; CI-NEXT:    v_mov_b32_e32 v3, s1
1662; CI-NEXT:    s_and_b32 s0, s4, 0xffff
1663; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1664; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1665; CI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
1666; CI-NEXT:    v_or_b32_e32 v1, s0, v1
1667; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1668; CI-NEXT:    s_endpgm
1669  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1670  %tid.ext = sext i32 %tid to i64
1671  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
1672  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
1673  %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
1674  %val.trunc = trunc i32 %val to i16
1675  %val.cvt = bitcast i16 %val.trunc to i16
1676  %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 2
1677  store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep
1678  ret void
1679}
1680
1681; FIXME: Better code on CI?
1682define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
1683; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1684; GFX9:       ; %bb.0:
1685; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1686; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
1687; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1688; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1689; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1690; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
1691; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1692; GFX9-NEXT:    global_load_dword v4, v[0:1], off
1693; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1694; GFX9-NEXT:    v_mov_b32_e32 v3, s1
1695; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
1696; GFX9-NEXT:    s_mov_b32 s1, 0
1697; GFX9-NEXT:    s_mov_b32 s0, 0xffff
1698; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1699; GFX9-NEXT:    s_waitcnt vmcnt(1)
1700; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
1701; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v4, s[0:1]
1702; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s4
1703; GFX9-NEXT:    s_waitcnt vmcnt(0)
1704; GFX9-NEXT:    v_bfi_b32 v1, v5, s0, v1
1705; GFX9-NEXT:    v_bfi_b32 v0, v4, s0, v0
1706; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1707; GFX9-NEXT:    s_endpgm
1708;
1709; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1710; VI:       ; %bb.0:
1711; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1712; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1713; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1714; VI-NEXT:    s_waitcnt lgkmcnt(0)
1715; VI-NEXT:    v_mov_b32_e32 v1, s3
1716; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1717; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1718; VI-NEXT:    flat_load_dword v4, v[0:1]
1719; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1720; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1721; VI-NEXT:    s_mov_b32 s0, 0xffff
1722; VI-NEXT:    v_mov_b32_e32 v3, s1
1723; VI-NEXT:    s_and_b32 s2, s4, s0
1724; VI-NEXT:    s_mov_b32 s1, 0
1725; VI-NEXT:    s_lshl_b32 s3, s2, 16
1726; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1727; VI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
1728; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
1729; VI-NEXT:    v_lshlrev_b64 v[4:5], v4, s[0:1]
1730; VI-NEXT:    s_or_b32 s0, s2, s3
1731; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1732; VI-NEXT:    v_bfi_b32 v1, v5, s0, v1
1733; VI-NEXT:    v_bfi_b32 v0, v4, s0, v0
1734; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1735; VI-NEXT:    s_endpgm
1736;
1737; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1738; CI:       ; %bb.0:
1739; CI-NEXT:    flat_load_dword v4, v[0:1]
1740; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1741; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1742; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1743; CI-NEXT:    s_mov_b32 s6, 0xffff
1744; CI-NEXT:    s_mov_b32 s7, 0
1745; CI-NEXT:    s_waitcnt lgkmcnt(0)
1746; CI-NEXT:    v_mov_b32_e32 v1, s3
1747; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1748; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1749; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1750; CI-NEXT:    v_mov_b32_e32 v3, s1
1751; CI-NEXT:    s_lshl_b32 s1, s4, 16
1752; CI-NEXT:    s_and_b32 s3, s4, s6
1753; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1754; CI-NEXT:    s_or_b32 s0, s3, s1
1755; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1756; CI-NEXT:    s_waitcnt vmcnt(1)
1757; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
1758; CI-NEXT:    v_lshl_b64 v[4:5], s[6:7], v4
1759; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1760; CI-NEXT:    v_bfi_b32 v1, v5, s0, v1
1761; CI-NEXT:    v_bfi_b32 v0, v4, s0, v0
1762; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1763; CI-NEXT:    s_endpgm
1764  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1765  %tid.ext = sext i32 %tid to i64
1766  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
1767  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
1768  %idx.val = load volatile i32, i32 addrspace(1)* undef
1769  %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
1770  %val.trunc = trunc i32 %val to i16
1771  %val.cvt = bitcast i16 %val.trunc to i16
1772  %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 %idx.val
1773  store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep
1774  ret void
1775}
1776
1777define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val, i32 %idxval) #0 {
1778; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr:
1779; GFX9:       ; %bb.0:
1780; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1781; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
1782; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1783; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1784; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1785; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
1786; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1787; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1788; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s4, s4
1789; GFX9-NEXT:    v_mov_b32_e32 v3, s1
1790; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
1791; GFX9-NEXT:    s_mov_b32 s1, 0
1792; GFX9-NEXT:    s_mov_b32 s0, 0xffff
1793; GFX9-NEXT:    s_lshl_b32 s3, s5, 4
1794; GFX9-NEXT:    v_mov_b32_e32 v4, s2
1795; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s3
1796; GFX9-NEXT:    v_mov_b32_e32 v5, s2
1797; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1798; GFX9-NEXT:    s_waitcnt vmcnt(0)
1799; GFX9-NEXT:    v_bfi_b32 v1, s1, v5, v1
1800; GFX9-NEXT:    v_bfi_b32 v0, s0, v4, v0
1801; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1802; GFX9-NEXT:    s_endpgm
1803;
1804; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
1805; VI:       ; %bb.0:
1806; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1807; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
1808; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1809; VI-NEXT:    s_waitcnt lgkmcnt(0)
1810; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1811; VI-NEXT:    v_mov_b32_e32 v1, s3
1812; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1813; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1814; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1815; VI-NEXT:    s_mov_b32 s0, 0xffff
1816; VI-NEXT:    v_mov_b32_e32 v3, s1
1817; VI-NEXT:    s_mov_b32 s1, 0
1818; VI-NEXT:    s_lshl_b32 s2, s5, 4
1819; VI-NEXT:    s_and_b32 s3, s4, s0
1820; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1821; VI-NEXT:    s_lshl_b32 s2, s3, 16
1822; VI-NEXT:    s_or_b32 s2, s3, s2
1823; VI-NEXT:    v_mov_b32_e32 v4, s2
1824; VI-NEXT:    v_mov_b32_e32 v5, s2
1825; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1826; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1827; VI-NEXT:    v_bfi_b32 v1, s1, v4, v1
1828; VI-NEXT:    v_bfi_b32 v0, s0, v5, v0
1829; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1830; VI-NEXT:    s_endpgm
1831;
1832; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
1833; CI:       ; %bb.0:
1834; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1835; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
1836; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1837; CI-NEXT:    s_waitcnt lgkmcnt(0)
1838; CI-NEXT:    v_mov_b32_e32 v1, s3
1839; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1840; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1841; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1842; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1843; CI-NEXT:    s_mov_b32 s0, 0xffff
1844; CI-NEXT:    s_and_b32 s2, s4, s0
1845; CI-NEXT:    s_lshl_b32 s4, s4, 16
1846; CI-NEXT:    v_mov_b32_e32 v3, s1
1847; CI-NEXT:    s_or_b32 s2, s2, s4
1848; CI-NEXT:    s_mov_b32 s1, 0
1849; CI-NEXT:    s_lshl_b32 s3, s5, 4
1850; CI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s3
1851; CI-NEXT:    v_mov_b32_e32 v4, s2
1852; CI-NEXT:    v_mov_b32_e32 v5, s2
1853; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1854; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1855; CI-NEXT:    v_bfi_b32 v1, s1, v4, v1
1856; CI-NEXT:    v_bfi_b32 v0, s0, v5, v0
1857; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1858; CI-NEXT:    s_endpgm
1859  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1860  %tid.ext = sext i32 %tid to i64
1861  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1862  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1863  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1864  %val.trunc = trunc i32 %val to i16
1865  %val.cvt = bitcast i16 %val.trunc to half
1866  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval
1867  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1868  ret void
1869}
1870
1871declare i32 @llvm.amdgcn.workitem.id.x() #1
1872
1873attributes #0 = { nounwind }
1874attributes #1 = { nounwind readnone }
1875