1; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
2
3; GCN-LABEL: {{^}}float4_inselt:
4; GCN-NOT: v_movrel
5; GCN-NOT: buffer_
6; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 3
7; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], -1, 0
8; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
9; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
10; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], -1, 0
11; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]]
12; GCN-DAG: s_cmp_lg_u32 [[IDX]], 1
13; GCN-DAG: s_cselect_b64 [[CC3:[^,]+]], -1, 0
14; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]]
15; GCN-DAG: s_cmp_lg_u32 [[IDX]], 0
16; GCN-DAG: s_cselect_b64 [[CC4:[^,]+]], -1, 0
17; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
18; GCN:     flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
19define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x float> %vec, i32 %sel) {
20entry:
21  %v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel
22  store <4 x float> %v, <4 x float> addrspace(1)* %out
23  ret void
24}
25
26; GCN-LABEL: {{^}}float4_inselt_undef:
27; GCN-NOT: v_movrel
28; GCN-NOT: buffer_
29; GCN-NOT: v_cmp_
30; GCN-NOT: v_cndmask_
31; GCN:     v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
32; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
33; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
34; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
35define amdgpu_kernel void @float4_inselt_undef(<4 x float> addrspace(1)* %out, i32 %sel) {
36entry:
37  %v = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel
38  store <4 x float> %v, <4 x float> addrspace(1)* %out
39  ret void
40}
41
42; GCN-LABEL: {{^}}int4_inselt:
43; GCN-NOT: v_movrel
44; GCN-NOT: buffer_
45; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 3
46; GCN-DAG: s_cselect_b32 s[[ELT_3:[0-9]+]], s{{[0-9]+}}, 1
47; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
48; GCN-DAG: s_cselect_b32 s[[ELT_2:[0-9]+]], s{{[0-9]+}}, 1
49; GCN-DAG: s_cmp_lg_u32 [[IDX]], 1
50; GCN-DAG: s_cselect_b32 s[[ELT_1:[0-9]+]], s{{[0-9]+}}, 1
51; GCN-DAG: s_cmp_lg_u32 [[IDX]], 0
52; GCN-DAG: s_cselect_b32 s[[ELT_0:[0-9]+]], s{{[0-9]+}}, 1
53; GCN-DAG: v_mov_b32_e32 v[[VELT_0:[0-9]+]], s[[ELT_0]]
54; GCN-DAG: v_mov_b32_e32 v[[VELT_1:[0-9]+]], s[[ELT_1]]
55; GCN-DAG: v_mov_b32_e32 v[[VELT_2:[0-9]+]], s[[ELT_2]]
56; GCN-DAG: v_mov_b32_e32 v[[VELT_3:[0-9]+]], s[[ELT_3]]
57; GCN:     flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[VELT_0]]:[[VELT_3]]]
58define amdgpu_kernel void @int4_inselt(<4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %sel) {
59entry:
60  %v = insertelement <4 x i32> %vec, i32 1, i32 %sel
61  store <4 x i32> %v, <4 x i32> addrspace(1)* %out
62  ret void
63}
64
65; GCN-LABEL: {{^}}float2_inselt:
66; GCN-NOT: v_movrel
67; GCN-NOT: buffer_
68; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 1
69; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], -1, 0
70; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
71; GCN-DAG: s_cmp_lg_u32 [[IDX]], 0
72; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], -1, 0
73; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC2]]
74; GCN:     flat_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
75define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x float> %vec, i32 %sel) {
76entry:
77  %v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel
78  store <2 x float> %v, <2 x float> addrspace(1)* %out
79  ret void
80}
81
82; GCN-LABEL: {{^}}float8_inselt:
83; GCN-NOT: v_movrel
84; GCN-NOT: buffer_
85; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 3
86; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], -1, 0
87; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
88; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
89; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], -1, 0
90; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]]
91; GCN-DAG: s_cmp_lg_u32 [[IDX]], 1
92; GCN-DAG: s_cselect_b64 [[CC3:[^,]+]], -1, 0
93; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]]
94; GCN-DAG: s_cmp_lg_u32 [[IDX]], 0
95; GCN-DAG: s_cselect_b64 [[CC4:[^,]+]], -1, 0
96; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
97; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
98; GCN-DAG: s_cselect_b64 [[CC5:[^,]+]], -1, 0
99; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC5]]
100; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
101; GCN-DAG: s_cselect_b64 [[CC6:[^,]+]], -1, 0
102; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC6]]
103; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
104; GCN-DAG: s_cselect_b64 [[CC7:[^,]+]], -1, 0
105; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC7]]
106; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
107; GCN-DAG: s_cselect_b64 [[CC8:[^,]+]], -1, 0
108; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC8]]
109; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST0]]:[[ELT_LAST0]]]
110; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST1]]:[[ELT_LAST1]]]
111define amdgpu_kernel void @float8_inselt(<8 x float> addrspace(1)* %out, <8 x float> %vec, i32 %sel) {
112entry:
113  %v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel
114  store <8 x float> %v, <8 x float> addrspace(1)* %out
115  ret void
116}
117
118; GCN-LABEL: {{^}}float16_inselt:
119; GCN: v_movreld_b32
120define amdgpu_kernel void @float16_inselt(<16 x float> addrspace(1)* %out, <16 x float> %vec, i32 %sel) {
121entry:
122  %v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel
123  store <16 x float> %v, <16 x float> addrspace(1)* %out
124  ret void
125}
126
127; GCN-LABEL: {{^}}float32_inselt:
128; GCN: v_movreld_b32
129define amdgpu_kernel void @float32_inselt(<32 x float> addrspace(1)* %out, <32 x float> %vec, i32 %sel) {
130entry:
131  %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel
132  store <32 x float> %v, <32 x float> addrspace(1)* %out
133  ret void
134}
135
136; GCN-LABEL: {{^}}half4_inselt:
137; GCN-NOT: v_cndmask_b32
138; GCN-NOT: v_movrel
139; GCN-NOT: buffer_
140; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
141; GCN:     s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
142; GCN:     s_mov_b32 s[[KLO:[0-9]+]], 0x3c003c00
143; GCN:     s_mov_b32 s[[KHI:[0-9]+]], s[[KLO]]
144; GCN:     s_andn2_b64
145; GCN:     s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s{{\[}}[[KLO]]:[[KHI]]]
146; GCN:     s_or_b64
147define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) {
148entry:
149  %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel
150  store <4 x half> %v, <4 x half> addrspace(1)* %out
151  ret void
152}
153
154; GCN-LABEL: {{^}}half2_inselt:
155; GCN-NOT: v_cndmask_b32
156; GCN-NOT: v_movrel
157; GCN-NOT: buffer_
158; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
159; GCN:     s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]]
160; GCN:     s_andn2_b32
161; GCN:     s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3c003c00
162; GCN:     s_or_b32
163define amdgpu_kernel void @half2_inselt(<2 x half> addrspace(1)* %out, <2 x half> %vec, i32 %sel) {
164entry:
165  %v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel
166  store <2 x half> %v, <2 x half> addrspace(1)* %out
167  ret void
168}
169
170; GCN-LABEL: {{^}}half8_inselt:
171; GCN-NOT: v_movrel
172; GCN-NOT: buffer_
173; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 0
174; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 1
175; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 2
176; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 3
177; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 4
178; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 5
179; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 6
180; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 7
181; GCN-DAG: v_cndmask_b32_e32
182; GCN-DAG: v_cndmask_b32_e32
183; GCN-DAG: v_cndmask_b32_e32
184; GCN-DAG: v_cndmask_b32_e32
185; GCN-DAG: v_cndmask_b32_e32
186; GCN-DAG: v_cndmask_b32_e32
187; GCN-DAG: v_cndmask_b32_e32
188; GCN-DAG: v_cndmask_b32_e32
189; GCN-DAG: v_or_b32_sdwa
190; GCN-DAG: v_or_b32_sdwa
191; GCN-DAG: v_or_b32_sdwa
192; GCN-DAG: v_or_b32_sdwa
193define amdgpu_kernel void @half8_inselt(<8 x half> addrspace(1)* %out, <8 x half> %vec, i32 %sel) {
194entry:
195  %v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel
196  store <8 x half> %v, <8 x half> addrspace(1)* %out
197  ret void
198}
199
200; GCN-LABEL: {{^}}short2_inselt:
201; GCN-NOT: v_cndmask_b32
202; GCN-NOT: v_movrel
203; GCN-NOT: buffer_
204; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
205; GCN:     s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]]
206; GCN:     s_andn2_b32
207; GCN:     s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x10001
208; GCN:     s_or_b32
209define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) {
210entry:
211  %v = insertelement <2 x i16> %vec, i16 1, i32 %sel
212  store <2 x i16> %v, <2 x i16> addrspace(1)* %out
213  ret void
214}
215
216; GCN-LABEL: {{^}}short4_inselt:
217; GCN-NOT: v_cndmask_b32
218; GCN-NOT: v_movrel
219; GCN-NOT: buffer_
220; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
221; GCN:     s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
222; GCN:     s_mov_b32 s[[KLO:[0-9]+]], 0x10001
223; GCN:     s_mov_b32 s[[KHI:[0-9]+]], s[[KLO]]
224; GCN:     s_andn2_b64
225; GCN:     s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s{{\[}}[[KLO]]:[[KHI]]]
226; GCN:     s_or_b64
227define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) {
228entry:
229  %v = insertelement <4 x i16> %vec, i16 1, i32 %sel
230  store <4 x i16> %v, <4 x i16> addrspace(1)* %out
231  ret void
232}
233
234; GCN-LABEL: {{^}}byte8_inselt:
235; GCN-NOT: v_movrel
236; GCN-NOT: buffer_
237; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3
238; GCN:     s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
239; GCN:     s_mov_b32 [[K:s[0-9]+]], 0x1010101
240; GCN:     s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]
241; GCN:     s_and_b32 s6, s4, [[K]]
242; GCN:     s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
243; GCN:     s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
244define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) {
245entry:
246  %v = insertelement <8 x i8> %vec, i8 1, i32 %sel
247  store <8 x i8> %v, <8 x i8> addrspace(1)* %out
248  ret void
249}
250
251; GCN-LABEL: {{^}}byte16_inselt:
252; GCN-NOT: v_movrel
253; GCN-NOT: buffer_
254; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 0
255; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 15
256; GCN-DAG: v_cndmask_b32_e32
257; GCN-DAG: v_cndmask_b32_e32
258; GCN-DAG: v_cndmask_b32_e32
259; GCN-DAG: v_cndmask_b32_e32
260; GCN-DAG: v_cndmask_b32_e32
261; GCN-DAG: v_cndmask_b32_e32
262; GCN-DAG: v_cndmask_b32_e32
263; GCN-DAG: v_cndmask_b32_e32
264; GCN-DAG: v_cndmask_b32_e32
265; GCN-DAG: v_cndmask_b32_e32
266; GCN-DAG: v_cndmask_b32_e32
267; GCN-DAG: v_cndmask_b32_e32
268; GCN-DAG: v_cndmask_b32_e32
269; GCN-DAG: v_cndmask_b32_e32
270; GCN-DAG: v_cndmask_b32_e32
271; GCN-DAG: v_cndmask_b32_e32
272; GCN-DAG: v_or_b32_sdwa
273; GCN-DAG: v_or_b32_sdwa
274; GCN-DAG: v_or_b32_sdwa
275; GCN-DAG: v_or_b32_sdwa
276; GCN-DAG: v_or_b32_sdwa
277; GCN-DAG: v_or_b32_sdwa
278; GCN-DAG: v_or_b32_sdwa
279; GCN-DAG: v_or_b32_sdwa
280define amdgpu_kernel void @byte16_inselt(<16 x i8> addrspace(1)* %out, <16 x i8> %vec, i32 %sel) {
281entry:
282  %v = insertelement <16 x i8> %vec, i8 1, i32 %sel
283  store <16 x i8> %v, <16 x i8> addrspace(1)* %out
284  ret void
285}
286
287; GCN-LABEL: {{^}}double2_inselt:
288; GCN: s_load_dwordx4 s{{\[}}[[FIRST:[0-9]+]]:[[LAST:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}]
289; GCN-NOT: v_movrel
290; GCN-NOT: buffer_
291; GCN: s_cmp_lg_u32 [[IDX:s[0-9]+]], 1
292; GCN: s_cselect_b64 s{{\[}}[[P0_LO:[0-9]+]]:[[P0_HI:[0-9]+]]{{\]}}, s{{\[}}{{[0-9]+}}:[[LAST]]{{\]}}, 1.0
293; GCN: s_cmp_lg_u32 [[IDX]], 0
294; GCN: s_cselect_b64 s{{\[}}[[P1_LO:[0-9]+]]:[[P1_HI:[0-9]+]]{{\]}}, s{{\[}}[[FIRST]]:{{[0-9]+}}{{\]}}, 1.0
295; GCN: v_mov_b32_e32 v[[V_FIRST:[0-9]+]], s[[P1_LO]]
296; GCN: v_mov_b32_e32 v[[V_SECOND:[0-9]+]], s[[P1_HI]]
297; GCN: v_mov_b32_e32 v[[V_THIRD:[0-9]+]], s[[P0_LO]]
298; GCN: v_mov_b32_e32 v[[V_LAST:[0-9]+]], s[[P0_HI]]
299; GCN: flat_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[V_FIRST]]:[[V_LAST]]{{\]}}
300define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) {
301entry:
302  %v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel
303  store <2 x double> %v, <2 x double> addrspace(1)* %out
304  ret void
305}
306
307; GCN-LABEL: {{^}}double5_inselt:
308; GCN-NOT: v_movrel
309; GCN-NOT: buffer_
310; GCN-COUNT-5: s_cselect_b64
311define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) {
312entry:
313  %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel
314  store <5 x double> %v, <5 x double> addrspace(1)* %out
315  ret void
316}
317
318; GCN-LABEL: {{^}}double8_inselt:
319; GCN-NOT: v_cndmask
320; GCN-NOT: buffer_
321; GCN-NOT: s_or_b32
322; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
323; GCN-DAG: v_movreld_b32_e32 v[[#BASE:]], 0
324; GCN-NOT: s_mov_b32 m0
325; GCN:     v_movreld_b32_e32 v[[#BASE+1]],
326define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, <8 x double> %vec, i32 %sel) {
327entry:
328  %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
329  store <8 x double> %v, <8 x double> addrspace(1)* %out
330  ret void
331}
332
333; GCN-LABEL: {{^}}double7_inselt:
334; GCN-NOT: v_cndmask
335; GCN-NOT: buffer_
336; GCN-NOT: s_or_b32
337; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
338; GCN-DAG: v_movreld_b32_e32 v[[#BASE]], 0
339; GCN-NOT: s_mov_b32 m0
340; GCN:     v_movreld_b32_e32 v[[#BASE+1]],
341define amdgpu_kernel void @double7_inselt(<7 x double> addrspace(1)* %out, <7 x double> %vec, i32 %sel) {
342entry:
343  %v = insertelement <7 x double> %vec, double 1.000000e+00, i32 %sel
344  store <7 x double> %v, <7 x double> addrspace(1)* %out
345  ret void
346}
347
348; GCN-LABEL: {{^}}double16_inselt:
349; GCN-NOT: v_cndmask
350; GCN-NOT: buffer_
351; GCN-NOT: s_or_b32
352; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
353; GCN-DAG: v_movreld_b32_e32 v[[#BASE:]], 0
354; GCN-NOT: s_mov_b32 m0
355; GCN:     v_movreld_b32_e32 v[[#BASE+1]],
356define amdgpu_kernel void @double16_inselt(<16 x double> addrspace(1)* %out, <16 x double> %vec, i32 %sel) {
357entry:
358  %v = insertelement <16 x double> %vec, double 1.000000e+00, i32 %sel
359  store <16 x double> %v, <16 x double> addrspace(1)* %out
360  ret void
361}
362
363; GCN-LABEL: {{^}}double15_inselt:
364; GCN-NOT: v_cndmask
365; GCN-NOT: buffer_
366; GCN-NOT: s_or_b32
367; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
368; GCN-DAG: v_movreld_b32_e32 v[[#BASE:]], 0
369; GCN-NOT: s_mov_b32 m0
370; GCN:     v_movreld_b32_e32 v[[#BASE+1]],
371define amdgpu_kernel void @double15_inselt(<15 x double> addrspace(1)* %out, <15 x double> %vec, i32 %sel) {
372entry:
373  %v = insertelement <15 x double> %vec, double 1.000000e+00, i32 %sel
374  store <15 x double> %v, <15 x double> addrspace(1)* %out
375  ret void
376}
377
378; GCN-LABEL: {{^}}bit4_inselt:
379; GCN: buffer_store_byte
380; GCN: buffer_load_ubyte
381; GCN: buffer_load_ubyte
382; GCN: buffer_load_ubyte
383; GCN: buffer_load_ubyte
384define amdgpu_kernel void @bit4_inselt(<4 x i1> addrspace(1)* %out, <4 x i1> %vec, i32 %sel) {
385entry:
386  %v = insertelement <4 x i1> %vec, i1 1, i32 %sel
387  store <4 x i1> %v, <4 x i1> addrspace(1)* %out
388  ret void
389}
390
391; GCN-LABEL: {{^}}bit128_inselt:
392; GCN-NOT: buffer_
393; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
394; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], -1, 0
395; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC1]]
396
397; GCN-DAG: s_cmpk_lg_i32 {{s[0-9]+}}, 0x7f
398; GCN-DAG: s_cselect_b64 [[CCL:[^,]+]], -1, 0
399; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CCL]]
400define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i1> %vec, i32 %sel) {
401entry:
402  %v = insertelement <128 x i1> %vec, i1 1, i32 %sel
403  store <128 x i1> %v, <128 x i1> addrspace(1)* %out
404  ret void
405}
406
407; GCN-LABEL: {{^}}float32_inselt_vec:
408; GCN-NOT: buffer_
409; GCN-COUNT-32: v_cmp_ne_u32
410; GCN-COUNT-32: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0,
411define amdgpu_ps <32 x float> @float32_inselt_vec(<32 x float> %vec, i32 %sel) {
412entry:
413  %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel
414  ret <32 x float> %v
415}
416
417; GCN-LABEL: {{^}}double8_inselt_vec:
418; GCN-NOT: buffer_
419; GCN:         v_cmp_eq_u32
420; GCN-COUNT-2: v_cndmask_b32
421; GCN:         v_cmp_eq_u32
422; GCN-COUNT-2: v_cndmask_b32
423; GCN:         v_cmp_eq_u32
424; GCN-COUNT-2: v_cndmask_b32
425; GCN:         v_cmp_eq_u32
426; GCN-COUNT-2: v_cndmask_b32
427; GCN:         v_cmp_eq_u32
428; GCN-COUNT-2: v_cndmask_b32
429; GCN:         v_cmp_eq_u32
430; GCN-COUNT-2: v_cndmask_b32
431; GCN:         v_cmp_eq_u32
432; GCN-COUNT-2: v_cndmask_b32
433; GCN:         v_cmp_eq_u32
434; GCN-COUNT-2: v_cndmask_b32
435define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) {
436entry:
437  %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
438  ret <8 x double> %v
439}
440