1; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
2
3; GCN-LABEL: {{^}}float4_inselt:
4; GCN-NOT: v_movrel
5; GCN-NOT: buffer_
6; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
7; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
8; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
9; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]]
10; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
11; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]]
12; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
13; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
14; GCN:     flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
15define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x float> %vec, i32 %sel) {
16entry:
17  %v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel
18  store <4 x float> %v, <4 x float> addrspace(1)* %out
19  ret void
20}
21
22; GCN-LABEL: {{^}}float4_inselt_undef:
23; GCN-NOT: v_movrel
24; GCN-NOT: buffer_
25; GCN-NOT: v_cmp_
26; GCN-NOT: v_cndmask_
27; GCN:     v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
28; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
29; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
30; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
31define amdgpu_kernel void @float4_inselt_undef(<4 x float> addrspace(1)* %out, i32 %sel) {
32entry:
33  %v = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel
34  store <4 x float> %v, <4 x float> addrspace(1)* %out
35  ret void
36}
37
38; GCN-LABEL: {{^}}int4_inselt:
39; GCN-NOT: v_movrel
40; GCN-NOT: buffer_
41; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
42; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1, v{{[0-9]+}}, [[CC1]]
43; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
44; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC2]]
45; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
46; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC3]]
47; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
48; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1, v{{[0-9]+}}, [[CC4]]
49; GCN:     flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
50define amdgpu_kernel void @int4_inselt(<4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %sel) {
51entry:
52  %v = insertelement <4 x i32> %vec, i32 1, i32 %sel
53  store <4 x i32> %v, <4 x i32> addrspace(1)* %out
54  ret void
55}
56
57; GCN-LABEL: {{^}}float2_inselt:
58; GCN-NOT: v_movrel
59; GCN-NOT: buffer_
60; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1
61; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
62; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 0
63; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC2]]
64; GCN:     flat_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
65define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x float> %vec, i32 %sel) {
66entry:
67  %v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel
68  store <2 x float> %v, <2 x float> addrspace(1)* %out
69  ret void
70}
71
72; GCN-LABEL: {{^}}float8_inselt:
73; GCN-NOT: v_movrel
74; GCN-NOT: buffer_
75; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
76; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
77; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
78; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]]
79; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
80; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]]
81; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
82; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
83; GCN-DAG: v_cmp_ne_u32_e64 [[CC5:[^,]+]], [[IDX:s[0-9]+]], 7
84; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC5]]
85; GCN-DAG: v_cmp_ne_u32_e64 [[CC6:[^,]+]], [[IDX]], 6
86; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC6]]
87; GCN-DAG: v_cmp_ne_u32_e64 [[CC7:[^,]+]], [[IDX]], 5
88; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC7]]
89; GCN-DAG: v_cmp_ne_u32_e64 [[CC8:[^,]+]], [[IDX]], 4
90; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC8]]
91; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST0]]:[[ELT_LAST0]]]
92; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST1]]:[[ELT_LAST1]]]
93define amdgpu_kernel void @float8_inselt(<8 x float> addrspace(1)* %out, <8 x float> %vec, i32 %sel) {
94entry:
95  %v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel
96  store <8 x float> %v, <8 x float> addrspace(1)* %out
97  ret void
98}
99
100; GCN-LABEL: {{^}}float16_inselt:
101; GCN: v_movreld_b32
102define amdgpu_kernel void @float16_inselt(<16 x float> addrspace(1)* %out, <16 x float> %vec, i32 %sel) {
103entry:
104  %v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel
105  store <16 x float> %v, <16 x float> addrspace(1)* %out
106  ret void
107}
108
109; GCN-LABEL: {{^}}half4_inselt:
110; GCN-NOT: v_cndmask_b32
111; GCN-NOT: v_movrel
112; GCN-NOT: buffer_
113; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
114; GCN:     s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
115; GCN:     s_mov_b32 [[K:s[0-9]+]], 0x3c003c00
116; GCN:     v_mov_b32_e32 [[V:v[0-9]+]], [[K]]
117; GCN:     v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}}
118; GCN:     v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}}
119define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) {
120entry:
121  %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel
122  store <4 x half> %v, <4 x half> addrspace(1)* %out
123  ret void
124}
125
126; GCN-LABEL: {{^}}half2_inselt:
127; GCN-NOT: v_cndmask_b32
128; GCN-NOT: v_movrel
129; GCN-NOT: buffer_
130; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
131; GCN:     s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]]
132; GCN:     v_bfi_b32 v{{[0-9]+}}, [[V]], v{{[0-9]+}}, v{{[0-9]+}}
133define amdgpu_kernel void @half2_inselt(<2 x half> addrspace(1)* %out, <2 x half> %vec, i32 %sel) {
134entry:
135  %v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel
136  store <2 x half> %v, <2 x half> addrspace(1)* %out
137  ret void
138}
139
140; GCN-LABEL: {{^}}half8_inselt:
141; GCN-NOT: v_movrel
142; GCN-NOT: buffer_
143; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0
144; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 1
145; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 2
146; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 3
147; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 4
148; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 5
149; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 6
150; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 7
151; GCN-DAG: v_cndmask_b32_e32
152; GCN-DAG: v_cndmask_b32_e32
153; GCN-DAG: v_cndmask_b32_e32
154; GCN-DAG: v_cndmask_b32_e32
155; GCN-DAG: v_cndmask_b32_e32
156; GCN-DAG: v_cndmask_b32_e32
157; GCN-DAG: v_cndmask_b32_e32
158; GCN-DAG: v_cndmask_b32_e32
159; GCN-DAG: v_or_b32_sdwa
160; GCN-DAG: v_or_b32_sdwa
161; GCN-DAG: v_or_b32_sdwa
162; GCN-DAG: v_or_b32_sdwa
163define amdgpu_kernel void @half8_inselt(<8 x half> addrspace(1)* %out, <8 x half> %vec, i32 %sel) {
164entry:
165  %v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel
166  store <8 x half> %v, <8 x half> addrspace(1)* %out
167  ret void
168}
169
170; GCN-LABEL: {{^}}short2_inselt:
171; GCN-NOT: v_cndmask_b32
172; GCN-NOT: v_movrel
173; GCN-NOT: buffer_
174; GCN:     v_mov_b32_e32 [[K:v[0-9]+]], 0x10001
175; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
176; GCN:     s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]]
177; GCN:     v_bfi_b32 v{{[0-9]+}}, [[V]], [[K]], v{{[0-9]+}}
178define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) {
179entry:
180  %v = insertelement <2 x i16> %vec, i16 1, i32 %sel
181  store <2 x i16> %v, <2 x i16> addrspace(1)* %out
182  ret void
183}
184
185; GCN-LABEL: {{^}}short4_inselt:
186; GCN-NOT: v_cndmask_b32
187; GCN-NOT: v_movrel
188; GCN-NOT: buffer_
189; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
190; GCN:     s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
191; GCN:     s_mov_b32 [[K:s[0-9]+]], 0x10001
192; GCN:     v_mov_b32_e32 [[V:v[0-9]+]], [[K]]
193; GCN:     v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}}
194; GCN:     v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}}
195define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) {
196entry:
197  %v = insertelement <4 x i16> %vec, i16 1, i32 %sel
198  store <4 x i16> %v, <4 x i16> addrspace(1)* %out
199  ret void
200}
201
202; GCN-LABEL: {{^}}byte8_inselt:
203; GCN-NOT: v_movrel
204; GCN-NOT: buffer_
205; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3
206; GCN:     s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
207; GCN:     s_mov_b32 [[K:s[0-9]+]], 0x1010101
208; GCN:     s_and_b32 s3, s1, [[K]]
209; GCN:     s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]
210; GCN:     s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
211; GCN:     s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
212define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) {
213entry:
214  %v = insertelement <8 x i8> %vec, i8 1, i32 %sel
215  store <8 x i8> %v, <8 x i8> addrspace(1)* %out
216  ret void
217}
218
219; GCN-LABEL: {{^}}byte16_inselt:
220; GCN-NOT: v_movrel
221; GCN-NOT: buffer_
222; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0
223; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 15
224; GCN-DAG: v_cndmask_b32_e32
225; GCN-DAG: v_cndmask_b32_e32
226; GCN-DAG: v_cndmask_b32_e32
227; GCN-DAG: v_cndmask_b32_e32
228; GCN-DAG: v_cndmask_b32_e32
229; GCN-DAG: v_cndmask_b32_e32
230; GCN-DAG: v_cndmask_b32_e32
231; GCN-DAG: v_cndmask_b32_e32
232; GCN-DAG: v_cndmask_b32_e32
233; GCN-DAG: v_cndmask_b32_e32
234; GCN-DAG: v_cndmask_b32_e32
235; GCN-DAG: v_cndmask_b32_e32
236; GCN-DAG: v_cndmask_b32_e32
237; GCN-DAG: v_cndmask_b32_e32
238; GCN-DAG: v_cndmask_b32_e32
239; GCN-DAG: v_cndmask_b32_e32
240; GCN-DAG: v_or_b32_sdwa
241; GCN-DAG: v_or_b32_sdwa
242; GCN-DAG: v_or_b32_sdwa
243; GCN-DAG: v_or_b32_sdwa
244; GCN-DAG: v_or_b32_sdwa
245; GCN-DAG: v_or_b32_sdwa
246; GCN-DAG: v_or_b32_sdwa
247; GCN-DAG: v_or_b32_sdwa
248define amdgpu_kernel void @byte16_inselt(<16 x i8> addrspace(1)* %out, <16 x i8> %vec, i32 %sel) {
249entry:
250  %v = insertelement <16 x i8> %vec, i8 1, i32 %sel
251  store <16 x i8> %v, <16 x i8> addrspace(1)* %out
252  ret void
253}
254
255; GCN-LABEL: {{^}}double2_inselt:
256; GCN-NOT: v_movrel
257; GCN-NOT: buffer_
258; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1
259; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]]
260; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
261; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 0
262; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC2]]
263; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
264define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) {
265entry:
266  %v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel
267  store <2 x double> %v, <2 x double> addrspace(1)* %out
268  ret void
269}
270
271; GCN-LABEL: {{^}}double8_inselt:
272; GCN-NOT: v_cndmask
273; GCN: buffer_store_dword
274; GCN: buffer_store_dword
275; GCN: buffer_load_dword
276; GCN: buffer_load_dword
277; GCN: buffer_load_dword
278; GCN: buffer_load_dword
279; GCN: buffer_load_dword
280; GCN: buffer_load_dword
281; GCN: buffer_load_dword
282; GCN: buffer_load_dword
283; GCN: buffer_load_dword
284; GCN: buffer_load_dword
285; GCN: buffer_load_dword
286; GCN: buffer_load_dword
287; GCN: buffer_load_dword
288; GCN: buffer_load_dword
289; GCN: buffer_load_dword
290; GCN: buffer_load_dword
291define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, <8 x double> %vec, i32 %sel) {
292entry:
293  %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
294  store <8 x double> %v, <8 x double> addrspace(1)* %out
295  ret void
296}
297
298; GCN-LABEL: {{^}}bit4_inselt:
299; GCN: buffer_store_byte
300; GCN: buffer_load_ubyte
301; GCN: buffer_load_ubyte
302; GCN: buffer_load_ubyte
303; GCN: buffer_load_ubyte
304define amdgpu_kernel void @bit4_inselt(<4 x i1> addrspace(1)* %out, <4 x i1> %vec, i32 %sel) {
305entry:
306  %v = insertelement <4 x i1> %vec, i1 1, i32 %sel
307  store <4 x i1> %v, <4 x i1> addrspace(1)* %out
308  ret void
309}
310
311; GCN-LABEL: {{^}}bit128_inselt:
312; GCN-NOT: buffer_
313; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 0
314; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC1]]
315; GCN-DAG: v_mov_b32_e32 [[LASTIDX:v[0-9]+]], 0x7f
316; GCN-DAG: v_cmp_ne_u32_e32 [[CCL:[^,]+]], s{{[0-9]+}}, [[LASTIDX]]
317; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CCL]]
318define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i1> %vec, i32 %sel) {
319entry:
320  %v = insertelement <128 x i1> %vec, i1 1, i32 %sel
321  store <128 x i1> %v, <128 x i1> addrspace(1)* %out
322  ret void
323}
324