1; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
2; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
3; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
4
5; Test expansion of scalar selects on vectors.
6; Evergreen not enabled since it seems to be having problems with doubles.
7
8; GCN-LABEL: {{^}}v_select_v2i8:
9; SI: v_cndmask_b32
10; SI-NOT: cndmask
11
12; GFX9: v_cndmask_b32
13; GFX9-NOT: cndmask
14
15; This is worse when i16 is legal and packed is not because
16; SelectionDAGBuilder for some reason changes the select type.
17; VI: v_cndmask_b32
18; VI: v_cndmask_b32
19define amdgpu_kernel void @v_select_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %a.ptr, <2 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
20  %a = load <2 x i8>, <2 x i8> addrspace(1)* %a.ptr, align 2
21  %b = load <2 x i8>, <2 x i8> addrspace(1)* %b.ptr, align 2
22  %cmp = icmp eq i32 %c, 0
23  %select = select i1 %cmp, <2 x i8> %a, <2 x i8> %b
24  store <2 x i8> %select, <2 x i8> addrspace(1)* %out, align 2
25  ret void
26}
27
28; GCN-LABEL: {{^}}v_select_v4i8:
29; GCN: v_cndmask_b32_e32
30; GCN-NOT: cndmask
31define amdgpu_kernel void @v_select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %a.ptr, <4 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
32  %a = load <4 x i8>, <4 x i8> addrspace(1)* %a.ptr
33  %b = load <4 x i8>, <4 x i8> addrspace(1)* %b.ptr
34  %cmp = icmp eq i32 %c, 0
35  %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
36  store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4
37  ret void
38}
39
40; GCN-LABEL: {{^}}v_select_v8i8:
41; GCN: v_cndmask_b32_e32
42; GCN: v_cndmask_b32_e32
43; GCN-NOT: cndmask
44define amdgpu_kernel void @v_select_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %a.ptr, <8 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
45  %a = load <8 x i8>, <8 x i8> addrspace(1)* %a.ptr
46  %b = load <8 x i8>, <8 x i8> addrspace(1)* %b.ptr
47  %cmp = icmp eq i32 %c, 0
48  %select = select i1 %cmp, <8 x i8> %a, <8 x i8> %b
49  store <8 x i8> %select, <8 x i8> addrspace(1)* %out, align 4
50  ret void
51}
52
53; GCN-LABEL: {{^}}v_select_v16i8:
54; GCN: v_cndmask_b32_e32
55; GCN: v_cndmask_b32_e32
56; GCN: v_cndmask_b32_e32
57; GCN: v_cndmask_b32_e32
58; GCN-NOT: cndmask
59define amdgpu_kernel void @v_select_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %a.ptr, <16 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
60  %a = load <16 x i8>, <16 x i8> addrspace(1)* %a.ptr
61  %b = load <16 x i8>, <16 x i8> addrspace(1)* %b.ptr
62  %cmp = icmp eq i32 %c, 0
63  %select = select i1 %cmp, <16 x i8> %a, <16 x i8> %b
64  store <16 x i8> %select, <16 x i8> addrspace(1)* %out, align 4
65  ret void
66}
67
68; GCN-LABEL: {{^}}select_v4i8:
69; GCN: v_cndmask_b32_e32
70; GCN-NOT: cndmask
71define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 {
72  %cmp = icmp eq i8 %c, 0
73  %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
74  store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4
75  ret void
76}
77
78; GCN-LABEL: {{^}}select_v2i16:
79; GCN: v_cndmask_b32_e32
80; GCN-NOT: v_cndmask_b32
81define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
82  %cmp = icmp eq i32 %c, 0
83  %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b
84  store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4
85  ret void
86}
87
88; GCN-LABEL: {{^}}v_select_v2i16:
89; GCN: v_cndmask_b32_e32
90; GCN-NOT: cndmask
91define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
92  %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.ptr
93  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.ptr
94  %cmp = icmp eq i32 %c, 0
95  %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b
96  store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4
97  ret void
98}
99
100; GCN-LABEL: {{^}}v_select_v3i16:
101; SI: v_cndmask_b32_e32
102; SI: cndmask
103; SI-NOT: cndmask
104
105; GFX9: v_cndmask_b32_e32
106; GFX9: cndmask
107; GFX9-NOT: cndmask
108
109; VI: v_cndmask_b32
110; VI: v_cndmask_b32
111; VI: v_cndmask_b32
112define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
113  %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr
114  %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.ptr
115  %cmp = icmp eq i32 %c, 0
116  %select = select i1 %cmp, <3 x i16> %a, <3 x i16> %b
117  store <3 x i16> %select, <3 x i16> addrspace(1)* %out, align 4
118  ret void
119}
120
121; GCN-LABEL: {{^}}v_select_v4i16:
122; GCN: v_cndmask_b32_e32
123; GCN: v_cndmask_b32_e32
124; GCN-NOT: cndmask
125define amdgpu_kernel void @v_select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %a.ptr, <4 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
126  %a = load <4 x i16>, <4 x i16> addrspace(1)* %a.ptr
127  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b.ptr
128  %cmp = icmp eq i32 %c, 0
129  %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
130  store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4
131  ret void
132}
133
134; GCN-LABEL: {{^}}v_select_v8i16:
135; GCN: v_cndmask_b32_e32
136; GCN: v_cndmask_b32_e32
137; GCN: v_cndmask_b32_e32
138; GCN: v_cndmask_b32_e32
139; GCN-NOT: cndmask
140define amdgpu_kernel void @v_select_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %a.ptr, <8 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
141  %a = load <8 x i16>, <8 x i16> addrspace(1)* %a.ptr
142  %b = load <8 x i16>, <8 x i16> addrspace(1)* %b.ptr
143  %cmp = icmp eq i32 %c, 0
144  %select = select i1 %cmp, <8 x i16> %a, <8 x i16> %b
145  store <8 x i16> %select, <8 x i16> addrspace(1)* %out, align 4
146  ret void
147}
148
149; FIXME: Expansion with bitwise operations may be better if doing a
150; vector select with SGPR inputs.
151
152; GCN-LABEL: {{^}}s_select_v2i32:
153; GCN: v_cndmask_b32_e32
154; GCN: v_cndmask_b32_e32
155; GCN: buffer_store_dwordx2
156define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
157  %cmp = icmp eq i32 %c, 0
158  %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
159  store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8
160  ret void
161}
162
163; GCN-LABEL: {{^}}s_select_v4i32:
164; GCN: v_cndmask_b32_e32
165; GCN: v_cndmask_b32_e32
166; GCN: v_cndmask_b32_e32
167; GCN: v_cndmask_b32_e32
168; GCN: buffer_store_dwordx4
169define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
170  %cmp = icmp eq i32 %c, 0
171  %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b
172  store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16
173  ret void
174}
175
176; GCN-LABEL: {{^}}v_select_v4i32:
177; GCN: buffer_load_dwordx4
178; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32
179; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
180; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
181; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
182; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
183; GCN: buffer_store_dwordx4
184define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 {
185bb:
186  %tmp2 = icmp ult i32 %cond, 32
187  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in
188  %tmp3 = select i1 %tmp2, <4 x i32> %val, <4 x i32> zeroinitializer
189  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 16
190  ret void
191}
192
193; GCN-LABEL: {{^}}select_v8i32:
194; GCN: v_cndmask_b32_e32
195; GCN: v_cndmask_b32_e32
196; GCN: v_cndmask_b32_e32
197; GCN: v_cndmask_b32_e32
198; GCN: v_cndmask_b32_e32
199; GCN: v_cndmask_b32_e32
200; GCN: v_cndmask_b32_e32
201; GCN: v_cndmask_b32_e32
202define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) #0 {
203  %cmp = icmp eq i32 %c, 0
204  %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
205  store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16
206  ret void
207}
208
209; GCN-LABEL: {{^}}s_select_v2f32:
210; GCN-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
211; GCN-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
212
213; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]]
214; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]]
215; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
216; GCN-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
217
218; GCN: v_cndmask_b32_e32
219; GCN: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
220; GCN: v_cndmask_b32_e32
221; GCN: buffer_store_dwordx2
222define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 {
223  %cmp = icmp eq i32 %c, 0
224  %select = select i1 %cmp, <2 x float> %a, <2 x float> %b
225  store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16
226  ret void
227}
228
229; GCN-LABEL: {{^}}s_select_v4f32:
230; GCN: s_load_dwordx4
231; GCN: s_load_dwordx4
232; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
233
234; GCN: v_cndmask_b32_e32
235; GCN: v_cndmask_b32_e32
236; GCN: v_cndmask_b32_e32
237; GCN: v_cndmask_b32_e32
238
239; GCN: buffer_store_dwordx4
240define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) #0 {
241  %cmp = icmp eq i32 %c, 0
242  %select = select i1 %cmp, <4 x float> %a, <4 x float> %b
243  store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16
244  ret void
245}
246
247; GCN-LABEL: {{^}}v_select_v4f32:
248; GCN: buffer_load_dwordx4
249; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32
250; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
251; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
252; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
253; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
254; GCN: buffer_store_dwordx4
255define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 {
256bb:
257  %tmp2 = icmp ult i32 %cond, 32
258  %val = load <4 x float>, <4 x float> addrspace(1)* %in
259  %tmp3 = select i1 %tmp2, <4 x float> %val, <4 x float> zeroinitializer
260  store <4 x float> %tmp3, <4 x float> addrspace(1)* %out, align 16
261  ret void
262}
263
264; GCN-LABEL: {{^}}select_v8f32:
265; GCN: v_cndmask_b32_e32
266; GCN: v_cndmask_b32_e32
267; GCN: v_cndmask_b32_e32
268; GCN: v_cndmask_b32_e32
269; GCN: v_cndmask_b32_e32
270; GCN: v_cndmask_b32_e32
271; GCN: v_cndmask_b32_e32
272; GCN: v_cndmask_b32_e32
273define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) #0 {
274  %cmp = icmp eq i32 %c, 0
275  %select = select i1 %cmp, <8 x float> %a, <8 x float> %b
276  store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16
277  ret void
278}
279
280; GCN-LABEL: {{^}}select_v2f64:
281; GCN: v_cndmask_b32_e32
282; GCN: v_cndmask_b32_e32
283; GCN: v_cndmask_b32_e32
284; GCN: v_cndmask_b32_e32
285define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) #0 {
286  %cmp = icmp eq i32 %c, 0
287  %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
288  store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16
289  ret void
290}
291
292; GCN-LABEL: {{^}}select_v4f64:
293; GCN: v_cndmask_b32_e32
294; GCN: v_cndmask_b32_e32
295; GCN: v_cndmask_b32_e32
296; GCN: v_cndmask_b32_e32
297; GCN: v_cndmask_b32_e32
298; GCN: v_cndmask_b32_e32
299; GCN: v_cndmask_b32_e32
300; GCN: v_cndmask_b32_e32
301define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) #0 {
302  %cmp = icmp eq i32 %c, 0
303  %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
304  store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16
305  ret void
306}
307
308; GCN-LABEL: {{^}}select_v8f64:
309; GCN: v_cndmask_b32_e32
310; GCN: v_cndmask_b32_e32
311; GCN: v_cndmask_b32_e32
312; GCN: v_cndmask_b32_e32
313; GCN: v_cndmask_b32_e32
314; GCN: v_cndmask_b32_e32
315; GCN: v_cndmask_b32_e32
316; GCN: v_cndmask_b32_e32
317; GCN: v_cndmask_b32_e32
318; GCN: v_cndmask_b32_e32
319; GCN: v_cndmask_b32_e32
320; GCN: v_cndmask_b32_e32
321; GCN: v_cndmask_b32_e32
322; GCN: v_cndmask_b32_e32
323; GCN: v_cndmask_b32_e32
324; GCN: v_cndmask_b32_e32
325define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) #0 {
326  %cmp = icmp eq i32 %c, 0
327  %select = select i1 %cmp, <8 x double> %a, <8 x double> %b
328  store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16
329  ret void
330}
331
332; GCN-LABEL: {{^}}v_select_v2f16:
333; GCN: v_cndmask_b32_e32
334; GCN-NOT: cndmask
335define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %a.ptr, <2 x half> addrspace(1)* %b.ptr, i32 %c) #0 {
336  %a = load <2 x half>, <2 x half> addrspace(1)* %a.ptr
337  %b = load <2 x half>, <2 x half> addrspace(1)* %b.ptr
338  %cmp = icmp eq i32 %c, 0
339  %select = select i1 %cmp, <2 x half> %a, <2 x half> %b
340  store <2 x half> %select, <2 x half> addrspace(1)* %out, align 4
341  ret void
342}
343
344; GCN-LABEL: {{^}}v_select_v3f16:
345; GCN: v_cndmask_b32_e32
346; GCN: v_cndmask_b32_e32
347; GCN-NOT: cndmask
348define amdgpu_kernel void @v_select_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %a.ptr, <3 x half> addrspace(1)* %b.ptr, i32 %c) #0 {
349  %a = load <3 x half>, <3 x half> addrspace(1)* %a.ptr
350  %b = load <3 x half>, <3 x half> addrspace(1)* %b.ptr
351  %cmp = icmp eq i32 %c, 0
352  %select = select i1 %cmp, <3 x half> %a, <3 x half> %b
353  store <3 x half> %select, <3 x half> addrspace(1)* %out, align 4
354  ret void
355}
356
357; GCN-LABEL: {{^}}v_select_v4f16:
358; GCN: v_cndmask_b32_e32
359; GCN: v_cndmask_b32_e32
360; GCN-NOT: cndmask
361define amdgpu_kernel void @v_select_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %a.ptr, <4 x half> addrspace(1)* %b.ptr, i32 %c) #0 {
362  %a = load <4 x half>, <4 x half> addrspace(1)* %a.ptr
363  %b = load <4 x half>, <4 x half> addrspace(1)* %b.ptr
364  %cmp = icmp eq i32 %c, 0
365  %select = select i1 %cmp, <4 x half> %a, <4 x half> %b
366  store <4 x half> %select, <4 x half> addrspace(1)* %out, align 4
367  ret void
368}
369
370; Function Attrs: nounwind readnone
371declare i32 @llvm.amdgcn.workitem.id.x() #1
372
373attributes #0 = { nounwind }
374attributes #1 = { nounwind readnone }
375