1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
2; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
3
4; GCN-LABEL: {{^}}select_f16:
5; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
6; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
7; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
8; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
9; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
10; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
11; SI:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
12; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
13; SI:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
14; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
15; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
16; VI:  v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
17; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
18; GCN: buffer_store_short v[[R_F16]]
19; GCN: s_endpgm
20define amdgpu_kernel void @select_f16(
21    half addrspace(1)* %r,
22    half addrspace(1)* %a,
23    half addrspace(1)* %b,
24    half addrspace(1)* %c,
25    half addrspace(1)* %d) {
26entry:
27  %a.val = load half, half addrspace(1)* %a
28  %b.val = load half, half addrspace(1)* %b
29  %c.val = load half, half addrspace(1)* %c
30  %d.val = load half, half addrspace(1)* %d
31  %fcmp = fcmp olt half %a.val, %b.val
32  %r.val = select i1 %fcmp, half %c.val, half %d.val
33  store half %r.val, half addrspace(1)* %r
34  ret void
35}
36
37; GCN-LABEL: {{^}}select_f16_imm_a:
38; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
39; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
40; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
41; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
42; SI:  v_cmp_lt_f32_e32 vcc, 0.5, v[[B_F32]]
43; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
44; SI:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
45; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
46; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
47; VI:  v_cmp_lt_f16_e32 vcc, 0.5, v[[B_F16]]
48; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
49; GCN: buffer_store_short v[[R_F16]]
50; GCN: s_endpgm
51define amdgpu_kernel void @select_f16_imm_a(
52    half addrspace(1)* %r,
53    half addrspace(1)* %b,
54    half addrspace(1)* %c,
55    half addrspace(1)* %d) {
56entry:
57  %b.val = load half, half addrspace(1)* %b
58  %c.val = load half, half addrspace(1)* %c
59  %d.val = load half, half addrspace(1)* %d
60  %fcmp = fcmp olt half 0xH3800, %b.val
61  %r.val = select i1 %fcmp, half %c.val, half %d.val
62  store half %r.val, half addrspace(1)* %r
63  ret void
64}
65
66; GCN-LABEL: {{^}}select_f16_imm_b:
67; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
68; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
69; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
70; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
71; SI:  v_cmp_gt_f32_e32 vcc, 0.5, v[[A_F32]]
72; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
73; SI:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
74; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
75; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
76
77; VI:  v_cmp_gt_f16_e32 vcc, 0.5, v[[A_F16]]
78; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
79; GCN: buffer_store_short v[[R_F16]]
80; GCN: s_endpgm
81define amdgpu_kernel void @select_f16_imm_b(
82    half addrspace(1)* %r,
83    half addrspace(1)* %a,
84    half addrspace(1)* %c,
85    half addrspace(1)* %d) {
86entry:
87  %a.val = load half, half addrspace(1)* %a
88  %c.val = load half, half addrspace(1)* %c
89  %d.val = load half, half addrspace(1)* %d
90  %fcmp = fcmp olt half %a.val, 0xH3800
91  %r.val = select i1 %fcmp, half %c.val, half %d.val
92  store half %r.val, half addrspace(1)* %r
93  ret void
94}
95
96; GCN-LABEL: {{^}}select_f16_imm_c:
97; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
98; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
99; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
100; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
101; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
102; SI:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
103; SI:  v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
104; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[D_F32]], vcc
105; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
106
107; VI:  v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
108; VI:  v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x3800{{$}}
109; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc
110; GCN: buffer_store_short v[[R_F16]]
111; GCN: s_endpgm
112define amdgpu_kernel void @select_f16_imm_c(
113    half addrspace(1)* %r,
114    half addrspace(1)* %a,
115    half addrspace(1)* %b,
116    half addrspace(1)* %d) {
117entry:
118  %a.val = load half, half addrspace(1)* %a
119  %b.val = load half, half addrspace(1)* %b
120  %d.val = load half, half addrspace(1)* %d
121  %fcmp = fcmp olt half %a.val, %b.val
122  %r.val = select i1 %fcmp, half 0xH3800, half %d.val
123  store half %r.val, half addrspace(1)* %r
124  ret void
125}
126
127; GCN-LABEL: {{^}}select_f16_imm_d:
128; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
129; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
130; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
131; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
132; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
133; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
134; SI:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
135; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[C_F32]]
136; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
137; VI:  v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
138; VI:  v_mov_b32_e32 v[[D_F16:[0-9]+]], 0x3800{{$}}
139; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
140; GCN: buffer_store_short v[[R_F16]]
141; GCN: s_endpgm
142define amdgpu_kernel void @select_f16_imm_d(
143    half addrspace(1)* %r,
144    half addrspace(1)* %a,
145    half addrspace(1)* %b,
146    half addrspace(1)* %c) {
147entry:
148  %a.val = load half, half addrspace(1)* %a
149  %b.val = load half, half addrspace(1)* %b
150  %c.val = load half, half addrspace(1)* %c
151  %fcmp = fcmp olt half %a.val, %b.val
152  %r.val = select i1 %fcmp, half %c.val, half 0xH3800
153  store half %r.val, half addrspace(1)* %r
154  ret void
155}
156
157; GCN-LABEL: {{^}}select_v2f16:
158; SI: v_cvt_f32_f16_e32
159; SI: v_cvt_f32_f16_e32
160; SI: v_cvt_f32_f16_e32
161; SI: v_cvt_f32_f16_e32
162; SI: v_cmp_lt_f32_e64
163; SI: v_cmp_lt_f32_e32
164; SI: v_cndmask_b32_e32
165; SI: v_cndmask_b32_e64
166; SI: v_cvt_f16_f32_e32
167; SI: v_cvt_f16_f32_e32
168
169; VI: v_cmp_lt_f16_e64
170; VI: v_cmp_lt_f16_e32
171; VI: v_cndmask_b32_e64
172; VI: v_cndmask_b32_e32
173
174; GCN: s_endpgm
175define amdgpu_kernel void @select_v2f16(
176    <2 x half> addrspace(1)* %r,
177    <2 x half> addrspace(1)* %a,
178    <2 x half> addrspace(1)* %b,
179    <2 x half> addrspace(1)* %c,
180    <2 x half> addrspace(1)* %d) {
181entry:
182  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
183  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
184  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
185  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
186  %fcmp = fcmp olt <2 x half> %a.val, %b.val
187  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
188  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
189  ret void
190}
191
192; GCN-LABEL: {{^}}select_v2f16_imm_a:
193; SI:  v_cvt_f32_f16_e32
194; SI:  v_cvt_f32_f16_e32
195; SI:  v_cvt_f32_f16_e32
196; SI:  v_cvt_f32_f16_e32
197; SI:  v_cvt_f32_f16_e32
198; SI:  v_cvt_f32_f16_e32
199; SI-DAG:  v_cmp_gt_f32_e64
200; SI-DAG:  v_cmp_lt_f32_e32 vcc, 0.5
201
202; VI:  v_cmp_lt_f16_e32
203; VI:  v_cmp_gt_f16_e64
204; GCN: v_cndmask_b32_e32
205; GCN: v_cndmask_b32_e64
206; SI:  v_cvt_f16_f32_e32
207; SI:  v_cvt_f16_f32_e32
208; GCN: s_endpgm
209define amdgpu_kernel void @select_v2f16_imm_a(
210    <2 x half> addrspace(1)* %r,
211    <2 x half> addrspace(1)* %b,
212    <2 x half> addrspace(1)* %c,
213    <2 x half> addrspace(1)* %d) {
214entry:
215  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
216  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
217  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
218  %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val
219  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
220  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
221  ret void
222}
223
224; GCN-LABEL: {{^}}select_v2f16_imm_b:
225; SI:  v_cvt_f32_f16_e32
226; SI:  v_cvt_f32_f16_e32
227; SI:  v_cvt_f32_f16_e32
228; SI:  v_cvt_f32_f16_e32
229; SI:  v_cvt_f32_f16_e32
230; SI:  v_cvt_f32_f16_e32
231; SI-DAG:  v_cmp_lt_f32_e64
232; SI-DAG:  v_cmp_gt_f32_e32 vcc, 0.5
233
234; VI:  v_cmp_gt_f16_e32
235; VI:  v_cmp_lt_f16_e64
236; GCN: v_cndmask_b32_e32
237; GCN: v_cndmask_b32_e64
238
239; SI:  v_cvt_f16_f32_e32
240; SI:  v_cvt_f16_f32_e32
241; GCN: s_endpgm
242define amdgpu_kernel void @select_v2f16_imm_b(
243    <2 x half> addrspace(1)* %r,
244    <2 x half> addrspace(1)* %a,
245    <2 x half> addrspace(1)* %c,
246    <2 x half> addrspace(1)* %d) {
247entry:
248  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
249  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
250  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
251  %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900>
252  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
253  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
254  ret void
255}
256
257; GCN-LABEL: {{^}}select_v2f16_imm_c:
258; SI:  v_cvt_f32_f16_e32
259; SI:  v_cvt_f32_f16_e32
260; SI:  v_cvt_f32_f16_e32
261; SI:  v_cvt_f32_f16_e32
262; SI:  v_cvt_f32_f16_e32
263; SI:  v_cvt_f32_f16_e32
264
265; SI: v_cmp_nlt_f32_e32
266; SI: v_cmp_nlt_f32_e64
267; SI: v_cndmask_b32_e64
268; SI: v_cndmask_b32_e32
269
270; VI: v_cmp_nlt_f16_e32
271; VI: v_cndmask_b32_e32
272
273; VI: v_cmp_nlt_f16_e32
274; VI: v_cndmask_b32_e32
275
276; SI:  v_cvt_f16_f32_e32
277; SI:  v_cvt_f16_f32_e32
278; GCN: s_endpgm
279define amdgpu_kernel void @select_v2f16_imm_c(
280    <2 x half> addrspace(1)* %r,
281    <2 x half> addrspace(1)* %a,
282    <2 x half> addrspace(1)* %b,
283    <2 x half> addrspace(1)* %d) {
284entry:
285  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
286  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
287  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
288  %fcmp = fcmp olt <2 x half> %a.val, %b.val
289  %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val
290  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
291  ret void
292}
293
294; GCN-LABEL: {{^}}select_v2f16_imm_d:
295; SI:  v_cvt_f32_f16_e32
296; SI:  v_cvt_f32_f16_e32
297; SI:  v_cvt_f32_f16_e32
298; SI:  v_cvt_f32_f16_e32
299; SI:  v_cvt_f32_f16_e32
300; SI:  v_cvt_f32_f16_e32
301; SI:  v_cmp_lt_f32_e64
302; SI:  v_cmp_lt_f32_e32
303
304; VI:  v_cmp_lt_f16_e32
305; VI:  v_cmp_lt_f16_e64
306; GCN: v_cndmask_b32
307; GCN: v_cndmask_b32
308; SI:  v_cvt_f16_f32_e32
309; SI:  v_cvt_f16_f32_e32
310; GCN: s_endpgm
311define amdgpu_kernel void @select_v2f16_imm_d(
312    <2 x half> addrspace(1)* %r,
313    <2 x half> addrspace(1)* %a,
314    <2 x half> addrspace(1)* %b,
315    <2 x half> addrspace(1)* %c) {
316entry:
317  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
318  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
319  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
320  %fcmp = fcmp olt <2 x half> %a.val, %b.val
321  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900>
322  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
323  ret void
324}
325