1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs  | FileCheck %s --check-prefix=SI
3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
4
5define amdgpu_kernel void @select_f16(
6; SI-LABEL: select_f16:
7; SI:       ; %bb.0: ; %entry
8; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
9; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
10; SI-NEXT:    s_mov_b32 s3, 0xf000
11; SI-NEXT:    s_mov_b32 s2, -1
12; SI-NEXT:    s_mov_b32 s18, s2
13; SI-NEXT:    s_waitcnt lgkmcnt(0)
14; SI-NEXT:    s_mov_b32 s16, s6
15; SI-NEXT:    s_mov_b32 s17, s7
16; SI-NEXT:    s_mov_b32 s19, s3
17; SI-NEXT:    s_mov_b32 s20, s8
18; SI-NEXT:    s_mov_b32 s21, s9
19; SI-NEXT:    s_mov_b32 s8, s10
20; SI-NEXT:    s_mov_b32 s9, s11
21; SI-NEXT:    s_mov_b32 s22, s2
22; SI-NEXT:    s_mov_b32 s23, s3
23; SI-NEXT:    s_mov_b32 s10, s2
24; SI-NEXT:    s_mov_b32 s11, s3
25; SI-NEXT:    s_mov_b32 s14, s2
26; SI-NEXT:    s_mov_b32 s15, s3
27; SI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0 glc
28; SI-NEXT:    s_waitcnt vmcnt(0)
29; SI-NEXT:    buffer_load_ushort v1, off, s[20:23], 0 glc
30; SI-NEXT:    s_waitcnt vmcnt(0)
31; SI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
32; SI-NEXT:    s_waitcnt vmcnt(0)
33; SI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0 glc
34; SI-NEXT:    s_waitcnt vmcnt(0)
35; SI-NEXT:    s_mov_b32 s0, s4
36; SI-NEXT:    s_mov_b32 s1, s5
37; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
38; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
39; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
40; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
41; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
42; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
43; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
44; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
45; SI-NEXT:    s_endpgm
46;
47; VI-LABEL: select_f16:
48; VI:       ; %bb.0: ; %entry
49; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
50; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
51; VI-NEXT:    s_mov_b32 s3, 0xf000
52; VI-NEXT:    s_mov_b32 s2, -1
53; VI-NEXT:    s_mov_b32 s18, s2
54; VI-NEXT:    s_waitcnt lgkmcnt(0)
55; VI-NEXT:    s_mov_b32 s16, s6
56; VI-NEXT:    s_mov_b32 s17, s7
57; VI-NEXT:    s_mov_b32 s19, s3
58; VI-NEXT:    s_mov_b32 s20, s8
59; VI-NEXT:    s_mov_b32 s21, s9
60; VI-NEXT:    s_mov_b32 s8, s10
61; VI-NEXT:    s_mov_b32 s9, s11
62; VI-NEXT:    s_mov_b32 s22, s2
63; VI-NEXT:    s_mov_b32 s23, s3
64; VI-NEXT:    s_mov_b32 s10, s2
65; VI-NEXT:    s_mov_b32 s11, s3
66; VI-NEXT:    s_mov_b32 s14, s2
67; VI-NEXT:    s_mov_b32 s15, s3
68; VI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0 glc
69; VI-NEXT:    s_waitcnt vmcnt(0)
70; VI-NEXT:    buffer_load_ushort v1, off, s[20:23], 0 glc
71; VI-NEXT:    s_waitcnt vmcnt(0)
72; VI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
73; VI-NEXT:    s_waitcnt vmcnt(0)
74; VI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0 glc
75; VI-NEXT:    s_waitcnt vmcnt(0)
76; VI-NEXT:    s_mov_b32 s0, s4
77; VI-NEXT:    s_mov_b32 s1, s5
78; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
79; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
80; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
81; VI-NEXT:    s_endpgm
82    half addrspace(1)* %r,
83    half addrspace(1)* %a,
84    half addrspace(1)* %b,
85    half addrspace(1)* %c,
86    half addrspace(1)* %d) {
87entry:
88  %a.val = load volatile half, half addrspace(1)* %a
89  %b.val = load volatile half, half addrspace(1)* %b
90  %c.val = load volatile half, half addrspace(1)* %c
91  %d.val = load volatile half, half addrspace(1)* %d
92  %fcmp = fcmp olt half %a.val, %b.val
93  %r.val = select i1 %fcmp, half %c.val, half %d.val
94  store half %r.val, half addrspace(1)* %r
95  ret void
96}
97
98define amdgpu_kernel void @select_f16_imm_a(
99; SI-LABEL: select_f16_imm_a:
100; SI:       ; %bb.0: ; %entry
101; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
102; SI-NEXT:    s_mov_b32 s11, 0xf000
103; SI-NEXT:    s_mov_b32 s10, -1
104; SI-NEXT:    s_mov_b32 s14, s10
105; SI-NEXT:    s_mov_b32 s15, s11
106; SI-NEXT:    s_waitcnt lgkmcnt(0)
107; SI-NEXT:    s_mov_b32 s12, s2
108; SI-NEXT:    s_mov_b32 s13, s3
109; SI-NEXT:    s_mov_b32 s16, s4
110; SI-NEXT:    s_mov_b32 s17, s5
111; SI-NEXT:    s_mov_b32 s4, s6
112; SI-NEXT:    s_mov_b32 s5, s7
113; SI-NEXT:    s_mov_b32 s18, s10
114; SI-NEXT:    s_mov_b32 s19, s11
115; SI-NEXT:    s_mov_b32 s6, s10
116; SI-NEXT:    s_mov_b32 s7, s11
117; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
118; SI-NEXT:    s_waitcnt vmcnt(0)
119; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
120; SI-NEXT:    s_waitcnt vmcnt(0)
121; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
122; SI-NEXT:    s_waitcnt vmcnt(0)
123; SI-NEXT:    s_mov_b32 s8, s0
124; SI-NEXT:    s_mov_b32 s9, s1
125; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
126; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
127; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
128; SI-NEXT:    v_cmp_lt_f32_e32 vcc, 0.5, v0
129; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
130; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
131; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
132; SI-NEXT:    s_endpgm
133;
134; VI-LABEL: select_f16_imm_a:
135; VI:       ; %bb.0: ; %entry
136; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
137; VI-NEXT:    s_mov_b32 s11, 0xf000
138; VI-NEXT:    s_mov_b32 s10, -1
139; VI-NEXT:    s_mov_b32 s14, s10
140; VI-NEXT:    s_mov_b32 s15, s11
141; VI-NEXT:    s_waitcnt lgkmcnt(0)
142; VI-NEXT:    s_mov_b32 s12, s2
143; VI-NEXT:    s_mov_b32 s13, s3
144; VI-NEXT:    s_mov_b32 s16, s4
145; VI-NEXT:    s_mov_b32 s17, s5
146; VI-NEXT:    s_mov_b32 s4, s6
147; VI-NEXT:    s_mov_b32 s5, s7
148; VI-NEXT:    s_mov_b32 s18, s10
149; VI-NEXT:    s_mov_b32 s19, s11
150; VI-NEXT:    s_mov_b32 s6, s10
151; VI-NEXT:    s_mov_b32 s7, s11
152; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
153; VI-NEXT:    s_waitcnt vmcnt(0)
154; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
155; VI-NEXT:    s_waitcnt vmcnt(0)
156; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
157; VI-NEXT:    s_waitcnt vmcnt(0)
158; VI-NEXT:    s_mov_b32 s8, s0
159; VI-NEXT:    s_mov_b32 s9, s1
160; VI-NEXT:    v_cmp_lt_f16_e32 vcc, 0.5, v0
161; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
162; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
163; VI-NEXT:    s_endpgm
164    half addrspace(1)* %r,
165    half addrspace(1)* %b,
166    half addrspace(1)* %c,
167    half addrspace(1)* %d) {
168entry:
169  %b.val = load volatile half, half addrspace(1)* %b
170  %c.val = load volatile half, half addrspace(1)* %c
171  %d.val = load volatile half, half addrspace(1)* %d
172  %fcmp = fcmp olt half 0xH3800, %b.val
173  %r.val = select i1 %fcmp, half %c.val, half %d.val
174  store half %r.val, half addrspace(1)* %r
175  ret void
176}
177
178define amdgpu_kernel void @select_f16_imm_b(
179; SI-LABEL: select_f16_imm_b:
180; SI:       ; %bb.0: ; %entry
181; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
182; SI-NEXT:    s_mov_b32 s11, 0xf000
183; SI-NEXT:    s_mov_b32 s10, -1
184; SI-NEXT:    s_mov_b32 s14, s10
185; SI-NEXT:    s_mov_b32 s15, s11
186; SI-NEXT:    s_waitcnt lgkmcnt(0)
187; SI-NEXT:    s_mov_b32 s12, s2
188; SI-NEXT:    s_mov_b32 s13, s3
189; SI-NEXT:    s_mov_b32 s16, s4
190; SI-NEXT:    s_mov_b32 s17, s5
191; SI-NEXT:    s_mov_b32 s4, s6
192; SI-NEXT:    s_mov_b32 s5, s7
193; SI-NEXT:    s_mov_b32 s18, s10
194; SI-NEXT:    s_mov_b32 s19, s11
195; SI-NEXT:    s_mov_b32 s6, s10
196; SI-NEXT:    s_mov_b32 s7, s11
197; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
198; SI-NEXT:    s_waitcnt vmcnt(0)
199; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
200; SI-NEXT:    s_waitcnt vmcnt(0)
201; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
202; SI-NEXT:    s_waitcnt vmcnt(0)
203; SI-NEXT:    s_mov_b32 s8, s0
204; SI-NEXT:    s_mov_b32 s9, s1
205; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
206; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
207; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
208; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0.5, v0
209; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
210; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
211; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
212; SI-NEXT:    s_endpgm
213;
214; VI-LABEL: select_f16_imm_b:
215; VI:       ; %bb.0: ; %entry
216; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
217; VI-NEXT:    s_mov_b32 s11, 0xf000
218; VI-NEXT:    s_mov_b32 s10, -1
219; VI-NEXT:    s_mov_b32 s14, s10
220; VI-NEXT:    s_mov_b32 s15, s11
221; VI-NEXT:    s_waitcnt lgkmcnt(0)
222; VI-NEXT:    s_mov_b32 s12, s2
223; VI-NEXT:    s_mov_b32 s13, s3
224; VI-NEXT:    s_mov_b32 s16, s4
225; VI-NEXT:    s_mov_b32 s17, s5
226; VI-NEXT:    s_mov_b32 s4, s6
227; VI-NEXT:    s_mov_b32 s5, s7
228; VI-NEXT:    s_mov_b32 s18, s10
229; VI-NEXT:    s_mov_b32 s19, s11
230; VI-NEXT:    s_mov_b32 s6, s10
231; VI-NEXT:    s_mov_b32 s7, s11
232; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
233; VI-NEXT:    s_waitcnt vmcnt(0)
234; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
235; VI-NEXT:    s_waitcnt vmcnt(0)
236; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
237; VI-NEXT:    s_waitcnt vmcnt(0)
238; VI-NEXT:    s_mov_b32 s8, s0
239; VI-NEXT:    s_mov_b32 s9, s1
240; VI-NEXT:    v_cmp_gt_f16_e32 vcc, 0.5, v0
241; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
242; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
243; VI-NEXT:    s_endpgm
244    half addrspace(1)* %r,
245    half addrspace(1)* %a,
246    half addrspace(1)* %c,
247    half addrspace(1)* %d) {
248entry:
249  %a.val = load volatile half, half addrspace(1)* %a
250  %c.val = load volatile half, half addrspace(1)* %c
251  %d.val = load volatile half, half addrspace(1)* %d
252  %fcmp = fcmp olt half %a.val, 0xH3800
253  %r.val = select i1 %fcmp, half %c.val, half %d.val
254  store half %r.val, half addrspace(1)* %r
255  ret void
256}
257
258define amdgpu_kernel void @select_f16_imm_c(
259; SI-LABEL: select_f16_imm_c:
260; SI:       ; %bb.0: ; %entry
261; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
262; SI-NEXT:    s_mov_b32 s11, 0xf000
263; SI-NEXT:    s_mov_b32 s10, -1
264; SI-NEXT:    s_mov_b32 s14, s10
265; SI-NEXT:    s_mov_b32 s15, s11
266; SI-NEXT:    s_waitcnt lgkmcnt(0)
267; SI-NEXT:    s_mov_b32 s12, s2
268; SI-NEXT:    s_mov_b32 s13, s3
269; SI-NEXT:    s_mov_b32 s16, s4
270; SI-NEXT:    s_mov_b32 s17, s5
271; SI-NEXT:    s_mov_b32 s4, s6
272; SI-NEXT:    s_mov_b32 s5, s7
273; SI-NEXT:    s_mov_b32 s18, s10
274; SI-NEXT:    s_mov_b32 s19, s11
275; SI-NEXT:    s_mov_b32 s6, s10
276; SI-NEXT:    s_mov_b32 s7, s11
277; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
278; SI-NEXT:    s_waitcnt vmcnt(0)
279; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
280; SI-NEXT:    s_waitcnt vmcnt(0)
281; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
282; SI-NEXT:    s_waitcnt vmcnt(0)
283; SI-NEXT:    s_mov_b32 s8, s0
284; SI-NEXT:    s_mov_b32 s9, s1
285; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
286; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
287; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
288; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
289; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
290; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
291; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
292; SI-NEXT:    s_endpgm
293;
294; VI-LABEL: select_f16_imm_c:
295; VI:       ; %bb.0: ; %entry
296; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
297; VI-NEXT:    s_mov_b32 s11, 0xf000
298; VI-NEXT:    s_mov_b32 s10, -1
299; VI-NEXT:    s_mov_b32 s14, s10
300; VI-NEXT:    s_mov_b32 s15, s11
301; VI-NEXT:    s_waitcnt lgkmcnt(0)
302; VI-NEXT:    s_mov_b32 s12, s2
303; VI-NEXT:    s_mov_b32 s13, s3
304; VI-NEXT:    s_mov_b32 s16, s4
305; VI-NEXT:    s_mov_b32 s17, s5
306; VI-NEXT:    s_mov_b32 s4, s6
307; VI-NEXT:    s_mov_b32 s5, s7
308; VI-NEXT:    s_mov_b32 s18, s10
309; VI-NEXT:    s_mov_b32 s19, s11
310; VI-NEXT:    s_mov_b32 s6, s10
311; VI-NEXT:    s_mov_b32 s7, s11
312; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
313; VI-NEXT:    s_waitcnt vmcnt(0)
314; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
315; VI-NEXT:    s_waitcnt vmcnt(0)
316; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
317; VI-NEXT:    s_waitcnt vmcnt(0)
318; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
319; VI-NEXT:    s_mov_b32 s8, s0
320; VI-NEXT:    s_mov_b32 s9, s1
321; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
322; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
323; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
324; VI-NEXT:    s_endpgm
325    half addrspace(1)* %r,
326    half addrspace(1)* %a,
327    half addrspace(1)* %b,
328    half addrspace(1)* %d) {
329entry:
330  %a.val = load volatile half, half addrspace(1)* %a
331  %b.val = load volatile half, half addrspace(1)* %b
332  %d.val = load volatile half, half addrspace(1)* %d
333  %fcmp = fcmp olt half %a.val, %b.val
334  %r.val = select i1 %fcmp, half 0xH3800, half %d.val
335  store half %r.val, half addrspace(1)* %r
336  ret void
337}
338
339define amdgpu_kernel void @select_f16_imm_d(
340; SI-LABEL: select_f16_imm_d:
341; SI:       ; %bb.0: ; %entry
342; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
343; SI-NEXT:    s_mov_b32 s11, 0xf000
344; SI-NEXT:    s_mov_b32 s10, -1
345; SI-NEXT:    s_mov_b32 s14, s10
346; SI-NEXT:    s_mov_b32 s15, s11
347; SI-NEXT:    s_waitcnt lgkmcnt(0)
348; SI-NEXT:    s_mov_b32 s12, s2
349; SI-NEXT:    s_mov_b32 s13, s3
350; SI-NEXT:    s_mov_b32 s16, s4
351; SI-NEXT:    s_mov_b32 s17, s5
352; SI-NEXT:    s_mov_b32 s4, s6
353; SI-NEXT:    s_mov_b32 s5, s7
354; SI-NEXT:    s_mov_b32 s18, s10
355; SI-NEXT:    s_mov_b32 s19, s11
356; SI-NEXT:    s_mov_b32 s6, s10
357; SI-NEXT:    s_mov_b32 s7, s11
358; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
359; SI-NEXT:    s_waitcnt vmcnt(0)
360; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
361; SI-NEXT:    s_waitcnt vmcnt(0)
362; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
363; SI-NEXT:    s_waitcnt vmcnt(0)
364; SI-NEXT:    s_mov_b32 s8, s0
365; SI-NEXT:    s_mov_b32 s9, s1
366; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
367; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
368; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
369; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
370; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
371; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
372; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
373; SI-NEXT:    s_endpgm
374;
375; VI-LABEL: select_f16_imm_d:
376; VI:       ; %bb.0: ; %entry
377; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
378; VI-NEXT:    s_mov_b32 s11, 0xf000
379; VI-NEXT:    s_mov_b32 s10, -1
380; VI-NEXT:    s_mov_b32 s14, s10
381; VI-NEXT:    s_mov_b32 s15, s11
382; VI-NEXT:    s_waitcnt lgkmcnt(0)
383; VI-NEXT:    s_mov_b32 s12, s2
384; VI-NEXT:    s_mov_b32 s13, s3
385; VI-NEXT:    s_mov_b32 s16, s4
386; VI-NEXT:    s_mov_b32 s17, s5
387; VI-NEXT:    s_mov_b32 s4, s6
388; VI-NEXT:    s_mov_b32 s5, s7
389; VI-NEXT:    s_mov_b32 s18, s10
390; VI-NEXT:    s_mov_b32 s19, s11
391; VI-NEXT:    s_mov_b32 s6, s10
392; VI-NEXT:    s_mov_b32 s7, s11
393; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
394; VI-NEXT:    s_waitcnt vmcnt(0)
395; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
396; VI-NEXT:    s_waitcnt vmcnt(0)
397; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
398; VI-NEXT:    s_waitcnt vmcnt(0)
399; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
400; VI-NEXT:    s_mov_b32 s8, s0
401; VI-NEXT:    s_mov_b32 s9, s1
402; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
403; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
404; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
405; VI-NEXT:    s_endpgm
406    half addrspace(1)* %r,
407    half addrspace(1)* %a,
408    half addrspace(1)* %b,
409    half addrspace(1)* %c) {
410entry:
411  %a.val = load volatile half, half addrspace(1)* %a
412  %b.val = load volatile half, half addrspace(1)* %b
413  %c.val = load volatile half, half addrspace(1)* %c
414  %fcmp = fcmp olt half %a.val, %b.val
415  %r.val = select i1 %fcmp, half %c.val, half 0xH3800
416  store half %r.val, half addrspace(1)* %r
417  ret void
418}
419
420define amdgpu_kernel void @select_v2f16(
421; SI-LABEL: select_v2f16:
422; SI:       ; %bb.0: ; %entry
423; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
424; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
425; SI-NEXT:    s_mov_b32 s3, 0xf000
426; SI-NEXT:    s_mov_b32 s2, -1
427; SI-NEXT:    s_mov_b32 s18, s2
428; SI-NEXT:    s_waitcnt lgkmcnt(0)
429; SI-NEXT:    s_mov_b32 s16, s6
430; SI-NEXT:    s_mov_b32 s17, s7
431; SI-NEXT:    s_mov_b32 s19, s3
432; SI-NEXT:    s_mov_b32 s20, s8
433; SI-NEXT:    s_mov_b32 s21, s9
434; SI-NEXT:    s_mov_b32 s8, s10
435; SI-NEXT:    s_mov_b32 s9, s11
436; SI-NEXT:    s_mov_b32 s22, s2
437; SI-NEXT:    s_mov_b32 s23, s3
438; SI-NEXT:    s_mov_b32 s10, s2
439; SI-NEXT:    s_mov_b32 s11, s3
440; SI-NEXT:    s_mov_b32 s14, s2
441; SI-NEXT:    s_mov_b32 s15, s3
442; SI-NEXT:    buffer_load_dword v0, off, s[16:19], 0
443; SI-NEXT:    buffer_load_dword v1, off, s[20:23], 0
444; SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0
445; SI-NEXT:    buffer_load_dword v3, off, s[8:11], 0
446; SI-NEXT:    s_mov_b32 s0, s4
447; SI-NEXT:    s_mov_b32 s1, s5
448; SI-NEXT:    s_waitcnt vmcnt(3)
449; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
450; SI-NEXT:    s_waitcnt vmcnt(2)
451; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
452; SI-NEXT:    s_waitcnt vmcnt(1)
453; SI-NEXT:    v_cvt_f32_f16_e32 v4, v2
454; SI-NEXT:    s_waitcnt vmcnt(0)
455; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
456; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
457; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
458; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
459; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
460; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
461; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
462; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
463; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
464; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
465; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
466; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
467; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
468; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
469; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
470; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
471; SI-NEXT:    v_or_b32_e32 v0, v0, v1
472; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
473; SI-NEXT:    s_endpgm
474;
475; VI-LABEL: select_v2f16:
476; VI:       ; %bb.0: ; %entry
477; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
478; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
479; VI-NEXT:    s_mov_b32 s3, 0xf000
480; VI-NEXT:    s_mov_b32 s2, -1
481; VI-NEXT:    s_mov_b32 s18, s2
482; VI-NEXT:    s_waitcnt lgkmcnt(0)
483; VI-NEXT:    s_mov_b32 s16, s6
484; VI-NEXT:    s_mov_b32 s17, s7
485; VI-NEXT:    s_mov_b32 s19, s3
486; VI-NEXT:    s_mov_b32 s20, s8
487; VI-NEXT:    s_mov_b32 s21, s9
488; VI-NEXT:    s_mov_b32 s8, s10
489; VI-NEXT:    s_mov_b32 s9, s11
490; VI-NEXT:    s_mov_b32 s22, s2
491; VI-NEXT:    s_mov_b32 s23, s3
492; VI-NEXT:    s_mov_b32 s10, s2
493; VI-NEXT:    s_mov_b32 s11, s3
494; VI-NEXT:    s_mov_b32 s14, s2
495; VI-NEXT:    s_mov_b32 s15, s3
496; VI-NEXT:    buffer_load_dword v0, off, s[16:19], 0
497; VI-NEXT:    buffer_load_dword v1, off, s[20:23], 0
498; VI-NEXT:    buffer_load_dword v2, off, s[12:15], 0
499; VI-NEXT:    buffer_load_dword v3, off, s[8:11], 0
500; VI-NEXT:    s_mov_b32 s0, s4
501; VI-NEXT:    s_mov_b32 s1, s5
502; VI-NEXT:    s_waitcnt vmcnt(3)
503; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
504; VI-NEXT:    s_waitcnt vmcnt(2)
505; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
506; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
507; VI-NEXT:    s_waitcnt vmcnt(0)
508; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
509; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
510; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
511; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v6, v5
512; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
513; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
514; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
515; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
516; VI-NEXT:    s_endpgm
517    <2 x half> addrspace(1)* %r,
518    <2 x half> addrspace(1)* %a,
519    <2 x half> addrspace(1)* %b,
520    <2 x half> addrspace(1)* %c,
521    <2 x half> addrspace(1)* %d) {
522entry:
523  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
524  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
525  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
526  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
527  %fcmp = fcmp olt <2 x half> %a.val, %b.val
528  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
529  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
530  ret void
531}
532
533define amdgpu_kernel void @select_v2f16_imm_a(
534; SI-LABEL: select_v2f16_imm_a:
535; SI:       ; %bb.0: ; %entry
536; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
537; SI-NEXT:    s_mov_b32 s11, 0xf000
538; SI-NEXT:    s_mov_b32 s10, -1
539; SI-NEXT:    s_mov_b32 s14, s10
540; SI-NEXT:    s_mov_b32 s15, s11
541; SI-NEXT:    s_waitcnt lgkmcnt(0)
542; SI-NEXT:    s_mov_b32 s12, s2
543; SI-NEXT:    s_mov_b32 s13, s3
544; SI-NEXT:    s_mov_b32 s16, s4
545; SI-NEXT:    s_mov_b32 s17, s5
546; SI-NEXT:    s_mov_b32 s4, s6
547; SI-NEXT:    s_mov_b32 s5, s7
548; SI-NEXT:    s_mov_b32 s18, s10
549; SI-NEXT:    s_mov_b32 s19, s11
550; SI-NEXT:    s_mov_b32 s6, s10
551; SI-NEXT:    s_mov_b32 s7, s11
552; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
553; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
554; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
555; SI-NEXT:    s_mov_b32 s2, 0x3f200000
556; SI-NEXT:    s_mov_b32 s8, s0
557; SI-NEXT:    s_mov_b32 s9, s1
558; SI-NEXT:    s_waitcnt vmcnt(2)
559; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
560; SI-NEXT:    s_waitcnt vmcnt(1)
561; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
562; SI-NEXT:    s_waitcnt vmcnt(0)
563; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
564; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
565; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
566; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
567; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
568; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
569; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
570; SI-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v3
571; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
572; SI-NEXT:    v_cmp_lt_f32_e32 vcc, 0.5, v0
573; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
574; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
575; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
576; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
577; SI-NEXT:    v_or_b32_e32 v0, v0, v1
578; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
579; SI-NEXT:    s_endpgm
580;
581; VI-LABEL: select_v2f16_imm_a:
582; VI:       ; %bb.0: ; %entry
583; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
584; VI-NEXT:    s_mov_b32 s11, 0xf000
585; VI-NEXT:    s_mov_b32 s10, -1
586; VI-NEXT:    s_mov_b32 s14, s10
587; VI-NEXT:    s_mov_b32 s15, s11
588; VI-NEXT:    s_waitcnt lgkmcnt(0)
589; VI-NEXT:    s_mov_b32 s12, s2
590; VI-NEXT:    s_mov_b32 s13, s3
591; VI-NEXT:    s_mov_b32 s16, s4
592; VI-NEXT:    s_mov_b32 s17, s5
593; VI-NEXT:    s_mov_b32 s4, s6
594; VI-NEXT:    s_mov_b32 s5, s7
595; VI-NEXT:    s_mov_b32 s18, s10
596; VI-NEXT:    s_mov_b32 s19, s11
597; VI-NEXT:    s_mov_b32 s6, s10
598; VI-NEXT:    s_mov_b32 s7, s11
599; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
600; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
601; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
602; VI-NEXT:    s_movk_i32 s2, 0x3900
603; VI-NEXT:    s_mov_b32 s8, s0
604; VI-NEXT:    s_mov_b32 s9, s1
605; VI-NEXT:    s_waitcnt vmcnt(2)
606; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
607; VI-NEXT:    v_cmp_lt_f16_e32 vcc, 0.5, v0
608; VI-NEXT:    s_waitcnt vmcnt(0)
609; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
610; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
611; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
612; VI-NEXT:    v_cmp_lt_f16_e32 vcc, s2, v3
613; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
614; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
615; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
616; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
617; VI-NEXT:    s_endpgm
618    <2 x half> addrspace(1)* %r,
619    <2 x half> addrspace(1)* %b,
620    <2 x half> addrspace(1)* %c,
621    <2 x half> addrspace(1)* %d) {
622entry:
623  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
624  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
625  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
626  %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val
627  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
628  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
629  ret void
630}
631
632define amdgpu_kernel void @select_v2f16_imm_b(
633; SI-LABEL: select_v2f16_imm_b:
634; SI:       ; %bb.0: ; %entry
635; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
636; SI-NEXT:    s_mov_b32 s11, 0xf000
637; SI-NEXT:    s_mov_b32 s10, -1
638; SI-NEXT:    s_mov_b32 s14, s10
639; SI-NEXT:    s_mov_b32 s15, s11
640; SI-NEXT:    s_waitcnt lgkmcnt(0)
641; SI-NEXT:    s_mov_b32 s12, s2
642; SI-NEXT:    s_mov_b32 s13, s3
643; SI-NEXT:    s_mov_b32 s16, s4
644; SI-NEXT:    s_mov_b32 s17, s5
645; SI-NEXT:    s_mov_b32 s4, s6
646; SI-NEXT:    s_mov_b32 s5, s7
647; SI-NEXT:    s_mov_b32 s18, s10
648; SI-NEXT:    s_mov_b32 s19, s11
649; SI-NEXT:    s_mov_b32 s6, s10
650; SI-NEXT:    s_mov_b32 s7, s11
651; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
652; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
653; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
654; SI-NEXT:    s_mov_b32 s2, 0x3f200000
655; SI-NEXT:    s_mov_b32 s8, s0
656; SI-NEXT:    s_mov_b32 s9, s1
657; SI-NEXT:    s_waitcnt vmcnt(2)
658; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
659; SI-NEXT:    s_waitcnt vmcnt(1)
660; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
661; SI-NEXT:    s_waitcnt vmcnt(0)
662; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
663; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
664; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
665; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
666; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
667; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
668; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
669; SI-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v3
670; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
671; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0.5, v0
672; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
673; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
674; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
675; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
676; SI-NEXT:    v_or_b32_e32 v0, v0, v1
677; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
678; SI-NEXT:    s_endpgm
679;
680; VI-LABEL: select_v2f16_imm_b:
681; VI:       ; %bb.0: ; %entry
682; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
683; VI-NEXT:    s_mov_b32 s11, 0xf000
684; VI-NEXT:    s_mov_b32 s10, -1
685; VI-NEXT:    s_mov_b32 s14, s10
686; VI-NEXT:    s_mov_b32 s15, s11
687; VI-NEXT:    s_waitcnt lgkmcnt(0)
688; VI-NEXT:    s_mov_b32 s12, s2
689; VI-NEXT:    s_mov_b32 s13, s3
690; VI-NEXT:    s_mov_b32 s16, s4
691; VI-NEXT:    s_mov_b32 s17, s5
692; VI-NEXT:    s_mov_b32 s4, s6
693; VI-NEXT:    s_mov_b32 s5, s7
694; VI-NEXT:    s_mov_b32 s18, s10
695; VI-NEXT:    s_mov_b32 s19, s11
696; VI-NEXT:    s_mov_b32 s6, s10
697; VI-NEXT:    s_mov_b32 s7, s11
698; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
699; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
700; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
701; VI-NEXT:    s_movk_i32 s2, 0x3900
702; VI-NEXT:    s_mov_b32 s8, s0
703; VI-NEXT:    s_mov_b32 s9, s1
704; VI-NEXT:    s_waitcnt vmcnt(2)
705; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
706; VI-NEXT:    v_cmp_gt_f16_e32 vcc, 0.5, v0
707; VI-NEXT:    s_waitcnt vmcnt(0)
708; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
709; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
710; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
711; VI-NEXT:    v_cmp_gt_f16_e32 vcc, s2, v3
712; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
713; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
714; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
715; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
716; VI-NEXT:    s_endpgm
717    <2 x half> addrspace(1)* %r,
718    <2 x half> addrspace(1)* %a,
719    <2 x half> addrspace(1)* %c,
720    <2 x half> addrspace(1)* %d) {
721entry:
722  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
723  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
724  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
725  %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900>
726  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
727  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
728  ret void
729}
730
731define amdgpu_kernel void @select_v2f16_imm_c(
732; SI-LABEL: select_v2f16_imm_c:
733; SI:       ; %bb.0: ; %entry
734; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
735; SI-NEXT:    s_mov_b32 s11, 0xf000
736; SI-NEXT:    s_mov_b32 s10, -1
737; SI-NEXT:    s_mov_b32 s14, s10
738; SI-NEXT:    s_mov_b32 s15, s11
739; SI-NEXT:    s_waitcnt lgkmcnt(0)
740; SI-NEXT:    s_mov_b32 s12, s2
741; SI-NEXT:    s_mov_b32 s13, s3
742; SI-NEXT:    s_mov_b32 s16, s4
743; SI-NEXT:    s_mov_b32 s17, s5
744; SI-NEXT:    s_mov_b32 s4, s6
745; SI-NEXT:    s_mov_b32 s5, s7
746; SI-NEXT:    s_mov_b32 s18, s10
747; SI-NEXT:    s_mov_b32 s19, s11
748; SI-NEXT:    s_mov_b32 s6, s10
749; SI-NEXT:    s_mov_b32 s7, s11
750; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
751; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
752; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
753; SI-NEXT:    v_mov_b32_e32 v3, 0x3f200000
754; SI-NEXT:    s_mov_b32 s8, s0
755; SI-NEXT:    s_mov_b32 s9, s1
756; SI-NEXT:    s_waitcnt vmcnt(2)
757; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
758; SI-NEXT:    s_waitcnt vmcnt(1)
759; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
760; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
761; SI-NEXT:    s_waitcnt vmcnt(0)
762; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
763; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
764; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
765; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
766; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
767; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
768; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v5
769; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v6, vcc
770; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v4, v1
771; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
772; SI-NEXT:    v_cndmask_b32_e32 v1, 0.5, v2, vcc
773; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
774; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
775; SI-NEXT:    v_or_b32_e32 v0, v1, v0
776; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
777; SI-NEXT:    s_endpgm
778;
779; VI-LABEL: select_v2f16_imm_c:
780; VI:       ; %bb.0: ; %entry
781; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
782; VI-NEXT:    s_mov_b32 s11, 0xf000
783; VI-NEXT:    s_mov_b32 s10, -1
784; VI-NEXT:    s_mov_b32 s14, s10
785; VI-NEXT:    s_mov_b32 s15, s11
786; VI-NEXT:    s_waitcnt lgkmcnt(0)
787; VI-NEXT:    s_mov_b32 s12, s2
788; VI-NEXT:    s_mov_b32 s13, s3
789; VI-NEXT:    s_mov_b32 s16, s4
790; VI-NEXT:    s_mov_b32 s17, s5
791; VI-NEXT:    s_mov_b32 s4, s6
792; VI-NEXT:    s_mov_b32 s5, s7
793; VI-NEXT:    s_mov_b32 s18, s10
794; VI-NEXT:    s_mov_b32 s19, s11
795; VI-NEXT:    s_mov_b32 s6, s10
796; VI-NEXT:    s_mov_b32 s7, s11
797; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
798; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
799; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
800; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
801; VI-NEXT:    v_mov_b32_e32 v4, 0x3900
802; VI-NEXT:    s_mov_b32 s8, s0
803; VI-NEXT:    s_mov_b32 s9, s1
804; VI-NEXT:    s_waitcnt vmcnt(2)
805; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
806; VI-NEXT:    s_waitcnt vmcnt(1)
807; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
808; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
809; VI-NEXT:    s_waitcnt vmcnt(0)
810; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
811; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
812; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v6, v5
813; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
814; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
815; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
816; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
817; VI-NEXT:    s_endpgm
818    <2 x half> addrspace(1)* %r,
819    <2 x half> addrspace(1)* %a,
820    <2 x half> addrspace(1)* %b,
821    <2 x half> addrspace(1)* %d) {
822entry:
823  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
824  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
825  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
826  %fcmp = fcmp olt <2 x half> %a.val, %b.val
827  %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val
828  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
829  ret void
830}
831
832define amdgpu_kernel void @select_v2f16_imm_d(
833; SI-LABEL: select_v2f16_imm_d:
834; SI:       ; %bb.0: ; %entry
835; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
836; SI-NEXT:    s_mov_b32 s11, 0xf000
837; SI-NEXT:    s_mov_b32 s10, -1
838; SI-NEXT:    s_mov_b32 s14, s10
839; SI-NEXT:    s_mov_b32 s15, s11
840; SI-NEXT:    s_waitcnt lgkmcnt(0)
841; SI-NEXT:    s_mov_b32 s12, s2
842; SI-NEXT:    s_mov_b32 s13, s3
843; SI-NEXT:    s_mov_b32 s16, s4
844; SI-NEXT:    s_mov_b32 s17, s5
845; SI-NEXT:    s_mov_b32 s4, s6
846; SI-NEXT:    s_mov_b32 s5, s7
847; SI-NEXT:    s_mov_b32 s18, s10
848; SI-NEXT:    s_mov_b32 s19, s11
849; SI-NEXT:    s_mov_b32 s6, s10
850; SI-NEXT:    s_mov_b32 s7, s11
851; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
852; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
853; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
854; SI-NEXT:    v_mov_b32_e32 v3, 0x3f200000
855; SI-NEXT:    s_mov_b32 s8, s0
856; SI-NEXT:    s_mov_b32 s9, s1
857; SI-NEXT:    s_waitcnt vmcnt(2)
858; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
859; SI-NEXT:    s_waitcnt vmcnt(1)
860; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
861; SI-NEXT:    s_waitcnt vmcnt(0)
862; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
863; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
864; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
865; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
866; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
867; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
868; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
869; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
870; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
871; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
872; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
873; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
874; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
875; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
876; SI-NEXT:    v_or_b32_e32 v0, v0, v1
877; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
878; SI-NEXT:    s_endpgm
879;
880; VI-LABEL: select_v2f16_imm_d:
881; VI:       ; %bb.0: ; %entry
882; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
883; VI-NEXT:    s_mov_b32 s11, 0xf000
884; VI-NEXT:    s_mov_b32 s10, -1
885; VI-NEXT:    s_mov_b32 s14, s10
886; VI-NEXT:    s_mov_b32 s15, s11
887; VI-NEXT:    s_waitcnt lgkmcnt(0)
888; VI-NEXT:    s_mov_b32 s12, s2
889; VI-NEXT:    s_mov_b32 s13, s3
890; VI-NEXT:    s_mov_b32 s16, s4
891; VI-NEXT:    s_mov_b32 s17, s5
892; VI-NEXT:    s_mov_b32 s4, s6
893; VI-NEXT:    s_mov_b32 s5, s7
894; VI-NEXT:    s_mov_b32 s18, s10
895; VI-NEXT:    s_mov_b32 s19, s11
896; VI-NEXT:    s_mov_b32 s6, s10
897; VI-NEXT:    s_mov_b32 s7, s11
898; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
899; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
900; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
901; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
902; VI-NEXT:    v_mov_b32_e32 v4, 0x3900
903; VI-NEXT:    s_mov_b32 s8, s0
904; VI-NEXT:    s_mov_b32 s9, s1
905; VI-NEXT:    s_waitcnt vmcnt(2)
906; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
907; VI-NEXT:    s_waitcnt vmcnt(1)
908; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
909; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
910; VI-NEXT:    s_waitcnt vmcnt(0)
911; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
912; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
913; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v6, v5
914; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
915; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
916; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
917; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
918; VI-NEXT:    s_endpgm
919    <2 x half> addrspace(1)* %r,
920    <2 x half> addrspace(1)* %a,
921    <2 x half> addrspace(1)* %b,
922    <2 x half> addrspace(1)* %c) {
923entry:
924  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
925  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
926  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
927  %fcmp = fcmp olt <2 x half> %a.val, %b.val
928  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900>
929  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
930  ret void
931}
932