1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs  | FileCheck %s --check-prefix=SI
3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
4
5define amdgpu_kernel void @select_f16(
6; SI-LABEL: select_f16:
7; SI:       ; %bb.0: ; %entry
8; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
9; SI-NEXT:    s_mov_b32 s3, 0xf000
10; SI-NEXT:    s_mov_b32 s2, -1
11; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
12; SI-NEXT:    s_mov_b32 s18, s2
13; SI-NEXT:    s_waitcnt lgkmcnt(0)
14; SI-NEXT:    s_mov_b32 s16, s6
15; SI-NEXT:    s_mov_b32 s17, s7
16; SI-NEXT:    s_mov_b32 s19, s3
17; SI-NEXT:    s_mov_b32 s20, s8
18; SI-NEXT:    s_mov_b32 s21, s9
19; SI-NEXT:    s_mov_b32 s22, s2
20; SI-NEXT:    s_mov_b32 s23, s3
21; SI-NEXT:    s_mov_b32 s8, s10
22; SI-NEXT:    s_mov_b32 s9, s11
23; SI-NEXT:    s_mov_b32 s10, s2
24; SI-NEXT:    s_mov_b32 s11, s3
25; SI-NEXT:    s_mov_b32 s14, s2
26; SI-NEXT:    s_mov_b32 s15, s3
27; SI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0 glc
28; SI-NEXT:    s_waitcnt vmcnt(0)
29; SI-NEXT:    buffer_load_ushort v1, off, s[20:23], 0 glc
30; SI-NEXT:    s_waitcnt vmcnt(0)
31; SI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
32; SI-NEXT:    s_waitcnt vmcnt(0)
33; SI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0 glc
34; SI-NEXT:    s_waitcnt vmcnt(0)
35; SI-NEXT:    s_mov_b32 s0, s4
36; SI-NEXT:    s_mov_b32 s1, s5
37; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
38; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
39; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
40; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
41; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
42; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
43; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
44; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
45; SI-NEXT:    s_endpgm
46;
47; VI-LABEL: select_f16:
48; VI:       ; %bb.0: ; %entry
49; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
50; VI-NEXT:    s_mov_b32 s3, 0xf000
51; VI-NEXT:    s_mov_b32 s2, -1
52; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
53; VI-NEXT:    s_mov_b32 s18, s2
54; VI-NEXT:    s_waitcnt lgkmcnt(0)
55; VI-NEXT:    s_mov_b32 s16, s6
56; VI-NEXT:    s_mov_b32 s17, s7
57; VI-NEXT:    s_mov_b32 s19, s3
58; VI-NEXT:    s_mov_b32 s20, s8
59; VI-NEXT:    s_mov_b32 s21, s9
60; VI-NEXT:    s_mov_b32 s22, s2
61; VI-NEXT:    s_mov_b32 s23, s3
62; VI-NEXT:    s_mov_b32 s8, s10
63; VI-NEXT:    s_mov_b32 s9, s11
64; VI-NEXT:    s_mov_b32 s10, s2
65; VI-NEXT:    s_mov_b32 s11, s3
66; VI-NEXT:    s_mov_b32 s14, s2
67; VI-NEXT:    s_mov_b32 s15, s3
68; VI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0 glc
69; VI-NEXT:    s_waitcnt vmcnt(0)
70; VI-NEXT:    buffer_load_ushort v1, off, s[20:23], 0 glc
71; VI-NEXT:    s_waitcnt vmcnt(0)
72; VI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
73; VI-NEXT:    s_waitcnt vmcnt(0)
74; VI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0 glc
75; VI-NEXT:    s_waitcnt vmcnt(0)
76; VI-NEXT:    s_mov_b32 s0, s4
77; VI-NEXT:    s_mov_b32 s1, s5
78; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
79; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
80; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
81; VI-NEXT:    s_endpgm
82    half addrspace(1)* %r,
83    half addrspace(1)* %a,
84    half addrspace(1)* %b,
85    half addrspace(1)* %c,
86    half addrspace(1)* %d) {
87entry:
88  %a.val = load volatile half, half addrspace(1)* %a
89  %b.val = load volatile half, half addrspace(1)* %b
90  %c.val = load volatile half, half addrspace(1)* %c
91  %d.val = load volatile half, half addrspace(1)* %d
92  %fcmp = fcmp olt half %a.val, %b.val
93  %r.val = select i1 %fcmp, half %c.val, half %d.val
94  store half %r.val, half addrspace(1)* %r
95  ret void
96}
97
98define amdgpu_kernel void @select_f16_imm_a(
99; SI-LABEL: select_f16_imm_a:
100; SI:       ; %bb.0: ; %entry
101; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
102; SI-NEXT:    s_mov_b32 s11, 0xf000
103; SI-NEXT:    s_mov_b32 s10, -1
104; SI-NEXT:    s_mov_b32 s14, s10
105; SI-NEXT:    s_mov_b32 s15, s11
106; SI-NEXT:    s_waitcnt lgkmcnt(0)
107; SI-NEXT:    s_mov_b32 s12, s2
108; SI-NEXT:    s_mov_b32 s13, s3
109; SI-NEXT:    s_mov_b32 s16, s4
110; SI-NEXT:    s_mov_b32 s17, s5
111; SI-NEXT:    s_mov_b32 s18, s10
112; SI-NEXT:    s_mov_b32 s19, s11
113; SI-NEXT:    s_mov_b32 s4, s6
114; SI-NEXT:    s_mov_b32 s5, s7
115; SI-NEXT:    s_mov_b32 s6, s10
116; SI-NEXT:    s_mov_b32 s7, s11
117; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
118; SI-NEXT:    s_waitcnt vmcnt(0)
119; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
120; SI-NEXT:    s_waitcnt vmcnt(0)
121; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
122; SI-NEXT:    s_waitcnt vmcnt(0)
123; SI-NEXT:    s_mov_b32 s8, s0
124; SI-NEXT:    s_mov_b32 s9, s1
125; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
126; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
127; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
128; SI-NEXT:    v_cmp_lt_f32_e32 vcc, 0.5, v0
129; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
130; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
131; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
132; SI-NEXT:    s_endpgm
133;
134; VI-LABEL: select_f16_imm_a:
135; VI:       ; %bb.0: ; %entry
136; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
137; VI-NEXT:    s_mov_b32 s11, 0xf000
138; VI-NEXT:    s_mov_b32 s10, -1
139; VI-NEXT:    s_mov_b32 s14, s10
140; VI-NEXT:    s_mov_b32 s15, s11
141; VI-NEXT:    s_waitcnt lgkmcnt(0)
142; VI-NEXT:    s_mov_b32 s12, s2
143; VI-NEXT:    s_mov_b32 s13, s3
144; VI-NEXT:    s_mov_b32 s16, s4
145; VI-NEXT:    s_mov_b32 s17, s5
146; VI-NEXT:    s_mov_b32 s18, s10
147; VI-NEXT:    s_mov_b32 s19, s11
148; VI-NEXT:    s_mov_b32 s4, s6
149; VI-NEXT:    s_mov_b32 s5, s7
150; VI-NEXT:    s_mov_b32 s6, s10
151; VI-NEXT:    s_mov_b32 s7, s11
152; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
153; VI-NEXT:    s_waitcnt vmcnt(0)
154; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
155; VI-NEXT:    s_waitcnt vmcnt(0)
156; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
157; VI-NEXT:    s_waitcnt vmcnt(0)
158; VI-NEXT:    s_mov_b32 s8, s0
159; VI-NEXT:    s_mov_b32 s9, s1
160; VI-NEXT:    v_cmp_lt_f16_e32 vcc, 0.5, v0
161; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
162; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
163; VI-NEXT:    s_endpgm
164    half addrspace(1)* %r,
165    half addrspace(1)* %b,
166    half addrspace(1)* %c,
167    half addrspace(1)* %d) {
168entry:
169  %b.val = load volatile half, half addrspace(1)* %b
170  %c.val = load volatile half, half addrspace(1)* %c
171  %d.val = load volatile half, half addrspace(1)* %d
172  %fcmp = fcmp olt half 0xH3800, %b.val
173  %r.val = select i1 %fcmp, half %c.val, half %d.val
174  store half %r.val, half addrspace(1)* %r
175  ret void
176}
177
178define amdgpu_kernel void @select_f16_imm_b(
179; SI-LABEL: select_f16_imm_b:
180; SI:       ; %bb.0: ; %entry
181; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
182; SI-NEXT:    s_mov_b32 s11, 0xf000
183; SI-NEXT:    s_mov_b32 s10, -1
184; SI-NEXT:    s_mov_b32 s14, s10
185; SI-NEXT:    s_mov_b32 s15, s11
186; SI-NEXT:    s_waitcnt lgkmcnt(0)
187; SI-NEXT:    s_mov_b32 s12, s2
188; SI-NEXT:    s_mov_b32 s13, s3
189; SI-NEXT:    s_mov_b32 s16, s4
190; SI-NEXT:    s_mov_b32 s17, s5
191; SI-NEXT:    s_mov_b32 s18, s10
192; SI-NEXT:    s_mov_b32 s19, s11
193; SI-NEXT:    s_mov_b32 s4, s6
194; SI-NEXT:    s_mov_b32 s5, s7
195; SI-NEXT:    s_mov_b32 s6, s10
196; SI-NEXT:    s_mov_b32 s7, s11
197; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
198; SI-NEXT:    s_waitcnt vmcnt(0)
199; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
200; SI-NEXT:    s_waitcnt vmcnt(0)
201; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
202; SI-NEXT:    s_waitcnt vmcnt(0)
203; SI-NEXT:    s_mov_b32 s8, s0
204; SI-NEXT:    s_mov_b32 s9, s1
205; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
206; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
207; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
208; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0.5, v0
209; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
210; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
211; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
212; SI-NEXT:    s_endpgm
213;
214; VI-LABEL: select_f16_imm_b:
215; VI:       ; %bb.0: ; %entry
216; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
217; VI-NEXT:    s_mov_b32 s11, 0xf000
218; VI-NEXT:    s_mov_b32 s10, -1
219; VI-NEXT:    s_mov_b32 s14, s10
220; VI-NEXT:    s_mov_b32 s15, s11
221; VI-NEXT:    s_waitcnt lgkmcnt(0)
222; VI-NEXT:    s_mov_b32 s12, s2
223; VI-NEXT:    s_mov_b32 s13, s3
224; VI-NEXT:    s_mov_b32 s16, s4
225; VI-NEXT:    s_mov_b32 s17, s5
226; VI-NEXT:    s_mov_b32 s18, s10
227; VI-NEXT:    s_mov_b32 s19, s11
228; VI-NEXT:    s_mov_b32 s4, s6
229; VI-NEXT:    s_mov_b32 s5, s7
230; VI-NEXT:    s_mov_b32 s6, s10
231; VI-NEXT:    s_mov_b32 s7, s11
232; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
233; VI-NEXT:    s_waitcnt vmcnt(0)
234; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
235; VI-NEXT:    s_waitcnt vmcnt(0)
236; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
237; VI-NEXT:    s_waitcnt vmcnt(0)
238; VI-NEXT:    s_mov_b32 s8, s0
239; VI-NEXT:    s_mov_b32 s9, s1
240; VI-NEXT:    v_cmp_gt_f16_e32 vcc, 0.5, v0
241; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
242; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
243; VI-NEXT:    s_endpgm
244    half addrspace(1)* %r,
245    half addrspace(1)* %a,
246    half addrspace(1)* %c,
247    half addrspace(1)* %d) {
248entry:
249  %a.val = load volatile half, half addrspace(1)* %a
250  %c.val = load volatile half, half addrspace(1)* %c
251  %d.val = load volatile half, half addrspace(1)* %d
252  %fcmp = fcmp olt half %a.val, 0xH3800
253  %r.val = select i1 %fcmp, half %c.val, half %d.val
254  store half %r.val, half addrspace(1)* %r
255  ret void
256}
257
258define amdgpu_kernel void @select_f16_imm_c(
259; SI-LABEL: select_f16_imm_c:
260; SI:       ; %bb.0: ; %entry
261; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
262; SI-NEXT:    s_mov_b32 s11, 0xf000
263; SI-NEXT:    s_mov_b32 s10, -1
264; SI-NEXT:    s_mov_b32 s14, s10
265; SI-NEXT:    s_mov_b32 s15, s11
266; SI-NEXT:    s_waitcnt lgkmcnt(0)
267; SI-NEXT:    s_mov_b32 s12, s2
268; SI-NEXT:    s_mov_b32 s13, s3
269; SI-NEXT:    s_mov_b32 s16, s4
270; SI-NEXT:    s_mov_b32 s17, s5
271; SI-NEXT:    s_mov_b32 s18, s10
272; SI-NEXT:    s_mov_b32 s19, s11
273; SI-NEXT:    s_mov_b32 s4, s6
274; SI-NEXT:    s_mov_b32 s5, s7
275; SI-NEXT:    s_mov_b32 s6, s10
276; SI-NEXT:    s_mov_b32 s7, s11
277; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
278; SI-NEXT:    s_waitcnt vmcnt(0)
279; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
280; SI-NEXT:    s_waitcnt vmcnt(0)
281; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
282; SI-NEXT:    s_waitcnt vmcnt(0)
283; SI-NEXT:    s_mov_b32 s8, s0
284; SI-NEXT:    s_mov_b32 s9, s1
285; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
286; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
287; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
288; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
289; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
290; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
291; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
292; SI-NEXT:    s_endpgm
293;
294; VI-LABEL: select_f16_imm_c:
295; VI:       ; %bb.0: ; %entry
296; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
297; VI-NEXT:    s_mov_b32 s11, 0xf000
298; VI-NEXT:    s_mov_b32 s10, -1
299; VI-NEXT:    s_mov_b32 s14, s10
300; VI-NEXT:    s_mov_b32 s15, s11
301; VI-NEXT:    s_waitcnt lgkmcnt(0)
302; VI-NEXT:    s_mov_b32 s12, s2
303; VI-NEXT:    s_mov_b32 s13, s3
304; VI-NEXT:    s_mov_b32 s16, s4
305; VI-NEXT:    s_mov_b32 s17, s5
306; VI-NEXT:    s_mov_b32 s18, s10
307; VI-NEXT:    s_mov_b32 s19, s11
308; VI-NEXT:    s_mov_b32 s4, s6
309; VI-NEXT:    s_mov_b32 s5, s7
310; VI-NEXT:    s_mov_b32 s6, s10
311; VI-NEXT:    s_mov_b32 s7, s11
312; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
313; VI-NEXT:    s_waitcnt vmcnt(0)
314; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
315; VI-NEXT:    s_waitcnt vmcnt(0)
316; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
317; VI-NEXT:    s_waitcnt vmcnt(0)
318; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
319; VI-NEXT:    s_mov_b32 s8, s0
320; VI-NEXT:    s_mov_b32 s9, s1
321; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
322; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
323; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
324; VI-NEXT:    s_endpgm
325    half addrspace(1)* %r,
326    half addrspace(1)* %a,
327    half addrspace(1)* %b,
328    half addrspace(1)* %d) {
329entry:
330  %a.val = load volatile half, half addrspace(1)* %a
331  %b.val = load volatile half, half addrspace(1)* %b
332  %d.val = load volatile half, half addrspace(1)* %d
333  %fcmp = fcmp olt half %a.val, %b.val
334  %r.val = select i1 %fcmp, half 0xH3800, half %d.val
335  store half %r.val, half addrspace(1)* %r
336  ret void
337}
338
339define amdgpu_kernel void @select_f16_imm_d(
340; SI-LABEL: select_f16_imm_d:
341; SI:       ; %bb.0: ; %entry
342; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
343; SI-NEXT:    s_mov_b32 s11, 0xf000
344; SI-NEXT:    s_mov_b32 s10, -1
345; SI-NEXT:    s_mov_b32 s14, s10
346; SI-NEXT:    s_mov_b32 s15, s11
347; SI-NEXT:    s_waitcnt lgkmcnt(0)
348; SI-NEXT:    s_mov_b32 s12, s2
349; SI-NEXT:    s_mov_b32 s13, s3
350; SI-NEXT:    s_mov_b32 s16, s4
351; SI-NEXT:    s_mov_b32 s17, s5
352; SI-NEXT:    s_mov_b32 s18, s10
353; SI-NEXT:    s_mov_b32 s19, s11
354; SI-NEXT:    s_mov_b32 s4, s6
355; SI-NEXT:    s_mov_b32 s5, s7
356; SI-NEXT:    s_mov_b32 s6, s10
357; SI-NEXT:    s_mov_b32 s7, s11
358; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
359; SI-NEXT:    s_waitcnt vmcnt(0)
360; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
361; SI-NEXT:    s_waitcnt vmcnt(0)
362; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
363; SI-NEXT:    s_waitcnt vmcnt(0)
364; SI-NEXT:    s_mov_b32 s8, s0
365; SI-NEXT:    s_mov_b32 s9, s1
366; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
367; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
368; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
369; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
370; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
371; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
372; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
373; SI-NEXT:    s_endpgm
374;
375; VI-LABEL: select_f16_imm_d:
376; VI:       ; %bb.0: ; %entry
377; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
378; VI-NEXT:    s_mov_b32 s11, 0xf000
379; VI-NEXT:    s_mov_b32 s10, -1
380; VI-NEXT:    s_mov_b32 s14, s10
381; VI-NEXT:    s_mov_b32 s15, s11
382; VI-NEXT:    s_waitcnt lgkmcnt(0)
383; VI-NEXT:    s_mov_b32 s12, s2
384; VI-NEXT:    s_mov_b32 s13, s3
385; VI-NEXT:    s_mov_b32 s16, s4
386; VI-NEXT:    s_mov_b32 s17, s5
387; VI-NEXT:    s_mov_b32 s18, s10
388; VI-NEXT:    s_mov_b32 s19, s11
389; VI-NEXT:    s_mov_b32 s4, s6
390; VI-NEXT:    s_mov_b32 s5, s7
391; VI-NEXT:    s_mov_b32 s6, s10
392; VI-NEXT:    s_mov_b32 s7, s11
393; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
394; VI-NEXT:    s_waitcnt vmcnt(0)
395; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
396; VI-NEXT:    s_waitcnt vmcnt(0)
397; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
398; VI-NEXT:    s_waitcnt vmcnt(0)
399; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
400; VI-NEXT:    s_mov_b32 s8, s0
401; VI-NEXT:    s_mov_b32 s9, s1
402; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
403; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
404; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
405; VI-NEXT:    s_endpgm
406    half addrspace(1)* %r,
407    half addrspace(1)* %a,
408    half addrspace(1)* %b,
409    half addrspace(1)* %c) {
410entry:
411  %a.val = load volatile half, half addrspace(1)* %a
412  %b.val = load volatile half, half addrspace(1)* %b
413  %c.val = load volatile half, half addrspace(1)* %c
414  %fcmp = fcmp olt half %a.val, %b.val
415  %r.val = select i1 %fcmp, half %c.val, half 0xH3800
416  store half %r.val, half addrspace(1)* %r
417  ret void
418}
419
420define amdgpu_kernel void @select_v2f16(
421; SI-LABEL: select_v2f16:
422; SI:       ; %bb.0: ; %entry
423; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
424; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
425; SI-NEXT:    s_mov_b32 s3, 0xf000
426; SI-NEXT:    s_mov_b32 s2, -1
427; SI-NEXT:    s_mov_b32 s18, s2
428; SI-NEXT:    s_waitcnt lgkmcnt(0)
429; SI-NEXT:    s_mov_b32 s16, s6
430; SI-NEXT:    s_mov_b32 s17, s7
431; SI-NEXT:    s_mov_b32 s19, s3
432; SI-NEXT:    s_mov_b32 s20, s8
433; SI-NEXT:    s_mov_b32 s21, s9
434; SI-NEXT:    s_mov_b32 s22, s2
435; SI-NEXT:    s_mov_b32 s23, s3
436; SI-NEXT:    s_mov_b32 s14, s2
437; SI-NEXT:    s_mov_b32 s15, s3
438; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
439; SI-NEXT:    s_mov_b32 s8, s10
440; SI-NEXT:    s_mov_b32 s9, s11
441; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
442; SI-NEXT:    s_mov_b32 s10, s2
443; SI-NEXT:    s_mov_b32 s11, s3
444; SI-NEXT:    buffer_load_dword v2, off, s[20:23], 0
445; SI-NEXT:    buffer_load_dword v3, off, s[8:11], 0
446; SI-NEXT:    s_mov_b32 s0, s4
447; SI-NEXT:    s_mov_b32 s1, s5
448; SI-NEXT:    s_waitcnt vmcnt(3)
449; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
450; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
451; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
452; SI-NEXT:    s_waitcnt vmcnt(2)
453; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
454; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
455; SI-NEXT:    s_waitcnt vmcnt(1)
456; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
457; SI-NEXT:    s_waitcnt vmcnt(0)
458; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
459; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
460; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
461; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
462; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
463; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
464; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
465; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
466; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v2
467; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
468; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
469; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
470; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
471; SI-NEXT:    v_or_b32_e32 v0, v1, v0
472; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
473; SI-NEXT:    s_endpgm
474;
475; VI-LABEL: select_v2f16:
476; VI:       ; %bb.0: ; %entry
477; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
478; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
479; VI-NEXT:    s_mov_b32 s3, 0xf000
480; VI-NEXT:    s_mov_b32 s2, -1
481; VI-NEXT:    s_mov_b32 s14, s2
482; VI-NEXT:    s_mov_b32 s15, s3
483; VI-NEXT:    s_waitcnt lgkmcnt(0)
484; VI-NEXT:    s_mov_b32 s16, s6
485; VI-NEXT:    s_mov_b32 s17, s7
486; VI-NEXT:    s_mov_b32 s18, s2
487; VI-NEXT:    s_mov_b32 s19, s3
488; VI-NEXT:    s_mov_b32 s20, s8
489; VI-NEXT:    s_mov_b32 s21, s9
490; VI-NEXT:    s_mov_b32 s22, s2
491; VI-NEXT:    s_mov_b32 s23, s3
492; VI-NEXT:    s_mov_b32 s8, s10
493; VI-NEXT:    s_mov_b32 s9, s11
494; VI-NEXT:    s_mov_b32 s10, s2
495; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
496; VI-NEXT:    buffer_load_dword v1, off, s[20:23], 0
497; VI-NEXT:    buffer_load_dword v2, off, s[16:19], 0
498; VI-NEXT:    s_mov_b32 s11, s3
499; VI-NEXT:    buffer_load_dword v3, off, s[8:11], 0
500; VI-NEXT:    s_mov_b32 s0, s4
501; VI-NEXT:    s_mov_b32 s1, s5
502; VI-NEXT:    s_waitcnt vmcnt(3)
503; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
504; VI-NEXT:    s_waitcnt vmcnt(2)
505; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
506; VI-NEXT:    s_waitcnt vmcnt(1)
507; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
508; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v2, v1
509; VI-NEXT:    s_waitcnt vmcnt(0)
510; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
511; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
512; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v6, v5
513; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
514; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
515; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
516; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
517; VI-NEXT:    s_endpgm
518    <2 x half> addrspace(1)* %r,
519    <2 x half> addrspace(1)* %a,
520    <2 x half> addrspace(1)* %b,
521    <2 x half> addrspace(1)* %c,
522    <2 x half> addrspace(1)* %d) {
523entry:
524  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
525  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
526  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
527  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
528  %fcmp = fcmp olt <2 x half> %a.val, %b.val
529  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
530  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
531  ret void
532}
533
534define amdgpu_kernel void @select_v2f16_imm_a(
535; SI-LABEL: select_v2f16_imm_a:
536; SI:       ; %bb.0: ; %entry
537; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
538; SI-NEXT:    s_mov_b32 s11, 0xf000
539; SI-NEXT:    s_mov_b32 s10, -1
540; SI-NEXT:    s_mov_b32 s14, s10
541; SI-NEXT:    s_mov_b32 s15, s11
542; SI-NEXT:    s_waitcnt lgkmcnt(0)
543; SI-NEXT:    s_mov_b32 s12, s2
544; SI-NEXT:    s_mov_b32 s13, s3
545; SI-NEXT:    s_mov_b32 s16, s4
546; SI-NEXT:    s_mov_b32 s17, s5
547; SI-NEXT:    s_mov_b32 s18, s10
548; SI-NEXT:    s_mov_b32 s19, s11
549; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
550; SI-NEXT:    s_mov_b32 s4, s6
551; SI-NEXT:    s_mov_b32 s5, s7
552; SI-NEXT:    s_mov_b32 s6, s10
553; SI-NEXT:    s_mov_b32 s7, s11
554; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
555; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
556; SI-NEXT:    s_mov_b32 s2, 0x3f200000
557; SI-NEXT:    s_mov_b32 s8, s0
558; SI-NEXT:    s_mov_b32 s9, s1
559; SI-NEXT:    s_waitcnt vmcnt(2)
560; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
561; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
562; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
563; SI-NEXT:    s_waitcnt vmcnt(1)
564; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
565; SI-NEXT:    s_waitcnt vmcnt(0)
566; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
567; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
568; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
569; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
570; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
571; SI-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v3
572; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
573; SI-NEXT:    v_cmp_lt_f32_e32 vcc, 0.5, v0
574; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
575; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
576; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
577; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
578; SI-NEXT:    v_or_b32_e32 v0, v0, v1
579; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
580; SI-NEXT:    s_endpgm
581;
582; VI-LABEL: select_v2f16_imm_a:
583; VI:       ; %bb.0: ; %entry
584; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
585; VI-NEXT:    s_mov_b32 s11, 0xf000
586; VI-NEXT:    s_mov_b32 s10, -1
587; VI-NEXT:    s_mov_b32 s14, s10
588; VI-NEXT:    s_mov_b32 s15, s11
589; VI-NEXT:    s_waitcnt lgkmcnt(0)
590; VI-NEXT:    s_mov_b32 s12, s2
591; VI-NEXT:    s_mov_b32 s13, s3
592; VI-NEXT:    s_mov_b32 s16, s4
593; VI-NEXT:    s_mov_b32 s17, s5
594; VI-NEXT:    s_mov_b32 s18, s10
595; VI-NEXT:    s_mov_b32 s19, s11
596; VI-NEXT:    s_mov_b32 s4, s6
597; VI-NEXT:    s_mov_b32 s5, s7
598; VI-NEXT:    s_mov_b32 s6, s10
599; VI-NEXT:    s_mov_b32 s7, s11
600; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
601; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
602; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
603; VI-NEXT:    s_movk_i32 s2, 0x3900
604; VI-NEXT:    s_mov_b32 s8, s0
605; VI-NEXT:    s_mov_b32 s9, s1
606; VI-NEXT:    s_waitcnt vmcnt(2)
607; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
608; VI-NEXT:    v_cmp_lt_f16_e32 vcc, 0.5, v0
609; VI-NEXT:    s_waitcnt vmcnt(0)
610; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
611; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
612; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
613; VI-NEXT:    v_cmp_lt_f16_e32 vcc, s2, v3
614; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
615; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
616; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
617; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
618; VI-NEXT:    s_endpgm
619    <2 x half> addrspace(1)* %r,
620    <2 x half> addrspace(1)* %b,
621    <2 x half> addrspace(1)* %c,
622    <2 x half> addrspace(1)* %d) {
623entry:
624  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
625  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
626  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
627  %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val
628  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
629  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
630  ret void
631}
632
633define amdgpu_kernel void @select_v2f16_imm_b(
634; SI-LABEL: select_v2f16_imm_b:
635; SI:       ; %bb.0: ; %entry
636; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
637; SI-NEXT:    s_mov_b32 s11, 0xf000
638; SI-NEXT:    s_mov_b32 s10, -1
639; SI-NEXT:    s_mov_b32 s14, s10
640; SI-NEXT:    s_mov_b32 s15, s11
641; SI-NEXT:    s_waitcnt lgkmcnt(0)
642; SI-NEXT:    s_mov_b32 s12, s2
643; SI-NEXT:    s_mov_b32 s13, s3
644; SI-NEXT:    s_mov_b32 s16, s4
645; SI-NEXT:    s_mov_b32 s17, s5
646; SI-NEXT:    s_mov_b32 s18, s10
647; SI-NEXT:    s_mov_b32 s19, s11
648; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
649; SI-NEXT:    s_mov_b32 s4, s6
650; SI-NEXT:    s_mov_b32 s5, s7
651; SI-NEXT:    s_mov_b32 s6, s10
652; SI-NEXT:    s_mov_b32 s7, s11
653; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
654; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
655; SI-NEXT:    s_mov_b32 s2, 0x3f200000
656; SI-NEXT:    s_mov_b32 s8, s0
657; SI-NEXT:    s_mov_b32 s9, s1
658; SI-NEXT:    s_waitcnt vmcnt(2)
659; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
660; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
661; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
662; SI-NEXT:    s_waitcnt vmcnt(1)
663; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
664; SI-NEXT:    s_waitcnt vmcnt(0)
665; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
666; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
667; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
668; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
669; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
670; SI-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v3
671; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
672; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0.5, v0
673; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
674; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
675; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
676; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
677; SI-NEXT:    v_or_b32_e32 v0, v0, v1
678; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
679; SI-NEXT:    s_endpgm
680;
681; VI-LABEL: select_v2f16_imm_b:
682; VI:       ; %bb.0: ; %entry
683; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
684; VI-NEXT:    s_mov_b32 s11, 0xf000
685; VI-NEXT:    s_mov_b32 s10, -1
686; VI-NEXT:    s_mov_b32 s14, s10
687; VI-NEXT:    s_mov_b32 s15, s11
688; VI-NEXT:    s_waitcnt lgkmcnt(0)
689; VI-NEXT:    s_mov_b32 s12, s2
690; VI-NEXT:    s_mov_b32 s13, s3
691; VI-NEXT:    s_mov_b32 s16, s4
692; VI-NEXT:    s_mov_b32 s17, s5
693; VI-NEXT:    s_mov_b32 s18, s10
694; VI-NEXT:    s_mov_b32 s19, s11
695; VI-NEXT:    s_mov_b32 s4, s6
696; VI-NEXT:    s_mov_b32 s5, s7
697; VI-NEXT:    s_mov_b32 s6, s10
698; VI-NEXT:    s_mov_b32 s7, s11
699; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
700; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
701; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
702; VI-NEXT:    s_movk_i32 s2, 0x3900
703; VI-NEXT:    s_mov_b32 s8, s0
704; VI-NEXT:    s_mov_b32 s9, s1
705; VI-NEXT:    s_waitcnt vmcnt(2)
706; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
707; VI-NEXT:    v_cmp_gt_f16_e32 vcc, 0.5, v0
708; VI-NEXT:    s_waitcnt vmcnt(0)
709; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
710; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
711; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
712; VI-NEXT:    v_cmp_gt_f16_e32 vcc, s2, v3
713; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
714; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
715; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
716; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
717; VI-NEXT:    s_endpgm
718    <2 x half> addrspace(1)* %r,
719    <2 x half> addrspace(1)* %a,
720    <2 x half> addrspace(1)* %c,
721    <2 x half> addrspace(1)* %d) {
722entry:
723  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
724  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
725  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
726  %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900>
727  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
728  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
729  ret void
730}
731
732define amdgpu_kernel void @select_v2f16_imm_c(
733; SI-LABEL: select_v2f16_imm_c:
734; SI:       ; %bb.0: ; %entry
735; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
736; SI-NEXT:    s_mov_b32 s11, 0xf000
737; SI-NEXT:    s_mov_b32 s10, -1
738; SI-NEXT:    s_mov_b32 s14, s10
739; SI-NEXT:    s_mov_b32 s15, s11
740; SI-NEXT:    s_waitcnt lgkmcnt(0)
741; SI-NEXT:    s_mov_b32 s12, s2
742; SI-NEXT:    s_mov_b32 s13, s3
743; SI-NEXT:    s_mov_b32 s16, s4
744; SI-NEXT:    s_mov_b32 s17, s5
745; SI-NEXT:    s_mov_b32 s18, s10
746; SI-NEXT:    s_mov_b32 s19, s11
747; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
748; SI-NEXT:    s_mov_b32 s4, s6
749; SI-NEXT:    s_mov_b32 s5, s7
750; SI-NEXT:    s_mov_b32 s6, s10
751; SI-NEXT:    s_mov_b32 s7, s11
752; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
753; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
754; SI-NEXT:    v_mov_b32_e32 v3, 0x3f200000
755; SI-NEXT:    s_mov_b32 s8, s0
756; SI-NEXT:    s_mov_b32 s9, s1
757; SI-NEXT:    s_waitcnt vmcnt(2)
758; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
759; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
760; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
761; SI-NEXT:    s_waitcnt vmcnt(1)
762; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
763; SI-NEXT:    s_waitcnt vmcnt(0)
764; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
765; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
766; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
767; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
768; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
769; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v5
770; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v6, vcc
771; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v4, v1
772; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
773; SI-NEXT:    v_cndmask_b32_e32 v1, 0.5, v2, vcc
774; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
775; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
776; SI-NEXT:    v_or_b32_e32 v0, v1, v0
777; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
778; SI-NEXT:    s_endpgm
779;
780; VI-LABEL: select_v2f16_imm_c:
781; VI:       ; %bb.0: ; %entry
782; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
783; VI-NEXT:    s_mov_b32 s11, 0xf000
784; VI-NEXT:    s_mov_b32 s10, -1
785; VI-NEXT:    s_mov_b32 s18, s10
786; VI-NEXT:    s_mov_b32 s19, s11
787; VI-NEXT:    s_waitcnt lgkmcnt(0)
788; VI-NEXT:    s_mov_b32 s16, s4
789; VI-NEXT:    s_mov_b32 s17, s5
790; VI-NEXT:    s_mov_b32 s14, s10
791; VI-NEXT:    s_mov_b32 s12, s2
792; VI-NEXT:    s_mov_b32 s13, s3
793; VI-NEXT:    s_mov_b32 s15, s11
794; VI-NEXT:    s_mov_b32 s4, s6
795; VI-NEXT:    s_mov_b32 s5, s7
796; VI-NEXT:    s_mov_b32 s6, s10
797; VI-NEXT:    buffer_load_dword v0, off, s[16:19], 0
798; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
799; VI-NEXT:    s_mov_b32 s7, s11
800; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
801; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
802; VI-NEXT:    v_mov_b32_e32 v4, 0x3900
803; VI-NEXT:    s_mov_b32 s8, s0
804; VI-NEXT:    s_mov_b32 s9, s1
805; VI-NEXT:    s_waitcnt vmcnt(2)
806; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
807; VI-NEXT:    s_waitcnt vmcnt(1)
808; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
809; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v1, v0
810; VI-NEXT:    s_waitcnt vmcnt(0)
811; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
812; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
813; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v6, v5
814; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
815; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
816; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
817; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
818; VI-NEXT:    s_endpgm
819    <2 x half> addrspace(1)* %r,
820    <2 x half> addrspace(1)* %a,
821    <2 x half> addrspace(1)* %b,
822    <2 x half> addrspace(1)* %d) {
823entry:
824  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
825  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
826  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
827  %fcmp = fcmp olt <2 x half> %a.val, %b.val
828  %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val
829  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
830  ret void
831}
832
833define amdgpu_kernel void @select_v2f16_imm_d(
834; SI-LABEL: select_v2f16_imm_d:
835; SI:       ; %bb.0: ; %entry
836; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
837; SI-NEXT:    s_mov_b32 s11, 0xf000
838; SI-NEXT:    s_mov_b32 s10, -1
839; SI-NEXT:    s_mov_b32 s14, s10
840; SI-NEXT:    s_mov_b32 s15, s11
841; SI-NEXT:    s_waitcnt lgkmcnt(0)
842; SI-NEXT:    s_mov_b32 s12, s2
843; SI-NEXT:    s_mov_b32 s13, s3
844; SI-NEXT:    s_mov_b32 s16, s4
845; SI-NEXT:    s_mov_b32 s17, s5
846; SI-NEXT:    s_mov_b32 s18, s10
847; SI-NEXT:    s_mov_b32 s19, s11
848; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
849; SI-NEXT:    s_mov_b32 s4, s6
850; SI-NEXT:    s_mov_b32 s5, s7
851; SI-NEXT:    s_mov_b32 s6, s10
852; SI-NEXT:    s_mov_b32 s7, s11
853; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
854; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
855; SI-NEXT:    v_mov_b32_e32 v3, 0x3f200000
856; SI-NEXT:    s_mov_b32 s8, s0
857; SI-NEXT:    s_mov_b32 s9, s1
858; SI-NEXT:    s_waitcnt vmcnt(2)
859; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
860; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
861; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
862; SI-NEXT:    s_waitcnt vmcnt(1)
863; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
864; SI-NEXT:    s_waitcnt vmcnt(0)
865; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
866; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
867; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
868; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
869; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
870; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
871; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
872; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
873; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
874; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
875; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
876; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
877; SI-NEXT:    v_or_b32_e32 v0, v0, v1
878; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
879; SI-NEXT:    s_endpgm
880;
881; VI-LABEL: select_v2f16_imm_d:
882; VI:       ; %bb.0: ; %entry
883; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
884; VI-NEXT:    s_mov_b32 s11, 0xf000
885; VI-NEXT:    s_mov_b32 s10, -1
886; VI-NEXT:    s_mov_b32 s18, s10
887; VI-NEXT:    s_mov_b32 s19, s11
888; VI-NEXT:    s_waitcnt lgkmcnt(0)
889; VI-NEXT:    s_mov_b32 s16, s4
890; VI-NEXT:    s_mov_b32 s17, s5
891; VI-NEXT:    s_mov_b32 s14, s10
892; VI-NEXT:    s_mov_b32 s12, s2
893; VI-NEXT:    s_mov_b32 s13, s3
894; VI-NEXT:    s_mov_b32 s15, s11
895; VI-NEXT:    s_mov_b32 s4, s6
896; VI-NEXT:    s_mov_b32 s5, s7
897; VI-NEXT:    s_mov_b32 s6, s10
898; VI-NEXT:    buffer_load_dword v0, off, s[16:19], 0
899; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
900; VI-NEXT:    s_mov_b32 s7, s11
901; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
902; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
903; VI-NEXT:    v_mov_b32_e32 v4, 0x3900
904; VI-NEXT:    s_mov_b32 s8, s0
905; VI-NEXT:    s_mov_b32 s9, s1
906; VI-NEXT:    s_waitcnt vmcnt(2)
907; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
908; VI-NEXT:    s_waitcnt vmcnt(1)
909; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
910; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v1, v0
911; VI-NEXT:    s_waitcnt vmcnt(0)
912; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
913; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
914; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v6, v5
915; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
916; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
917; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
918; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
919; VI-NEXT:    s_endpgm
920    <2 x half> addrspace(1)* %r,
921    <2 x half> addrspace(1)* %a,
922    <2 x half> addrspace(1)* %b,
923    <2 x half> addrspace(1)* %c) {
924entry:
925  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
926  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
927  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
928  %fcmp = fcmp olt <2 x half> %a.val, %b.val
929  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900>
930  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
931  ret void
932}
933