1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs  | FileCheck %s -check-prefixes=GCN,SI
3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI
4
5define amdgpu_kernel void @select_f16(
6; SI-LABEL: select_f16:
7; SI:       ; %bb.0: ; %entry
8; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
9; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
10; SI-NEXT:    s_mov_b32 s3, 0xf000
11; SI-NEXT:    s_mov_b32 s2, -1
12; SI-NEXT:    s_mov_b32 s18, s2
13; SI-NEXT:    s_waitcnt lgkmcnt(0)
14; SI-NEXT:    s_mov_b32 s16, s6
15; SI-NEXT:    s_mov_b32 s17, s7
16; SI-NEXT:    s_mov_b32 s19, s3
17; SI-NEXT:    s_mov_b32 s20, s8
18; SI-NEXT:    s_mov_b32 s21, s9
19; SI-NEXT:    s_mov_b32 s8, s10
20; SI-NEXT:    s_mov_b32 s9, s11
21; SI-NEXT:    s_mov_b32 s22, s2
22; SI-NEXT:    s_mov_b32 s23, s3
23; SI-NEXT:    s_mov_b32 s10, s2
24; SI-NEXT:    s_mov_b32 s11, s3
25; SI-NEXT:    s_mov_b32 s14, s2
26; SI-NEXT:    s_mov_b32 s15, s3
27; SI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
28; SI-NEXT:    buffer_load_ushort v1, off, s[20:23], 0
29; SI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0
30; SI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0
31; SI-NEXT:    s_mov_b32 s0, s4
32; SI-NEXT:    s_mov_b32 s1, s5
33; SI-NEXT:    s_waitcnt vmcnt(3)
34; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
35; SI-NEXT:    s_waitcnt vmcnt(2)
36; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
37; SI-NEXT:    s_waitcnt vmcnt(1)
38; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
39; SI-NEXT:    s_waitcnt vmcnt(0)
40; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
41; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
42; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
43; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
44; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
45; SI-NEXT:    s_endpgm
46;
47; VI-LABEL: select_f16:
48; VI:       ; %bb.0: ; %entry
49; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
50; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
51; VI-NEXT:    s_mov_b32 s3, 0xf000
52; VI-NEXT:    s_mov_b32 s2, -1
53; VI-NEXT:    s_mov_b32 s18, s2
54; VI-NEXT:    s_waitcnt lgkmcnt(0)
55; VI-NEXT:    s_mov_b32 s0, s4
56; VI-NEXT:    s_mov_b32 s1, s5
57; VI-NEXT:    s_mov_b32 s4, s6
58; VI-NEXT:    s_mov_b32 s5, s7
59; VI-NEXT:    s_mov_b32 s6, s2
60; VI-NEXT:    s_mov_b32 s7, s3
61; VI-NEXT:    s_mov_b32 s16, s8
62; VI-NEXT:    s_mov_b32 s17, s9
63; VI-NEXT:    s_mov_b32 s8, s10
64; VI-NEXT:    s_mov_b32 s9, s11
65; VI-NEXT:    s_mov_b32 s19, s3
66; VI-NEXT:    s_mov_b32 s10, s2
67; VI-NEXT:    s_mov_b32 s11, s3
68; VI-NEXT:    s_mov_b32 s14, s2
69; VI-NEXT:    s_mov_b32 s15, s3
70; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
71; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
72; VI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0
73; VI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0
74; VI-NEXT:    s_waitcnt vmcnt(2)
75; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
76; VI-NEXT:    s_waitcnt vmcnt(0)
77; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
78; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
79; VI-NEXT:    s_endpgm
80    half addrspace(1)* %r,
81    half addrspace(1)* %a,
82    half addrspace(1)* %b,
83    half addrspace(1)* %c,
84    half addrspace(1)* %d) {
85entry:
86  %a.val = load volatile half, half addrspace(1)* %a
87  %b.val = load volatile half, half addrspace(1)* %b
88  %c.val = load volatile half, half addrspace(1)* %c
89  %d.val = load volatile half, half addrspace(1)* %d
90  %fcmp = fcmp olt half %a.val, %b.val
91  %r.val = select i1 %fcmp, half %c.val, half %d.val
92  store half %r.val, half addrspace(1)* %r
93  ret void
94}
95
96define amdgpu_kernel void @select_f16_imm_a(
97; SI-LABEL: select_f16_imm_a:
98; SI:       ; %bb.0: ; %entry
99; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
100; SI-NEXT:    s_mov_b32 s11, 0xf000
101; SI-NEXT:    s_mov_b32 s10, -1
102; SI-NEXT:    s_mov_b32 s14, s10
103; SI-NEXT:    s_mov_b32 s15, s11
104; SI-NEXT:    s_waitcnt lgkmcnt(0)
105; SI-NEXT:    s_mov_b32 s12, s2
106; SI-NEXT:    s_mov_b32 s13, s3
107; SI-NEXT:    s_mov_b32 s16, s4
108; SI-NEXT:    s_mov_b32 s17, s5
109; SI-NEXT:    s_mov_b32 s4, s6
110; SI-NEXT:    s_mov_b32 s5, s7
111; SI-NEXT:    s_mov_b32 s18, s10
112; SI-NEXT:    s_mov_b32 s19, s11
113; SI-NEXT:    s_mov_b32 s6, s10
114; SI-NEXT:    s_mov_b32 s7, s11
115; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
116; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
117; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
118; SI-NEXT:    s_mov_b32 s8, s0
119; SI-NEXT:    s_mov_b32 s9, s1
120; SI-NEXT:    s_waitcnt vmcnt(2)
121; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
122; SI-NEXT:    s_waitcnt vmcnt(1)
123; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
124; SI-NEXT:    s_waitcnt vmcnt(0)
125; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
126; SI-NEXT:    v_cmp_lt_f32_e32 vcc, 0.5, v0
127; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
128; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
129; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
130; SI-NEXT:    s_endpgm
131;
132; VI-LABEL: select_f16_imm_a:
133; VI:       ; %bb.0: ; %entry
134; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
135; VI-NEXT:    s_mov_b32 s11, 0xf000
136; VI-NEXT:    s_mov_b32 s10, -1
137; VI-NEXT:    s_mov_b32 s14, s10
138; VI-NEXT:    s_mov_b32 s15, s11
139; VI-NEXT:    s_waitcnt lgkmcnt(0)
140; VI-NEXT:    s_mov_b32 s8, s0
141; VI-NEXT:    s_mov_b32 s9, s1
142; VI-NEXT:    s_mov_b32 s0, s2
143; VI-NEXT:    s_mov_b32 s1, s3
144; VI-NEXT:    s_mov_b32 s2, s10
145; VI-NEXT:    s_mov_b32 s3, s11
146; VI-NEXT:    s_mov_b32 s12, s4
147; VI-NEXT:    s_mov_b32 s13, s5
148; VI-NEXT:    s_mov_b32 s4, s6
149; VI-NEXT:    s_mov_b32 s5, s7
150; VI-NEXT:    s_mov_b32 s6, s10
151; VI-NEXT:    s_mov_b32 s7, s11
152; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
153; VI-NEXT:    buffer_load_ushort v1, off, s[12:15], 0
154; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
155; VI-NEXT:    s_waitcnt vmcnt(2)
156; VI-NEXT:    v_cmp_lt_f16_e32 vcc, 0.5, v0
157; VI-NEXT:    s_waitcnt vmcnt(0)
158; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
159; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
160; VI-NEXT:    s_endpgm
161    half addrspace(1)* %r,
162    half addrspace(1)* %b,
163    half addrspace(1)* %c,
164    half addrspace(1)* %d) {
165entry:
166  %b.val = load volatile half, half addrspace(1)* %b
167  %c.val = load volatile half, half addrspace(1)* %c
168  %d.val = load volatile half, half addrspace(1)* %d
169  %fcmp = fcmp olt half 0xH3800, %b.val
170  %r.val = select i1 %fcmp, half %c.val, half %d.val
171  store half %r.val, half addrspace(1)* %r
172  ret void
173}
174
175define amdgpu_kernel void @select_f16_imm_b(
176; SI-LABEL: select_f16_imm_b:
177; SI:       ; %bb.0: ; %entry
178; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
179; SI-NEXT:    s_mov_b32 s11, 0xf000
180; SI-NEXT:    s_mov_b32 s10, -1
181; SI-NEXT:    s_mov_b32 s14, s10
182; SI-NEXT:    s_mov_b32 s15, s11
183; SI-NEXT:    s_waitcnt lgkmcnt(0)
184; SI-NEXT:    s_mov_b32 s12, s2
185; SI-NEXT:    s_mov_b32 s13, s3
186; SI-NEXT:    s_mov_b32 s16, s4
187; SI-NEXT:    s_mov_b32 s17, s5
188; SI-NEXT:    s_mov_b32 s4, s6
189; SI-NEXT:    s_mov_b32 s5, s7
190; SI-NEXT:    s_mov_b32 s18, s10
191; SI-NEXT:    s_mov_b32 s19, s11
192; SI-NEXT:    s_mov_b32 s6, s10
193; SI-NEXT:    s_mov_b32 s7, s11
194; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
195; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
196; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
197; SI-NEXT:    s_mov_b32 s8, s0
198; SI-NEXT:    s_mov_b32 s9, s1
199; SI-NEXT:    s_waitcnt vmcnt(2)
200; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
201; SI-NEXT:    s_waitcnt vmcnt(1)
202; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
203; SI-NEXT:    s_waitcnt vmcnt(0)
204; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
205; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0.5, v0
206; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
207; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
208; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
209; SI-NEXT:    s_endpgm
210;
211; VI-LABEL: select_f16_imm_b:
212; VI:       ; %bb.0: ; %entry
213; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
214; VI-NEXT:    s_mov_b32 s11, 0xf000
215; VI-NEXT:    s_mov_b32 s10, -1
216; VI-NEXT:    s_mov_b32 s14, s10
217; VI-NEXT:    s_mov_b32 s15, s11
218; VI-NEXT:    s_waitcnt lgkmcnt(0)
219; VI-NEXT:    s_mov_b32 s8, s0
220; VI-NEXT:    s_mov_b32 s9, s1
221; VI-NEXT:    s_mov_b32 s0, s2
222; VI-NEXT:    s_mov_b32 s1, s3
223; VI-NEXT:    s_mov_b32 s2, s10
224; VI-NEXT:    s_mov_b32 s3, s11
225; VI-NEXT:    s_mov_b32 s12, s4
226; VI-NEXT:    s_mov_b32 s13, s5
227; VI-NEXT:    s_mov_b32 s4, s6
228; VI-NEXT:    s_mov_b32 s5, s7
229; VI-NEXT:    s_mov_b32 s6, s10
230; VI-NEXT:    s_mov_b32 s7, s11
231; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
232; VI-NEXT:    buffer_load_ushort v1, off, s[12:15], 0
233; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
234; VI-NEXT:    s_waitcnt vmcnt(2)
235; VI-NEXT:    v_cmp_gt_f16_e32 vcc, 0.5, v0
236; VI-NEXT:    s_waitcnt vmcnt(0)
237; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
238; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
239; VI-NEXT:    s_endpgm
240    half addrspace(1)* %r,
241    half addrspace(1)* %a,
242    half addrspace(1)* %c,
243    half addrspace(1)* %d) {
244entry:
245  %a.val = load volatile half, half addrspace(1)* %a
246  %c.val = load volatile half, half addrspace(1)* %c
247  %d.val = load volatile half, half addrspace(1)* %d
248  %fcmp = fcmp olt half %a.val, 0xH3800
249  %r.val = select i1 %fcmp, half %c.val, half %d.val
250  store half %r.val, half addrspace(1)* %r
251  ret void
252}
253
254define amdgpu_kernel void @select_f16_imm_c(
255; SI-LABEL: select_f16_imm_c:
256; SI:       ; %bb.0: ; %entry
257; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
258; SI-NEXT:    s_mov_b32 s11, 0xf000
259; SI-NEXT:    s_mov_b32 s10, -1
260; SI-NEXT:    s_mov_b32 s14, s10
261; SI-NEXT:    s_mov_b32 s15, s11
262; SI-NEXT:    s_waitcnt lgkmcnt(0)
263; SI-NEXT:    s_mov_b32 s12, s2
264; SI-NEXT:    s_mov_b32 s13, s3
265; SI-NEXT:    s_mov_b32 s16, s4
266; SI-NEXT:    s_mov_b32 s17, s5
267; SI-NEXT:    s_mov_b32 s4, s6
268; SI-NEXT:    s_mov_b32 s5, s7
269; SI-NEXT:    s_mov_b32 s18, s10
270; SI-NEXT:    s_mov_b32 s19, s11
271; SI-NEXT:    s_mov_b32 s6, s10
272; SI-NEXT:    s_mov_b32 s7, s11
273; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
274; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
275; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
276; SI-NEXT:    s_mov_b32 s8, s0
277; SI-NEXT:    s_mov_b32 s9, s1
278; SI-NEXT:    s_waitcnt vmcnt(2)
279; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
280; SI-NEXT:    s_waitcnt vmcnt(1)
281; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
282; SI-NEXT:    s_waitcnt vmcnt(0)
283; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
284; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
285; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
286; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
287; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
288; SI-NEXT:    s_endpgm
289;
290; VI-LABEL: select_f16_imm_c:
291; VI:       ; %bb.0: ; %entry
292; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
293; VI-NEXT:    s_mov_b32 s11, 0xf000
294; VI-NEXT:    s_mov_b32 s10, -1
295; VI-NEXT:    s_mov_b32 s14, s10
296; VI-NEXT:    s_mov_b32 s15, s11
297; VI-NEXT:    s_waitcnt lgkmcnt(0)
298; VI-NEXT:    s_mov_b32 s8, s0
299; VI-NEXT:    s_mov_b32 s9, s1
300; VI-NEXT:    s_mov_b32 s0, s2
301; VI-NEXT:    s_mov_b32 s1, s3
302; VI-NEXT:    s_mov_b32 s2, s10
303; VI-NEXT:    s_mov_b32 s3, s11
304; VI-NEXT:    s_mov_b32 s12, s4
305; VI-NEXT:    s_mov_b32 s13, s5
306; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
307; VI-NEXT:    buffer_load_ushort v1, off, s[12:15], 0
308; VI-NEXT:    s_mov_b32 s4, s6
309; VI-NEXT:    s_mov_b32 s5, s7
310; VI-NEXT:    s_mov_b32 s6, s10
311; VI-NEXT:    s_mov_b32 s7, s11
312; VI-NEXT:    buffer_load_ushort v3, off, s[4:7], 0
313; VI-NEXT:    v_mov_b32_e32 v2, 0x3800
314; VI-NEXT:    s_waitcnt vmcnt(1)
315; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
316; VI-NEXT:    s_waitcnt vmcnt(0)
317; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
318; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
319; VI-NEXT:    s_endpgm
320    half addrspace(1)* %r,
321    half addrspace(1)* %a,
322    half addrspace(1)* %b,
323    half addrspace(1)* %d) {
324entry:
325  %a.val = load volatile half, half addrspace(1)* %a
326  %b.val = load volatile half, half addrspace(1)* %b
327  %d.val = load volatile half, half addrspace(1)* %d
328  %fcmp = fcmp olt half %a.val, %b.val
329  %r.val = select i1 %fcmp, half 0xH3800, half %d.val
330  store half %r.val, half addrspace(1)* %r
331  ret void
332}
333
334define amdgpu_kernel void @select_f16_imm_d(
335; SI-LABEL: select_f16_imm_d:
336; SI:       ; %bb.0: ; %entry
337; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
338; SI-NEXT:    s_mov_b32 s11, 0xf000
339; SI-NEXT:    s_mov_b32 s10, -1
340; SI-NEXT:    s_mov_b32 s14, s10
341; SI-NEXT:    s_mov_b32 s15, s11
342; SI-NEXT:    s_waitcnt lgkmcnt(0)
343; SI-NEXT:    s_mov_b32 s12, s2
344; SI-NEXT:    s_mov_b32 s13, s3
345; SI-NEXT:    s_mov_b32 s16, s4
346; SI-NEXT:    s_mov_b32 s17, s5
347; SI-NEXT:    s_mov_b32 s4, s6
348; SI-NEXT:    s_mov_b32 s5, s7
349; SI-NEXT:    s_mov_b32 s18, s10
350; SI-NEXT:    s_mov_b32 s19, s11
351; SI-NEXT:    s_mov_b32 s6, s10
352; SI-NEXT:    s_mov_b32 s7, s11
353; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
354; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
355; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
356; SI-NEXT:    s_mov_b32 s8, s0
357; SI-NEXT:    s_mov_b32 s9, s1
358; SI-NEXT:    s_waitcnt vmcnt(2)
359; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
360; SI-NEXT:    s_waitcnt vmcnt(1)
361; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
362; SI-NEXT:    s_waitcnt vmcnt(0)
363; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
364; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
365; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
366; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
367; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
368; SI-NEXT:    s_endpgm
369;
370; VI-LABEL: select_f16_imm_d:
371; VI:       ; %bb.0: ; %entry
372; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
373; VI-NEXT:    s_mov_b32 s11, 0xf000
374; VI-NEXT:    s_mov_b32 s10, -1
375; VI-NEXT:    s_mov_b32 s14, s10
376; VI-NEXT:    s_mov_b32 s15, s11
377; VI-NEXT:    s_waitcnt lgkmcnt(0)
378; VI-NEXT:    s_mov_b32 s8, s0
379; VI-NEXT:    s_mov_b32 s9, s1
380; VI-NEXT:    s_mov_b32 s0, s2
381; VI-NEXT:    s_mov_b32 s1, s3
382; VI-NEXT:    s_mov_b32 s2, s10
383; VI-NEXT:    s_mov_b32 s3, s11
384; VI-NEXT:    s_mov_b32 s12, s4
385; VI-NEXT:    s_mov_b32 s13, s5
386; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
387; VI-NEXT:    buffer_load_ushort v1, off, s[12:15], 0
388; VI-NEXT:    s_mov_b32 s4, s6
389; VI-NEXT:    s_mov_b32 s5, s7
390; VI-NEXT:    s_mov_b32 s6, s10
391; VI-NEXT:    s_mov_b32 s7, s11
392; VI-NEXT:    buffer_load_ushort v3, off, s[4:7], 0
393; VI-NEXT:    v_mov_b32_e32 v2, 0x3800
394; VI-NEXT:    s_waitcnt vmcnt(1)
395; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
396; VI-NEXT:    s_waitcnt vmcnt(0)
397; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
398; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
399; VI-NEXT:    s_endpgm
400    half addrspace(1)* %r,
401    half addrspace(1)* %a,
402    half addrspace(1)* %b,
403    half addrspace(1)* %c) {
404entry:
405  %a.val = load volatile half, half addrspace(1)* %a
406  %b.val = load volatile half, half addrspace(1)* %b
407  %c.val = load volatile half, half addrspace(1)* %c
408  %fcmp = fcmp olt half %a.val, %b.val
409  %r.val = select i1 %fcmp, half %c.val, half 0xH3800
410  store half %r.val, half addrspace(1)* %r
411  ret void
412}
413
414define amdgpu_kernel void @select_v2f16(
415; SI-LABEL: select_v2f16:
416; SI:       ; %bb.0: ; %entry
417; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
418; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
419; SI-NEXT:    s_mov_b32 s3, 0xf000
420; SI-NEXT:    s_mov_b32 s2, -1
421; SI-NEXT:    s_mov_b32 s18, s2
422; SI-NEXT:    s_waitcnt lgkmcnt(0)
423; SI-NEXT:    s_mov_b32 s16, s6
424; SI-NEXT:    s_mov_b32 s17, s7
425; SI-NEXT:    s_mov_b32 s19, s3
426; SI-NEXT:    s_mov_b32 s20, s8
427; SI-NEXT:    s_mov_b32 s21, s9
428; SI-NEXT:    s_mov_b32 s8, s10
429; SI-NEXT:    s_mov_b32 s9, s11
430; SI-NEXT:    s_mov_b32 s22, s2
431; SI-NEXT:    s_mov_b32 s23, s3
432; SI-NEXT:    s_mov_b32 s10, s2
433; SI-NEXT:    s_mov_b32 s11, s3
434; SI-NEXT:    s_mov_b32 s14, s2
435; SI-NEXT:    s_mov_b32 s15, s3
436; SI-NEXT:    buffer_load_dword v0, off, s[16:19], 0
437; SI-NEXT:    buffer_load_dword v1, off, s[20:23], 0
438; SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0
439; SI-NEXT:    buffer_load_dword v3, off, s[8:11], 0
440; SI-NEXT:    s_mov_b32 s0, s4
441; SI-NEXT:    s_mov_b32 s1, s5
442; SI-NEXT:    s_waitcnt vmcnt(3)
443; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
444; SI-NEXT:    s_waitcnt vmcnt(2)
445; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
446; SI-NEXT:    s_waitcnt vmcnt(1)
447; SI-NEXT:    v_cvt_f32_f16_e32 v4, v2
448; SI-NEXT:    s_waitcnt vmcnt(0)
449; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
450; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
451; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
452; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
453; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
454; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
455; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
456; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
457; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
458; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
459; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
460; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
461; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
462; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
463; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
464; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
465; SI-NEXT:    v_or_b32_e32 v0, v0, v1
466; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
467; SI-NEXT:    s_endpgm
468;
469; VI-LABEL: select_v2f16:
470; VI:       ; %bb.0: ; %entry
471; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
472; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
473; VI-NEXT:    s_mov_b32 s3, 0xf000
474; VI-NEXT:    s_mov_b32 s2, -1
475; VI-NEXT:    s_mov_b32 s18, s2
476; VI-NEXT:    s_waitcnt lgkmcnt(0)
477; VI-NEXT:    s_mov_b32 s0, s4
478; VI-NEXT:    s_mov_b32 s1, s5
479; VI-NEXT:    s_mov_b32 s4, s6
480; VI-NEXT:    s_mov_b32 s5, s7
481; VI-NEXT:    s_mov_b32 s6, s2
482; VI-NEXT:    s_mov_b32 s7, s3
483; VI-NEXT:    s_mov_b32 s16, s8
484; VI-NEXT:    s_mov_b32 s17, s9
485; VI-NEXT:    s_mov_b32 s8, s10
486; VI-NEXT:    s_mov_b32 s9, s11
487; VI-NEXT:    s_mov_b32 s19, s3
488; VI-NEXT:    s_mov_b32 s10, s2
489; VI-NEXT:    s_mov_b32 s11, s3
490; VI-NEXT:    s_mov_b32 s14, s2
491; VI-NEXT:    s_mov_b32 s15, s3
492; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
493; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
494; VI-NEXT:    buffer_load_dword v2, off, s[12:15], 0
495; VI-NEXT:    buffer_load_dword v3, off, s[8:11], 0
496; VI-NEXT:    s_waitcnt vmcnt(3)
497; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
498; VI-NEXT:    s_waitcnt vmcnt(2)
499; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
500; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
501; VI-NEXT:    s_waitcnt vmcnt(0)
502; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
503; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
504; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
505; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v6, v5
506; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
507; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
508; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
509; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
510; VI-NEXT:    s_endpgm
511    <2 x half> addrspace(1)* %r,
512    <2 x half> addrspace(1)* %a,
513    <2 x half> addrspace(1)* %b,
514    <2 x half> addrspace(1)* %c,
515    <2 x half> addrspace(1)* %d) {
516entry:
517  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
518  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
519  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
520  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
521  %fcmp = fcmp olt <2 x half> %a.val, %b.val
522  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
523  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
524  ret void
525}
526
527define amdgpu_kernel void @select_v2f16_imm_a(
528; SI-LABEL: select_v2f16_imm_a:
529; SI:       ; %bb.0: ; %entry
530; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
531; SI-NEXT:    s_mov_b32 s11, 0xf000
532; SI-NEXT:    s_mov_b32 s10, -1
533; SI-NEXT:    s_mov_b32 s14, s10
534; SI-NEXT:    s_mov_b32 s15, s11
535; SI-NEXT:    s_waitcnt lgkmcnt(0)
536; SI-NEXT:    s_mov_b32 s12, s2
537; SI-NEXT:    s_mov_b32 s16, s4
538; SI-NEXT:    s_mov_b32 s17, s5
539; SI-NEXT:    s_mov_b32 s4, s6
540; SI-NEXT:    s_mov_b32 s5, s7
541; SI-NEXT:    s_mov_b32 s13, s3
542; SI-NEXT:    s_mov_b32 s6, s10
543; SI-NEXT:    s_mov_b32 s7, s11
544; SI-NEXT:    s_mov_b32 s18, s10
545; SI-NEXT:    s_mov_b32 s19, s11
546; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
547; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
548; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
549; SI-NEXT:    s_mov_b32 s2, 0x3f200000
550; SI-NEXT:    s_mov_b32 s8, s0
551; SI-NEXT:    s_mov_b32 s9, s1
552; SI-NEXT:    s_waitcnt vmcnt(2)
553; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
554; SI-NEXT:    s_waitcnt vmcnt(1)
555; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
556; SI-NEXT:    s_waitcnt vmcnt(0)
557; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
558; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
559; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
560; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
561; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
562; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
563; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
564; SI-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v3
565; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
566; SI-NEXT:    v_cmp_lt_f32_e32 vcc, 0.5, v0
567; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
568; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
569; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
570; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
571; SI-NEXT:    v_or_b32_e32 v0, v0, v1
572; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
573; SI-NEXT:    s_endpgm
574;
575; VI-LABEL: select_v2f16_imm_a:
576; VI:       ; %bb.0: ; %entry
577; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
578; VI-NEXT:    s_mov_b32 s11, 0xf000
579; VI-NEXT:    s_mov_b32 s10, -1
580; VI-NEXT:    s_mov_b32 s14, s10
581; VI-NEXT:    s_mov_b32 s15, s11
582; VI-NEXT:    s_waitcnt lgkmcnt(0)
583; VI-NEXT:    s_mov_b32 s8, s0
584; VI-NEXT:    s_mov_b32 s9, s1
585; VI-NEXT:    s_mov_b32 s0, s2
586; VI-NEXT:    s_mov_b32 s1, s3
587; VI-NEXT:    s_mov_b32 s12, s4
588; VI-NEXT:    s_mov_b32 s13, s5
589; VI-NEXT:    s_mov_b32 s4, s6
590; VI-NEXT:    s_mov_b32 s5, s7
591; VI-NEXT:    s_mov_b32 s2, s10
592; VI-NEXT:    s_mov_b32 s3, s11
593; VI-NEXT:    s_mov_b32 s6, s10
594; VI-NEXT:    s_mov_b32 s7, s11
595; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
596; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
597; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
598; VI-NEXT:    s_movk_i32 s0, 0x3900
599; VI-NEXT:    s_waitcnt vmcnt(2)
600; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
601; VI-NEXT:    v_cmp_lt_f16_e32 vcc, 0.5, v0
602; VI-NEXT:    s_waitcnt vmcnt(0)
603; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
604; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
605; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
606; VI-NEXT:    v_cmp_lt_f16_e32 vcc, s0, v3
607; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
608; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
609; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
610; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
611; VI-NEXT:    s_endpgm
612    <2 x half> addrspace(1)* %r,
613    <2 x half> addrspace(1)* %b,
614    <2 x half> addrspace(1)* %c,
615    <2 x half> addrspace(1)* %d) {
616entry:
617  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
618  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
619  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
620  %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val
621  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
622  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
623  ret void
624}
625
626define amdgpu_kernel void @select_v2f16_imm_b(
627; SI-LABEL: select_v2f16_imm_b:
628; SI:       ; %bb.0: ; %entry
629; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
630; SI-NEXT:    s_mov_b32 s11, 0xf000
631; SI-NEXT:    s_mov_b32 s10, -1
632; SI-NEXT:    s_mov_b32 s14, s10
633; SI-NEXT:    s_mov_b32 s15, s11
634; SI-NEXT:    s_waitcnt lgkmcnt(0)
635; SI-NEXT:    s_mov_b32 s12, s2
636; SI-NEXT:    s_mov_b32 s16, s4
637; SI-NEXT:    s_mov_b32 s17, s5
638; SI-NEXT:    s_mov_b32 s4, s6
639; SI-NEXT:    s_mov_b32 s5, s7
640; SI-NEXT:    s_mov_b32 s13, s3
641; SI-NEXT:    s_mov_b32 s6, s10
642; SI-NEXT:    s_mov_b32 s7, s11
643; SI-NEXT:    s_mov_b32 s18, s10
644; SI-NEXT:    s_mov_b32 s19, s11
645; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
646; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
647; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
648; SI-NEXT:    s_mov_b32 s2, 0x3f200000
649; SI-NEXT:    s_mov_b32 s8, s0
650; SI-NEXT:    s_mov_b32 s9, s1
651; SI-NEXT:    s_waitcnt vmcnt(2)
652; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
653; SI-NEXT:    s_waitcnt vmcnt(1)
654; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
655; SI-NEXT:    s_waitcnt vmcnt(0)
656; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
657; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
658; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
659; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
660; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
661; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
662; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
663; SI-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v3
664; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
665; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0.5, v0
666; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
667; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
668; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
669; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
670; SI-NEXT:    v_or_b32_e32 v0, v0, v1
671; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
672; SI-NEXT:    s_endpgm
673;
674; VI-LABEL: select_v2f16_imm_b:
675; VI:       ; %bb.0: ; %entry
676; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
677; VI-NEXT:    s_mov_b32 s11, 0xf000
678; VI-NEXT:    s_mov_b32 s10, -1
679; VI-NEXT:    s_mov_b32 s14, s10
680; VI-NEXT:    s_mov_b32 s15, s11
681; VI-NEXT:    s_waitcnt lgkmcnt(0)
682; VI-NEXT:    s_mov_b32 s8, s0
683; VI-NEXT:    s_mov_b32 s9, s1
684; VI-NEXT:    s_mov_b32 s0, s2
685; VI-NEXT:    s_mov_b32 s1, s3
686; VI-NEXT:    s_mov_b32 s12, s4
687; VI-NEXT:    s_mov_b32 s13, s5
688; VI-NEXT:    s_mov_b32 s4, s6
689; VI-NEXT:    s_mov_b32 s5, s7
690; VI-NEXT:    s_mov_b32 s2, s10
691; VI-NEXT:    s_mov_b32 s3, s11
692; VI-NEXT:    s_mov_b32 s6, s10
693; VI-NEXT:    s_mov_b32 s7, s11
694; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
695; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
696; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
697; VI-NEXT:    s_movk_i32 s0, 0x3900
698; VI-NEXT:    s_waitcnt vmcnt(2)
699; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
700; VI-NEXT:    v_cmp_gt_f16_e32 vcc, 0.5, v0
701; VI-NEXT:    s_waitcnt vmcnt(0)
702; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
703; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
704; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
705; VI-NEXT:    v_cmp_gt_f16_e32 vcc, s0, v3
706; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
707; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
708; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
709; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
710; VI-NEXT:    s_endpgm
711    <2 x half> addrspace(1)* %r,
712    <2 x half> addrspace(1)* %a,
713    <2 x half> addrspace(1)* %c,
714    <2 x half> addrspace(1)* %d) {
715entry:
716  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
717  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
718  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
719  %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900>
720  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
721  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
722  ret void
723}
724
725define amdgpu_kernel void @select_v2f16_imm_c(
726; SI-LABEL: select_v2f16_imm_c:
727; SI:       ; %bb.0: ; %entry
728; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
729; SI-NEXT:    s_mov_b32 s11, 0xf000
730; SI-NEXT:    s_mov_b32 s10, -1
731; SI-NEXT:    s_mov_b32 s14, s10
732; SI-NEXT:    s_mov_b32 s15, s11
733; SI-NEXT:    s_waitcnt lgkmcnt(0)
734; SI-NEXT:    s_mov_b32 s16, s4
735; SI-NEXT:    s_mov_b32 s17, s5
736; SI-NEXT:    s_mov_b32 s12, s2
737; SI-NEXT:    s_mov_b32 s13, s3
738; SI-NEXT:    s_mov_b32 s18, s10
739; SI-NEXT:    s_mov_b32 s19, s11
740; SI-NEXT:    s_mov_b32 s4, s6
741; SI-NEXT:    s_mov_b32 s5, s7
742; SI-NEXT:    s_mov_b32 s6, s10
743; SI-NEXT:    s_mov_b32 s7, s11
744; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
745; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
746; SI-NEXT:    buffer_load_dword v3, off, s[16:19], 0
747; SI-NEXT:    v_mov_b32_e32 v2, 0x3f200000
748; SI-NEXT:    s_mov_b32 s8, s0
749; SI-NEXT:    s_mov_b32 s9, s1
750; SI-NEXT:    s_waitcnt vmcnt(2)
751; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
752; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
753; SI-NEXT:    s_waitcnt vmcnt(0)
754; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
755; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
756; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
757; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
758; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
759; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
760; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
761; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v5
762; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
763; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v4, v3
764; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
765; SI-NEXT:    v_cndmask_b32_e32 v1, 0.5, v1, vcc
766; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
767; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
768; SI-NEXT:    v_or_b32_e32 v0, v1, v0
769; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
770; SI-NEXT:    s_endpgm
771;
772; VI-LABEL: select_v2f16_imm_c:
773; VI:       ; %bb.0: ; %entry
774; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
775; VI-NEXT:    s_mov_b32 s11, 0xf000
776; VI-NEXT:    s_mov_b32 s10, -1
777; VI-NEXT:    s_mov_b32 s14, s10
778; VI-NEXT:    s_mov_b32 s15, s11
779; VI-NEXT:    s_waitcnt lgkmcnt(0)
780; VI-NEXT:    s_mov_b32 s8, s0
781; VI-NEXT:    s_mov_b32 s9, s1
782; VI-NEXT:    s_mov_b32 s0, s2
783; VI-NEXT:    s_mov_b32 s1, s3
784; VI-NEXT:    s_mov_b32 s12, s4
785; VI-NEXT:    s_mov_b32 s13, s5
786; VI-NEXT:    s_mov_b32 s2, s10
787; VI-NEXT:    s_mov_b32 s3, s11
788; VI-NEXT:    s_mov_b32 s4, s6
789; VI-NEXT:    s_mov_b32 s5, s7
790; VI-NEXT:    s_mov_b32 s6, s10
791; VI-NEXT:    s_mov_b32 s7, s11
792; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
793; VI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
794; VI-NEXT:    buffer_load_dword v4, off, s[12:15], 0
795; VI-NEXT:    v_mov_b32_e32 v2, 0x3800
796; VI-NEXT:    v_mov_b32_e32 v3, 0x3900
797; VI-NEXT:    s_waitcnt vmcnt(2)
798; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
799; VI-NEXT:    s_waitcnt vmcnt(0)
800; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v4
801; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
802; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
803; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
804; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v6, v5
805; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
806; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
807; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
808; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
809; VI-NEXT:    s_endpgm
810    <2 x half> addrspace(1)* %r,
811    <2 x half> addrspace(1)* %a,
812    <2 x half> addrspace(1)* %b,
813    <2 x half> addrspace(1)* %d) {
814entry:
815  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
816  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
817  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
818  %fcmp = fcmp olt <2 x half> %a.val, %b.val
819  %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val
820  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
821  ret void
822}
823
824define amdgpu_kernel void @select_v2f16_imm_d(
825; SI-LABEL: select_v2f16_imm_d:
826; SI:       ; %bb.0: ; %entry
827; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
828; SI-NEXT:    s_mov_b32 s11, 0xf000
829; SI-NEXT:    s_mov_b32 s10, -1
830; SI-NEXT:    s_mov_b32 s14, s10
831; SI-NEXT:    s_mov_b32 s15, s11
832; SI-NEXT:    s_waitcnt lgkmcnt(0)
833; SI-NEXT:    s_mov_b32 s16, s4
834; SI-NEXT:    s_mov_b32 s17, s5
835; SI-NEXT:    s_mov_b32 s12, s2
836; SI-NEXT:    s_mov_b32 s13, s3
837; SI-NEXT:    s_mov_b32 s18, s10
838; SI-NEXT:    s_mov_b32 s19, s11
839; SI-NEXT:    s_mov_b32 s4, s6
840; SI-NEXT:    s_mov_b32 s5, s7
841; SI-NEXT:    s_mov_b32 s6, s10
842; SI-NEXT:    s_mov_b32 s7, s11
843; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
844; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
845; SI-NEXT:    buffer_load_dword v3, off, s[16:19], 0
846; SI-NEXT:    v_mov_b32_e32 v2, 0x3f200000
847; SI-NEXT:    s_mov_b32 s8, s0
848; SI-NEXT:    s_mov_b32 s9, s1
849; SI-NEXT:    s_waitcnt vmcnt(2)
850; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
851; SI-NEXT:    s_waitcnt vmcnt(1)
852; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
853; SI-NEXT:    s_waitcnt vmcnt(0)
854; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
855; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
856; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
857; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
858; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
859; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
860; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
861; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
862; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
863; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
864; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v1, vcc
865; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
866; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
867; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
868; SI-NEXT:    v_or_b32_e32 v0, v0, v1
869; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
870; SI-NEXT:    s_endpgm
871;
872; VI-LABEL: select_v2f16_imm_d:
873; VI:       ; %bb.0: ; %entry
874; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
875; VI-NEXT:    s_mov_b32 s11, 0xf000
876; VI-NEXT:    s_mov_b32 s10, -1
877; VI-NEXT:    s_mov_b32 s14, s10
878; VI-NEXT:    s_mov_b32 s15, s11
879; VI-NEXT:    s_waitcnt lgkmcnt(0)
880; VI-NEXT:    s_mov_b32 s8, s0
881; VI-NEXT:    s_mov_b32 s9, s1
882; VI-NEXT:    s_mov_b32 s0, s2
883; VI-NEXT:    s_mov_b32 s1, s3
884; VI-NEXT:    s_mov_b32 s12, s4
885; VI-NEXT:    s_mov_b32 s13, s5
886; VI-NEXT:    s_mov_b32 s2, s10
887; VI-NEXT:    s_mov_b32 s3, s11
888; VI-NEXT:    s_mov_b32 s4, s6
889; VI-NEXT:    s_mov_b32 s5, s7
890; VI-NEXT:    s_mov_b32 s6, s10
891; VI-NEXT:    s_mov_b32 s7, s11
892; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
893; VI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
894; VI-NEXT:    buffer_load_dword v4, off, s[12:15], 0
895; VI-NEXT:    v_mov_b32_e32 v2, 0x3800
896; VI-NEXT:    v_mov_b32_e32 v3, 0x3900
897; VI-NEXT:    s_waitcnt vmcnt(2)
898; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
899; VI-NEXT:    s_waitcnt vmcnt(0)
900; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v4
901; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
902; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
903; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
904; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v6, v5
905; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
906; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
907; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
908; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
909; VI-NEXT:    s_endpgm
910    <2 x half> addrspace(1)* %r,
911    <2 x half> addrspace(1)* %a,
912    <2 x half> addrspace(1)* %b,
913    <2 x half> addrspace(1)* %c) {
914entry:
915  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
916  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
917  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
918  %fcmp = fcmp olt <2 x half> %a.val, %b.val
919  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900>
920  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
921  ret void
922}
923