1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s
6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11 %s
7
8declare half @llvm.minnum.f16(half %a, half %b)
9declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
10declare <3 x half> @llvm.minnum.v3f16(<3 x half> %a, <3 x half> %b)
11declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b)
12
13define amdgpu_kernel void @minnum_f16_ieee(
14; SI-LABEL: minnum_f16_ieee:
15; SI:       ; %bb.0: ; %entry
16; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
17; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
18; SI-NEXT:    s_mov_b32 s3, 0xf000
19; SI-NEXT:    s_mov_b32 s2, -1
20; SI-NEXT:    s_mov_b32 s14, s2
21; SI-NEXT:    s_waitcnt lgkmcnt(0)
22; SI-NEXT:    s_mov_b32 s12, s6
23; SI-NEXT:    s_mov_b32 s13, s7
24; SI-NEXT:    s_mov_b32 s15, s3
25; SI-NEXT:    s_mov_b32 s10, s2
26; SI-NEXT:    s_mov_b32 s11, s3
27; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
28; SI-NEXT:    s_waitcnt vmcnt(0)
29; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
30; SI-NEXT:    s_waitcnt vmcnt(0)
31; SI-NEXT:    s_mov_b32 s0, s4
32; SI-NEXT:    s_mov_b32 s1, s5
33; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
34; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
35; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
36; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
37; SI-NEXT:    v_min_f32_e32 v0, v0, v1
38; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
39; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
40; SI-NEXT:    s_endpgm
41;
42; VI-LABEL: minnum_f16_ieee:
43; VI:       ; %bb.0: ; %entry
44; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
45; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
46; VI-NEXT:    s_mov_b32 s3, 0xf000
47; VI-NEXT:    s_mov_b32 s2, -1
48; VI-NEXT:    s_mov_b32 s14, s2
49; VI-NEXT:    s_waitcnt lgkmcnt(0)
50; VI-NEXT:    s_mov_b32 s12, s6
51; VI-NEXT:    s_mov_b32 s13, s7
52; VI-NEXT:    s_mov_b32 s15, s3
53; VI-NEXT:    s_mov_b32 s10, s2
54; VI-NEXT:    s_mov_b32 s11, s3
55; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
56; VI-NEXT:    s_waitcnt vmcnt(0)
57; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
58; VI-NEXT:    s_waitcnt vmcnt(0)
59; VI-NEXT:    s_mov_b32 s0, s4
60; VI-NEXT:    s_mov_b32 s1, s5
61; VI-NEXT:    v_max_f16_e32 v0, v0, v0
62; VI-NEXT:    v_max_f16_e32 v1, v1, v1
63; VI-NEXT:    v_min_f16_e32 v0, v0, v1
64; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
65; VI-NEXT:    s_endpgm
66;
67; GFX9-LABEL: minnum_f16_ieee:
68; GFX9:       ; %bb.0: ; %entry
69; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
70; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
71; GFX9-NEXT:    s_mov_b32 s3, 0xf000
72; GFX9-NEXT:    s_mov_b32 s2, -1
73; GFX9-NEXT:    s_mov_b32 s14, s2
74; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
75; GFX9-NEXT:    s_mov_b32 s12, s6
76; GFX9-NEXT:    s_mov_b32 s13, s7
77; GFX9-NEXT:    s_mov_b32 s15, s3
78; GFX9-NEXT:    s_mov_b32 s10, s2
79; GFX9-NEXT:    s_mov_b32 s11, s3
80; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
81; GFX9-NEXT:    s_waitcnt vmcnt(0)
82; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
83; GFX9-NEXT:    s_waitcnt vmcnt(0)
84; GFX9-NEXT:    s_mov_b32 s0, s4
85; GFX9-NEXT:    s_mov_b32 s1, s5
86; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
87; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
88; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
89; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
90; GFX9-NEXT:    s_endpgm
91;
92; GFX10-LABEL: minnum_f16_ieee:
93; GFX10:       ; %bb.0: ; %entry
94; GFX10-NEXT:    s_clause 0x1
95; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
96; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
97; GFX10-NEXT:    s_mov_b32 s2, -1
98; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
99; GFX10-NEXT:    s_mov_b32 s14, s2
100; GFX10-NEXT:    s_mov_b32 s15, s3
101; GFX10-NEXT:    s_mov_b32 s10, s2
102; GFX10-NEXT:    s_mov_b32 s11, s3
103; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
104; GFX10-NEXT:    s_mov_b32 s12, s6
105; GFX10-NEXT:    s_mov_b32 s13, s7
106; GFX10-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc dlc
107; GFX10-NEXT:    s_waitcnt vmcnt(0)
108; GFX10-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc dlc
109; GFX10-NEXT:    s_waitcnt vmcnt(0)
110; GFX10-NEXT:    s_mov_b32 s0, s4
111; GFX10-NEXT:    s_mov_b32 s1, s5
112; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
113; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
114; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
115; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0
116; GFX10-NEXT:    s_endpgm
117;
118; GFX11-LABEL: minnum_f16_ieee:
119; GFX11:       ; %bb.0: ; %entry
120; GFX11-NEXT:    s_clause 0x1
121; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
122; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
123; GFX11-NEXT:    s_mov_b32 s10, -1
124; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
125; GFX11-NEXT:    s_mov_b32 s14, s10
126; GFX11-NEXT:    s_mov_b32 s15, s11
127; GFX11-NEXT:    s_mov_b32 s2, s10
128; GFX11-NEXT:    s_mov_b32 s3, s11
129; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
130; GFX11-NEXT:    s_mov_b32 s12, s6
131; GFX11-NEXT:    s_mov_b32 s13, s7
132; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
133; GFX11-NEXT:    s_waitcnt vmcnt(0)
134; GFX11-NEXT:    buffer_load_u16 v1, off, s[0:3], 0 glc dlc
135; GFX11-NEXT:    s_waitcnt vmcnt(0)
136; GFX11-NEXT:    s_mov_b32 s8, s4
137; GFX11-NEXT:    s_mov_b32 s9, s5
138; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
139; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
140; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
141; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
142; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
143; GFX11-NEXT:    s_endpgm
144    half addrspace(1)* %r,
145    half addrspace(1)* %a,
146    half addrspace(1)* %b) #0 {
147entry:
148  %a.val = load volatile half, half addrspace(1)* %a
149  %b.val = load volatile half, half addrspace(1)* %b
150  %r.val = call half @llvm.minnum.f16(half %a.val, half %b.val)
151  store half %r.val, half addrspace(1)* %r
152  ret void
153}
154
155define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 {
156; SI-LABEL: minnum_f16_no_ieee:
157; SI:       ; %bb.0:
158; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
159; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
160; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
161; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
162; SI-NEXT:    v_min_f32_e32 v0, v0, v1
163; SI-NEXT:    ; return to shader part epilog
164;
165; VI-LABEL: minnum_f16_no_ieee:
166; VI:       ; %bb.0:
167; VI-NEXT:    v_min_f16_e32 v0, v0, v1
168; VI-NEXT:    ; return to shader part epilog
169;
170; GFX9-LABEL: minnum_f16_no_ieee:
171; GFX9:       ; %bb.0:
172; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
173; GFX9-NEXT:    ; return to shader part epilog
174;
175; GFX10PLUS-LABEL: minnum_f16_no_ieee:
176; GFX10PLUS:       ; %bb.0:
177; GFX10PLUS-NEXT:    v_min_f16_e32 v0, v0, v1
178; GFX10PLUS-NEXT:    ; return to shader part epilog
179  %r.val = call half @llvm.minnum.f16(half %a, half %b)
180  ret half %r.val
181}
182
183define amdgpu_kernel void @minnum_f16_imm_a(
184; SI-LABEL: minnum_f16_imm_a:
185; SI:       ; %bb.0: ; %entry
186; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
187; SI-NEXT:    s_mov_b32 s7, 0xf000
188; SI-NEXT:    s_mov_b32 s6, -1
189; SI-NEXT:    s_mov_b32 s10, s6
190; SI-NEXT:    s_mov_b32 s11, s7
191; SI-NEXT:    s_waitcnt lgkmcnt(0)
192; SI-NEXT:    s_mov_b32 s8, s2
193; SI-NEXT:    s_mov_b32 s9, s3
194; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
195; SI-NEXT:    s_mov_b32 s4, s0
196; SI-NEXT:    s_mov_b32 s5, s1
197; SI-NEXT:    s_waitcnt vmcnt(0)
198; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
199; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
200; SI-NEXT:    v_min_f32_e32 v0, 0x40400000, v0
201; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
202; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
203; SI-NEXT:    s_endpgm
204;
205; VI-LABEL: minnum_f16_imm_a:
206; VI:       ; %bb.0: ; %entry
207; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
208; VI-NEXT:    s_mov_b32 s7, 0xf000
209; VI-NEXT:    s_mov_b32 s6, -1
210; VI-NEXT:    s_mov_b32 s10, s6
211; VI-NEXT:    s_mov_b32 s11, s7
212; VI-NEXT:    s_waitcnt lgkmcnt(0)
213; VI-NEXT:    s_mov_b32 s8, s2
214; VI-NEXT:    s_mov_b32 s9, s3
215; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
216; VI-NEXT:    s_mov_b32 s4, s0
217; VI-NEXT:    s_mov_b32 s5, s1
218; VI-NEXT:    s_waitcnt vmcnt(0)
219; VI-NEXT:    v_max_f16_e32 v0, v0, v0
220; VI-NEXT:    v_min_f16_e32 v0, 0x4200, v0
221; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
222; VI-NEXT:    s_endpgm
223;
224; GFX9-LABEL: minnum_f16_imm_a:
225; GFX9:       ; %bb.0: ; %entry
226; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
227; GFX9-NEXT:    s_mov_b32 s7, 0xf000
228; GFX9-NEXT:    s_mov_b32 s6, -1
229; GFX9-NEXT:    s_mov_b32 s10, s6
230; GFX9-NEXT:    s_mov_b32 s11, s7
231; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
232; GFX9-NEXT:    s_mov_b32 s8, s2
233; GFX9-NEXT:    s_mov_b32 s9, s3
234; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
235; GFX9-NEXT:    s_mov_b32 s4, s0
236; GFX9-NEXT:    s_mov_b32 s5, s1
237; GFX9-NEXT:    s_waitcnt vmcnt(0)
238; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
239; GFX9-NEXT:    v_min_f16_e32 v0, 0x4200, v0
240; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
241; GFX9-NEXT:    s_endpgm
242;
243; GFX10-LABEL: minnum_f16_imm_a:
244; GFX10:       ; %bb.0: ; %entry
245; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
246; GFX10-NEXT:    s_mov_b32 s6, -1
247; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
248; GFX10-NEXT:    s_mov_b32 s10, s6
249; GFX10-NEXT:    s_mov_b32 s11, s7
250; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
251; GFX10-NEXT:    s_mov_b32 s8, s2
252; GFX10-NEXT:    s_mov_b32 s9, s3
253; GFX10-NEXT:    s_mov_b32 s4, s0
254; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
255; GFX10-NEXT:    s_mov_b32 s5, s1
256; GFX10-NEXT:    s_waitcnt vmcnt(0)
257; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
258; GFX10-NEXT:    v_min_f16_e32 v0, 0x4200, v0
259; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
260; GFX10-NEXT:    s_endpgm
261;
262; GFX11-LABEL: minnum_f16_imm_a:
263; GFX11:       ; %bb.0: ; %entry
264; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
265; GFX11-NEXT:    s_mov_b32 s6, -1
266; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
267; GFX11-NEXT:    s_mov_b32 s10, s6
268; GFX11-NEXT:    s_mov_b32 s11, s7
269; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
270; GFX11-NEXT:    s_mov_b32 s8, s2
271; GFX11-NEXT:    s_mov_b32 s9, s3
272; GFX11-NEXT:    s_mov_b32 s4, s0
273; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
274; GFX11-NEXT:    s_mov_b32 s5, s1
275; GFX11-NEXT:    s_waitcnt vmcnt(0)
276; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
277; GFX11-NEXT:    v_min_f16_e32 v0, 0x4200, v0
278; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
279; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
280; GFX11-NEXT:    s_endpgm
281    half addrspace(1)* %r,
282    half addrspace(1)* %b) #0 {
283entry:
284  %b.val = load half, half addrspace(1)* %b
285  %r.val = call half @llvm.minnum.f16(half 3.0, half %b.val)
286  store half %r.val, half addrspace(1)* %r
287  ret void
288}
289
290define amdgpu_kernel void @minnum_f16_imm_b(
291; SI-LABEL: minnum_f16_imm_b:
292; SI:       ; %bb.0: ; %entry
293; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
294; SI-NEXT:    s_mov_b32 s7, 0xf000
295; SI-NEXT:    s_mov_b32 s6, -1
296; SI-NEXT:    s_mov_b32 s10, s6
297; SI-NEXT:    s_mov_b32 s11, s7
298; SI-NEXT:    s_waitcnt lgkmcnt(0)
299; SI-NEXT:    s_mov_b32 s8, s2
300; SI-NEXT:    s_mov_b32 s9, s3
301; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
302; SI-NEXT:    s_mov_b32 s4, s0
303; SI-NEXT:    s_mov_b32 s5, s1
304; SI-NEXT:    s_waitcnt vmcnt(0)
305; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
306; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
307; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
308; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
309; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
310; SI-NEXT:    s_endpgm
311;
312; VI-LABEL: minnum_f16_imm_b:
313; VI:       ; %bb.0: ; %entry
314; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
315; VI-NEXT:    s_mov_b32 s7, 0xf000
316; VI-NEXT:    s_mov_b32 s6, -1
317; VI-NEXT:    s_mov_b32 s10, s6
318; VI-NEXT:    s_mov_b32 s11, s7
319; VI-NEXT:    s_waitcnt lgkmcnt(0)
320; VI-NEXT:    s_mov_b32 s8, s2
321; VI-NEXT:    s_mov_b32 s9, s3
322; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
323; VI-NEXT:    s_mov_b32 s4, s0
324; VI-NEXT:    s_mov_b32 s5, s1
325; VI-NEXT:    s_waitcnt vmcnt(0)
326; VI-NEXT:    v_max_f16_e32 v0, v0, v0
327; VI-NEXT:    v_min_f16_e32 v0, 4.0, v0
328; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
329; VI-NEXT:    s_endpgm
330;
331; GFX9-LABEL: minnum_f16_imm_b:
332; GFX9:       ; %bb.0: ; %entry
333; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
334; GFX9-NEXT:    s_mov_b32 s7, 0xf000
335; GFX9-NEXT:    s_mov_b32 s6, -1
336; GFX9-NEXT:    s_mov_b32 s10, s6
337; GFX9-NEXT:    s_mov_b32 s11, s7
338; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
339; GFX9-NEXT:    s_mov_b32 s8, s2
340; GFX9-NEXT:    s_mov_b32 s9, s3
341; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
342; GFX9-NEXT:    s_mov_b32 s4, s0
343; GFX9-NEXT:    s_mov_b32 s5, s1
344; GFX9-NEXT:    s_waitcnt vmcnt(0)
345; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
346; GFX9-NEXT:    v_min_f16_e32 v0, 4.0, v0
347; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
348; GFX9-NEXT:    s_endpgm
349;
350; GFX10-LABEL: minnum_f16_imm_b:
351; GFX10:       ; %bb.0: ; %entry
352; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
353; GFX10-NEXT:    s_mov_b32 s6, -1
354; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
355; GFX10-NEXT:    s_mov_b32 s10, s6
356; GFX10-NEXT:    s_mov_b32 s11, s7
357; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
358; GFX10-NEXT:    s_mov_b32 s8, s2
359; GFX10-NEXT:    s_mov_b32 s9, s3
360; GFX10-NEXT:    s_mov_b32 s4, s0
361; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
362; GFX10-NEXT:    s_mov_b32 s5, s1
363; GFX10-NEXT:    s_waitcnt vmcnt(0)
364; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
365; GFX10-NEXT:    v_min_f16_e32 v0, 4.0, v0
366; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
367; GFX10-NEXT:    s_endpgm
368;
369; GFX11-LABEL: minnum_f16_imm_b:
370; GFX11:       ; %bb.0: ; %entry
371; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
372; GFX11-NEXT:    s_mov_b32 s6, -1
373; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
374; GFX11-NEXT:    s_mov_b32 s10, s6
375; GFX11-NEXT:    s_mov_b32 s11, s7
376; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
377; GFX11-NEXT:    s_mov_b32 s8, s2
378; GFX11-NEXT:    s_mov_b32 s9, s3
379; GFX11-NEXT:    s_mov_b32 s4, s0
380; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
381; GFX11-NEXT:    s_mov_b32 s5, s1
382; GFX11-NEXT:    s_waitcnt vmcnt(0)
383; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
384; GFX11-NEXT:    v_min_f16_e32 v0, 4.0, v0
385; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
386; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
387; GFX11-NEXT:    s_endpgm
388    half addrspace(1)* %r,
389    half addrspace(1)* %a) #0 {
390entry:
391  %a.val = load half, half addrspace(1)* %a
392  %r.val = call half @llvm.minnum.f16(half %a.val, half 4.0)
393  store half %r.val, half addrspace(1)* %r
394  ret void
395}
396
397define amdgpu_kernel void @minnum_v2f16_ieee(
398; SI-LABEL: minnum_v2f16_ieee:
399; SI:       ; %bb.0: ; %entry
400; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
401; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
402; SI-NEXT:    s_waitcnt lgkmcnt(0)
403; SI-NEXT:    s_load_dword s2, s[6:7], 0x0
404; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
405; SI-NEXT:    s_mov_b32 s7, 0xf000
406; SI-NEXT:    s_mov_b32 s6, -1
407; SI-NEXT:    s_waitcnt lgkmcnt(0)
408; SI-NEXT:    s_lshr_b32 s1, s2, 16
409; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
410; SI-NEXT:    s_lshr_b32 s0, s0, 16
411; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
412; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
413; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
414; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
415; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
416; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
417; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
418; SI-NEXT:    v_min_f32_e32 v2, v3, v2
419; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
420; SI-NEXT:    v_min_f32_e32 v0, v0, v1
421; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
422; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
423; SI-NEXT:    v_or_b32_e32 v0, v0, v1
424; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
425; SI-NEXT:    s_endpgm
426;
427; VI-LABEL: minnum_v2f16_ieee:
428; VI:       ; %bb.0: ; %entry
429; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
430; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
431; VI-NEXT:    s_mov_b32 s7, 0xf000
432; VI-NEXT:    s_mov_b32 s6, -1
433; VI-NEXT:    s_waitcnt lgkmcnt(0)
434; VI-NEXT:    s_load_dword s8, s[4:5], 0x0
435; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
436; VI-NEXT:    s_mov_b32 s4, s0
437; VI-NEXT:    s_mov_b32 s5, s1
438; VI-NEXT:    s_waitcnt lgkmcnt(0)
439; VI-NEXT:    v_max_f16_e64 v0, s8, s8
440; VI-NEXT:    v_max_f16_e64 v1, s2, s2
441; VI-NEXT:    s_lshr_b32 s0, s8, 16
442; VI-NEXT:    v_min_f16_e32 v0, v1, v0
443; VI-NEXT:    v_max_f16_e64 v1, s0, s0
444; VI-NEXT:    s_lshr_b32 s0, s2, 16
445; VI-NEXT:    v_max_f16_e64 v2, s0, s0
446; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
447; VI-NEXT:    v_or_b32_e32 v0, v0, v1
448; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
449; VI-NEXT:    s_endpgm
450;
451; GFX9-LABEL: minnum_v2f16_ieee:
452; GFX9:       ; %bb.0: ; %entry
453; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
454; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
455; GFX9-NEXT:    s_mov_b32 s3, 0xf000
456; GFX9-NEXT:    s_mov_b32 s2, -1
457; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
458; GFX9-NEXT:    s_load_dword s10, s[8:9], 0x0
459; GFX9-NEXT:    s_load_dword s11, s[6:7], 0x0
460; GFX9-NEXT:    s_mov_b32 s0, s4
461; GFX9-NEXT:    s_mov_b32 s1, s5
462; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
463; GFX9-NEXT:    v_pk_max_f16 v0, s10, s10
464; GFX9-NEXT:    v_pk_max_f16 v1, s11, s11
465; GFX9-NEXT:    v_pk_min_f16 v0, v1, v0
466; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
467; GFX9-NEXT:    s_endpgm
468;
469; GFX10-LABEL: minnum_v2f16_ieee:
470; GFX10:       ; %bb.0: ; %entry
471; GFX10-NEXT:    s_clause 0x1
472; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
473; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
474; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
475; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
476; GFX10-NEXT:    s_load_dword s1, s[6:7], 0x0
477; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
478; GFX10-NEXT:    s_mov_b32 s6, -1
479; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
480; GFX10-NEXT:    v_pk_max_f16 v0, s0, s0
481; GFX10-NEXT:    v_pk_max_f16 v1, s1, s1
482; GFX10-NEXT:    v_pk_min_f16 v0, v1, v0
483; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
484; GFX10-NEXT:    s_endpgm
485;
486; GFX11-LABEL: minnum_v2f16_ieee:
487; GFX11:       ; %bb.0: ; %entry
488; GFX11-NEXT:    s_clause 0x1
489; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
490; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
491; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
492; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x0
493; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
494; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
495; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
496; GFX11-NEXT:    v_pk_max_f16 v0, s4, s4
497; GFX11-NEXT:    v_pk_max_f16 v1, s2, s2
498; GFX11-NEXT:    s_mov_b32 s2, -1
499; GFX11-NEXT:    v_pk_min_f16 v0, v1, v0
500; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
501; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
502; GFX11-NEXT:    s_endpgm
503    <2 x half> addrspace(1)* %r,
504    <2 x half> addrspace(1)* %a,
505    <2 x half> addrspace(1)* %b) #0 {
506entry:
507  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
508  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
509  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> %b.val)
510  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
511  ret void
512}
513
514define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) #0 {
515; SI-LABEL: minnum_v2f16_no_ieee:
516; SI:       ; %bb.0:
517; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
518; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
519; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
520; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
521; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
522; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
523; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
524; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
525; SI-NEXT:    v_min_f32_e32 v0, v0, v2
526; SI-NEXT:    v_min_f32_e32 v1, v1, v3
527; SI-NEXT:    ; return to shader part epilog
528;
529; VI-LABEL: minnum_v2f16_no_ieee:
530; VI:       ; %bb.0:
531; VI-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
532; VI-NEXT:    v_min_f16_e32 v0, v0, v1
533; VI-NEXT:    v_or_b32_e32 v0, v0, v2
534; VI-NEXT:    ; return to shader part epilog
535;
536; GFX9-LABEL: minnum_v2f16_no_ieee:
537; GFX9:       ; %bb.0:
538; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
539; GFX9-NEXT:    ; return to shader part epilog
540;
541; GFX10PLUS-LABEL: minnum_v2f16_no_ieee:
542; GFX10PLUS:       ; %bb.0:
543; GFX10PLUS-NEXT:    v_pk_min_f16 v0, v0, v1
544; GFX10PLUS-NEXT:    ; return to shader part epilog
545  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
546  ret <2 x half> %r.val
547}
548
549define amdgpu_kernel void @minnum_v2f16_imm_a(
550; SI-LABEL: minnum_v2f16_imm_a:
551; SI:       ; %bb.0: ; %entry
552; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
553; SI-NEXT:    s_waitcnt lgkmcnt(0)
554; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
555; SI-NEXT:    s_mov_b32 s3, 0xf000
556; SI-NEXT:    s_waitcnt lgkmcnt(0)
557; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
558; SI-NEXT:    s_lshr_b32 s2, s2, 16
559; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
560; SI-NEXT:    s_mov_b32 s2, -1
561; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
562; SI-NEXT:    v_min_f32_e32 v0, 0x40400000, v0
563; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
564; SI-NEXT:    v_min_f32_e32 v1, 4.0, v1
565; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
566; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
567; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
568; SI-NEXT:    v_or_b32_e32 v0, v0, v1
569; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
570; SI-NEXT:    s_endpgm
571;
572; VI-LABEL: minnum_v2f16_imm_a:
573; VI:       ; %bb.0: ; %entry
574; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
575; VI-NEXT:    v_mov_b32_e32 v2, 0x4400
576; VI-NEXT:    s_waitcnt lgkmcnt(0)
577; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
578; VI-NEXT:    s_mov_b32 s3, 0xf000
579; VI-NEXT:    s_mov_b32 s2, -1
580; VI-NEXT:    s_waitcnt lgkmcnt(0)
581; VI-NEXT:    v_max_f16_e64 v0, s4, s4
582; VI-NEXT:    s_lshr_b32 s4, s4, 16
583; VI-NEXT:    v_max_f16_e64 v1, s4, s4
584; VI-NEXT:    v_min_f16_e32 v0, 0x4200, v0
585; VI-NEXT:    v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
586; VI-NEXT:    v_or_b32_e32 v0, v0, v1
587; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
588; VI-NEXT:    s_endpgm
589;
590; GFX9-LABEL: minnum_v2f16_imm_a:
591; GFX9:       ; %bb.0: ; %entry
592; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
593; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
594; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
595; GFX9-NEXT:    s_mov_b32 s3, 0xf000
596; GFX9-NEXT:    s_mov_b32 s2, -1
597; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
598; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
599; GFX9-NEXT:    s_mov_b32 s4, 0x44004200
600; GFX9-NEXT:    v_pk_min_f16 v0, v0, s4
601; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
602; GFX9-NEXT:    s_endpgm
603;
604; GFX10-LABEL: minnum_v2f16_imm_a:
605; GFX10:       ; %bb.0: ; %entry
606; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
607; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
608; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
609; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
610; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
611; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
612; GFX10-NEXT:    s_mov_b32 s2, -1
613; GFX10-NEXT:    v_pk_min_f16 v0, 0x44004200, v0
614; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
615; GFX10-NEXT:    s_endpgm
616;
617; GFX11-LABEL: minnum_v2f16_imm_a:
618; GFX11:       ; %bb.0: ; %entry
619; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
620; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
621; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
622; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
623; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
624; GFX11-NEXT:    v_pk_max_f16 v0, s2, s2
625; GFX11-NEXT:    s_mov_b32 s2, -1
626; GFX11-NEXT:    v_pk_min_f16 v0, 0x44004200, v0
627; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
628; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
629; GFX11-NEXT:    s_endpgm
630    <2 x half> addrspace(1)* %r,
631    <2 x half> addrspace(1)* %b) #0 {
632entry:
633  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
634  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val)
635  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
636  ret void
637}
638
639define amdgpu_kernel void @minnum_v2f16_imm_b(
640; SI-LABEL: minnum_v2f16_imm_b:
641; SI:       ; %bb.0: ; %entry
642; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
643; SI-NEXT:    s_waitcnt lgkmcnt(0)
644; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
645; SI-NEXT:    s_mov_b32 s3, 0xf000
646; SI-NEXT:    s_waitcnt lgkmcnt(0)
647; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
648; SI-NEXT:    s_lshr_b32 s2, s2, 16
649; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
650; SI-NEXT:    s_mov_b32 s2, -1
651; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
652; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
653; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
654; SI-NEXT:    v_min_f32_e32 v1, 0x40400000, v1
655; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
656; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
657; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
658; SI-NEXT:    v_or_b32_e32 v0, v0, v1
659; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
660; SI-NEXT:    s_endpgm
661;
662; VI-LABEL: minnum_v2f16_imm_b:
663; VI:       ; %bb.0: ; %entry
664; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
665; VI-NEXT:    v_mov_b32_e32 v2, 0x4200
666; VI-NEXT:    s_waitcnt lgkmcnt(0)
667; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
668; VI-NEXT:    s_mov_b32 s3, 0xf000
669; VI-NEXT:    s_mov_b32 s2, -1
670; VI-NEXT:    s_waitcnt lgkmcnt(0)
671; VI-NEXT:    v_max_f16_e64 v0, s4, s4
672; VI-NEXT:    s_lshr_b32 s4, s4, 16
673; VI-NEXT:    v_max_f16_e64 v1, s4, s4
674; VI-NEXT:    v_min_f16_e32 v0, 4.0, v0
675; VI-NEXT:    v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
676; VI-NEXT:    v_or_b32_e32 v0, v0, v1
677; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
678; VI-NEXT:    s_endpgm
679;
680; GFX9-LABEL: minnum_v2f16_imm_b:
681; GFX9:       ; %bb.0: ; %entry
682; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
683; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
684; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
685; GFX9-NEXT:    s_mov_b32 s3, 0xf000
686; GFX9-NEXT:    s_mov_b32 s2, -1
687; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
688; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
689; GFX9-NEXT:    s_mov_b32 s4, 0x42004400
690; GFX9-NEXT:    v_pk_min_f16 v0, v0, s4
691; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
692; GFX9-NEXT:    s_endpgm
693;
694; GFX10-LABEL: minnum_v2f16_imm_b:
695; GFX10:       ; %bb.0: ; %entry
696; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
697; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
698; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
699; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
700; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
701; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
702; GFX10-NEXT:    s_mov_b32 s2, -1
703; GFX10-NEXT:    v_pk_min_f16 v0, 0x42004400, v0
704; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
705; GFX10-NEXT:    s_endpgm
706;
707; GFX11-LABEL: minnum_v2f16_imm_b:
708; GFX11:       ; %bb.0: ; %entry
709; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
710; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
711; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
712; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
713; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
714; GFX11-NEXT:    v_pk_max_f16 v0, s2, s2
715; GFX11-NEXT:    s_mov_b32 s2, -1
716; GFX11-NEXT:    v_pk_min_f16 v0, 0x42004400, v0
717; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
718; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
719; GFX11-NEXT:    s_endpgm
720    <2 x half> addrspace(1)* %r,
721    <2 x half> addrspace(1)* %a) #0 {
722entry:
723  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
724  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>)
725  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
726  ret void
727}
728
729; FIXME: Scalarize with undef half
730define amdgpu_kernel void @minnum_v3f16(
731; SI-LABEL: minnum_v3f16:
732; SI:       ; %bb.0: ; %entry
733; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
734; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
735; SI-NEXT:    s_waitcnt lgkmcnt(0)
736; SI-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
737; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
738; SI-NEXT:    s_mov_b32 s7, 0xf000
739; SI-NEXT:    s_mov_b32 s6, -1
740; SI-NEXT:    s_waitcnt lgkmcnt(0)
741; SI-NEXT:    v_cvt_f32_f16_e32 v0, s3
742; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
743; SI-NEXT:    s_lshr_b32 s2, s2, 16
744; SI-NEXT:    s_lshr_b32 s3, s0, 16
745; SI-NEXT:    v_cvt_f32_f16_e32 v2, s3
746; SI-NEXT:    v_cvt_f32_f16_e32 v3, s2
747; SI-NEXT:    v_cvt_f32_f16_e32 v5, s0
748; SI-NEXT:    v_cvt_f32_f16_e32 v4, s1
749; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
750; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
751; SI-NEXT:    v_min_f32_e32 v2, v3, v2
752; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
753; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
754; SI-NEXT:    v_min_f32_e32 v1, v1, v3
755; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
756; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
757; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
758; SI-NEXT:    v_min_f32_e32 v0, v0, v3
759; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
760; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
761; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
762; SI-NEXT:    v_or_b32_e32 v1, v1, v2
763; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
764; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
765; SI-NEXT:    s_endpgm
766;
767; VI-LABEL: minnum_v3f16:
768; VI:       ; %bb.0: ; %entry
769; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
770; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
771; VI-NEXT:    s_mov_b32 s7, 0xf000
772; VI-NEXT:    s_mov_b32 s6, -1
773; VI-NEXT:    s_waitcnt lgkmcnt(0)
774; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
775; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
776; VI-NEXT:    s_mov_b32 s4, s0
777; VI-NEXT:    s_mov_b32 s5, s1
778; VI-NEXT:    s_waitcnt lgkmcnt(0)
779; VI-NEXT:    v_max_f16_e64 v0, s8, s8
780; VI-NEXT:    v_max_f16_e64 v1, s2, s2
781; VI-NEXT:    s_lshr_b32 s0, s8, 16
782; VI-NEXT:    v_min_f16_e32 v0, v1, v0
783; VI-NEXT:    v_max_f16_e64 v1, s0, s0
784; VI-NEXT:    s_lshr_b32 s0, s2, 16
785; VI-NEXT:    v_max_f16_e64 v2, s0, s0
786; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
787; VI-NEXT:    v_or_b32_e32 v0, v0, v1
788; VI-NEXT:    v_max_f16_e64 v1, s9, s9
789; VI-NEXT:    v_max_f16_e64 v2, s3, s3
790; VI-NEXT:    v_min_f16_e32 v1, v2, v1
791; VI-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
792; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
793; VI-NEXT:    s_endpgm
794;
795; GFX9-LABEL: minnum_v3f16:
796; GFX9:       ; %bb.0: ; %entry
797; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
798; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
799; GFX9-NEXT:    s_mov_b32 s3, 0xf000
800; GFX9-NEXT:    s_mov_b32 s2, -1
801; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
802; GFX9-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
803; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[6:7], 0x0
804; GFX9-NEXT:    s_mov_b32 s0, s4
805; GFX9-NEXT:    s_mov_b32 s1, s5
806; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
807; GFX9-NEXT:    v_pk_max_f16 v0, s10, s10
808; GFX9-NEXT:    v_pk_max_f16 v1, s12, s12
809; GFX9-NEXT:    v_pk_max_f16 v2, s11, s11
810; GFX9-NEXT:    v_pk_min_f16 v0, v1, v0
811; GFX9-NEXT:    v_pk_max_f16 v1, s13, s13
812; GFX9-NEXT:    v_pk_min_f16 v1, v1, v2
813; GFX9-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
814; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
815; GFX9-NEXT:    s_endpgm
816;
817; GFX10-LABEL: minnum_v3f16:
818; GFX10:       ; %bb.0: ; %entry
819; GFX10-NEXT:    s_clause 0x1
820; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
821; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
822; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
823; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
824; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
825; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
826; GFX10-NEXT:    s_mov_b32 s6, -1
827; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
828; GFX10-NEXT:    v_pk_max_f16 v1, s1, s1
829; GFX10-NEXT:    v_pk_max_f16 v2, s9, s9
830; GFX10-NEXT:    v_pk_max_f16 v0, s0, s0
831; GFX10-NEXT:    v_pk_max_f16 v3, s8, s8
832; GFX10-NEXT:    v_pk_min_f16 v1, v2, v1
833; GFX10-NEXT:    v_pk_min_f16 v0, v3, v0
834; GFX10-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
835; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
836; GFX10-NEXT:    s_endpgm
837;
838; GFX11-LABEL: minnum_v3f16:
839; GFX11:       ; %bb.0: ; %entry
840; GFX11-NEXT:    s_clause 0x1
841; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
842; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
843; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
844; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
845; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
846; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
847; GFX11-NEXT:    v_pk_max_f16 v1, s5, s5
848; GFX11-NEXT:    v_pk_max_f16 v2, s3, s3
849; GFX11-NEXT:    v_pk_max_f16 v0, s4, s4
850; GFX11-NEXT:    v_pk_max_f16 v3, s2, s2
851; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
852; GFX11-NEXT:    s_mov_b32 s2, -1
853; GFX11-NEXT:    v_pk_min_f16 v1, v2, v1
854; GFX11-NEXT:    v_pk_min_f16 v0, v3, v0
855; GFX11-NEXT:    s_clause 0x1
856; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0 offset:4
857; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
858; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
859; GFX11-NEXT:    s_endpgm
860    <3 x half> addrspace(1)* %r,
861    <3 x half> addrspace(1)* %a,
862    <3 x half> addrspace(1)* %b) #0 {
863entry:
864  %a.val = load <3 x half>, <3 x half> addrspace(1)* %a
865  %b.val = load <3 x half>, <3 x half> addrspace(1)* %b
866  %r.val = call <3 x half> @llvm.minnum.v3f16(<3 x half> %a.val, <3 x half> %b.val)
867  store <3 x half> %r.val, <3 x half> addrspace(1)* %r
868  ret void
869}
870
871define amdgpu_kernel void @minnum_v4f16(
872; SI-LABEL: minnum_v4f16:
873; SI:       ; %bb.0: ; %entry
874; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
875; SI-NEXT:    s_mov_b32 s3, 0xf000
876; SI-NEXT:    s_mov_b32 s2, -1
877; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
878; SI-NEXT:    s_waitcnt lgkmcnt(0)
879; SI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
880; SI-NEXT:    s_mov_b32 s0, s4
881; SI-NEXT:    s_mov_b32 s1, s5
882; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
883; SI-NEXT:    s_waitcnt lgkmcnt(0)
884; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
885; SI-NEXT:    s_lshr_b32 s6, s6, 16
886; SI-NEXT:    v_cvt_f32_f16_e32 v2, s6
887; SI-NEXT:    s_lshr_b32 s6, s7, 16
888; SI-NEXT:    v_cvt_f32_f16_e32 v3, s6
889; SI-NEXT:    s_lshr_b32 s6, s5, 16
890; SI-NEXT:    v_cvt_f32_f16_e32 v5, s6
891; SI-NEXT:    v_cvt_f32_f16_e32 v1, s7
892; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
893; SI-NEXT:    s_lshr_b32 s4, s4, 16
894; SI-NEXT:    v_cvt_f32_f16_e32 v7, s5
895; SI-NEXT:    v_cvt_f32_f16_e32 v6, s4
896; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v5
897; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
898; SI-NEXT:    v_min_f32_e32 v3, v3, v5
899; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v7
900; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
901; SI-NEXT:    v_min_f32_e32 v1, v1, v5
902; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v6
903; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
904; SI-NEXT:    v_min_f32_e32 v2, v2, v5
905; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
906; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
907; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
908; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
909; SI-NEXT:    v_min_f32_e32 v0, v0, v4
910; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
911; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
912; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
913; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
914; SI-NEXT:    v_or_b32_e32 v1, v1, v3
915; SI-NEXT:    v_or_b32_e32 v0, v0, v2
916; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
917; SI-NEXT:    s_endpgm
918;
919; VI-LABEL: minnum_v4f16:
920; VI:       ; %bb.0: ; %entry
921; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
922; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
923; VI-NEXT:    s_mov_b32 s7, 0xf000
924; VI-NEXT:    s_mov_b32 s6, -1
925; VI-NEXT:    s_waitcnt lgkmcnt(0)
926; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
927; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
928; VI-NEXT:    s_mov_b32 s4, s0
929; VI-NEXT:    s_mov_b32 s5, s1
930; VI-NEXT:    s_waitcnt lgkmcnt(0)
931; VI-NEXT:    v_max_f16_e64 v0, s9, s9
932; VI-NEXT:    v_max_f16_e64 v1, s3, s3
933; VI-NEXT:    s_lshr_b32 s0, s9, 16
934; VI-NEXT:    v_min_f16_e32 v0, v1, v0
935; VI-NEXT:    v_max_f16_e64 v1, s0, s0
936; VI-NEXT:    s_lshr_b32 s0, s3, 16
937; VI-NEXT:    v_max_f16_e64 v2, s0, s0
938; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
939; VI-NEXT:    v_or_b32_e32 v1, v0, v1
940; VI-NEXT:    v_max_f16_e64 v0, s8, s8
941; VI-NEXT:    v_max_f16_e64 v2, s2, s2
942; VI-NEXT:    s_lshr_b32 s0, s8, 16
943; VI-NEXT:    v_min_f16_e32 v0, v2, v0
944; VI-NEXT:    v_max_f16_e64 v2, s0, s0
945; VI-NEXT:    s_lshr_b32 s0, s2, 16
946; VI-NEXT:    v_max_f16_e64 v3, s0, s0
947; VI-NEXT:    v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
948; VI-NEXT:    v_or_b32_e32 v0, v0, v2
949; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
950; VI-NEXT:    s_endpgm
951;
952; GFX9-LABEL: minnum_v4f16:
953; GFX9:       ; %bb.0: ; %entry
954; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
955; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
956; GFX9-NEXT:    s_mov_b32 s3, 0xf000
957; GFX9-NEXT:    s_mov_b32 s2, -1
958; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
959; GFX9-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
960; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[6:7], 0x0
961; GFX9-NEXT:    s_mov_b32 s0, s4
962; GFX9-NEXT:    s_mov_b32 s1, s5
963; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
964; GFX9-NEXT:    v_pk_max_f16 v0, s11, s11
965; GFX9-NEXT:    v_pk_max_f16 v1, s13, s13
966; GFX9-NEXT:    v_pk_max_f16 v2, s10, s10
967; GFX9-NEXT:    v_pk_min_f16 v1, v1, v0
968; GFX9-NEXT:    v_pk_max_f16 v0, s12, s12
969; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
970; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
971; GFX9-NEXT:    s_endpgm
972;
973; GFX10-LABEL: minnum_v4f16:
974; GFX10:       ; %bb.0: ; %entry
975; GFX10-NEXT:    s_clause 0x1
976; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
977; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
978; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
979; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
980; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
981; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
982; GFX10-NEXT:    s_mov_b32 s6, -1
983; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
984; GFX10-NEXT:    v_pk_max_f16 v0, s1, s1
985; GFX10-NEXT:    v_pk_max_f16 v1, s9, s9
986; GFX10-NEXT:    v_pk_max_f16 v2, s0, s0
987; GFX10-NEXT:    v_pk_max_f16 v3, s8, s8
988; GFX10-NEXT:    v_pk_min_f16 v1, v1, v0
989; GFX10-NEXT:    v_pk_min_f16 v0, v3, v2
990; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
991; GFX10-NEXT:    s_endpgm
992;
993; GFX11-LABEL: minnum_v4f16:
994; GFX11:       ; %bb.0: ; %entry
995; GFX11-NEXT:    s_clause 0x1
996; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
997; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
998; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
999; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
1000; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
1001; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1002; GFX11-NEXT:    v_pk_max_f16 v0, s5, s5
1003; GFX11-NEXT:    v_pk_max_f16 v1, s3, s3
1004; GFX11-NEXT:    v_pk_max_f16 v2, s4, s4
1005; GFX11-NEXT:    v_pk_max_f16 v3, s2, s2
1006; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1007; GFX11-NEXT:    s_mov_b32 s2, -1
1008; GFX11-NEXT:    v_pk_min_f16 v1, v1, v0
1009; GFX11-NEXT:    v_pk_min_f16 v0, v3, v2
1010; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1011; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1012; GFX11-NEXT:    s_endpgm
1013    <4 x half> addrspace(1)* %r,
1014    <4 x half> addrspace(1)* %a,
1015    <4 x half> addrspace(1)* %b) #0 {
1016entry:
1017  %a.val = load <4 x half>, <4 x half> addrspace(1)* %a
1018  %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
1019  %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> %a.val, <4 x half> %b.val)
1020  store <4 x half> %r.val, <4 x half> addrspace(1)* %r
1021  ret void
1022}
1023
1024define amdgpu_kernel void @fmin_v4f16_imm_a(
1025; SI-LABEL: fmin_v4f16_imm_a:
1026; SI:       ; %bb.0: ; %entry
1027; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1028; SI-NEXT:    s_waitcnt lgkmcnt(0)
1029; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
1030; SI-NEXT:    s_mov_b32 s3, 0xf000
1031; SI-NEXT:    s_mov_b32 s2, -1
1032; SI-NEXT:    s_waitcnt lgkmcnt(0)
1033; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
1034; SI-NEXT:    s_lshr_b32 s5, s5, 16
1035; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
1036; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
1037; SI-NEXT:    s_lshr_b32 s4, s4, 16
1038; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
1039; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
1040; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
1041; SI-NEXT:    v_min_f32_e32 v2, 4.0, v2
1042; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
1043; SI-NEXT:    v_min_f32_e32 v1, 0x40400000, v1
1044; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1045; SI-NEXT:    v_min_f32_e32 v3, 2.0, v3
1046; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
1047; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1048; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1049; SI-NEXT:    v_min_f32_e32 v0, 0x41000000, v0
1050; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1051; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1052; SI-NEXT:    v_or_b32_e32 v1, v1, v2
1053; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
1054; SI-NEXT:    v_or_b32_e32 v0, v0, v2
1055; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1056; SI-NEXT:    s_endpgm
1057;
1058; VI-LABEL: fmin_v4f16_imm_a:
1059; VI:       ; %bb.0: ; %entry
1060; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1061; VI-NEXT:    v_mov_b32_e32 v0, 0x4400
1062; VI-NEXT:    s_mov_b32 s7, 0xf000
1063; VI-NEXT:    s_mov_b32 s6, -1
1064; VI-NEXT:    s_waitcnt lgkmcnt(0)
1065; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1066; VI-NEXT:    s_mov_b32 s4, s0
1067; VI-NEXT:    s_mov_b32 s5, s1
1068; VI-NEXT:    s_waitcnt lgkmcnt(0)
1069; VI-NEXT:    s_lshr_b32 s0, s3, 16
1070; VI-NEXT:    v_max_f16_e64 v1, s3, s3
1071; VI-NEXT:    v_max_f16_e64 v3, s0, s0
1072; VI-NEXT:    v_max_f16_e64 v2, s2, s2
1073; VI-NEXT:    v_min_f16_e32 v1, 0x4200, v1
1074; VI-NEXT:    v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1075; VI-NEXT:    s_lshr_b32 s0, s2, 16
1076; VI-NEXT:    v_or_b32_e32 v1, v1, v0
1077; VI-NEXT:    v_min_f16_e32 v0, 0x4800, v2
1078; VI-NEXT:    v_max_f16_e64 v2, s0, s0
1079; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
1080; VI-NEXT:    v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1081; VI-NEXT:    v_or_b32_e32 v0, v0, v2
1082; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1083; VI-NEXT:    s_endpgm
1084;
1085; GFX9-LABEL: fmin_v4f16_imm_a:
1086; GFX9:       ; %bb.0: ; %entry
1087; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1088; GFX9-NEXT:    s_mov_b32 s8, 0x44004200
1089; GFX9-NEXT:    s_mov_b32 s9, 0x40004800
1090; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1091; GFX9-NEXT:    s_mov_b32 s6, -1
1092; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1093; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1094; GFX9-NEXT:    s_mov_b32 s4, s0
1095; GFX9-NEXT:    s_mov_b32 s5, s1
1096; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1097; GFX9-NEXT:    v_pk_max_f16 v0, s3, s3
1098; GFX9-NEXT:    v_pk_max_f16 v2, s2, s2
1099; GFX9-NEXT:    v_pk_min_f16 v1, v0, s8
1100; GFX9-NEXT:    v_pk_min_f16 v0, v2, s9
1101; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1102; GFX9-NEXT:    s_endpgm
1103;
1104; GFX10-LABEL: fmin_v4f16_imm_a:
1105; GFX10:       ; %bb.0: ; %entry
1106; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1107; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1108; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1109; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1110; GFX10-NEXT:    v_pk_max_f16 v0, s3, s3
1111; GFX10-NEXT:    v_pk_max_f16 v2, s2, s2
1112; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1113; GFX10-NEXT:    s_mov_b32 s2, -1
1114; GFX10-NEXT:    v_pk_min_f16 v1, 0x44004200, v0
1115; GFX10-NEXT:    v_pk_min_f16 v0, 0x40004800, v2
1116; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1117; GFX10-NEXT:    s_endpgm
1118;
1119; GFX11-LABEL: fmin_v4f16_imm_a:
1120; GFX11:       ; %bb.0: ; %entry
1121; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1122; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1123; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
1124; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1125; GFX11-NEXT:    v_pk_max_f16 v0, s3, s3
1126; GFX11-NEXT:    v_pk_max_f16 v2, s2, s2
1127; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1128; GFX11-NEXT:    s_mov_b32 s2, -1
1129; GFX11-NEXT:    v_pk_min_f16 v1, 0x44004200, v0
1130; GFX11-NEXT:    v_pk_min_f16 v0, 0x40004800, v2
1131; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1132; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1133; GFX11-NEXT:    s_endpgm
1134    <4 x half> addrspace(1)* %r,
1135    <4 x half> addrspace(1)* %b) #0 {
1136entry:
1137  %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
1138  %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val)
1139  store <4 x half> %r.val, <4 x half> addrspace(1)* %r
1140  ret void
1141}
1142
1143attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
1144