1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s
6
7declare half @llvm.minnum.f16(half %a, half %b)
8declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
9declare <3 x half> @llvm.minnum.v3f16(<3 x half> %a, <3 x half> %b)
10declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b)
11
12define amdgpu_kernel void @minnum_f16_ieee(
13; SI-LABEL: minnum_f16_ieee:
14; SI:       ; %bb.0: ; %entry
15; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
16; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
17; SI-NEXT:    s_mov_b32 s3, 0xf000
18; SI-NEXT:    s_mov_b32 s2, -1
19; SI-NEXT:    s_mov_b32 s14, s2
20; SI-NEXT:    s_waitcnt lgkmcnt(0)
21; SI-NEXT:    s_mov_b32 s12, s6
22; SI-NEXT:    s_mov_b32 s13, s7
23; SI-NEXT:    s_mov_b32 s15, s3
24; SI-NEXT:    s_mov_b32 s10, s2
25; SI-NEXT:    s_mov_b32 s11, s3
26; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
27; SI-NEXT:    s_waitcnt vmcnt(0)
28; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
29; SI-NEXT:    s_waitcnt vmcnt(0)
30; SI-NEXT:    s_mov_b32 s0, s4
31; SI-NEXT:    s_mov_b32 s1, s5
32; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
33; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
34; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
35; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
36; SI-NEXT:    v_min_f32_e32 v0, v0, v1
37; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
38; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
39; SI-NEXT:    s_endpgm
40;
41; VI-LABEL: minnum_f16_ieee:
42; VI:       ; %bb.0: ; %entry
43; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
44; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
45; VI-NEXT:    s_mov_b32 s3, 0xf000
46; VI-NEXT:    s_mov_b32 s2, -1
47; VI-NEXT:    s_mov_b32 s14, s2
48; VI-NEXT:    s_waitcnt lgkmcnt(0)
49; VI-NEXT:    s_mov_b32 s12, s6
50; VI-NEXT:    s_mov_b32 s13, s7
51; VI-NEXT:    s_mov_b32 s15, s3
52; VI-NEXT:    s_mov_b32 s10, s2
53; VI-NEXT:    s_mov_b32 s11, s3
54; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
55; VI-NEXT:    s_waitcnt vmcnt(0)
56; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
57; VI-NEXT:    s_waitcnt vmcnt(0)
58; VI-NEXT:    s_mov_b32 s0, s4
59; VI-NEXT:    s_mov_b32 s1, s5
60; VI-NEXT:    v_max_f16_e32 v0, v0, v0
61; VI-NEXT:    v_max_f16_e32 v1, v1, v1
62; VI-NEXT:    v_min_f16_e32 v0, v0, v1
63; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
64; VI-NEXT:    s_endpgm
65;
66; GFX9-LABEL: minnum_f16_ieee:
67; GFX9:       ; %bb.0: ; %entry
68; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
69; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
70; GFX9-NEXT:    s_mov_b32 s3, 0xf000
71; GFX9-NEXT:    s_mov_b32 s2, -1
72; GFX9-NEXT:    s_mov_b32 s14, s2
73; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
74; GFX9-NEXT:    s_mov_b32 s12, s6
75; GFX9-NEXT:    s_mov_b32 s13, s7
76; GFX9-NEXT:    s_mov_b32 s15, s3
77; GFX9-NEXT:    s_mov_b32 s10, s2
78; GFX9-NEXT:    s_mov_b32 s11, s3
79; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
80; GFX9-NEXT:    s_waitcnt vmcnt(0)
81; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
82; GFX9-NEXT:    s_waitcnt vmcnt(0)
83; GFX9-NEXT:    s_mov_b32 s0, s4
84; GFX9-NEXT:    s_mov_b32 s1, s5
85; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
86; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
87; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
88; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
89; GFX9-NEXT:    s_endpgm
90;
91; GFX10-LABEL: minnum_f16_ieee:
92; GFX10:       ; %bb.0: ; %entry
93; GFX10-NEXT:    s_clause 0x1
94; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
95; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
96; GFX10-NEXT:    s_mov_b32 s2, -1
97; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
98; GFX10-NEXT:    s_mov_b32 s14, s2
99; GFX10-NEXT:    s_mov_b32 s15, s3
100; GFX10-NEXT:    s_mov_b32 s10, s2
101; GFX10-NEXT:    s_mov_b32 s11, s3
102; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
103; GFX10-NEXT:    s_mov_b32 s12, s6
104; GFX10-NEXT:    s_mov_b32 s13, s7
105; GFX10-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc dlc
106; GFX10-NEXT:    s_waitcnt vmcnt(0)
107; GFX10-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc dlc
108; GFX10-NEXT:    s_waitcnt vmcnt(0)
109; GFX10-NEXT:    s_mov_b32 s0, s4
110; GFX10-NEXT:    s_mov_b32 s1, s5
111; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
112; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
113; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
114; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0
115; GFX10-NEXT:    s_endpgm
116    half addrspace(1)* %r,
117    half addrspace(1)* %a,
118    half addrspace(1)* %b) #0 {
119entry:
120  %a.val = load volatile half, half addrspace(1)* %a
121  %b.val = load volatile half, half addrspace(1)* %b
122  %r.val = call half @llvm.minnum.f16(half %a.val, half %b.val)
123  store half %r.val, half addrspace(1)* %r
124  ret void
125}
126
127define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 {
128; SI-LABEL: minnum_f16_no_ieee:
129; SI:       ; %bb.0:
130; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
131; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
132; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
133; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
134; SI-NEXT:    v_min_f32_e32 v0, v0, v1
135; SI-NEXT:    ; return to shader part epilog
136;
137; VI-LABEL: minnum_f16_no_ieee:
138; VI:       ; %bb.0:
139; VI-NEXT:    v_min_f16_e32 v0, v0, v1
140; VI-NEXT:    ; return to shader part epilog
141;
142; GFX9-LABEL: minnum_f16_no_ieee:
143; GFX9:       ; %bb.0:
144; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
145; GFX9-NEXT:    ; return to shader part epilog
146;
147; GFX10-LABEL: minnum_f16_no_ieee:
148; GFX10:       ; %bb.0:
149; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
150; GFX10-NEXT:    ; return to shader part epilog
151  %r.val = call half @llvm.minnum.f16(half %a, half %b)
152  ret half %r.val
153}
154
155define amdgpu_kernel void @minnum_f16_imm_a(
156; SI-LABEL: minnum_f16_imm_a:
157; SI:       ; %bb.0: ; %entry
158; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
159; SI-NEXT:    s_mov_b32 s7, 0xf000
160; SI-NEXT:    s_mov_b32 s6, -1
161; SI-NEXT:    s_mov_b32 s10, s6
162; SI-NEXT:    s_mov_b32 s11, s7
163; SI-NEXT:    s_waitcnt lgkmcnt(0)
164; SI-NEXT:    s_mov_b32 s8, s2
165; SI-NEXT:    s_mov_b32 s9, s3
166; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
167; SI-NEXT:    s_mov_b32 s4, s0
168; SI-NEXT:    s_mov_b32 s5, s1
169; SI-NEXT:    s_waitcnt vmcnt(0)
170; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
171; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
172; SI-NEXT:    v_min_f32_e32 v0, 0x40400000, v0
173; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
174; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
175; SI-NEXT:    s_endpgm
176;
177; VI-LABEL: minnum_f16_imm_a:
178; VI:       ; %bb.0: ; %entry
179; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
180; VI-NEXT:    s_mov_b32 s7, 0xf000
181; VI-NEXT:    s_mov_b32 s6, -1
182; VI-NEXT:    s_mov_b32 s10, s6
183; VI-NEXT:    s_mov_b32 s11, s7
184; VI-NEXT:    s_waitcnt lgkmcnt(0)
185; VI-NEXT:    s_mov_b32 s8, s2
186; VI-NEXT:    s_mov_b32 s9, s3
187; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
188; VI-NEXT:    s_mov_b32 s4, s0
189; VI-NEXT:    s_mov_b32 s5, s1
190; VI-NEXT:    s_waitcnt vmcnt(0)
191; VI-NEXT:    v_max_f16_e32 v0, v0, v0
192; VI-NEXT:    v_min_f16_e32 v0, 0x4200, v0
193; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
194; VI-NEXT:    s_endpgm
195;
196; GFX9-LABEL: minnum_f16_imm_a:
197; GFX9:       ; %bb.0: ; %entry
198; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
199; GFX9-NEXT:    s_mov_b32 s7, 0xf000
200; GFX9-NEXT:    s_mov_b32 s6, -1
201; GFX9-NEXT:    s_mov_b32 s10, s6
202; GFX9-NEXT:    s_mov_b32 s11, s7
203; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
204; GFX9-NEXT:    s_mov_b32 s8, s2
205; GFX9-NEXT:    s_mov_b32 s9, s3
206; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
207; GFX9-NEXT:    s_mov_b32 s4, s0
208; GFX9-NEXT:    s_mov_b32 s5, s1
209; GFX9-NEXT:    s_waitcnt vmcnt(0)
210; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
211; GFX9-NEXT:    v_min_f16_e32 v0, 0x4200, v0
212; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
213; GFX9-NEXT:    s_endpgm
214;
215; GFX10-LABEL: minnum_f16_imm_a:
216; GFX10:       ; %bb.0: ; %entry
217; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
218; GFX10-NEXT:    s_mov_b32 s6, -1
219; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
220; GFX10-NEXT:    s_mov_b32 s10, s6
221; GFX10-NEXT:    s_mov_b32 s11, s7
222; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
223; GFX10-NEXT:    s_mov_b32 s8, s2
224; GFX10-NEXT:    s_mov_b32 s9, s3
225; GFX10-NEXT:    s_mov_b32 s4, s0
226; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
227; GFX10-NEXT:    s_mov_b32 s5, s1
228; GFX10-NEXT:    s_waitcnt vmcnt(0)
229; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
230; GFX10-NEXT:    v_min_f16_e32 v0, 0x4200, v0
231; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
232; GFX10-NEXT:    s_endpgm
233    half addrspace(1)* %r,
234    half addrspace(1)* %b) #0 {
235entry:
236  %b.val = load half, half addrspace(1)* %b
237  %r.val = call half @llvm.minnum.f16(half 3.0, half %b.val)
238  store half %r.val, half addrspace(1)* %r
239  ret void
240}
241
242define amdgpu_kernel void @minnum_f16_imm_b(
243; SI-LABEL: minnum_f16_imm_b:
244; SI:       ; %bb.0: ; %entry
245; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
246; SI-NEXT:    s_mov_b32 s7, 0xf000
247; SI-NEXT:    s_mov_b32 s6, -1
248; SI-NEXT:    s_mov_b32 s10, s6
249; SI-NEXT:    s_mov_b32 s11, s7
250; SI-NEXT:    s_waitcnt lgkmcnt(0)
251; SI-NEXT:    s_mov_b32 s8, s2
252; SI-NEXT:    s_mov_b32 s9, s3
253; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
254; SI-NEXT:    s_mov_b32 s4, s0
255; SI-NEXT:    s_mov_b32 s5, s1
256; SI-NEXT:    s_waitcnt vmcnt(0)
257; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
258; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
259; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
260; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
261; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
262; SI-NEXT:    s_endpgm
263;
264; VI-LABEL: minnum_f16_imm_b:
265; VI:       ; %bb.0: ; %entry
266; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
267; VI-NEXT:    s_mov_b32 s7, 0xf000
268; VI-NEXT:    s_mov_b32 s6, -1
269; VI-NEXT:    s_mov_b32 s10, s6
270; VI-NEXT:    s_mov_b32 s11, s7
271; VI-NEXT:    s_waitcnt lgkmcnt(0)
272; VI-NEXT:    s_mov_b32 s8, s2
273; VI-NEXT:    s_mov_b32 s9, s3
274; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
275; VI-NEXT:    s_mov_b32 s4, s0
276; VI-NEXT:    s_mov_b32 s5, s1
277; VI-NEXT:    s_waitcnt vmcnt(0)
278; VI-NEXT:    v_max_f16_e32 v0, v0, v0
279; VI-NEXT:    v_min_f16_e32 v0, 4.0, v0
280; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
281; VI-NEXT:    s_endpgm
282;
283; GFX9-LABEL: minnum_f16_imm_b:
284; GFX9:       ; %bb.0: ; %entry
285; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
286; GFX9-NEXT:    s_mov_b32 s7, 0xf000
287; GFX9-NEXT:    s_mov_b32 s6, -1
288; GFX9-NEXT:    s_mov_b32 s10, s6
289; GFX9-NEXT:    s_mov_b32 s11, s7
290; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
291; GFX9-NEXT:    s_mov_b32 s8, s2
292; GFX9-NEXT:    s_mov_b32 s9, s3
293; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
294; GFX9-NEXT:    s_mov_b32 s4, s0
295; GFX9-NEXT:    s_mov_b32 s5, s1
296; GFX9-NEXT:    s_waitcnt vmcnt(0)
297; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
298; GFX9-NEXT:    v_min_f16_e32 v0, 4.0, v0
299; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
300; GFX9-NEXT:    s_endpgm
301;
302; GFX10-LABEL: minnum_f16_imm_b:
303; GFX10:       ; %bb.0: ; %entry
304; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
305; GFX10-NEXT:    s_mov_b32 s6, -1
306; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
307; GFX10-NEXT:    s_mov_b32 s10, s6
308; GFX10-NEXT:    s_mov_b32 s11, s7
309; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
310; GFX10-NEXT:    s_mov_b32 s8, s2
311; GFX10-NEXT:    s_mov_b32 s9, s3
312; GFX10-NEXT:    s_mov_b32 s4, s0
313; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
314; GFX10-NEXT:    s_mov_b32 s5, s1
315; GFX10-NEXT:    s_waitcnt vmcnt(0)
316; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
317; GFX10-NEXT:    v_min_f16_e32 v0, 4.0, v0
318; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
319; GFX10-NEXT:    s_endpgm
320    half addrspace(1)* %r,
321    half addrspace(1)* %a) #0 {
322entry:
323  %a.val = load half, half addrspace(1)* %a
324  %r.val = call half @llvm.minnum.f16(half %a.val, half 4.0)
325  store half %r.val, half addrspace(1)* %r
326  ret void
327}
328
329define amdgpu_kernel void @minnum_v2f16_ieee(
330; SI-LABEL: minnum_v2f16_ieee:
331; SI:       ; %bb.0: ; %entry
332; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
333; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
334; SI-NEXT:    s_waitcnt lgkmcnt(0)
335; SI-NEXT:    s_load_dword s2, s[6:7], 0x0
336; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
337; SI-NEXT:    s_mov_b32 s7, 0xf000
338; SI-NEXT:    s_mov_b32 s6, -1
339; SI-NEXT:    s_waitcnt lgkmcnt(0)
340; SI-NEXT:    s_lshr_b32 s1, s2, 16
341; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
342; SI-NEXT:    s_lshr_b32 s0, s0, 16
343; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
344; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
345; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
346; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
347; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
348; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
349; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
350; SI-NEXT:    v_min_f32_e32 v2, v3, v2
351; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
352; SI-NEXT:    v_min_f32_e32 v0, v0, v1
353; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
354; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
355; SI-NEXT:    v_or_b32_e32 v0, v0, v1
356; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
357; SI-NEXT:    s_endpgm
358;
359; VI-LABEL: minnum_v2f16_ieee:
360; VI:       ; %bb.0: ; %entry
361; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
362; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
363; VI-NEXT:    s_mov_b32 s7, 0xf000
364; VI-NEXT:    s_mov_b32 s6, -1
365; VI-NEXT:    s_waitcnt lgkmcnt(0)
366; VI-NEXT:    s_load_dword s8, s[4:5], 0x0
367; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
368; VI-NEXT:    s_mov_b32 s4, s0
369; VI-NEXT:    s_mov_b32 s5, s1
370; VI-NEXT:    s_waitcnt lgkmcnt(0)
371; VI-NEXT:    v_max_f16_e64 v0, s8, s8
372; VI-NEXT:    v_max_f16_e64 v1, s2, s2
373; VI-NEXT:    s_lshr_b32 s0, s8, 16
374; VI-NEXT:    v_min_f16_e32 v0, v1, v0
375; VI-NEXT:    v_max_f16_e64 v1, s0, s0
376; VI-NEXT:    s_lshr_b32 s0, s2, 16
377; VI-NEXT:    v_max_f16_e64 v2, s0, s0
378; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
379; VI-NEXT:    v_or_b32_e32 v0, v0, v1
380; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
381; VI-NEXT:    s_endpgm
382;
383; GFX9-LABEL: minnum_v2f16_ieee:
384; GFX9:       ; %bb.0: ; %entry
385; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
386; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
387; GFX9-NEXT:    s_mov_b32 s3, 0xf000
388; GFX9-NEXT:    s_mov_b32 s2, -1
389; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
390; GFX9-NEXT:    s_load_dword s10, s[8:9], 0x0
391; GFX9-NEXT:    s_load_dword s11, s[6:7], 0x0
392; GFX9-NEXT:    s_mov_b32 s0, s4
393; GFX9-NEXT:    s_mov_b32 s1, s5
394; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
395; GFX9-NEXT:    v_pk_max_f16 v0, s10, s10
396; GFX9-NEXT:    v_pk_max_f16 v1, s11, s11
397; GFX9-NEXT:    v_pk_min_f16 v0, v1, v0
398; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
399; GFX9-NEXT:    s_endpgm
400;
401; GFX10-LABEL: minnum_v2f16_ieee:
402; GFX10:       ; %bb.0: ; %entry
403; GFX10-NEXT:    s_clause 0x1
404; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
405; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
406; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
407; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
408; GFX10-NEXT:    s_load_dword s1, s[6:7], 0x0
409; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
410; GFX10-NEXT:    s_mov_b32 s6, -1
411; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
412; GFX10-NEXT:    v_pk_max_f16 v0, s0, s0
413; GFX10-NEXT:    v_pk_max_f16 v1, s1, s1
414; GFX10-NEXT:    v_pk_min_f16 v0, v1, v0
415; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
416; GFX10-NEXT:    s_endpgm
417    <2 x half> addrspace(1)* %r,
418    <2 x half> addrspace(1)* %a,
419    <2 x half> addrspace(1)* %b) #0 {
420entry:
421  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
422  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
423  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> %b.val)
424  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
425  ret void
426}
427
428define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) #0 {
429; SI-LABEL: minnum_v2f16_no_ieee:
430; SI:       ; %bb.0:
431; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
432; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
433; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
434; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
435; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
436; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
437; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
438; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
439; SI-NEXT:    v_min_f32_e32 v0, v0, v2
440; SI-NEXT:    v_min_f32_e32 v1, v1, v3
441; SI-NEXT:    ; return to shader part epilog
442;
443; VI-LABEL: minnum_v2f16_no_ieee:
444; VI:       ; %bb.0:
445; VI-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
446; VI-NEXT:    v_min_f16_e32 v0, v0, v1
447; VI-NEXT:    v_or_b32_e32 v0, v0, v2
448; VI-NEXT:    ; return to shader part epilog
449;
450; GFX9-LABEL: minnum_v2f16_no_ieee:
451; GFX9:       ; %bb.0:
452; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
453; GFX9-NEXT:    ; return to shader part epilog
454;
455; GFX10-LABEL: minnum_v2f16_no_ieee:
456; GFX10:       ; %bb.0:
457; GFX10-NEXT:    v_pk_min_f16 v0, v0, v1
458; GFX10-NEXT:    ; return to shader part epilog
459  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
460  ret <2 x half> %r.val
461}
462
463define amdgpu_kernel void @minnum_v2f16_imm_a(
464; SI-LABEL: minnum_v2f16_imm_a:
465; SI:       ; %bb.0: ; %entry
466; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
467; SI-NEXT:    s_waitcnt lgkmcnt(0)
468; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
469; SI-NEXT:    s_mov_b32 s3, 0xf000
470; SI-NEXT:    s_waitcnt lgkmcnt(0)
471; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
472; SI-NEXT:    s_lshr_b32 s2, s2, 16
473; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
474; SI-NEXT:    s_mov_b32 s2, -1
475; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
476; SI-NEXT:    v_min_f32_e32 v0, 0x40400000, v0
477; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
478; SI-NEXT:    v_min_f32_e32 v1, 4.0, v1
479; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
480; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
481; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
482; SI-NEXT:    v_or_b32_e32 v0, v0, v1
483; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
484; SI-NEXT:    s_endpgm
485;
486; VI-LABEL: minnum_v2f16_imm_a:
487; VI:       ; %bb.0: ; %entry
488; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
489; VI-NEXT:    v_mov_b32_e32 v2, 0x4400
490; VI-NEXT:    s_waitcnt lgkmcnt(0)
491; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
492; VI-NEXT:    s_mov_b32 s3, 0xf000
493; VI-NEXT:    s_mov_b32 s2, -1
494; VI-NEXT:    s_waitcnt lgkmcnt(0)
495; VI-NEXT:    v_max_f16_e64 v0, s4, s4
496; VI-NEXT:    s_lshr_b32 s4, s4, 16
497; VI-NEXT:    v_max_f16_e64 v1, s4, s4
498; VI-NEXT:    v_min_f16_e32 v0, 0x4200, v0
499; VI-NEXT:    v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
500; VI-NEXT:    v_or_b32_e32 v0, v0, v1
501; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
502; VI-NEXT:    s_endpgm
503;
504; GFX9-LABEL: minnum_v2f16_imm_a:
505; GFX9:       ; %bb.0: ; %entry
506; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
507; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
508; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
509; GFX9-NEXT:    s_mov_b32 s3, 0xf000
510; GFX9-NEXT:    s_mov_b32 s2, -1
511; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
512; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
513; GFX9-NEXT:    s_mov_b32 s4, 0x44004200
514; GFX9-NEXT:    v_pk_min_f16 v0, v0, s4
515; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
516; GFX9-NEXT:    s_endpgm
517;
518; GFX10-LABEL: minnum_v2f16_imm_a:
519; GFX10:       ; %bb.0: ; %entry
520; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
521; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
522; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
523; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
524; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
525; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
526; GFX10-NEXT:    s_mov_b32 s2, -1
527; GFX10-NEXT:    v_pk_min_f16 v0, 0x44004200, v0
528; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
529; GFX10-NEXT:    s_endpgm
530    <2 x half> addrspace(1)* %r,
531    <2 x half> addrspace(1)* %b) #0 {
532entry:
533  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
534  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val)
535  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
536  ret void
537}
538
539define amdgpu_kernel void @minnum_v2f16_imm_b(
540; SI-LABEL: minnum_v2f16_imm_b:
541; SI:       ; %bb.0: ; %entry
542; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
543; SI-NEXT:    s_waitcnt lgkmcnt(0)
544; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
545; SI-NEXT:    s_mov_b32 s3, 0xf000
546; SI-NEXT:    s_waitcnt lgkmcnt(0)
547; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
548; SI-NEXT:    s_lshr_b32 s2, s2, 16
549; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
550; SI-NEXT:    s_mov_b32 s2, -1
551; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
552; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
553; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
554; SI-NEXT:    v_min_f32_e32 v1, 0x40400000, v1
555; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
556; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
557; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
558; SI-NEXT:    v_or_b32_e32 v0, v0, v1
559; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
560; SI-NEXT:    s_endpgm
561;
562; VI-LABEL: minnum_v2f16_imm_b:
563; VI:       ; %bb.0: ; %entry
564; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
565; VI-NEXT:    v_mov_b32_e32 v2, 0x4200
566; VI-NEXT:    s_waitcnt lgkmcnt(0)
567; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
568; VI-NEXT:    s_mov_b32 s3, 0xf000
569; VI-NEXT:    s_mov_b32 s2, -1
570; VI-NEXT:    s_waitcnt lgkmcnt(0)
571; VI-NEXT:    v_max_f16_e64 v0, s4, s4
572; VI-NEXT:    s_lshr_b32 s4, s4, 16
573; VI-NEXT:    v_max_f16_e64 v1, s4, s4
574; VI-NEXT:    v_min_f16_e32 v0, 4.0, v0
575; VI-NEXT:    v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
576; VI-NEXT:    v_or_b32_e32 v0, v0, v1
577; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
578; VI-NEXT:    s_endpgm
579;
580; GFX9-LABEL: minnum_v2f16_imm_b:
581; GFX9:       ; %bb.0: ; %entry
582; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
583; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
584; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
585; GFX9-NEXT:    s_mov_b32 s3, 0xf000
586; GFX9-NEXT:    s_mov_b32 s2, -1
587; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
588; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
589; GFX9-NEXT:    s_mov_b32 s4, 0x42004400
590; GFX9-NEXT:    v_pk_min_f16 v0, v0, s4
591; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
592; GFX9-NEXT:    s_endpgm
593;
594; GFX10-LABEL: minnum_v2f16_imm_b:
595; GFX10:       ; %bb.0: ; %entry
596; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
597; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
598; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
599; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
600; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
601; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
602; GFX10-NEXT:    s_mov_b32 s2, -1
603; GFX10-NEXT:    v_pk_min_f16 v0, 0x42004400, v0
604; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
605; GFX10-NEXT:    s_endpgm
606    <2 x half> addrspace(1)* %r,
607    <2 x half> addrspace(1)* %a) #0 {
608entry:
609  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
610  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>)
611  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
612  ret void
613}
614
615; FIXME: Scalarize with undef half
616define amdgpu_kernel void @minnum_v3f16(
617; SI-LABEL: minnum_v3f16:
618; SI:       ; %bb.0: ; %entry
619; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
620; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
621; SI-NEXT:    s_waitcnt lgkmcnt(0)
622; SI-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
623; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
624; SI-NEXT:    s_mov_b32 s7, 0xf000
625; SI-NEXT:    s_mov_b32 s6, -1
626; SI-NEXT:    s_waitcnt lgkmcnt(0)
627; SI-NEXT:    v_cvt_f32_f16_e32 v0, s3
628; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
629; SI-NEXT:    s_lshr_b32 s2, s2, 16
630; SI-NEXT:    s_lshr_b32 s3, s0, 16
631; SI-NEXT:    v_cvt_f32_f16_e32 v2, s3
632; SI-NEXT:    v_cvt_f32_f16_e32 v3, s2
633; SI-NEXT:    v_cvt_f32_f16_e32 v5, s0
634; SI-NEXT:    v_cvt_f32_f16_e32 v4, s1
635; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
636; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
637; SI-NEXT:    v_min_f32_e32 v2, v3, v2
638; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
639; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
640; SI-NEXT:    v_min_f32_e32 v1, v1, v3
641; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
642; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
643; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
644; SI-NEXT:    v_min_f32_e32 v0, v0, v3
645; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
646; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
647; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
648; SI-NEXT:    v_or_b32_e32 v1, v1, v2
649; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
650; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
651; SI-NEXT:    s_endpgm
652;
653; VI-LABEL: minnum_v3f16:
654; VI:       ; %bb.0: ; %entry
655; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
656; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
657; VI-NEXT:    s_mov_b32 s7, 0xf000
658; VI-NEXT:    s_mov_b32 s6, -1
659; VI-NEXT:    s_waitcnt lgkmcnt(0)
660; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
661; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
662; VI-NEXT:    s_mov_b32 s4, s0
663; VI-NEXT:    s_mov_b32 s5, s1
664; VI-NEXT:    s_waitcnt lgkmcnt(0)
665; VI-NEXT:    v_max_f16_e64 v0, s8, s8
666; VI-NEXT:    v_max_f16_e64 v1, s2, s2
667; VI-NEXT:    s_lshr_b32 s0, s8, 16
668; VI-NEXT:    v_min_f16_e32 v0, v1, v0
669; VI-NEXT:    v_max_f16_e64 v1, s0, s0
670; VI-NEXT:    s_lshr_b32 s0, s2, 16
671; VI-NEXT:    v_max_f16_e64 v2, s0, s0
672; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
673; VI-NEXT:    v_or_b32_e32 v0, v0, v1
674; VI-NEXT:    v_max_f16_e64 v1, s9, s9
675; VI-NEXT:    v_max_f16_e64 v2, s3, s3
676; VI-NEXT:    v_min_f16_e32 v1, v2, v1
677; VI-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
678; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
679; VI-NEXT:    s_endpgm
680;
681; GFX9-LABEL: minnum_v3f16:
682; GFX9:       ; %bb.0: ; %entry
683; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
684; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
685; GFX9-NEXT:    s_mov_b32 s3, 0xf000
686; GFX9-NEXT:    s_mov_b32 s2, -1
687; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
688; GFX9-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
689; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[6:7], 0x0
690; GFX9-NEXT:    s_mov_b32 s0, s4
691; GFX9-NEXT:    s_mov_b32 s1, s5
692; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
693; GFX9-NEXT:    v_pk_max_f16 v0, s10, s10
694; GFX9-NEXT:    v_pk_max_f16 v1, s12, s12
695; GFX9-NEXT:    v_pk_max_f16 v2, s11, s11
696; GFX9-NEXT:    v_pk_min_f16 v0, v1, v0
697; GFX9-NEXT:    v_pk_max_f16 v1, s13, s13
698; GFX9-NEXT:    v_pk_min_f16 v1, v1, v2
699; GFX9-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
700; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
701; GFX9-NEXT:    s_endpgm
702;
703; GFX10-LABEL: minnum_v3f16:
704; GFX10:       ; %bb.0: ; %entry
705; GFX10-NEXT:    s_clause 0x1
706; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
707; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
708; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
709; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
710; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
711; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
712; GFX10-NEXT:    s_mov_b32 s6, -1
713; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
714; GFX10-NEXT:    v_pk_max_f16 v1, s1, s1
715; GFX10-NEXT:    v_pk_max_f16 v2, s9, s9
716; GFX10-NEXT:    v_pk_max_f16 v0, s0, s0
717; GFX10-NEXT:    v_pk_max_f16 v3, s8, s8
718; GFX10-NEXT:    v_pk_min_f16 v1, v2, v1
719; GFX10-NEXT:    v_pk_min_f16 v0, v3, v0
720; GFX10-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
721; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
722; GFX10-NEXT:    s_endpgm
723    <3 x half> addrspace(1)* %r,
724    <3 x half> addrspace(1)* %a,
725    <3 x half> addrspace(1)* %b) #0 {
726entry:
727  %a.val = load <3 x half>, <3 x half> addrspace(1)* %a
728  %b.val = load <3 x half>, <3 x half> addrspace(1)* %b
729  %r.val = call <3 x half> @llvm.minnum.v3f16(<3 x half> %a.val, <3 x half> %b.val)
730  store <3 x half> %r.val, <3 x half> addrspace(1)* %r
731  ret void
732}
733
734define amdgpu_kernel void @minnum_v4f16(
735; SI-LABEL: minnum_v4f16:
736; SI:       ; %bb.0: ; %entry
737; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
738; SI-NEXT:    s_mov_b32 s3, 0xf000
739; SI-NEXT:    s_mov_b32 s2, -1
740; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
741; SI-NEXT:    s_waitcnt lgkmcnt(0)
742; SI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
743; SI-NEXT:    s_mov_b32 s0, s4
744; SI-NEXT:    s_mov_b32 s1, s5
745; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
746; SI-NEXT:    s_waitcnt lgkmcnt(0)
747; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
748; SI-NEXT:    s_lshr_b32 s6, s6, 16
749; SI-NEXT:    v_cvt_f32_f16_e32 v2, s6
750; SI-NEXT:    s_lshr_b32 s6, s7, 16
751; SI-NEXT:    v_cvt_f32_f16_e32 v3, s6
752; SI-NEXT:    s_lshr_b32 s6, s5, 16
753; SI-NEXT:    v_cvt_f32_f16_e32 v5, s6
754; SI-NEXT:    v_cvt_f32_f16_e32 v1, s7
755; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
756; SI-NEXT:    s_lshr_b32 s4, s4, 16
757; SI-NEXT:    v_cvt_f32_f16_e32 v7, s5
758; SI-NEXT:    v_cvt_f32_f16_e32 v6, s4
759; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v5
760; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
761; SI-NEXT:    v_min_f32_e32 v3, v3, v5
762; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v7
763; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
764; SI-NEXT:    v_min_f32_e32 v1, v1, v5
765; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v6
766; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
767; SI-NEXT:    v_min_f32_e32 v2, v2, v5
768; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
769; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
770; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
771; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
772; SI-NEXT:    v_min_f32_e32 v0, v0, v4
773; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
774; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
775; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
776; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
777; SI-NEXT:    v_or_b32_e32 v1, v1, v3
778; SI-NEXT:    v_or_b32_e32 v0, v0, v2
779; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
780; SI-NEXT:    s_endpgm
781;
782; VI-LABEL: minnum_v4f16:
783; VI:       ; %bb.0: ; %entry
784; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
785; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
786; VI-NEXT:    s_mov_b32 s7, 0xf000
787; VI-NEXT:    s_mov_b32 s6, -1
788; VI-NEXT:    s_waitcnt lgkmcnt(0)
789; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
790; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
791; VI-NEXT:    s_mov_b32 s4, s0
792; VI-NEXT:    s_mov_b32 s5, s1
793; VI-NEXT:    s_waitcnt lgkmcnt(0)
794; VI-NEXT:    v_max_f16_e64 v0, s9, s9
795; VI-NEXT:    v_max_f16_e64 v1, s3, s3
796; VI-NEXT:    s_lshr_b32 s0, s9, 16
797; VI-NEXT:    v_min_f16_e32 v0, v1, v0
798; VI-NEXT:    v_max_f16_e64 v1, s0, s0
799; VI-NEXT:    s_lshr_b32 s0, s3, 16
800; VI-NEXT:    v_max_f16_e64 v2, s0, s0
801; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
802; VI-NEXT:    v_or_b32_e32 v1, v0, v1
803; VI-NEXT:    v_max_f16_e64 v0, s8, s8
804; VI-NEXT:    v_max_f16_e64 v2, s2, s2
805; VI-NEXT:    s_lshr_b32 s0, s8, 16
806; VI-NEXT:    v_min_f16_e32 v0, v2, v0
807; VI-NEXT:    v_max_f16_e64 v2, s0, s0
808; VI-NEXT:    s_lshr_b32 s0, s2, 16
809; VI-NEXT:    v_max_f16_e64 v3, s0, s0
810; VI-NEXT:    v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
811; VI-NEXT:    v_or_b32_e32 v0, v0, v2
812; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
813; VI-NEXT:    s_endpgm
814;
815; GFX9-LABEL: minnum_v4f16:
816; GFX9:       ; %bb.0: ; %entry
817; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
818; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
819; GFX9-NEXT:    s_mov_b32 s3, 0xf000
820; GFX9-NEXT:    s_mov_b32 s2, -1
821; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
822; GFX9-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
823; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[6:7], 0x0
824; GFX9-NEXT:    s_mov_b32 s0, s4
825; GFX9-NEXT:    s_mov_b32 s1, s5
826; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
827; GFX9-NEXT:    v_pk_max_f16 v0, s11, s11
828; GFX9-NEXT:    v_pk_max_f16 v1, s13, s13
829; GFX9-NEXT:    v_pk_max_f16 v2, s10, s10
830; GFX9-NEXT:    v_pk_min_f16 v1, v1, v0
831; GFX9-NEXT:    v_pk_max_f16 v0, s12, s12
832; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
833; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
834; GFX9-NEXT:    s_endpgm
835;
836; GFX10-LABEL: minnum_v4f16:
837; GFX10:       ; %bb.0: ; %entry
838; GFX10-NEXT:    s_clause 0x1
839; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
840; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
841; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
842; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
843; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
844; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
845; GFX10-NEXT:    s_mov_b32 s6, -1
846; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
847; GFX10-NEXT:    v_pk_max_f16 v0, s1, s1
848; GFX10-NEXT:    v_pk_max_f16 v1, s9, s9
849; GFX10-NEXT:    v_pk_max_f16 v2, s0, s0
850; GFX10-NEXT:    v_pk_max_f16 v3, s8, s8
851; GFX10-NEXT:    v_pk_min_f16 v1, v1, v0
852; GFX10-NEXT:    v_pk_min_f16 v0, v3, v2
853; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
854; GFX10-NEXT:    s_endpgm
855    <4 x half> addrspace(1)* %r,
856    <4 x half> addrspace(1)* %a,
857    <4 x half> addrspace(1)* %b) #0 {
858entry:
859  %a.val = load <4 x half>, <4 x half> addrspace(1)* %a
860  %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
861  %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> %a.val, <4 x half> %b.val)
862  store <4 x half> %r.val, <4 x half> addrspace(1)* %r
863  ret void
864}
865
866define amdgpu_kernel void @fmin_v4f16_imm_a(
867; SI-LABEL: fmin_v4f16_imm_a:
868; SI:       ; %bb.0: ; %entry
869; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
870; SI-NEXT:    s_waitcnt lgkmcnt(0)
871; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
872; SI-NEXT:    s_mov_b32 s3, 0xf000
873; SI-NEXT:    s_mov_b32 s2, -1
874; SI-NEXT:    s_waitcnt lgkmcnt(0)
875; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
876; SI-NEXT:    s_lshr_b32 s5, s5, 16
877; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
878; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
879; SI-NEXT:    s_lshr_b32 s4, s4, 16
880; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
881; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
882; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
883; SI-NEXT:    v_min_f32_e32 v2, 4.0, v2
884; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
885; SI-NEXT:    v_min_f32_e32 v1, 0x40400000, v1
886; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
887; SI-NEXT:    v_min_f32_e32 v3, 2.0, v3
888; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
889; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
890; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
891; SI-NEXT:    v_min_f32_e32 v0, 0x41000000, v0
892; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
893; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
894; SI-NEXT:    v_or_b32_e32 v1, v1, v2
895; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
896; SI-NEXT:    v_or_b32_e32 v0, v0, v2
897; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
898; SI-NEXT:    s_endpgm
899;
900; VI-LABEL: fmin_v4f16_imm_a:
901; VI:       ; %bb.0: ; %entry
902; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
903; VI-NEXT:    v_mov_b32_e32 v0, 0x4400
904; VI-NEXT:    s_mov_b32 s7, 0xf000
905; VI-NEXT:    s_mov_b32 s6, -1
906; VI-NEXT:    s_waitcnt lgkmcnt(0)
907; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
908; VI-NEXT:    s_mov_b32 s4, s0
909; VI-NEXT:    s_mov_b32 s5, s1
910; VI-NEXT:    s_waitcnt lgkmcnt(0)
911; VI-NEXT:    s_lshr_b32 s0, s3, 16
912; VI-NEXT:    v_max_f16_e64 v1, s3, s3
913; VI-NEXT:    v_max_f16_e64 v3, s0, s0
914; VI-NEXT:    v_max_f16_e64 v2, s2, s2
915; VI-NEXT:    v_min_f16_e32 v1, 0x4200, v1
916; VI-NEXT:    v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
917; VI-NEXT:    s_lshr_b32 s0, s2, 16
918; VI-NEXT:    v_or_b32_e32 v1, v1, v0
919; VI-NEXT:    v_min_f16_e32 v0, 0x4800, v2
920; VI-NEXT:    v_max_f16_e64 v2, s0, s0
921; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
922; VI-NEXT:    v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
923; VI-NEXT:    v_or_b32_e32 v0, v0, v2
924; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
925; VI-NEXT:    s_endpgm
926;
927; GFX9-LABEL: fmin_v4f16_imm_a:
928; GFX9:       ; %bb.0: ; %entry
929; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
930; GFX9-NEXT:    s_mov_b32 s8, 0x44004200
931; GFX9-NEXT:    s_mov_b32 s9, 0x40004800
932; GFX9-NEXT:    s_mov_b32 s7, 0xf000
933; GFX9-NEXT:    s_mov_b32 s6, -1
934; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
935; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
936; GFX9-NEXT:    s_mov_b32 s4, s0
937; GFX9-NEXT:    s_mov_b32 s5, s1
938; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
939; GFX9-NEXT:    v_pk_max_f16 v0, s3, s3
940; GFX9-NEXT:    v_pk_max_f16 v2, s2, s2
941; GFX9-NEXT:    v_pk_min_f16 v1, v0, s8
942; GFX9-NEXT:    v_pk_min_f16 v0, v2, s9
943; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
944; GFX9-NEXT:    s_endpgm
945;
946; GFX10-LABEL: fmin_v4f16_imm_a:
947; GFX10:       ; %bb.0: ; %entry
948; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
949; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
950; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
951; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
952; GFX10-NEXT:    v_pk_max_f16 v0, s3, s3
953; GFX10-NEXT:    v_pk_max_f16 v2, s2, s2
954; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
955; GFX10-NEXT:    s_mov_b32 s2, -1
956; GFX10-NEXT:    v_pk_min_f16 v1, 0x44004200, v0
957; GFX10-NEXT:    v_pk_min_f16 v0, 0x40004800, v2
958; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
959; GFX10-NEXT:    s_endpgm
960    <4 x half> addrspace(1)* %r,
961    <4 x half> addrspace(1)* %b) #0 {
962entry:
963  %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
964  %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val)
965  store <4 x half> %r.val, <4 x half> addrspace(1)* %r
966  ret void
967}
968
969attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
970