1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=SI %s
3; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
6
7declare half @llvm.maxnum.f16(half %a, half %b)
8declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
9declare <3 x half> @llvm.maxnum.v3f16(<3 x half> %a, <3 x half> %b)
10declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b)
11
12define amdgpu_kernel void @maxnum_f16(
13; SI-LABEL: maxnum_f16:
14; SI:       ; %bb.0: ; %entry
15; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
16; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
17; SI-NEXT:    s_mov_b32 s3, 0xf000
18; SI-NEXT:    s_mov_b32 s2, -1
19; SI-NEXT:    s_mov_b32 s14, s2
20; SI-NEXT:    s_waitcnt lgkmcnt(0)
21; SI-NEXT:    s_mov_b32 s12, s6
22; SI-NEXT:    s_mov_b32 s13, s7
23; SI-NEXT:    s_mov_b32 s15, s3
24; SI-NEXT:    s_mov_b32 s10, s2
25; SI-NEXT:    s_mov_b32 s11, s3
26; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
27; SI-NEXT:    s_waitcnt vmcnt(0)
28; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
29; SI-NEXT:    s_waitcnt vmcnt(0)
30; SI-NEXT:    s_mov_b32 s0, s4
31; SI-NEXT:    s_mov_b32 s1, s5
32; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
33; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
34; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
35; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
36; SI-NEXT:    v_max_f32_e32 v0, v0, v1
37; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
38; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
39; SI-NEXT:    s_endpgm
40;
41; VI-LABEL: maxnum_f16:
42; VI:       ; %bb.0: ; %entry
43; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
44; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
45; VI-NEXT:    s_mov_b32 s3, 0xf000
46; VI-NEXT:    s_mov_b32 s2, -1
47; VI-NEXT:    s_mov_b32 s14, s2
48; VI-NEXT:    s_waitcnt lgkmcnt(0)
49; VI-NEXT:    s_mov_b32 s12, s6
50; VI-NEXT:    s_mov_b32 s13, s7
51; VI-NEXT:    s_mov_b32 s15, s3
52; VI-NEXT:    s_mov_b32 s10, s2
53; VI-NEXT:    s_mov_b32 s11, s3
54; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
55; VI-NEXT:    s_waitcnt vmcnt(0)
56; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
57; VI-NEXT:    s_waitcnt vmcnt(0)
58; VI-NEXT:    s_mov_b32 s0, s4
59; VI-NEXT:    s_mov_b32 s1, s5
60; VI-NEXT:    v_max_f16_e32 v0, v0, v0
61; VI-NEXT:    v_max_f16_e32 v1, v1, v1
62; VI-NEXT:    v_max_f16_e32 v0, v0, v1
63; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
64; VI-NEXT:    s_endpgm
65;
66; GFX9-LABEL: maxnum_f16:
67; GFX9:       ; %bb.0: ; %entry
68; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
69; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
70; GFX9-NEXT:    s_mov_b32 s3, 0xf000
71; GFX9-NEXT:    s_mov_b32 s2, -1
72; GFX9-NEXT:    s_mov_b32 s14, s2
73; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
74; GFX9-NEXT:    s_mov_b32 s12, s6
75; GFX9-NEXT:    s_mov_b32 s13, s7
76; GFX9-NEXT:    s_mov_b32 s15, s3
77; GFX9-NEXT:    s_mov_b32 s10, s2
78; GFX9-NEXT:    s_mov_b32 s11, s3
79; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
80; GFX9-NEXT:    s_waitcnt vmcnt(0)
81; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
82; GFX9-NEXT:    s_waitcnt vmcnt(0)
83; GFX9-NEXT:    s_mov_b32 s0, s4
84; GFX9-NEXT:    s_mov_b32 s1, s5
85; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
86; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
87; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
88; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
89; GFX9-NEXT:    s_endpgm
90;
91; GFX10-LABEL: maxnum_f16:
92; GFX10:       ; %bb.0: ; %entry
93; GFX10-NEXT:    s_clause 0x1
94; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
95; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
96; GFX10-NEXT:    s_mov_b32 s2, -1
97; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
98; GFX10-NEXT:    s_mov_b32 s14, s2
99; GFX10-NEXT:    s_mov_b32 s15, s3
100; GFX10-NEXT:    s_mov_b32 s10, s2
101; GFX10-NEXT:    s_mov_b32 s11, s3
102; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
103; GFX10-NEXT:    s_mov_b32 s12, s6
104; GFX10-NEXT:    s_mov_b32 s13, s7
105; GFX10-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc dlc
106; GFX10-NEXT:    s_waitcnt vmcnt(0)
107; GFX10-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc dlc
108; GFX10-NEXT:    s_waitcnt vmcnt(0)
109; GFX10-NEXT:    s_mov_b32 s0, s4
110; GFX10-NEXT:    s_mov_b32 s1, s5
111; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
112; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
113; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
114; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0
115; GFX10-NEXT:    s_endpgm
116    half addrspace(1)* %r,
117    half addrspace(1)* %a,
118    half addrspace(1)* %b) #0 {
119entry:
120  %a.val = load volatile half, half addrspace(1)* %a
121  %b.val = load volatile half, half addrspace(1)* %b
122  %r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val)
123  store half %r.val, half addrspace(1)* %r
124  ret void
125}
126
127define amdgpu_kernel void @maxnum_f16_imm_a(
128; SI-LABEL: maxnum_f16_imm_a:
129; SI:       ; %bb.0: ; %entry
130; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
131; SI-NEXT:    s_mov_b32 s7, 0xf000
132; SI-NEXT:    s_mov_b32 s6, -1
133; SI-NEXT:    s_mov_b32 s10, s6
134; SI-NEXT:    s_mov_b32 s11, s7
135; SI-NEXT:    s_waitcnt lgkmcnt(0)
136; SI-NEXT:    s_mov_b32 s8, s2
137; SI-NEXT:    s_mov_b32 s9, s3
138; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
139; SI-NEXT:    s_mov_b32 s4, s0
140; SI-NEXT:    s_mov_b32 s5, s1
141; SI-NEXT:    s_waitcnt vmcnt(0)
142; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
143; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
144; SI-NEXT:    v_max_f32_e32 v0, 0x40400000, v0
145; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
146; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
147; SI-NEXT:    s_endpgm
148;
149; VI-LABEL: maxnum_f16_imm_a:
150; VI:       ; %bb.0: ; %entry
151; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
152; VI-NEXT:    s_mov_b32 s7, 0xf000
153; VI-NEXT:    s_mov_b32 s6, -1
154; VI-NEXT:    s_mov_b32 s10, s6
155; VI-NEXT:    s_mov_b32 s11, s7
156; VI-NEXT:    s_waitcnt lgkmcnt(0)
157; VI-NEXT:    s_mov_b32 s8, s2
158; VI-NEXT:    s_mov_b32 s9, s3
159; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
160; VI-NEXT:    s_mov_b32 s4, s0
161; VI-NEXT:    s_mov_b32 s5, s1
162; VI-NEXT:    s_waitcnt vmcnt(0)
163; VI-NEXT:    v_max_f16_e32 v0, v0, v0
164; VI-NEXT:    v_max_f16_e32 v0, 0x4200, v0
165; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
166; VI-NEXT:    s_endpgm
167;
168; GFX9-LABEL: maxnum_f16_imm_a:
169; GFX9:       ; %bb.0: ; %entry
170; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
171; GFX9-NEXT:    s_mov_b32 s7, 0xf000
172; GFX9-NEXT:    s_mov_b32 s6, -1
173; GFX9-NEXT:    s_mov_b32 s10, s6
174; GFX9-NEXT:    s_mov_b32 s11, s7
175; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
176; GFX9-NEXT:    s_mov_b32 s8, s2
177; GFX9-NEXT:    s_mov_b32 s9, s3
178; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
179; GFX9-NEXT:    s_mov_b32 s4, s0
180; GFX9-NEXT:    s_mov_b32 s5, s1
181; GFX9-NEXT:    s_waitcnt vmcnt(0)
182; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
183; GFX9-NEXT:    v_max_f16_e32 v0, 0x4200, v0
184; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
185; GFX9-NEXT:    s_endpgm
186;
187; GFX10-LABEL: maxnum_f16_imm_a:
188; GFX10:       ; %bb.0: ; %entry
189; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
190; GFX10-NEXT:    s_mov_b32 s6, -1
191; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
192; GFX10-NEXT:    s_mov_b32 s10, s6
193; GFX10-NEXT:    s_mov_b32 s11, s7
194; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
195; GFX10-NEXT:    s_mov_b32 s8, s2
196; GFX10-NEXT:    s_mov_b32 s9, s3
197; GFX10-NEXT:    s_mov_b32 s4, s0
198; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
199; GFX10-NEXT:    s_mov_b32 s5, s1
200; GFX10-NEXT:    s_waitcnt vmcnt(0)
201; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
202; GFX10-NEXT:    v_max_f16_e32 v0, 0x4200, v0
203; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
204; GFX10-NEXT:    s_endpgm
205    half addrspace(1)* %r,
206    half addrspace(1)* %b) #0 {
207entry:
208  %b.val = load half, half addrspace(1)* %b
209  %r.val = call half @llvm.maxnum.f16(half 3.0, half %b.val)
210  store half %r.val, half addrspace(1)* %r
211  ret void
212}
213
214define amdgpu_kernel void @maxnum_f16_imm_b(
215; SI-LABEL: maxnum_f16_imm_b:
216; SI:       ; %bb.0: ; %entry
217; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
218; SI-NEXT:    s_mov_b32 s7, 0xf000
219; SI-NEXT:    s_mov_b32 s6, -1
220; SI-NEXT:    s_mov_b32 s10, s6
221; SI-NEXT:    s_mov_b32 s11, s7
222; SI-NEXT:    s_waitcnt lgkmcnt(0)
223; SI-NEXT:    s_mov_b32 s8, s2
224; SI-NEXT:    s_mov_b32 s9, s3
225; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
226; SI-NEXT:    s_mov_b32 s4, s0
227; SI-NEXT:    s_mov_b32 s5, s1
228; SI-NEXT:    s_waitcnt vmcnt(0)
229; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
230; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
231; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
232; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
233; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
234; SI-NEXT:    s_endpgm
235;
236; VI-LABEL: maxnum_f16_imm_b:
237; VI:       ; %bb.0: ; %entry
238; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
239; VI-NEXT:    s_mov_b32 s7, 0xf000
240; VI-NEXT:    s_mov_b32 s6, -1
241; VI-NEXT:    s_mov_b32 s10, s6
242; VI-NEXT:    s_mov_b32 s11, s7
243; VI-NEXT:    s_waitcnt lgkmcnt(0)
244; VI-NEXT:    s_mov_b32 s8, s2
245; VI-NEXT:    s_mov_b32 s9, s3
246; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
247; VI-NEXT:    s_mov_b32 s4, s0
248; VI-NEXT:    s_mov_b32 s5, s1
249; VI-NEXT:    s_waitcnt vmcnt(0)
250; VI-NEXT:    v_max_f16_e32 v0, v0, v0
251; VI-NEXT:    v_max_f16_e32 v0, 4.0, v0
252; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
253; VI-NEXT:    s_endpgm
254;
255; GFX9-LABEL: maxnum_f16_imm_b:
256; GFX9:       ; %bb.0: ; %entry
257; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
258; GFX9-NEXT:    s_mov_b32 s7, 0xf000
259; GFX9-NEXT:    s_mov_b32 s6, -1
260; GFX9-NEXT:    s_mov_b32 s10, s6
261; GFX9-NEXT:    s_mov_b32 s11, s7
262; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
263; GFX9-NEXT:    s_mov_b32 s8, s2
264; GFX9-NEXT:    s_mov_b32 s9, s3
265; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
266; GFX9-NEXT:    s_mov_b32 s4, s0
267; GFX9-NEXT:    s_mov_b32 s5, s1
268; GFX9-NEXT:    s_waitcnt vmcnt(0)
269; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
270; GFX9-NEXT:    v_max_f16_e32 v0, 4.0, v0
271; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
272; GFX9-NEXT:    s_endpgm
273;
274; GFX10-LABEL: maxnum_f16_imm_b:
275; GFX10:       ; %bb.0: ; %entry
276; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
277; GFX10-NEXT:    s_mov_b32 s6, -1
278; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
279; GFX10-NEXT:    s_mov_b32 s10, s6
280; GFX10-NEXT:    s_mov_b32 s11, s7
281; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
282; GFX10-NEXT:    s_mov_b32 s8, s2
283; GFX10-NEXT:    s_mov_b32 s9, s3
284; GFX10-NEXT:    s_mov_b32 s4, s0
285; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
286; GFX10-NEXT:    s_mov_b32 s5, s1
287; GFX10-NEXT:    s_waitcnt vmcnt(0)
288; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
289; GFX10-NEXT:    v_max_f16_e32 v0, 4.0, v0
290; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
291; GFX10-NEXT:    s_endpgm
292    half addrspace(1)* %r,
293    half addrspace(1)* %a) #0 {
294entry:
295  %a.val = load half, half addrspace(1)* %a
296  %r.val = call half @llvm.maxnum.f16(half %a.val, half 4.0)
297  store half %r.val, half addrspace(1)* %r
298  ret void
299}
300
301define amdgpu_kernel void @maxnum_v2f16(
302; SI-LABEL: maxnum_v2f16:
303; SI:       ; %bb.0: ; %entry
304; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
305; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
306; SI-NEXT:    s_waitcnt lgkmcnt(0)
307; SI-NEXT:    s_load_dword s2, s[6:7], 0x0
308; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
309; SI-NEXT:    s_mov_b32 s7, 0xf000
310; SI-NEXT:    s_mov_b32 s6, -1
311; SI-NEXT:    s_waitcnt lgkmcnt(0)
312; SI-NEXT:    s_lshr_b32 s1, s2, 16
313; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
314; SI-NEXT:    s_lshr_b32 s0, s0, 16
315; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
316; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
317; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
318; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
319; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
320; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
321; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
322; SI-NEXT:    v_max_f32_e32 v2, v3, v2
323; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
324; SI-NEXT:    v_max_f32_e32 v0, v0, v1
325; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
326; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
327; SI-NEXT:    v_or_b32_e32 v0, v0, v1
328; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
329; SI-NEXT:    s_endpgm
330;
331; VI-LABEL: maxnum_v2f16:
332; VI:       ; %bb.0: ; %entry
333; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
334; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
335; VI-NEXT:    s_mov_b32 s7, 0xf000
336; VI-NEXT:    s_mov_b32 s6, -1
337; VI-NEXT:    s_waitcnt lgkmcnt(0)
338; VI-NEXT:    s_load_dword s8, s[4:5], 0x0
339; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
340; VI-NEXT:    s_mov_b32 s4, s0
341; VI-NEXT:    s_mov_b32 s5, s1
342; VI-NEXT:    s_waitcnt lgkmcnt(0)
343; VI-NEXT:    v_max_f16_e64 v0, s8, s8
344; VI-NEXT:    v_max_f16_e64 v1, s2, s2
345; VI-NEXT:    s_lshr_b32 s0, s8, 16
346; VI-NEXT:    v_max_f16_e32 v0, v1, v0
347; VI-NEXT:    v_max_f16_e64 v1, s0, s0
348; VI-NEXT:    s_lshr_b32 s0, s2, 16
349; VI-NEXT:    v_max_f16_e64 v2, s0, s0
350; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
351; VI-NEXT:    v_or_b32_e32 v0, v0, v1
352; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
353; VI-NEXT:    s_endpgm
354;
355; GFX9-LABEL: maxnum_v2f16:
356; GFX9:       ; %bb.0: ; %entry
357; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
358; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
359; GFX9-NEXT:    s_mov_b32 s3, 0xf000
360; GFX9-NEXT:    s_mov_b32 s2, -1
361; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
362; GFX9-NEXT:    s_load_dword s10, s[8:9], 0x0
363; GFX9-NEXT:    s_load_dword s11, s[6:7], 0x0
364; GFX9-NEXT:    s_mov_b32 s0, s4
365; GFX9-NEXT:    s_mov_b32 s1, s5
366; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
367; GFX9-NEXT:    v_pk_max_f16 v0, s10, s10
368; GFX9-NEXT:    v_pk_max_f16 v1, s11, s11
369; GFX9-NEXT:    v_pk_max_f16 v0, v1, v0
370; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
371; GFX9-NEXT:    s_endpgm
372;
373; GFX10-LABEL: maxnum_v2f16:
374; GFX10:       ; %bb.0: ; %entry
375; GFX10-NEXT:    s_clause 0x1
376; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
377; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
378; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
379; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
380; GFX10-NEXT:    s_load_dword s1, s[6:7], 0x0
381; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
382; GFX10-NEXT:    s_mov_b32 s6, -1
383; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
384; GFX10-NEXT:    v_pk_max_f16 v0, s0, s0
385; GFX10-NEXT:    v_pk_max_f16 v1, s1, s1
386; GFX10-NEXT:    v_pk_max_f16 v0, v1, v0
387; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
388; GFX10-NEXT:    s_endpgm
389    <2 x half> addrspace(1)* %r,
390    <2 x half> addrspace(1)* %a,
391    <2 x half> addrspace(1)* %b) #0 {
392entry:
393  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
394  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
395  %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> %b.val)
396  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
397  ret void
398}
399
400define amdgpu_kernel void @maxnum_v2f16_imm_a(
401; SI-LABEL: maxnum_v2f16_imm_a:
402; SI:       ; %bb.0: ; %entry
403; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
404; SI-NEXT:    s_waitcnt lgkmcnt(0)
405; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
406; SI-NEXT:    s_mov_b32 s3, 0xf000
407; SI-NEXT:    s_waitcnt lgkmcnt(0)
408; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
409; SI-NEXT:    s_lshr_b32 s2, s2, 16
410; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
411; SI-NEXT:    s_mov_b32 s2, -1
412; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
413; SI-NEXT:    v_max_f32_e32 v0, 0x40400000, v0
414; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
415; SI-NEXT:    v_max_f32_e32 v1, 4.0, v1
416; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
417; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
418; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
419; SI-NEXT:    v_or_b32_e32 v0, v0, v1
420; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
421; SI-NEXT:    s_endpgm
422;
423; VI-LABEL: maxnum_v2f16_imm_a:
424; VI:       ; %bb.0: ; %entry
425; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
426; VI-NEXT:    v_mov_b32_e32 v2, 0x4400
427; VI-NEXT:    s_waitcnt lgkmcnt(0)
428; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
429; VI-NEXT:    s_mov_b32 s3, 0xf000
430; VI-NEXT:    s_mov_b32 s2, -1
431; VI-NEXT:    s_waitcnt lgkmcnt(0)
432; VI-NEXT:    v_max_f16_e64 v0, s4, s4
433; VI-NEXT:    s_lshr_b32 s4, s4, 16
434; VI-NEXT:    v_max_f16_e64 v1, s4, s4
435; VI-NEXT:    v_max_f16_e32 v0, 0x4200, v0
436; VI-NEXT:    v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
437; VI-NEXT:    v_or_b32_e32 v0, v0, v1
438; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
439; VI-NEXT:    s_endpgm
440;
441; GFX9-LABEL: maxnum_v2f16_imm_a:
442; GFX9:       ; %bb.0: ; %entry
443; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
444; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
445; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
446; GFX9-NEXT:    s_mov_b32 s3, 0xf000
447; GFX9-NEXT:    s_mov_b32 s2, -1
448; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
449; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
450; GFX9-NEXT:    s_mov_b32 s4, 0x44004200
451; GFX9-NEXT:    v_pk_max_f16 v0, v0, s4
452; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
453; GFX9-NEXT:    s_endpgm
454;
455; GFX10-LABEL: maxnum_v2f16_imm_a:
456; GFX10:       ; %bb.0: ; %entry
457; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
458; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
459; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
460; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
461; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
462; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
463; GFX10-NEXT:    s_mov_b32 s2, -1
464; GFX10-NEXT:    v_pk_max_f16 v0, 0x44004200, v0
465; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
466; GFX10-NEXT:    s_endpgm
467    <2 x half> addrspace(1)* %r,
468    <2 x half> addrspace(1)* %b) #0 {
469entry:
470  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
471  %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val)
472  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
473  ret void
474}
475
476define amdgpu_kernel void @maxnum_v2f16_imm_b(
477; SI-LABEL: maxnum_v2f16_imm_b:
478; SI:       ; %bb.0: ; %entry
479; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
480; SI-NEXT:    s_waitcnt lgkmcnt(0)
481; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
482; SI-NEXT:    s_mov_b32 s3, 0xf000
483; SI-NEXT:    s_waitcnt lgkmcnt(0)
484; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
485; SI-NEXT:    s_lshr_b32 s2, s2, 16
486; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
487; SI-NEXT:    s_mov_b32 s2, -1
488; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
489; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
490; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
491; SI-NEXT:    v_max_f32_e32 v1, 0x40400000, v1
492; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
493; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
494; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
495; SI-NEXT:    v_or_b32_e32 v0, v0, v1
496; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
497; SI-NEXT:    s_endpgm
498;
499; VI-LABEL: maxnum_v2f16_imm_b:
500; VI:       ; %bb.0: ; %entry
501; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
502; VI-NEXT:    v_mov_b32_e32 v2, 0x4200
503; VI-NEXT:    s_waitcnt lgkmcnt(0)
504; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
505; VI-NEXT:    s_mov_b32 s3, 0xf000
506; VI-NEXT:    s_mov_b32 s2, -1
507; VI-NEXT:    s_waitcnt lgkmcnt(0)
508; VI-NEXT:    v_max_f16_e64 v0, s4, s4
509; VI-NEXT:    s_lshr_b32 s4, s4, 16
510; VI-NEXT:    v_max_f16_e64 v1, s4, s4
511; VI-NEXT:    v_max_f16_e32 v0, 4.0, v0
512; VI-NEXT:    v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
513; VI-NEXT:    v_or_b32_e32 v0, v0, v1
514; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
515; VI-NEXT:    s_endpgm
516;
517; GFX9-LABEL: maxnum_v2f16_imm_b:
518; GFX9:       ; %bb.0: ; %entry
519; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
520; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
521; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
522; GFX9-NEXT:    s_mov_b32 s3, 0xf000
523; GFX9-NEXT:    s_mov_b32 s2, -1
524; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
525; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
526; GFX9-NEXT:    s_mov_b32 s4, 0x42004400
527; GFX9-NEXT:    v_pk_max_f16 v0, v0, s4
528; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
529; GFX9-NEXT:    s_endpgm
530;
531; GFX10-LABEL: maxnum_v2f16_imm_b:
532; GFX10:       ; %bb.0: ; %entry
533; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
534; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
535; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
536; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
537; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
538; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
539; GFX10-NEXT:    s_mov_b32 s2, -1
540; GFX10-NEXT:    v_pk_max_f16 v0, 0x42004400, v0
541; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
542; GFX10-NEXT:    s_endpgm
543    <2 x half> addrspace(1)* %r,
544    <2 x half> addrspace(1)* %a) #0 {
545entry:
546  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
547  %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>)
548  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
549  ret void
550}
551
552; FIXME: Scalarize with undef half
553define amdgpu_kernel void @maxnum_v3f16(
554; SI-LABEL: maxnum_v3f16:
555; SI:       ; %bb.0: ; %entry
556; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
557; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
558; SI-NEXT:    s_waitcnt lgkmcnt(0)
559; SI-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
560; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
561; SI-NEXT:    s_mov_b32 s7, 0xf000
562; SI-NEXT:    s_mov_b32 s6, -1
563; SI-NEXT:    s_waitcnt lgkmcnt(0)
564; SI-NEXT:    v_cvt_f32_f16_e32 v0, s3
565; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
566; SI-NEXT:    s_lshr_b32 s2, s2, 16
567; SI-NEXT:    s_lshr_b32 s3, s0, 16
568; SI-NEXT:    v_cvt_f32_f16_e32 v2, s3
569; SI-NEXT:    v_cvt_f32_f16_e32 v3, s2
570; SI-NEXT:    v_cvt_f32_f16_e32 v5, s0
571; SI-NEXT:    v_cvt_f32_f16_e32 v4, s1
572; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
573; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
574; SI-NEXT:    v_max_f32_e32 v2, v3, v2
575; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
576; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
577; SI-NEXT:    v_max_f32_e32 v1, v1, v3
578; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
579; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
580; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
581; SI-NEXT:    v_max_f32_e32 v0, v0, v3
582; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
583; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
584; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
585; SI-NEXT:    v_or_b32_e32 v1, v1, v2
586; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
587; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
588; SI-NEXT:    s_endpgm
589;
590; VI-LABEL: maxnum_v3f16:
591; VI:       ; %bb.0: ; %entry
592; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
593; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
594; VI-NEXT:    s_mov_b32 s7, 0xf000
595; VI-NEXT:    s_mov_b32 s6, -1
596; VI-NEXT:    s_waitcnt lgkmcnt(0)
597; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
598; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
599; VI-NEXT:    s_mov_b32 s4, s0
600; VI-NEXT:    s_mov_b32 s5, s1
601; VI-NEXT:    s_waitcnt lgkmcnt(0)
602; VI-NEXT:    v_max_f16_e64 v0, s8, s8
603; VI-NEXT:    v_max_f16_e64 v1, s2, s2
604; VI-NEXT:    s_lshr_b32 s0, s8, 16
605; VI-NEXT:    v_max_f16_e32 v0, v1, v0
606; VI-NEXT:    v_max_f16_e64 v1, s0, s0
607; VI-NEXT:    s_lshr_b32 s0, s2, 16
608; VI-NEXT:    v_max_f16_e64 v2, s0, s0
609; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
610; VI-NEXT:    v_or_b32_e32 v0, v0, v1
611; VI-NEXT:    v_max_f16_e64 v1, s9, s9
612; VI-NEXT:    v_max_f16_e64 v2, s3, s3
613; VI-NEXT:    v_max_f16_e32 v1, v2, v1
614; VI-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
615; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
616; VI-NEXT:    s_endpgm
617;
618; GFX9-LABEL: maxnum_v3f16:
619; GFX9:       ; %bb.0: ; %entry
620; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
621; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
622; GFX9-NEXT:    s_mov_b32 s3, 0xf000
623; GFX9-NEXT:    s_mov_b32 s2, -1
624; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
625; GFX9-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
626; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[6:7], 0x0
627; GFX9-NEXT:    s_mov_b32 s0, s4
628; GFX9-NEXT:    s_mov_b32 s1, s5
629; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
630; GFX9-NEXT:    v_pk_max_f16 v0, s10, s10
631; GFX9-NEXT:    v_pk_max_f16 v1, s12, s12
632; GFX9-NEXT:    v_pk_max_f16 v2, s11, s11
633; GFX9-NEXT:    v_pk_max_f16 v0, v1, v0
634; GFX9-NEXT:    v_pk_max_f16 v1, s13, s13
635; GFX9-NEXT:    v_pk_max_f16 v1, v1, v2
636; GFX9-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
637; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
638; GFX9-NEXT:    s_endpgm
639;
640; GFX10-LABEL: maxnum_v3f16:
641; GFX10:       ; %bb.0: ; %entry
642; GFX10-NEXT:    s_clause 0x1
643; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
644; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
645; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
646; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
647; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
648; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
649; GFX10-NEXT:    s_mov_b32 s6, -1
650; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
651; GFX10-NEXT:    v_pk_max_f16 v1, s1, s1
652; GFX10-NEXT:    v_pk_max_f16 v2, s9, s9
653; GFX10-NEXT:    v_pk_max_f16 v0, s0, s0
654; GFX10-NEXT:    v_pk_max_f16 v3, s8, s8
655; GFX10-NEXT:    v_pk_max_f16 v1, v2, v1
656; GFX10-NEXT:    v_pk_max_f16 v0, v3, v0
657; GFX10-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
658; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
659; GFX10-NEXT:    s_endpgm
660    <3 x half> addrspace(1)* %r,
661    <3 x half> addrspace(1)* %a,
662    <3 x half> addrspace(1)* %b) #0 {
663entry:
664  %a.val = load <3 x half>, <3 x half> addrspace(1)* %a
665  %b.val = load <3 x half>, <3 x half> addrspace(1)* %b
666  %r.val = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %a.val, <3 x half> %b.val)
667  store <3 x half> %r.val, <3 x half> addrspace(1)* %r
668  ret void
669}
670
671define amdgpu_kernel void @maxnum_v4f16(
672; SI-LABEL: maxnum_v4f16:
673; SI:       ; %bb.0: ; %entry
674; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
675; SI-NEXT:    s_mov_b32 s3, 0xf000
676; SI-NEXT:    s_mov_b32 s2, -1
677; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
678; SI-NEXT:    s_waitcnt lgkmcnt(0)
679; SI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
680; SI-NEXT:    s_mov_b32 s0, s4
681; SI-NEXT:    s_mov_b32 s1, s5
682; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
683; SI-NEXT:    s_waitcnt lgkmcnt(0)
684; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
685; SI-NEXT:    s_lshr_b32 s6, s6, 16
686; SI-NEXT:    v_cvt_f32_f16_e32 v2, s6
687; SI-NEXT:    s_lshr_b32 s6, s7, 16
688; SI-NEXT:    v_cvt_f32_f16_e32 v3, s6
689; SI-NEXT:    s_lshr_b32 s6, s5, 16
690; SI-NEXT:    v_cvt_f32_f16_e32 v5, s6
691; SI-NEXT:    v_cvt_f32_f16_e32 v1, s7
692; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
693; SI-NEXT:    s_lshr_b32 s4, s4, 16
694; SI-NEXT:    v_cvt_f32_f16_e32 v7, s5
695; SI-NEXT:    v_cvt_f32_f16_e32 v6, s4
696; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v5
697; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
698; SI-NEXT:    v_max_f32_e32 v3, v3, v5
699; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v7
700; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
701; SI-NEXT:    v_max_f32_e32 v1, v1, v5
702; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v6
703; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
704; SI-NEXT:    v_max_f32_e32 v2, v2, v5
705; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
706; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
707; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
708; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
709; SI-NEXT:    v_max_f32_e32 v0, v0, v4
710; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
711; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
712; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
713; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
714; SI-NEXT:    v_or_b32_e32 v1, v1, v3
715; SI-NEXT:    v_or_b32_e32 v0, v0, v2
716; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
717; SI-NEXT:    s_endpgm
718;
719; VI-LABEL: maxnum_v4f16:
720; VI:       ; %bb.0: ; %entry
721; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
722; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
723; VI-NEXT:    s_mov_b32 s7, 0xf000
724; VI-NEXT:    s_mov_b32 s6, -1
725; VI-NEXT:    s_waitcnt lgkmcnt(0)
726; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
727; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
728; VI-NEXT:    s_mov_b32 s4, s0
729; VI-NEXT:    s_mov_b32 s5, s1
730; VI-NEXT:    s_waitcnt lgkmcnt(0)
731; VI-NEXT:    v_max_f16_e64 v0, s9, s9
732; VI-NEXT:    v_max_f16_e64 v1, s3, s3
733; VI-NEXT:    s_lshr_b32 s0, s9, 16
734; VI-NEXT:    v_max_f16_e32 v0, v1, v0
735; VI-NEXT:    v_max_f16_e64 v1, s0, s0
736; VI-NEXT:    s_lshr_b32 s0, s3, 16
737; VI-NEXT:    v_max_f16_e64 v2, s0, s0
738; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
739; VI-NEXT:    v_or_b32_e32 v1, v0, v1
740; VI-NEXT:    v_max_f16_e64 v0, s8, s8
741; VI-NEXT:    v_max_f16_e64 v2, s2, s2
742; VI-NEXT:    s_lshr_b32 s0, s8, 16
743; VI-NEXT:    v_max_f16_e32 v0, v2, v0
744; VI-NEXT:    v_max_f16_e64 v2, s0, s0
745; VI-NEXT:    s_lshr_b32 s0, s2, 16
746; VI-NEXT:    v_max_f16_e64 v3, s0, s0
747; VI-NEXT:    v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
748; VI-NEXT:    v_or_b32_e32 v0, v0, v2
749; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
750; VI-NEXT:    s_endpgm
751;
752; GFX9-LABEL: maxnum_v4f16:
753; GFX9:       ; %bb.0: ; %entry
754; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
755; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
756; GFX9-NEXT:    s_mov_b32 s3, 0xf000
757; GFX9-NEXT:    s_mov_b32 s2, -1
758; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
759; GFX9-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
760; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[6:7], 0x0
761; GFX9-NEXT:    s_mov_b32 s0, s4
762; GFX9-NEXT:    s_mov_b32 s1, s5
763; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
764; GFX9-NEXT:    v_pk_max_f16 v0, s11, s11
765; GFX9-NEXT:    v_pk_max_f16 v1, s13, s13
766; GFX9-NEXT:    v_pk_max_f16 v2, s10, s10
767; GFX9-NEXT:    v_pk_max_f16 v1, v1, v0
768; GFX9-NEXT:    v_pk_max_f16 v0, s12, s12
769; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
770; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
771; GFX9-NEXT:    s_endpgm
772;
773; GFX10-LABEL: maxnum_v4f16:
774; GFX10:       ; %bb.0: ; %entry
775; GFX10-NEXT:    s_clause 0x1
776; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
777; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
778; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
779; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
780; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
781; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
782; GFX10-NEXT:    s_mov_b32 s6, -1
783; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
784; GFX10-NEXT:    v_pk_max_f16 v0, s1, s1
785; GFX10-NEXT:    v_pk_max_f16 v1, s9, s9
786; GFX10-NEXT:    v_pk_max_f16 v2, s0, s0
787; GFX10-NEXT:    v_pk_max_f16 v3, s8, s8
788; GFX10-NEXT:    v_pk_max_f16 v1, v1, v0
789; GFX10-NEXT:    v_pk_max_f16 v0, v3, v2
790; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
791; GFX10-NEXT:    s_endpgm
792    <4 x half> addrspace(1)* %r,
793    <4 x half> addrspace(1)* %a,
794    <4 x half> addrspace(1)* %b) #0 {
795entry:
796  %a.val = load <4 x half>, <4 x half> addrspace(1)* %a
797  %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
798  %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %a.val, <4 x half> %b.val)
799  store <4 x half> %r.val, <4 x half> addrspace(1)* %r
800  ret void
801}
802
803define amdgpu_kernel void @fmax_v4f16_imm_a(
804; SI-LABEL: fmax_v4f16_imm_a:
805; SI:       ; %bb.0: ; %entry
806; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
807; SI-NEXT:    s_waitcnt lgkmcnt(0)
808; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
809; SI-NEXT:    s_mov_b32 s3, 0xf000
810; SI-NEXT:    s_mov_b32 s2, -1
811; SI-NEXT:    s_waitcnt lgkmcnt(0)
812; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
813; SI-NEXT:    s_lshr_b32 s5, s5, 16
814; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
815; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
816; SI-NEXT:    s_lshr_b32 s4, s4, 16
817; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
818; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
819; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
820; SI-NEXT:    v_max_f32_e32 v2, 4.0, v2
821; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
822; SI-NEXT:    v_max_f32_e32 v1, 0x40400000, v1
823; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
824; SI-NEXT:    v_max_f32_e32 v3, 2.0, v3
825; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
826; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
827; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
828; SI-NEXT:    v_max_f32_e32 v0, 0x41000000, v0
829; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
830; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
831; SI-NEXT:    v_or_b32_e32 v1, v1, v2
832; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
833; SI-NEXT:    v_or_b32_e32 v0, v0, v2
834; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
835; SI-NEXT:    s_endpgm
836;
837; VI-LABEL: fmax_v4f16_imm_a:
838; VI:       ; %bb.0: ; %entry
839; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
840; VI-NEXT:    v_mov_b32_e32 v0, 0x4400
841; VI-NEXT:    s_mov_b32 s7, 0xf000
842; VI-NEXT:    s_mov_b32 s6, -1
843; VI-NEXT:    s_waitcnt lgkmcnt(0)
844; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
845; VI-NEXT:    s_mov_b32 s4, s0
846; VI-NEXT:    s_mov_b32 s5, s1
847; VI-NEXT:    s_waitcnt lgkmcnt(0)
848; VI-NEXT:    s_lshr_b32 s0, s3, 16
849; VI-NEXT:    v_max_f16_e64 v1, s3, s3
850; VI-NEXT:    v_max_f16_e64 v3, s0, s0
851; VI-NEXT:    v_max_f16_e64 v2, s2, s2
852; VI-NEXT:    v_max_f16_e32 v1, 0x4200, v1
853; VI-NEXT:    v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
854; VI-NEXT:    s_lshr_b32 s0, s2, 16
855; VI-NEXT:    v_or_b32_e32 v1, v1, v0
856; VI-NEXT:    v_max_f16_e32 v0, 0x4800, v2
857; VI-NEXT:    v_max_f16_e64 v2, s0, s0
858; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
859; VI-NEXT:    v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
860; VI-NEXT:    v_or_b32_e32 v0, v0, v2
861; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
862; VI-NEXT:    s_endpgm
863;
864; GFX9-LABEL: fmax_v4f16_imm_a:
865; GFX9:       ; %bb.0: ; %entry
866; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
867; GFX9-NEXT:    s_mov_b32 s8, 0x44004200
868; GFX9-NEXT:    s_mov_b32 s9, 0x40004800
869; GFX9-NEXT:    s_mov_b32 s7, 0xf000
870; GFX9-NEXT:    s_mov_b32 s6, -1
871; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
872; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
873; GFX9-NEXT:    s_mov_b32 s4, s0
874; GFX9-NEXT:    s_mov_b32 s5, s1
875; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
876; GFX9-NEXT:    v_pk_max_f16 v0, s3, s3
877; GFX9-NEXT:    v_pk_max_f16 v2, s2, s2
878; GFX9-NEXT:    v_pk_max_f16 v1, v0, s8
879; GFX9-NEXT:    v_pk_max_f16 v0, v2, s9
880; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
881; GFX9-NEXT:    s_endpgm
882;
883; GFX10-LABEL: fmax_v4f16_imm_a:
884; GFX10:       ; %bb.0: ; %entry
885; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
886; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
887; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
888; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
889; GFX10-NEXT:    v_pk_max_f16 v0, s3, s3
890; GFX10-NEXT:    v_pk_max_f16 v2, s2, s2
891; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
892; GFX10-NEXT:    s_mov_b32 s2, -1
893; GFX10-NEXT:    v_pk_max_f16 v1, 0x44004200, v0
894; GFX10-NEXT:    v_pk_max_f16 v0, 0x40004800, v2
895; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
896; GFX10-NEXT:    s_endpgm
897    <4 x half> addrspace(1)* %r,
898    <4 x half> addrspace(1)* %b) #0 {
899entry:
900  %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
901  %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val)
902  store <4 x half> %r.val, <4 x half> addrspace(1)* %r
903  ret void
904}
905
906attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
907