1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=SI %s
3; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
7
8declare half @llvm.maxnum.f16(half %a, half %b)
9declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
10declare <3 x half> @llvm.maxnum.v3f16(<3 x half> %a, <3 x half> %b)
11declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b)
12
13define amdgpu_kernel void @maxnum_f16(
14; SI-LABEL: maxnum_f16:
15; SI:       ; %bb.0: ; %entry
16; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
17; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
18; SI-NEXT:    s_mov_b32 s3, 0xf000
19; SI-NEXT:    s_mov_b32 s2, -1
20; SI-NEXT:    s_mov_b32 s14, s2
21; SI-NEXT:    s_waitcnt lgkmcnt(0)
22; SI-NEXT:    s_mov_b32 s12, s6
23; SI-NEXT:    s_mov_b32 s13, s7
24; SI-NEXT:    s_mov_b32 s15, s3
25; SI-NEXT:    s_mov_b32 s10, s2
26; SI-NEXT:    s_mov_b32 s11, s3
27; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
28; SI-NEXT:    s_waitcnt vmcnt(0)
29; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
30; SI-NEXT:    s_waitcnt vmcnt(0)
31; SI-NEXT:    s_mov_b32 s0, s4
32; SI-NEXT:    s_mov_b32 s1, s5
33; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
34; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
35; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
36; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
37; SI-NEXT:    v_max_f32_e32 v0, v0, v1
38; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
39; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
40; SI-NEXT:    s_endpgm
41;
42; VI-LABEL: maxnum_f16:
43; VI:       ; %bb.0: ; %entry
44; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
45; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
46; VI-NEXT:    s_mov_b32 s3, 0xf000
47; VI-NEXT:    s_mov_b32 s2, -1
48; VI-NEXT:    s_mov_b32 s14, s2
49; VI-NEXT:    s_waitcnt lgkmcnt(0)
50; VI-NEXT:    s_mov_b32 s12, s6
51; VI-NEXT:    s_mov_b32 s13, s7
52; VI-NEXT:    s_mov_b32 s15, s3
53; VI-NEXT:    s_mov_b32 s10, s2
54; VI-NEXT:    s_mov_b32 s11, s3
55; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
56; VI-NEXT:    s_waitcnt vmcnt(0)
57; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
58; VI-NEXT:    s_waitcnt vmcnt(0)
59; VI-NEXT:    s_mov_b32 s0, s4
60; VI-NEXT:    s_mov_b32 s1, s5
61; VI-NEXT:    v_max_f16_e32 v0, v0, v0
62; VI-NEXT:    v_max_f16_e32 v1, v1, v1
63; VI-NEXT:    v_max_f16_e32 v0, v0, v1
64; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
65; VI-NEXT:    s_endpgm
66;
67; GFX9-LABEL: maxnum_f16:
68; GFX9:       ; %bb.0: ; %entry
69; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
70; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
71; GFX9-NEXT:    s_mov_b32 s3, 0xf000
72; GFX9-NEXT:    s_mov_b32 s2, -1
73; GFX9-NEXT:    s_mov_b32 s14, s2
74; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
75; GFX9-NEXT:    s_mov_b32 s12, s6
76; GFX9-NEXT:    s_mov_b32 s13, s7
77; GFX9-NEXT:    s_mov_b32 s15, s3
78; GFX9-NEXT:    s_mov_b32 s10, s2
79; GFX9-NEXT:    s_mov_b32 s11, s3
80; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
81; GFX9-NEXT:    s_waitcnt vmcnt(0)
82; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
83; GFX9-NEXT:    s_waitcnt vmcnt(0)
84; GFX9-NEXT:    s_mov_b32 s0, s4
85; GFX9-NEXT:    s_mov_b32 s1, s5
86; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
87; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
88; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
89; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
90; GFX9-NEXT:    s_endpgm
91;
92; GFX10-LABEL: maxnum_f16:
93; GFX10:       ; %bb.0: ; %entry
94; GFX10-NEXT:    s_clause 0x1
95; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
96; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
97; GFX10-NEXT:    s_mov_b32 s2, -1
98; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
99; GFX10-NEXT:    s_mov_b32 s14, s2
100; GFX10-NEXT:    s_mov_b32 s15, s3
101; GFX10-NEXT:    s_mov_b32 s10, s2
102; GFX10-NEXT:    s_mov_b32 s11, s3
103; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
104; GFX10-NEXT:    s_mov_b32 s12, s6
105; GFX10-NEXT:    s_mov_b32 s13, s7
106; GFX10-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc dlc
107; GFX10-NEXT:    s_waitcnt vmcnt(0)
108; GFX10-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc dlc
109; GFX10-NEXT:    s_waitcnt vmcnt(0)
110; GFX10-NEXT:    s_mov_b32 s0, s4
111; GFX10-NEXT:    s_mov_b32 s1, s5
112; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
113; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
114; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
115; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0
116; GFX10-NEXT:    s_endpgm
117;
118; GFX11-LABEL: maxnum_f16:
119; GFX11:       ; %bb.0: ; %entry
120; GFX11-NEXT:    s_clause 0x1
121; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
122; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
123; GFX11-NEXT:    s_mov_b32 s10, -1
124; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
125; GFX11-NEXT:    s_mov_b32 s14, s10
126; GFX11-NEXT:    s_mov_b32 s15, s11
127; GFX11-NEXT:    s_mov_b32 s2, s10
128; GFX11-NEXT:    s_mov_b32 s3, s11
129; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
130; GFX11-NEXT:    s_mov_b32 s12, s6
131; GFX11-NEXT:    s_mov_b32 s13, s7
132; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
133; GFX11-NEXT:    s_waitcnt vmcnt(0)
134; GFX11-NEXT:    buffer_load_u16 v1, off, s[0:3], 0 glc dlc
135; GFX11-NEXT:    s_waitcnt vmcnt(0)
136; GFX11-NEXT:    s_mov_b32 s8, s4
137; GFX11-NEXT:    s_mov_b32 s9, s5
138; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
139; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
140; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
141; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
142; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
143; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
144; GFX11-NEXT:    s_endpgm
145    half addrspace(1)* %r,
146    half addrspace(1)* %a,
147    half addrspace(1)* %b) #0 {
148entry:
149  %a.val = load volatile half, half addrspace(1)* %a
150  %b.val = load volatile half, half addrspace(1)* %b
151  %r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val)
152  store half %r.val, half addrspace(1)* %r
153  ret void
154}
155
156define amdgpu_kernel void @maxnum_f16_imm_a(
157; SI-LABEL: maxnum_f16_imm_a:
158; SI:       ; %bb.0: ; %entry
159; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
160; SI-NEXT:    s_mov_b32 s7, 0xf000
161; SI-NEXT:    s_mov_b32 s6, -1
162; SI-NEXT:    s_mov_b32 s10, s6
163; SI-NEXT:    s_mov_b32 s11, s7
164; SI-NEXT:    s_waitcnt lgkmcnt(0)
165; SI-NEXT:    s_mov_b32 s8, s2
166; SI-NEXT:    s_mov_b32 s9, s3
167; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
168; SI-NEXT:    s_mov_b32 s4, s0
169; SI-NEXT:    s_mov_b32 s5, s1
170; SI-NEXT:    s_waitcnt vmcnt(0)
171; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
172; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
173; SI-NEXT:    v_max_f32_e32 v0, 0x40400000, v0
174; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
175; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
176; SI-NEXT:    s_endpgm
177;
178; VI-LABEL: maxnum_f16_imm_a:
179; VI:       ; %bb.0: ; %entry
180; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
181; VI-NEXT:    s_mov_b32 s7, 0xf000
182; VI-NEXT:    s_mov_b32 s6, -1
183; VI-NEXT:    s_mov_b32 s10, s6
184; VI-NEXT:    s_mov_b32 s11, s7
185; VI-NEXT:    s_waitcnt lgkmcnt(0)
186; VI-NEXT:    s_mov_b32 s8, s2
187; VI-NEXT:    s_mov_b32 s9, s3
188; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
189; VI-NEXT:    s_mov_b32 s4, s0
190; VI-NEXT:    s_mov_b32 s5, s1
191; VI-NEXT:    s_waitcnt vmcnt(0)
192; VI-NEXT:    v_max_f16_e32 v0, v0, v0
193; VI-NEXT:    v_max_f16_e32 v0, 0x4200, v0
194; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
195; VI-NEXT:    s_endpgm
196;
197; GFX9-LABEL: maxnum_f16_imm_a:
198; GFX9:       ; %bb.0: ; %entry
199; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
200; GFX9-NEXT:    s_mov_b32 s7, 0xf000
201; GFX9-NEXT:    s_mov_b32 s6, -1
202; GFX9-NEXT:    s_mov_b32 s10, s6
203; GFX9-NEXT:    s_mov_b32 s11, s7
204; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
205; GFX9-NEXT:    s_mov_b32 s8, s2
206; GFX9-NEXT:    s_mov_b32 s9, s3
207; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
208; GFX9-NEXT:    s_mov_b32 s4, s0
209; GFX9-NEXT:    s_mov_b32 s5, s1
210; GFX9-NEXT:    s_waitcnt vmcnt(0)
211; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
212; GFX9-NEXT:    v_max_f16_e32 v0, 0x4200, v0
213; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
214; GFX9-NEXT:    s_endpgm
215;
216; GFX10-LABEL: maxnum_f16_imm_a:
217; GFX10:       ; %bb.0: ; %entry
218; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
219; GFX10-NEXT:    s_mov_b32 s6, -1
220; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
221; GFX10-NEXT:    s_mov_b32 s10, s6
222; GFX10-NEXT:    s_mov_b32 s11, s7
223; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
224; GFX10-NEXT:    s_mov_b32 s8, s2
225; GFX10-NEXT:    s_mov_b32 s9, s3
226; GFX10-NEXT:    s_mov_b32 s4, s0
227; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
228; GFX10-NEXT:    s_mov_b32 s5, s1
229; GFX10-NEXT:    s_waitcnt vmcnt(0)
230; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
231; GFX10-NEXT:    v_max_f16_e32 v0, 0x4200, v0
232; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
233; GFX10-NEXT:    s_endpgm
234;
235; GFX11-LABEL: maxnum_f16_imm_a:
236; GFX11:       ; %bb.0: ; %entry
237; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
238; GFX11-NEXT:    s_mov_b32 s6, -1
239; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
240; GFX11-NEXT:    s_mov_b32 s10, s6
241; GFX11-NEXT:    s_mov_b32 s11, s7
242; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
243; GFX11-NEXT:    s_mov_b32 s8, s2
244; GFX11-NEXT:    s_mov_b32 s9, s3
245; GFX11-NEXT:    s_mov_b32 s4, s0
246; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
247; GFX11-NEXT:    s_mov_b32 s5, s1
248; GFX11-NEXT:    s_waitcnt vmcnt(0)
249; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
250; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
251; GFX11-NEXT:    v_max_f16_e32 v0, 0x4200, v0
252; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
253; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
254; GFX11-NEXT:    s_endpgm
255    half addrspace(1)* %r,
256    half addrspace(1)* %b) #0 {
257entry:
258  %b.val = load half, half addrspace(1)* %b
259  %r.val = call half @llvm.maxnum.f16(half 3.0, half %b.val)
260  store half %r.val, half addrspace(1)* %r
261  ret void
262}
263
264define amdgpu_kernel void @maxnum_f16_imm_b(
265; SI-LABEL: maxnum_f16_imm_b:
266; SI:       ; %bb.0: ; %entry
267; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
268; SI-NEXT:    s_mov_b32 s7, 0xf000
269; SI-NEXT:    s_mov_b32 s6, -1
270; SI-NEXT:    s_mov_b32 s10, s6
271; SI-NEXT:    s_mov_b32 s11, s7
272; SI-NEXT:    s_waitcnt lgkmcnt(0)
273; SI-NEXT:    s_mov_b32 s8, s2
274; SI-NEXT:    s_mov_b32 s9, s3
275; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
276; SI-NEXT:    s_mov_b32 s4, s0
277; SI-NEXT:    s_mov_b32 s5, s1
278; SI-NEXT:    s_waitcnt vmcnt(0)
279; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
280; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
281; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
282; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
283; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
284; SI-NEXT:    s_endpgm
285;
286; VI-LABEL: maxnum_f16_imm_b:
287; VI:       ; %bb.0: ; %entry
288; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
289; VI-NEXT:    s_mov_b32 s7, 0xf000
290; VI-NEXT:    s_mov_b32 s6, -1
291; VI-NEXT:    s_mov_b32 s10, s6
292; VI-NEXT:    s_mov_b32 s11, s7
293; VI-NEXT:    s_waitcnt lgkmcnt(0)
294; VI-NEXT:    s_mov_b32 s8, s2
295; VI-NEXT:    s_mov_b32 s9, s3
296; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
297; VI-NEXT:    s_mov_b32 s4, s0
298; VI-NEXT:    s_mov_b32 s5, s1
299; VI-NEXT:    s_waitcnt vmcnt(0)
300; VI-NEXT:    v_max_f16_e32 v0, v0, v0
301; VI-NEXT:    v_max_f16_e32 v0, 4.0, v0
302; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
303; VI-NEXT:    s_endpgm
304;
305; GFX9-LABEL: maxnum_f16_imm_b:
306; GFX9:       ; %bb.0: ; %entry
307; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
308; GFX9-NEXT:    s_mov_b32 s7, 0xf000
309; GFX9-NEXT:    s_mov_b32 s6, -1
310; GFX9-NEXT:    s_mov_b32 s10, s6
311; GFX9-NEXT:    s_mov_b32 s11, s7
312; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
313; GFX9-NEXT:    s_mov_b32 s8, s2
314; GFX9-NEXT:    s_mov_b32 s9, s3
315; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
316; GFX9-NEXT:    s_mov_b32 s4, s0
317; GFX9-NEXT:    s_mov_b32 s5, s1
318; GFX9-NEXT:    s_waitcnt vmcnt(0)
319; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
320; GFX9-NEXT:    v_max_f16_e32 v0, 4.0, v0
321; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
322; GFX9-NEXT:    s_endpgm
323;
324; GFX10-LABEL: maxnum_f16_imm_b:
325; GFX10:       ; %bb.0: ; %entry
326; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
327; GFX10-NEXT:    s_mov_b32 s6, -1
328; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
329; GFX10-NEXT:    s_mov_b32 s10, s6
330; GFX10-NEXT:    s_mov_b32 s11, s7
331; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
332; GFX10-NEXT:    s_mov_b32 s8, s2
333; GFX10-NEXT:    s_mov_b32 s9, s3
334; GFX10-NEXT:    s_mov_b32 s4, s0
335; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
336; GFX10-NEXT:    s_mov_b32 s5, s1
337; GFX10-NEXT:    s_waitcnt vmcnt(0)
338; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
339; GFX10-NEXT:    v_max_f16_e32 v0, 4.0, v0
340; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
341; GFX10-NEXT:    s_endpgm
342;
343; GFX11-LABEL: maxnum_f16_imm_b:
344; GFX11:       ; %bb.0: ; %entry
345; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
346; GFX11-NEXT:    s_mov_b32 s6, -1
347; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
348; GFX11-NEXT:    s_mov_b32 s10, s6
349; GFX11-NEXT:    s_mov_b32 s11, s7
350; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
351; GFX11-NEXT:    s_mov_b32 s8, s2
352; GFX11-NEXT:    s_mov_b32 s9, s3
353; GFX11-NEXT:    s_mov_b32 s4, s0
354; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
355; GFX11-NEXT:    s_mov_b32 s5, s1
356; GFX11-NEXT:    s_waitcnt vmcnt(0)
357; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
358; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
359; GFX11-NEXT:    v_max_f16_e32 v0, 4.0, v0
360; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
361; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
362; GFX11-NEXT:    s_endpgm
363    half addrspace(1)* %r,
364    half addrspace(1)* %a) #0 {
365entry:
366  %a.val = load half, half addrspace(1)* %a
367  %r.val = call half @llvm.maxnum.f16(half %a.val, half 4.0)
368  store half %r.val, half addrspace(1)* %r
369  ret void
370}
371
372define amdgpu_kernel void @maxnum_v2f16(
373; SI-LABEL: maxnum_v2f16:
374; SI:       ; %bb.0: ; %entry
375; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
376; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
377; SI-NEXT:    s_waitcnt lgkmcnt(0)
378; SI-NEXT:    s_load_dword s2, s[6:7], 0x0
379; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
380; SI-NEXT:    s_mov_b32 s7, 0xf000
381; SI-NEXT:    s_mov_b32 s6, -1
382; SI-NEXT:    s_waitcnt lgkmcnt(0)
383; SI-NEXT:    s_lshr_b32 s1, s2, 16
384; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
385; SI-NEXT:    s_lshr_b32 s0, s0, 16
386; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
387; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
388; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
389; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
390; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
391; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
392; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
393; SI-NEXT:    v_max_f32_e32 v2, v3, v2
394; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
395; SI-NEXT:    v_max_f32_e32 v0, v0, v1
396; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
397; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
398; SI-NEXT:    v_or_b32_e32 v0, v0, v1
399; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
400; SI-NEXT:    s_endpgm
401;
402; VI-LABEL: maxnum_v2f16:
403; VI:       ; %bb.0: ; %entry
404; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
405; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
406; VI-NEXT:    s_mov_b32 s7, 0xf000
407; VI-NEXT:    s_mov_b32 s6, -1
408; VI-NEXT:    s_waitcnt lgkmcnt(0)
409; VI-NEXT:    s_load_dword s8, s[4:5], 0x0
410; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
411; VI-NEXT:    s_mov_b32 s4, s0
412; VI-NEXT:    s_mov_b32 s5, s1
413; VI-NEXT:    s_waitcnt lgkmcnt(0)
414; VI-NEXT:    v_max_f16_e64 v0, s8, s8
415; VI-NEXT:    v_max_f16_e64 v1, s2, s2
416; VI-NEXT:    s_lshr_b32 s0, s8, 16
417; VI-NEXT:    v_max_f16_e32 v0, v1, v0
418; VI-NEXT:    v_max_f16_e64 v1, s0, s0
419; VI-NEXT:    s_lshr_b32 s0, s2, 16
420; VI-NEXT:    v_max_f16_e64 v2, s0, s0
421; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
422; VI-NEXT:    v_or_b32_e32 v0, v0, v1
423; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
424; VI-NEXT:    s_endpgm
425;
426; GFX9-LABEL: maxnum_v2f16:
427; GFX9:       ; %bb.0: ; %entry
428; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
429; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
430; GFX9-NEXT:    s_mov_b32 s3, 0xf000
431; GFX9-NEXT:    s_mov_b32 s2, -1
432; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
433; GFX9-NEXT:    s_load_dword s10, s[8:9], 0x0
434; GFX9-NEXT:    s_load_dword s11, s[6:7], 0x0
435; GFX9-NEXT:    s_mov_b32 s0, s4
436; GFX9-NEXT:    s_mov_b32 s1, s5
437; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
438; GFX9-NEXT:    v_pk_max_f16 v0, s10, s10
439; GFX9-NEXT:    v_pk_max_f16 v1, s11, s11
440; GFX9-NEXT:    v_pk_max_f16 v0, v1, v0
441; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
442; GFX9-NEXT:    s_endpgm
443;
444; GFX10-LABEL: maxnum_v2f16:
445; GFX10:       ; %bb.0: ; %entry
446; GFX10-NEXT:    s_clause 0x1
447; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
448; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
449; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
450; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
451; GFX10-NEXT:    s_load_dword s1, s[6:7], 0x0
452; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
453; GFX10-NEXT:    s_mov_b32 s6, -1
454; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
455; GFX10-NEXT:    v_pk_max_f16 v0, s0, s0
456; GFX10-NEXT:    v_pk_max_f16 v1, s1, s1
457; GFX10-NEXT:    v_pk_max_f16 v0, v1, v0
458; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
459; GFX10-NEXT:    s_endpgm
460;
461; GFX11-LABEL: maxnum_v2f16:
462; GFX11:       ; %bb.0: ; %entry
463; GFX11-NEXT:    s_clause 0x1
464; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
465; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
466; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
467; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x0
468; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
469; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
470; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
471; GFX11-NEXT:    v_pk_max_f16 v0, s4, s4
472; GFX11-NEXT:    v_pk_max_f16 v1, s2, s2
473; GFX11-NEXT:    s_mov_b32 s2, -1
474; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
475; GFX11-NEXT:    v_pk_max_f16 v0, v1, v0
476; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
477; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
478; GFX11-NEXT:    s_endpgm
479    <2 x half> addrspace(1)* %r,
480    <2 x half> addrspace(1)* %a,
481    <2 x half> addrspace(1)* %b) #0 {
482entry:
483  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
484  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
485  %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> %b.val)
486  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
487  ret void
488}
489
490define amdgpu_kernel void @maxnum_v2f16_imm_a(
491; SI-LABEL: maxnum_v2f16_imm_a:
492; SI:       ; %bb.0: ; %entry
493; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
494; SI-NEXT:    s_waitcnt lgkmcnt(0)
495; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
496; SI-NEXT:    s_mov_b32 s3, 0xf000
497; SI-NEXT:    s_waitcnt lgkmcnt(0)
498; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
499; SI-NEXT:    s_lshr_b32 s2, s2, 16
500; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
501; SI-NEXT:    s_mov_b32 s2, -1
502; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
503; SI-NEXT:    v_max_f32_e32 v0, 0x40400000, v0
504; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
505; SI-NEXT:    v_max_f32_e32 v1, 4.0, v1
506; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
507; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
508; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
509; SI-NEXT:    v_or_b32_e32 v0, v0, v1
510; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
511; SI-NEXT:    s_endpgm
512;
513; VI-LABEL: maxnum_v2f16_imm_a:
514; VI:       ; %bb.0: ; %entry
515; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
516; VI-NEXT:    v_mov_b32_e32 v2, 0x4400
517; VI-NEXT:    s_waitcnt lgkmcnt(0)
518; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
519; VI-NEXT:    s_mov_b32 s3, 0xf000
520; VI-NEXT:    s_mov_b32 s2, -1
521; VI-NEXT:    s_waitcnt lgkmcnt(0)
522; VI-NEXT:    v_max_f16_e64 v0, s4, s4
523; VI-NEXT:    s_lshr_b32 s4, s4, 16
524; VI-NEXT:    v_max_f16_e64 v1, s4, s4
525; VI-NEXT:    v_max_f16_e32 v0, 0x4200, v0
526; VI-NEXT:    v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
527; VI-NEXT:    v_or_b32_e32 v0, v0, v1
528; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
529; VI-NEXT:    s_endpgm
530;
531; GFX9-LABEL: maxnum_v2f16_imm_a:
532; GFX9:       ; %bb.0: ; %entry
533; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
534; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
535; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
536; GFX9-NEXT:    s_mov_b32 s3, 0xf000
537; GFX9-NEXT:    s_mov_b32 s2, -1
538; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
539; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
540; GFX9-NEXT:    s_mov_b32 s4, 0x44004200
541; GFX9-NEXT:    v_pk_max_f16 v0, v0, s4
542; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
543; GFX9-NEXT:    s_endpgm
544;
545; GFX10-LABEL: maxnum_v2f16_imm_a:
546; GFX10:       ; %bb.0: ; %entry
547; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
548; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
549; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
550; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
551; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
552; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
553; GFX10-NEXT:    s_mov_b32 s2, -1
554; GFX10-NEXT:    v_pk_max_f16 v0, 0x44004200, v0
555; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
556; GFX10-NEXT:    s_endpgm
557;
558; GFX11-LABEL: maxnum_v2f16_imm_a:
559; GFX11:       ; %bb.0: ; %entry
560; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
561; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
562; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
563; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
564; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
565; GFX11-NEXT:    v_pk_max_f16 v0, s2, s2
566; GFX11-NEXT:    s_mov_b32 s2, -1
567; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
568; GFX11-NEXT:    v_pk_max_f16 v0, 0x44004200, v0
569; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
570; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
571; GFX11-NEXT:    s_endpgm
572    <2 x half> addrspace(1)* %r,
573    <2 x half> addrspace(1)* %b) #0 {
574entry:
575  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
576  %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val)
577  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
578  ret void
579}
580
581define amdgpu_kernel void @maxnum_v2f16_imm_b(
582; SI-LABEL: maxnum_v2f16_imm_b:
583; SI:       ; %bb.0: ; %entry
584; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
585; SI-NEXT:    s_waitcnt lgkmcnt(0)
586; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
587; SI-NEXT:    s_mov_b32 s3, 0xf000
588; SI-NEXT:    s_waitcnt lgkmcnt(0)
589; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
590; SI-NEXT:    s_lshr_b32 s2, s2, 16
591; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
592; SI-NEXT:    s_mov_b32 s2, -1
593; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
594; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
595; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
596; SI-NEXT:    v_max_f32_e32 v1, 0x40400000, v1
597; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
598; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
599; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
600; SI-NEXT:    v_or_b32_e32 v0, v0, v1
601; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
602; SI-NEXT:    s_endpgm
603;
604; VI-LABEL: maxnum_v2f16_imm_b:
605; VI:       ; %bb.0: ; %entry
606; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
607; VI-NEXT:    v_mov_b32_e32 v2, 0x4200
608; VI-NEXT:    s_waitcnt lgkmcnt(0)
609; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
610; VI-NEXT:    s_mov_b32 s3, 0xf000
611; VI-NEXT:    s_mov_b32 s2, -1
612; VI-NEXT:    s_waitcnt lgkmcnt(0)
613; VI-NEXT:    v_max_f16_e64 v0, s4, s4
614; VI-NEXT:    s_lshr_b32 s4, s4, 16
615; VI-NEXT:    v_max_f16_e64 v1, s4, s4
616; VI-NEXT:    v_max_f16_e32 v0, 4.0, v0
617; VI-NEXT:    v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
618; VI-NEXT:    v_or_b32_e32 v0, v0, v1
619; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
620; VI-NEXT:    s_endpgm
621;
622; GFX9-LABEL: maxnum_v2f16_imm_b:
623; GFX9:       ; %bb.0: ; %entry
624; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
625; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
626; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
627; GFX9-NEXT:    s_mov_b32 s3, 0xf000
628; GFX9-NEXT:    s_mov_b32 s2, -1
629; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
630; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
631; GFX9-NEXT:    s_mov_b32 s4, 0x42004400
632; GFX9-NEXT:    v_pk_max_f16 v0, v0, s4
633; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
634; GFX9-NEXT:    s_endpgm
635;
636; GFX10-LABEL: maxnum_v2f16_imm_b:
637; GFX10:       ; %bb.0: ; %entry
638; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
639; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
640; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
641; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
642; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
643; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
644; GFX10-NEXT:    s_mov_b32 s2, -1
645; GFX10-NEXT:    v_pk_max_f16 v0, 0x42004400, v0
646; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
647; GFX10-NEXT:    s_endpgm
648;
649; GFX11-LABEL: maxnum_v2f16_imm_b:
650; GFX11:       ; %bb.0: ; %entry
651; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
652; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
653; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
654; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
655; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
656; GFX11-NEXT:    v_pk_max_f16 v0, s2, s2
657; GFX11-NEXT:    s_mov_b32 s2, -1
658; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
659; GFX11-NEXT:    v_pk_max_f16 v0, 0x42004400, v0
660; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
661; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
662; GFX11-NEXT:    s_endpgm
663    <2 x half> addrspace(1)* %r,
664    <2 x half> addrspace(1)* %a) #0 {
665entry:
666  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
667  %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>)
668  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
669  ret void
670}
671
672; FIXME: Scalarize with undef half
673define amdgpu_kernel void @maxnum_v3f16(
674; SI-LABEL: maxnum_v3f16:
675; SI:       ; %bb.0: ; %entry
676; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
677; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
678; SI-NEXT:    s_waitcnt lgkmcnt(0)
679; SI-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
680; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
681; SI-NEXT:    s_mov_b32 s7, 0xf000
682; SI-NEXT:    s_mov_b32 s6, -1
683; SI-NEXT:    s_waitcnt lgkmcnt(0)
684; SI-NEXT:    v_cvt_f32_f16_e32 v0, s3
685; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
686; SI-NEXT:    s_lshr_b32 s2, s2, 16
687; SI-NEXT:    s_lshr_b32 s3, s0, 16
688; SI-NEXT:    v_cvt_f32_f16_e32 v2, s3
689; SI-NEXT:    v_cvt_f32_f16_e32 v3, s2
690; SI-NEXT:    v_cvt_f32_f16_e32 v5, s0
691; SI-NEXT:    v_cvt_f32_f16_e32 v4, s1
692; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
693; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
694; SI-NEXT:    v_max_f32_e32 v2, v3, v2
695; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
696; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
697; SI-NEXT:    v_max_f32_e32 v1, v1, v3
698; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
699; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
700; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
701; SI-NEXT:    v_max_f32_e32 v0, v0, v3
702; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
703; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
704; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
705; SI-NEXT:    v_or_b32_e32 v1, v1, v2
706; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
707; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
708; SI-NEXT:    s_endpgm
709;
710; VI-LABEL: maxnum_v3f16:
711; VI:       ; %bb.0: ; %entry
712; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
713; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
714; VI-NEXT:    s_mov_b32 s7, 0xf000
715; VI-NEXT:    s_mov_b32 s6, -1
716; VI-NEXT:    s_waitcnt lgkmcnt(0)
717; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
718; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
719; VI-NEXT:    s_mov_b32 s4, s0
720; VI-NEXT:    s_mov_b32 s5, s1
721; VI-NEXT:    s_waitcnt lgkmcnt(0)
722; VI-NEXT:    v_max_f16_e64 v0, s8, s8
723; VI-NEXT:    v_max_f16_e64 v1, s2, s2
724; VI-NEXT:    s_lshr_b32 s0, s8, 16
725; VI-NEXT:    v_max_f16_e32 v0, v1, v0
726; VI-NEXT:    v_max_f16_e64 v1, s0, s0
727; VI-NEXT:    s_lshr_b32 s0, s2, 16
728; VI-NEXT:    v_max_f16_e64 v2, s0, s0
729; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
730; VI-NEXT:    v_or_b32_e32 v0, v0, v1
731; VI-NEXT:    v_max_f16_e64 v1, s9, s9
732; VI-NEXT:    v_max_f16_e64 v2, s3, s3
733; VI-NEXT:    v_max_f16_e32 v1, v2, v1
734; VI-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
735; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
736; VI-NEXT:    s_endpgm
737;
738; GFX9-LABEL: maxnum_v3f16:
739; GFX9:       ; %bb.0: ; %entry
740; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
741; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
742; GFX9-NEXT:    s_mov_b32 s3, 0xf000
743; GFX9-NEXT:    s_mov_b32 s2, -1
744; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
745; GFX9-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
746; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[6:7], 0x0
747; GFX9-NEXT:    s_mov_b32 s0, s4
748; GFX9-NEXT:    s_mov_b32 s1, s5
749; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
750; GFX9-NEXT:    v_pk_max_f16 v0, s10, s10
751; GFX9-NEXT:    v_pk_max_f16 v1, s12, s12
752; GFX9-NEXT:    v_pk_max_f16 v2, s11, s11
753; GFX9-NEXT:    v_pk_max_f16 v0, v1, v0
754; GFX9-NEXT:    v_pk_max_f16 v1, s13, s13
755; GFX9-NEXT:    v_pk_max_f16 v1, v1, v2
756; GFX9-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
757; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
758; GFX9-NEXT:    s_endpgm
759;
760; GFX10-LABEL: maxnum_v3f16:
761; GFX10:       ; %bb.0: ; %entry
762; GFX10-NEXT:    s_clause 0x1
763; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
764; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
765; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
766; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
767; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
768; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
769; GFX10-NEXT:    s_mov_b32 s6, -1
770; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
771; GFX10-NEXT:    v_pk_max_f16 v1, s1, s1
772; GFX10-NEXT:    v_pk_max_f16 v2, s9, s9
773; GFX10-NEXT:    v_pk_max_f16 v0, s0, s0
774; GFX10-NEXT:    v_pk_max_f16 v3, s8, s8
775; GFX10-NEXT:    v_pk_max_f16 v1, v2, v1
776; GFX10-NEXT:    v_pk_max_f16 v0, v3, v0
777; GFX10-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
778; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
779; GFX10-NEXT:    s_endpgm
780;
781; GFX11-LABEL: maxnum_v3f16:
782; GFX11:       ; %bb.0: ; %entry
783; GFX11-NEXT:    s_clause 0x1
784; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
785; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
786; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
787; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
788; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
789; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
790; GFX11-NEXT:    v_pk_max_f16 v1, s5, s5
791; GFX11-NEXT:    v_pk_max_f16 v2, s3, s3
792; GFX11-NEXT:    v_pk_max_f16 v0, s4, s4
793; GFX11-NEXT:    v_pk_max_f16 v3, s2, s2
794; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
795; GFX11-NEXT:    s_mov_b32 s2, -1
796; GFX11-NEXT:    v_pk_max_f16 v1, v2, v1
797; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
798; GFX11-NEXT:    v_pk_max_f16 v0, v3, v0
799; GFX11-NEXT:    s_clause 0x1
800; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0 offset:4
801; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
802; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
803; GFX11-NEXT:    s_endpgm
804    <3 x half> addrspace(1)* %r,
805    <3 x half> addrspace(1)* %a,
806    <3 x half> addrspace(1)* %b) #0 {
807entry:
808  %a.val = load <3 x half>, <3 x half> addrspace(1)* %a
809  %b.val = load <3 x half>, <3 x half> addrspace(1)* %b
810  %r.val = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %a.val, <3 x half> %b.val)
811  store <3 x half> %r.val, <3 x half> addrspace(1)* %r
812  ret void
813}
814
815define amdgpu_kernel void @maxnum_v4f16(
816; SI-LABEL: maxnum_v4f16:
817; SI:       ; %bb.0: ; %entry
818; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
819; SI-NEXT:    s_mov_b32 s3, 0xf000
820; SI-NEXT:    s_mov_b32 s2, -1
821; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
822; SI-NEXT:    s_waitcnt lgkmcnt(0)
823; SI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
824; SI-NEXT:    s_mov_b32 s0, s4
825; SI-NEXT:    s_mov_b32 s1, s5
826; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
827; SI-NEXT:    s_waitcnt lgkmcnt(0)
828; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
829; SI-NEXT:    s_lshr_b32 s6, s6, 16
830; SI-NEXT:    v_cvt_f32_f16_e32 v2, s6
831; SI-NEXT:    s_lshr_b32 s6, s7, 16
832; SI-NEXT:    v_cvt_f32_f16_e32 v3, s6
833; SI-NEXT:    s_lshr_b32 s6, s5, 16
834; SI-NEXT:    v_cvt_f32_f16_e32 v5, s6
835; SI-NEXT:    v_cvt_f32_f16_e32 v1, s7
836; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
837; SI-NEXT:    s_lshr_b32 s4, s4, 16
838; SI-NEXT:    v_cvt_f32_f16_e32 v7, s5
839; SI-NEXT:    v_cvt_f32_f16_e32 v6, s4
840; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v5
841; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
842; SI-NEXT:    v_max_f32_e32 v3, v3, v5
843; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v7
844; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
845; SI-NEXT:    v_max_f32_e32 v1, v1, v5
846; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v6
847; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
848; SI-NEXT:    v_max_f32_e32 v2, v2, v5
849; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
850; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
851; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
852; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
853; SI-NEXT:    v_max_f32_e32 v0, v0, v4
854; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
855; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
856; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
857; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
858; SI-NEXT:    v_or_b32_e32 v1, v1, v3
859; SI-NEXT:    v_or_b32_e32 v0, v0, v2
860; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
861; SI-NEXT:    s_endpgm
862;
863; VI-LABEL: maxnum_v4f16:
864; VI:       ; %bb.0: ; %entry
865; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
866; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
867; VI-NEXT:    s_mov_b32 s7, 0xf000
868; VI-NEXT:    s_mov_b32 s6, -1
869; VI-NEXT:    s_waitcnt lgkmcnt(0)
870; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
871; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
872; VI-NEXT:    s_mov_b32 s4, s0
873; VI-NEXT:    s_mov_b32 s5, s1
874; VI-NEXT:    s_waitcnt lgkmcnt(0)
875; VI-NEXT:    v_max_f16_e64 v0, s9, s9
876; VI-NEXT:    v_max_f16_e64 v1, s3, s3
877; VI-NEXT:    s_lshr_b32 s0, s9, 16
878; VI-NEXT:    v_max_f16_e32 v0, v1, v0
879; VI-NEXT:    v_max_f16_e64 v1, s0, s0
880; VI-NEXT:    s_lshr_b32 s0, s3, 16
881; VI-NEXT:    v_max_f16_e64 v2, s0, s0
882; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
883; VI-NEXT:    v_or_b32_e32 v1, v0, v1
884; VI-NEXT:    v_max_f16_e64 v0, s8, s8
885; VI-NEXT:    v_max_f16_e64 v2, s2, s2
886; VI-NEXT:    s_lshr_b32 s0, s8, 16
887; VI-NEXT:    v_max_f16_e32 v0, v2, v0
888; VI-NEXT:    v_max_f16_e64 v2, s0, s0
889; VI-NEXT:    s_lshr_b32 s0, s2, 16
890; VI-NEXT:    v_max_f16_e64 v3, s0, s0
891; VI-NEXT:    v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
892; VI-NEXT:    v_or_b32_e32 v0, v0, v2
893; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
894; VI-NEXT:    s_endpgm
895;
896; GFX9-LABEL: maxnum_v4f16:
897; GFX9:       ; %bb.0: ; %entry
898; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
899; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
900; GFX9-NEXT:    s_mov_b32 s3, 0xf000
901; GFX9-NEXT:    s_mov_b32 s2, -1
902; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
903; GFX9-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
904; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[6:7], 0x0
905; GFX9-NEXT:    s_mov_b32 s0, s4
906; GFX9-NEXT:    s_mov_b32 s1, s5
907; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
908; GFX9-NEXT:    v_pk_max_f16 v0, s11, s11
909; GFX9-NEXT:    v_pk_max_f16 v1, s13, s13
910; GFX9-NEXT:    v_pk_max_f16 v2, s10, s10
911; GFX9-NEXT:    v_pk_max_f16 v1, v1, v0
912; GFX9-NEXT:    v_pk_max_f16 v0, s12, s12
913; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
914; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
915; GFX9-NEXT:    s_endpgm
916;
917; GFX10-LABEL: maxnum_v4f16:
918; GFX10:       ; %bb.0: ; %entry
919; GFX10-NEXT:    s_clause 0x1
920; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
921; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
922; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
923; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
924; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
925; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
926; GFX10-NEXT:    s_mov_b32 s6, -1
927; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
928; GFX10-NEXT:    v_pk_max_f16 v0, s1, s1
929; GFX10-NEXT:    v_pk_max_f16 v1, s9, s9
930; GFX10-NEXT:    v_pk_max_f16 v2, s0, s0
931; GFX10-NEXT:    v_pk_max_f16 v3, s8, s8
932; GFX10-NEXT:    v_pk_max_f16 v1, v1, v0
933; GFX10-NEXT:    v_pk_max_f16 v0, v3, v2
934; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
935; GFX10-NEXT:    s_endpgm
936;
937; GFX11-LABEL: maxnum_v4f16:
938; GFX11:       ; %bb.0: ; %entry
939; GFX11-NEXT:    s_clause 0x1
940; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
941; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
942; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
943; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
944; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
945; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
946; GFX11-NEXT:    v_pk_max_f16 v0, s5, s5
947; GFX11-NEXT:    v_pk_max_f16 v1, s3, s3
948; GFX11-NEXT:    v_pk_max_f16 v2, s4, s4
949; GFX11-NEXT:    v_pk_max_f16 v3, s2, s2
950; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
951; GFX11-NEXT:    s_mov_b32 s2, -1
952; GFX11-NEXT:    v_pk_max_f16 v1, v1, v0
953; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
954; GFX11-NEXT:    v_pk_max_f16 v0, v3, v2
955; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
956; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
957; GFX11-NEXT:    s_endpgm
958    <4 x half> addrspace(1)* %r,
959    <4 x half> addrspace(1)* %a,
960    <4 x half> addrspace(1)* %b) #0 {
961entry:
962  %a.val = load <4 x half>, <4 x half> addrspace(1)* %a
963  %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
964  %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %a.val, <4 x half> %b.val)
965  store <4 x half> %r.val, <4 x half> addrspace(1)* %r
966  ret void
967}
968
969define amdgpu_kernel void @fmax_v4f16_imm_a(
970; SI-LABEL: fmax_v4f16_imm_a:
971; SI:       ; %bb.0: ; %entry
972; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
973; SI-NEXT:    s_waitcnt lgkmcnt(0)
974; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
975; SI-NEXT:    s_mov_b32 s3, 0xf000
976; SI-NEXT:    s_mov_b32 s2, -1
977; SI-NEXT:    s_waitcnt lgkmcnt(0)
978; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
979; SI-NEXT:    s_lshr_b32 s5, s5, 16
980; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
981; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
982; SI-NEXT:    s_lshr_b32 s4, s4, 16
983; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
984; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
985; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
986; SI-NEXT:    v_max_f32_e32 v2, 4.0, v2
987; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
988; SI-NEXT:    v_max_f32_e32 v1, 0x40400000, v1
989; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
990; SI-NEXT:    v_max_f32_e32 v3, 2.0, v3
991; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
992; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
993; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
994; SI-NEXT:    v_max_f32_e32 v0, 0x41000000, v0
995; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
996; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
997; SI-NEXT:    v_or_b32_e32 v1, v1, v2
998; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
999; SI-NEXT:    v_or_b32_e32 v0, v0, v2
1000; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1001; SI-NEXT:    s_endpgm
1002;
1003; VI-LABEL: fmax_v4f16_imm_a:
1004; VI:       ; %bb.0: ; %entry
1005; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1006; VI-NEXT:    v_mov_b32_e32 v0, 0x4400
1007; VI-NEXT:    s_mov_b32 s7, 0xf000
1008; VI-NEXT:    s_mov_b32 s6, -1
1009; VI-NEXT:    s_waitcnt lgkmcnt(0)
1010; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1011; VI-NEXT:    s_mov_b32 s4, s0
1012; VI-NEXT:    s_mov_b32 s5, s1
1013; VI-NEXT:    s_waitcnt lgkmcnt(0)
1014; VI-NEXT:    s_lshr_b32 s0, s3, 16
1015; VI-NEXT:    v_max_f16_e64 v1, s3, s3
1016; VI-NEXT:    v_max_f16_e64 v3, s0, s0
1017; VI-NEXT:    v_max_f16_e64 v2, s2, s2
1018; VI-NEXT:    v_max_f16_e32 v1, 0x4200, v1
1019; VI-NEXT:    v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1020; VI-NEXT:    s_lshr_b32 s0, s2, 16
1021; VI-NEXT:    v_or_b32_e32 v1, v1, v0
1022; VI-NEXT:    v_max_f16_e32 v0, 0x4800, v2
1023; VI-NEXT:    v_max_f16_e64 v2, s0, s0
1024; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
1025; VI-NEXT:    v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1026; VI-NEXT:    v_or_b32_e32 v0, v0, v2
1027; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1028; VI-NEXT:    s_endpgm
1029;
1030; GFX9-LABEL: fmax_v4f16_imm_a:
1031; GFX9:       ; %bb.0: ; %entry
1032; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1033; GFX9-NEXT:    s_mov_b32 s8, 0x44004200
1034; GFX9-NEXT:    s_mov_b32 s9, 0x40004800
1035; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1036; GFX9-NEXT:    s_mov_b32 s6, -1
1037; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1038; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1039; GFX9-NEXT:    s_mov_b32 s4, s0
1040; GFX9-NEXT:    s_mov_b32 s5, s1
1041; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1042; GFX9-NEXT:    v_pk_max_f16 v0, s3, s3
1043; GFX9-NEXT:    v_pk_max_f16 v2, s2, s2
1044; GFX9-NEXT:    v_pk_max_f16 v1, v0, s8
1045; GFX9-NEXT:    v_pk_max_f16 v0, v2, s9
1046; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1047; GFX9-NEXT:    s_endpgm
1048;
1049; GFX10-LABEL: fmax_v4f16_imm_a:
1050; GFX10:       ; %bb.0: ; %entry
1051; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1052; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1053; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1054; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1055; GFX10-NEXT:    v_pk_max_f16 v0, s3, s3
1056; GFX10-NEXT:    v_pk_max_f16 v2, s2, s2
1057; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1058; GFX10-NEXT:    s_mov_b32 s2, -1
1059; GFX10-NEXT:    v_pk_max_f16 v1, 0x44004200, v0
1060; GFX10-NEXT:    v_pk_max_f16 v0, 0x40004800, v2
1061; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1062; GFX10-NEXT:    s_endpgm
1063;
1064; GFX11-LABEL: fmax_v4f16_imm_a:
1065; GFX11:       ; %bb.0: ; %entry
1066; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1067; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1068; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
1069; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1070; GFX11-NEXT:    v_pk_max_f16 v0, s3, s3
1071; GFX11-NEXT:    v_pk_max_f16 v2, s2, s2
1072; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1073; GFX11-NEXT:    s_mov_b32 s2, -1
1074; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1075; GFX11-NEXT:    v_pk_max_f16 v1, 0x44004200, v0
1076; GFX11-NEXT:    v_pk_max_f16 v0, 0x40004800, v2
1077; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1078; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1079; GFX11-NEXT:    s_endpgm
1080    <4 x half> addrspace(1)* %r,
1081    <4 x half> addrspace(1)* %b) #0 {
1082entry:
1083  %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
1084  %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val)
1085  store <4 x half> %r.val, <4 x half> addrspace(1)* %r
1086  ret void
1087}
1088
1089attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
1090