1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=SI %s 3; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s 5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s 6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s 7 8declare half @llvm.maxnum.f16(half %a, half %b) 9declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) 10declare <3 x half> @llvm.maxnum.v3f16(<3 x half> %a, <3 x half> %b) 11declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b) 12 13define amdgpu_kernel void @maxnum_f16( 14; SI-LABEL: maxnum_f16: 15; SI: ; %bb.0: ; %entry 16; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 17; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 18; SI-NEXT: s_mov_b32 s3, 0xf000 19; SI-NEXT: s_mov_b32 s2, -1 20; SI-NEXT: s_mov_b32 s14, s2 21; SI-NEXT: s_waitcnt lgkmcnt(0) 22; SI-NEXT: s_mov_b32 s12, s6 23; SI-NEXT: s_mov_b32 s13, s7 24; SI-NEXT: s_mov_b32 s15, s3 25; SI-NEXT: s_mov_b32 s10, s2 26; SI-NEXT: s_mov_b32 s11, s3 27; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 28; SI-NEXT: s_waitcnt vmcnt(0) 29; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 30; SI-NEXT: s_waitcnt vmcnt(0) 31; SI-NEXT: s_mov_b32 s0, s4 32; SI-NEXT: s_mov_b32 s1, s5 33; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 34; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 35; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 36; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 37; SI-NEXT: v_max_f32_e32 v0, v0, v1 38; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 39; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 40; SI-NEXT: s_endpgm 41; 42; VI-LABEL: maxnum_f16: 43; VI: ; %bb.0: ; %entry 44; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 45; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 46; VI-NEXT: s_mov_b32 s3, 0xf000 47; VI-NEXT: s_mov_b32 s2, -1 48; VI-NEXT: s_mov_b32 s14, s2 49; VI-NEXT: s_waitcnt lgkmcnt(0) 50; VI-NEXT: s_mov_b32 s12, s6 51; VI-NEXT: s_mov_b32 s13, s7 52; VI-NEXT: s_mov_b32 s15, s3 53; VI-NEXT: s_mov_b32 s10, s2 54; VI-NEXT: s_mov_b32 s11, s3 55; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 56; VI-NEXT: s_waitcnt vmcnt(0) 57; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 58; VI-NEXT: s_waitcnt vmcnt(0) 59; VI-NEXT: s_mov_b32 s0, s4 60; VI-NEXT: s_mov_b32 s1, s5 61; VI-NEXT: v_max_f16_e32 v0, v0, v0 62; VI-NEXT: v_max_f16_e32 v1, v1, v1 63; VI-NEXT: v_max_f16_e32 v0, v0, v1 64; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 65; VI-NEXT: s_endpgm 66; 67; GFX9-LABEL: maxnum_f16: 68; GFX9: ; %bb.0: ; %entry 69; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 70; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 71; GFX9-NEXT: s_mov_b32 s3, 0xf000 72; GFX9-NEXT: s_mov_b32 s2, -1 73; GFX9-NEXT: s_mov_b32 s14, s2 74; GFX9-NEXT: s_waitcnt lgkmcnt(0) 75; GFX9-NEXT: s_mov_b32 s12, s6 76; GFX9-NEXT: s_mov_b32 s13, s7 77; GFX9-NEXT: s_mov_b32 s15, s3 78; GFX9-NEXT: s_mov_b32 s10, s2 79; GFX9-NEXT: s_mov_b32 s11, s3 80; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 81; GFX9-NEXT: s_waitcnt vmcnt(0) 82; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 83; GFX9-NEXT: s_waitcnt vmcnt(0) 84; GFX9-NEXT: s_mov_b32 s0, s4 85; GFX9-NEXT: s_mov_b32 s1, s5 86; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 87; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 88; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 89; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 90; GFX9-NEXT: s_endpgm 91; 92; GFX10-LABEL: maxnum_f16: 93; GFX10: ; %bb.0: ; %entry 94; GFX10-NEXT: s_clause 0x1 95; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 96; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 97; GFX10-NEXT: s_mov_b32 s2, -1 98; GFX10-NEXT: s_mov_b32 s3, 0x31016000 99; GFX10-NEXT: s_mov_b32 s14, s2 100; GFX10-NEXT: s_mov_b32 s15, s3 101; GFX10-NEXT: s_mov_b32 s10, s2 102; GFX10-NEXT: s_mov_b32 s11, s3 103; GFX10-NEXT: s_waitcnt lgkmcnt(0) 104; GFX10-NEXT: s_mov_b32 s12, s6 105; GFX10-NEXT: s_mov_b32 s13, s7 106; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc 107; GFX10-NEXT: s_waitcnt vmcnt(0) 108; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc 109; GFX10-NEXT: s_waitcnt vmcnt(0) 110; GFX10-NEXT: s_mov_b32 s0, s4 111; GFX10-NEXT: s_mov_b32 s1, s5 112; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 113; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 114; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 115; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 116; GFX10-NEXT: s_endpgm 117; 118; GFX11-LABEL: maxnum_f16: 119; GFX11: ; %bb.0: ; %entry 120; GFX11-NEXT: s_clause 0x1 121; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 122; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 123; GFX11-NEXT: s_mov_b32 s10, -1 124; GFX11-NEXT: s_mov_b32 s11, 0x31016000 125; GFX11-NEXT: s_mov_b32 s14, s10 126; GFX11-NEXT: s_mov_b32 s15, s11 127; GFX11-NEXT: s_mov_b32 s2, s10 128; GFX11-NEXT: s_mov_b32 s3, s11 129; GFX11-NEXT: s_waitcnt lgkmcnt(0) 130; GFX11-NEXT: s_mov_b32 s12, s6 131; GFX11-NEXT: s_mov_b32 s13, s7 132; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 133; GFX11-NEXT: s_waitcnt vmcnt(0) 134; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc 135; GFX11-NEXT: s_waitcnt vmcnt(0) 136; GFX11-NEXT: s_mov_b32 s8, s4 137; GFX11-NEXT: s_mov_b32 s9, s5 138; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 139; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 140; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 141; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 142; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 143; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 144; GFX11-NEXT: s_endpgm 145 half addrspace(1)* %r, 146 half addrspace(1)* %a, 147 half addrspace(1)* %b) #0 { 148entry: 149 %a.val = load volatile half, half addrspace(1)* %a 150 %b.val = load volatile half, half addrspace(1)* %b 151 %r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val) 152 store half %r.val, half addrspace(1)* %r 153 ret void 154} 155 156define amdgpu_kernel void @maxnum_f16_imm_a( 157; SI-LABEL: maxnum_f16_imm_a: 158; SI: ; %bb.0: ; %entry 159; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 160; SI-NEXT: s_mov_b32 s7, 0xf000 161; SI-NEXT: s_mov_b32 s6, -1 162; SI-NEXT: s_mov_b32 s10, s6 163; SI-NEXT: s_mov_b32 s11, s7 164; SI-NEXT: s_waitcnt lgkmcnt(0) 165; SI-NEXT: s_mov_b32 s8, s2 166; SI-NEXT: s_mov_b32 s9, s3 167; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 168; SI-NEXT: s_mov_b32 s4, s0 169; SI-NEXT: s_mov_b32 s5, s1 170; SI-NEXT: s_waitcnt vmcnt(0) 171; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 172; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 173; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 174; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 175; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 176; SI-NEXT: s_endpgm 177; 178; VI-LABEL: maxnum_f16_imm_a: 179; VI: ; %bb.0: ; %entry 180; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 181; VI-NEXT: s_mov_b32 s7, 0xf000 182; VI-NEXT: s_mov_b32 s6, -1 183; VI-NEXT: s_mov_b32 s10, s6 184; VI-NEXT: s_mov_b32 s11, s7 185; VI-NEXT: s_waitcnt lgkmcnt(0) 186; VI-NEXT: s_mov_b32 s8, s2 187; VI-NEXT: s_mov_b32 s9, s3 188; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 189; VI-NEXT: s_mov_b32 s4, s0 190; VI-NEXT: s_mov_b32 s5, s1 191; VI-NEXT: s_waitcnt vmcnt(0) 192; VI-NEXT: v_max_f16_e32 v0, v0, v0 193; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0 194; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 195; VI-NEXT: s_endpgm 196; 197; GFX9-LABEL: maxnum_f16_imm_a: 198; GFX9: ; %bb.0: ; %entry 199; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 200; GFX9-NEXT: s_mov_b32 s7, 0xf000 201; GFX9-NEXT: s_mov_b32 s6, -1 202; GFX9-NEXT: s_mov_b32 s10, s6 203; GFX9-NEXT: s_mov_b32 s11, s7 204; GFX9-NEXT: s_waitcnt lgkmcnt(0) 205; GFX9-NEXT: s_mov_b32 s8, s2 206; GFX9-NEXT: s_mov_b32 s9, s3 207; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 208; GFX9-NEXT: s_mov_b32 s4, s0 209; GFX9-NEXT: s_mov_b32 s5, s1 210; GFX9-NEXT: s_waitcnt vmcnt(0) 211; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 212; GFX9-NEXT: v_max_f16_e32 v0, 0x4200, v0 213; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 214; GFX9-NEXT: s_endpgm 215; 216; GFX10-LABEL: maxnum_f16_imm_a: 217; GFX10: ; %bb.0: ; %entry 218; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 219; GFX10-NEXT: s_mov_b32 s6, -1 220; GFX10-NEXT: s_mov_b32 s7, 0x31016000 221; GFX10-NEXT: s_mov_b32 s10, s6 222; GFX10-NEXT: s_mov_b32 s11, s7 223; GFX10-NEXT: s_waitcnt lgkmcnt(0) 224; GFX10-NEXT: s_mov_b32 s8, s2 225; GFX10-NEXT: s_mov_b32 s9, s3 226; GFX10-NEXT: s_mov_b32 s4, s0 227; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 228; GFX10-NEXT: s_mov_b32 s5, s1 229; GFX10-NEXT: s_waitcnt vmcnt(0) 230; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 231; GFX10-NEXT: v_max_f16_e32 v0, 0x4200, v0 232; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 233; GFX10-NEXT: s_endpgm 234; 235; GFX11-LABEL: maxnum_f16_imm_a: 236; GFX11: ; %bb.0: ; %entry 237; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 238; GFX11-NEXT: s_mov_b32 s6, -1 239; GFX11-NEXT: s_mov_b32 s7, 0x31016000 240; GFX11-NEXT: s_mov_b32 s10, s6 241; GFX11-NEXT: s_mov_b32 s11, s7 242; GFX11-NEXT: s_waitcnt lgkmcnt(0) 243; GFX11-NEXT: s_mov_b32 s8, s2 244; GFX11-NEXT: s_mov_b32 s9, s3 245; GFX11-NEXT: s_mov_b32 s4, s0 246; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 247; GFX11-NEXT: s_mov_b32 s5, s1 248; GFX11-NEXT: s_waitcnt vmcnt(0) 249; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 250; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 251; GFX11-NEXT: v_max_f16_e32 v0, 0x4200, v0 252; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 253; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 254; GFX11-NEXT: s_endpgm 255 half addrspace(1)* %r, 256 half addrspace(1)* %b) #0 { 257entry: 258 %b.val = load half, half addrspace(1)* %b 259 %r.val = call half @llvm.maxnum.f16(half 3.0, half %b.val) 260 store half %r.val, half addrspace(1)* %r 261 ret void 262} 263 264define amdgpu_kernel void @maxnum_f16_imm_b( 265; SI-LABEL: maxnum_f16_imm_b: 266; SI: ; %bb.0: ; %entry 267; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 268; SI-NEXT: s_mov_b32 s7, 0xf000 269; SI-NEXT: s_mov_b32 s6, -1 270; SI-NEXT: s_mov_b32 s10, s6 271; SI-NEXT: s_mov_b32 s11, s7 272; SI-NEXT: s_waitcnt lgkmcnt(0) 273; SI-NEXT: s_mov_b32 s8, s2 274; SI-NEXT: s_mov_b32 s9, s3 275; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 276; SI-NEXT: s_mov_b32 s4, s0 277; SI-NEXT: s_mov_b32 s5, s1 278; SI-NEXT: s_waitcnt vmcnt(0) 279; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 280; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 281; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 282; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 283; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 284; SI-NEXT: s_endpgm 285; 286; VI-LABEL: maxnum_f16_imm_b: 287; VI: ; %bb.0: ; %entry 288; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 289; VI-NEXT: s_mov_b32 s7, 0xf000 290; VI-NEXT: s_mov_b32 s6, -1 291; VI-NEXT: s_mov_b32 s10, s6 292; VI-NEXT: s_mov_b32 s11, s7 293; VI-NEXT: s_waitcnt lgkmcnt(0) 294; VI-NEXT: s_mov_b32 s8, s2 295; VI-NEXT: s_mov_b32 s9, s3 296; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 297; VI-NEXT: s_mov_b32 s4, s0 298; VI-NEXT: s_mov_b32 s5, s1 299; VI-NEXT: s_waitcnt vmcnt(0) 300; VI-NEXT: v_max_f16_e32 v0, v0, v0 301; VI-NEXT: v_max_f16_e32 v0, 4.0, v0 302; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 303; VI-NEXT: s_endpgm 304; 305; GFX9-LABEL: maxnum_f16_imm_b: 306; GFX9: ; %bb.0: ; %entry 307; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 308; GFX9-NEXT: s_mov_b32 s7, 0xf000 309; GFX9-NEXT: s_mov_b32 s6, -1 310; GFX9-NEXT: s_mov_b32 s10, s6 311; GFX9-NEXT: s_mov_b32 s11, s7 312; GFX9-NEXT: s_waitcnt lgkmcnt(0) 313; GFX9-NEXT: s_mov_b32 s8, s2 314; GFX9-NEXT: s_mov_b32 s9, s3 315; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 316; GFX9-NEXT: s_mov_b32 s4, s0 317; GFX9-NEXT: s_mov_b32 s5, s1 318; GFX9-NEXT: s_waitcnt vmcnt(0) 319; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 320; GFX9-NEXT: v_max_f16_e32 v0, 4.0, v0 321; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 322; GFX9-NEXT: s_endpgm 323; 324; GFX10-LABEL: maxnum_f16_imm_b: 325; GFX10: ; %bb.0: ; %entry 326; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 327; GFX10-NEXT: s_mov_b32 s6, -1 328; GFX10-NEXT: s_mov_b32 s7, 0x31016000 329; GFX10-NEXT: s_mov_b32 s10, s6 330; GFX10-NEXT: s_mov_b32 s11, s7 331; GFX10-NEXT: s_waitcnt lgkmcnt(0) 332; GFX10-NEXT: s_mov_b32 s8, s2 333; GFX10-NEXT: s_mov_b32 s9, s3 334; GFX10-NEXT: s_mov_b32 s4, s0 335; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 336; GFX10-NEXT: s_mov_b32 s5, s1 337; GFX10-NEXT: s_waitcnt vmcnt(0) 338; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 339; GFX10-NEXT: v_max_f16_e32 v0, 4.0, v0 340; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 341; GFX10-NEXT: s_endpgm 342; 343; GFX11-LABEL: maxnum_f16_imm_b: 344; GFX11: ; %bb.0: ; %entry 345; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 346; GFX11-NEXT: s_mov_b32 s6, -1 347; GFX11-NEXT: s_mov_b32 s7, 0x31016000 348; GFX11-NEXT: s_mov_b32 s10, s6 349; GFX11-NEXT: s_mov_b32 s11, s7 350; GFX11-NEXT: s_waitcnt lgkmcnt(0) 351; GFX11-NEXT: s_mov_b32 s8, s2 352; GFX11-NEXT: s_mov_b32 s9, s3 353; GFX11-NEXT: s_mov_b32 s4, s0 354; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 355; GFX11-NEXT: s_mov_b32 s5, s1 356; GFX11-NEXT: s_waitcnt vmcnt(0) 357; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 358; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 359; GFX11-NEXT: v_max_f16_e32 v0, 4.0, v0 360; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 361; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 362; GFX11-NEXT: s_endpgm 363 half addrspace(1)* %r, 364 half addrspace(1)* %a) #0 { 365entry: 366 %a.val = load half, half addrspace(1)* %a 367 %r.val = call half @llvm.maxnum.f16(half %a.val, half 4.0) 368 store half %r.val, half addrspace(1)* %r 369 ret void 370} 371 372define amdgpu_kernel void @maxnum_v2f16( 373; SI-LABEL: maxnum_v2f16: 374; SI: ; %bb.0: ; %entry 375; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 376; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 377; SI-NEXT: s_waitcnt lgkmcnt(0) 378; SI-NEXT: s_load_dword s2, s[6:7], 0x0 379; SI-NEXT: s_load_dword s0, s[0:1], 0x0 380; SI-NEXT: s_mov_b32 s7, 0xf000 381; SI-NEXT: s_mov_b32 s6, -1 382; SI-NEXT: s_waitcnt lgkmcnt(0) 383; SI-NEXT: s_lshr_b32 s1, s2, 16 384; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 385; SI-NEXT: s_lshr_b32 s0, s0, 16 386; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 387; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 388; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 389; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 390; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 391; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 392; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 393; SI-NEXT: v_max_f32_e32 v2, v3, v2 394; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 395; SI-NEXT: v_max_f32_e32 v0, v0, v1 396; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 397; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 398; SI-NEXT: v_or_b32_e32 v0, v0, v1 399; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 400; SI-NEXT: s_endpgm 401; 402; VI-LABEL: maxnum_v2f16: 403; VI: ; %bb.0: ; %entry 404; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 405; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 406; VI-NEXT: s_mov_b32 s7, 0xf000 407; VI-NEXT: s_mov_b32 s6, -1 408; VI-NEXT: s_waitcnt lgkmcnt(0) 409; VI-NEXT: s_load_dword s8, s[4:5], 0x0 410; VI-NEXT: s_load_dword s2, s[2:3], 0x0 411; VI-NEXT: s_mov_b32 s4, s0 412; VI-NEXT: s_mov_b32 s5, s1 413; VI-NEXT: s_waitcnt lgkmcnt(0) 414; VI-NEXT: v_max_f16_e64 v0, s8, s8 415; VI-NEXT: v_max_f16_e64 v1, s2, s2 416; VI-NEXT: s_lshr_b32 s0, s8, 16 417; VI-NEXT: v_max_f16_e32 v0, v1, v0 418; VI-NEXT: v_max_f16_e64 v1, s0, s0 419; VI-NEXT: s_lshr_b32 s0, s2, 16 420; VI-NEXT: v_max_f16_e64 v2, s0, s0 421; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 422; VI-NEXT: v_or_b32_e32 v0, v0, v1 423; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 424; VI-NEXT: s_endpgm 425; 426; GFX9-LABEL: maxnum_v2f16: 427; GFX9: ; %bb.0: ; %entry 428; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 429; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 430; GFX9-NEXT: s_mov_b32 s3, 0xf000 431; GFX9-NEXT: s_mov_b32 s2, -1 432; GFX9-NEXT: s_waitcnt lgkmcnt(0) 433; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 434; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 435; GFX9-NEXT: s_mov_b32 s0, s4 436; GFX9-NEXT: s_mov_b32 s1, s5 437; GFX9-NEXT: s_waitcnt lgkmcnt(0) 438; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 439; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 440; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 441; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 442; GFX9-NEXT: s_endpgm 443; 444; GFX10-LABEL: maxnum_v2f16: 445; GFX10: ; %bb.0: ; %entry 446; GFX10-NEXT: s_clause 0x1 447; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 448; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 449; GFX10-NEXT: s_waitcnt lgkmcnt(0) 450; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 451; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0 452; GFX10-NEXT: s_mov_b32 s7, 0x31016000 453; GFX10-NEXT: s_mov_b32 s6, -1 454; GFX10-NEXT: s_waitcnt lgkmcnt(0) 455; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 456; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 457; GFX10-NEXT: v_pk_max_f16 v0, v1, v0 458; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 459; GFX10-NEXT: s_endpgm 460; 461; GFX11-LABEL: maxnum_v2f16: 462; GFX11: ; %bb.0: ; %entry 463; GFX11-NEXT: s_clause 0x1 464; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 465; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 466; GFX11-NEXT: s_waitcnt lgkmcnt(0) 467; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 468; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 469; GFX11-NEXT: s_mov_b32 s3, 0x31016000 470; GFX11-NEXT: s_waitcnt lgkmcnt(0) 471; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 472; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 473; GFX11-NEXT: s_mov_b32 s2, -1 474; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 475; GFX11-NEXT: v_pk_max_f16 v0, v1, v0 476; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 477; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 478; GFX11-NEXT: s_endpgm 479 <2 x half> addrspace(1)* %r, 480 <2 x half> addrspace(1)* %a, 481 <2 x half> addrspace(1)* %b) #0 { 482entry: 483 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 484 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 485 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> %b.val) 486 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 487 ret void 488} 489 490define amdgpu_kernel void @maxnum_v2f16_imm_a( 491; SI-LABEL: maxnum_v2f16_imm_a: 492; SI: ; %bb.0: ; %entry 493; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 494; SI-NEXT: s_waitcnt lgkmcnt(0) 495; SI-NEXT: s_load_dword s2, s[2:3], 0x0 496; SI-NEXT: s_mov_b32 s3, 0xf000 497; SI-NEXT: s_waitcnt lgkmcnt(0) 498; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 499; SI-NEXT: s_lshr_b32 s2, s2, 16 500; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 501; SI-NEXT: s_mov_b32 s2, -1 502; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 503; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 504; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 505; SI-NEXT: v_max_f32_e32 v1, 4.0, v1 506; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 507; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 508; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 509; SI-NEXT: v_or_b32_e32 v0, v0, v1 510; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 511; SI-NEXT: s_endpgm 512; 513; VI-LABEL: maxnum_v2f16_imm_a: 514; VI: ; %bb.0: ; %entry 515; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 516; VI-NEXT: v_mov_b32_e32 v2, 0x4400 517; VI-NEXT: s_waitcnt lgkmcnt(0) 518; VI-NEXT: s_load_dword s4, s[2:3], 0x0 519; VI-NEXT: s_mov_b32 s3, 0xf000 520; VI-NEXT: s_mov_b32 s2, -1 521; VI-NEXT: s_waitcnt lgkmcnt(0) 522; VI-NEXT: v_max_f16_e64 v0, s4, s4 523; VI-NEXT: s_lshr_b32 s4, s4, 16 524; VI-NEXT: v_max_f16_e64 v1, s4, s4 525; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0 526; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 527; VI-NEXT: v_or_b32_e32 v0, v0, v1 528; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 529; VI-NEXT: s_endpgm 530; 531; GFX9-LABEL: maxnum_v2f16_imm_a: 532; GFX9: ; %bb.0: ; %entry 533; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 534; GFX9-NEXT: s_waitcnt lgkmcnt(0) 535; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 536; GFX9-NEXT: s_mov_b32 s3, 0xf000 537; GFX9-NEXT: s_mov_b32 s2, -1 538; GFX9-NEXT: s_waitcnt lgkmcnt(0) 539; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 540; GFX9-NEXT: s_mov_b32 s4, 0x44004200 541; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 542; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 543; GFX9-NEXT: s_endpgm 544; 545; GFX10-LABEL: maxnum_v2f16_imm_a: 546; GFX10: ; %bb.0: ; %entry 547; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 548; GFX10-NEXT: s_waitcnt lgkmcnt(0) 549; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 550; GFX10-NEXT: s_mov_b32 s3, 0x31016000 551; GFX10-NEXT: s_waitcnt lgkmcnt(0) 552; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 553; GFX10-NEXT: s_mov_b32 s2, -1 554; GFX10-NEXT: v_pk_max_f16 v0, 0x44004200, v0 555; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 556; GFX10-NEXT: s_endpgm 557; 558; GFX11-LABEL: maxnum_v2f16_imm_a: 559; GFX11: ; %bb.0: ; %entry 560; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 561; GFX11-NEXT: s_waitcnt lgkmcnt(0) 562; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 563; GFX11-NEXT: s_mov_b32 s3, 0x31016000 564; GFX11-NEXT: s_waitcnt lgkmcnt(0) 565; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 566; GFX11-NEXT: s_mov_b32 s2, -1 567; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 568; GFX11-NEXT: v_pk_max_f16 v0, 0x44004200, v0 569; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 570; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 571; GFX11-NEXT: s_endpgm 572 <2 x half> addrspace(1)* %r, 573 <2 x half> addrspace(1)* %b) #0 { 574entry: 575 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 576 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val) 577 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 578 ret void 579} 580 581define amdgpu_kernel void @maxnum_v2f16_imm_b( 582; SI-LABEL: maxnum_v2f16_imm_b: 583; SI: ; %bb.0: ; %entry 584; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 585; SI-NEXT: s_waitcnt lgkmcnt(0) 586; SI-NEXT: s_load_dword s2, s[2:3], 0x0 587; SI-NEXT: s_mov_b32 s3, 0xf000 588; SI-NEXT: s_waitcnt lgkmcnt(0) 589; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 590; SI-NEXT: s_lshr_b32 s2, s2, 16 591; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 592; SI-NEXT: s_mov_b32 s2, -1 593; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 594; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 595; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 596; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 597; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 598; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 599; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 600; SI-NEXT: v_or_b32_e32 v0, v0, v1 601; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 602; SI-NEXT: s_endpgm 603; 604; VI-LABEL: maxnum_v2f16_imm_b: 605; VI: ; %bb.0: ; %entry 606; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 607; VI-NEXT: v_mov_b32_e32 v2, 0x4200 608; VI-NEXT: s_waitcnt lgkmcnt(0) 609; VI-NEXT: s_load_dword s4, s[2:3], 0x0 610; VI-NEXT: s_mov_b32 s3, 0xf000 611; VI-NEXT: s_mov_b32 s2, -1 612; VI-NEXT: s_waitcnt lgkmcnt(0) 613; VI-NEXT: v_max_f16_e64 v0, s4, s4 614; VI-NEXT: s_lshr_b32 s4, s4, 16 615; VI-NEXT: v_max_f16_e64 v1, s4, s4 616; VI-NEXT: v_max_f16_e32 v0, 4.0, v0 617; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 618; VI-NEXT: v_or_b32_e32 v0, v0, v1 619; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 620; VI-NEXT: s_endpgm 621; 622; GFX9-LABEL: maxnum_v2f16_imm_b: 623; GFX9: ; %bb.0: ; %entry 624; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 625; GFX9-NEXT: s_waitcnt lgkmcnt(0) 626; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 627; GFX9-NEXT: s_mov_b32 s3, 0xf000 628; GFX9-NEXT: s_mov_b32 s2, -1 629; GFX9-NEXT: s_waitcnt lgkmcnt(0) 630; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 631; GFX9-NEXT: s_mov_b32 s4, 0x42004400 632; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 633; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 634; GFX9-NEXT: s_endpgm 635; 636; GFX10-LABEL: maxnum_v2f16_imm_b: 637; GFX10: ; %bb.0: ; %entry 638; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 639; GFX10-NEXT: s_waitcnt lgkmcnt(0) 640; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 641; GFX10-NEXT: s_mov_b32 s3, 0x31016000 642; GFX10-NEXT: s_waitcnt lgkmcnt(0) 643; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 644; GFX10-NEXT: s_mov_b32 s2, -1 645; GFX10-NEXT: v_pk_max_f16 v0, 0x42004400, v0 646; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 647; GFX10-NEXT: s_endpgm 648; 649; GFX11-LABEL: maxnum_v2f16_imm_b: 650; GFX11: ; %bb.0: ; %entry 651; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 652; GFX11-NEXT: s_waitcnt lgkmcnt(0) 653; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 654; GFX11-NEXT: s_mov_b32 s3, 0x31016000 655; GFX11-NEXT: s_waitcnt lgkmcnt(0) 656; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 657; GFX11-NEXT: s_mov_b32 s2, -1 658; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 659; GFX11-NEXT: v_pk_max_f16 v0, 0x42004400, v0 660; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 661; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 662; GFX11-NEXT: s_endpgm 663 <2 x half> addrspace(1)* %r, 664 <2 x half> addrspace(1)* %a) #0 { 665entry: 666 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 667 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>) 668 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 669 ret void 670} 671 672; FIXME: Scalarize with undef half 673define amdgpu_kernel void @maxnum_v3f16( 674; SI-LABEL: maxnum_v3f16: 675; SI: ; %bb.0: ; %entry 676; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 677; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 678; SI-NEXT: s_waitcnt lgkmcnt(0) 679; SI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 680; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 681; SI-NEXT: s_mov_b32 s7, 0xf000 682; SI-NEXT: s_mov_b32 s6, -1 683; SI-NEXT: s_waitcnt lgkmcnt(0) 684; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 685; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 686; SI-NEXT: s_lshr_b32 s2, s2, 16 687; SI-NEXT: s_lshr_b32 s3, s0, 16 688; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 689; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 690; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 691; SI-NEXT: v_cvt_f32_f16_e32 v4, s1 692; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 693; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 694; SI-NEXT: v_max_f32_e32 v2, v3, v2 695; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 696; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 697; SI-NEXT: v_max_f32_e32 v1, v1, v3 698; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 699; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 700; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 701; SI-NEXT: v_max_f32_e32 v0, v0, v3 702; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 703; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 704; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 705; SI-NEXT: v_or_b32_e32 v1, v1, v2 706; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 707; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 708; SI-NEXT: s_endpgm 709; 710; VI-LABEL: maxnum_v3f16: 711; VI: ; %bb.0: ; %entry 712; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 713; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 714; VI-NEXT: s_mov_b32 s7, 0xf000 715; VI-NEXT: s_mov_b32 s6, -1 716; VI-NEXT: s_waitcnt lgkmcnt(0) 717; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 718; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 719; VI-NEXT: s_mov_b32 s4, s0 720; VI-NEXT: s_mov_b32 s5, s1 721; VI-NEXT: s_waitcnt lgkmcnt(0) 722; VI-NEXT: v_max_f16_e64 v0, s8, s8 723; VI-NEXT: v_max_f16_e64 v1, s2, s2 724; VI-NEXT: s_lshr_b32 s0, s8, 16 725; VI-NEXT: v_max_f16_e32 v0, v1, v0 726; VI-NEXT: v_max_f16_e64 v1, s0, s0 727; VI-NEXT: s_lshr_b32 s0, s2, 16 728; VI-NEXT: v_max_f16_e64 v2, s0, s0 729; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 730; VI-NEXT: v_or_b32_e32 v0, v0, v1 731; VI-NEXT: v_max_f16_e64 v1, s9, s9 732; VI-NEXT: v_max_f16_e64 v2, s3, s3 733; VI-NEXT: v_max_f16_e32 v1, v2, v1 734; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 735; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 736; VI-NEXT: s_endpgm 737; 738; GFX9-LABEL: maxnum_v3f16: 739; GFX9: ; %bb.0: ; %entry 740; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 741; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 742; GFX9-NEXT: s_mov_b32 s3, 0xf000 743; GFX9-NEXT: s_mov_b32 s2, -1 744; GFX9-NEXT: s_waitcnt lgkmcnt(0) 745; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 746; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 747; GFX9-NEXT: s_mov_b32 s0, s4 748; GFX9-NEXT: s_mov_b32 s1, s5 749; GFX9-NEXT: s_waitcnt lgkmcnt(0) 750; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 751; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 752; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 753; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 754; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 755; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 756; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 757; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 758; GFX9-NEXT: s_endpgm 759; 760; GFX10-LABEL: maxnum_v3f16: 761; GFX10: ; %bb.0: ; %entry 762; GFX10-NEXT: s_clause 0x1 763; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 764; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 765; GFX10-NEXT: s_waitcnt lgkmcnt(0) 766; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 767; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 768; GFX10-NEXT: s_mov_b32 s7, 0x31016000 769; GFX10-NEXT: s_mov_b32 s6, -1 770; GFX10-NEXT: s_waitcnt lgkmcnt(0) 771; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 772; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 773; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 774; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 775; GFX10-NEXT: v_pk_max_f16 v1, v2, v1 776; GFX10-NEXT: v_pk_max_f16 v0, v3, v0 777; GFX10-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 778; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 779; GFX10-NEXT: s_endpgm 780; 781; GFX11-LABEL: maxnum_v3f16: 782; GFX11: ; %bb.0: ; %entry 783; GFX11-NEXT: s_clause 0x1 784; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 785; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 786; GFX11-NEXT: s_waitcnt lgkmcnt(0) 787; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 788; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 789; GFX11-NEXT: s_waitcnt lgkmcnt(0) 790; GFX11-NEXT: v_pk_max_f16 v1, s5, s5 791; GFX11-NEXT: v_pk_max_f16 v2, s3, s3 792; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 793; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 794; GFX11-NEXT: s_mov_b32 s3, 0x31016000 795; GFX11-NEXT: s_mov_b32 s2, -1 796; GFX11-NEXT: v_pk_max_f16 v1, v2, v1 797; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 798; GFX11-NEXT: v_pk_max_f16 v0, v3, v0 799; GFX11-NEXT: s_clause 0x1 800; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4 801; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 802; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 803; GFX11-NEXT: s_endpgm 804 <3 x half> addrspace(1)* %r, 805 <3 x half> addrspace(1)* %a, 806 <3 x half> addrspace(1)* %b) #0 { 807entry: 808 %a.val = load <3 x half>, <3 x half> addrspace(1)* %a 809 %b.val = load <3 x half>, <3 x half> addrspace(1)* %b 810 %r.val = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %a.val, <3 x half> %b.val) 811 store <3 x half> %r.val, <3 x half> addrspace(1)* %r 812 ret void 813} 814 815define amdgpu_kernel void @maxnum_v4f16( 816; SI-LABEL: maxnum_v4f16: 817; SI: ; %bb.0: ; %entry 818; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 819; SI-NEXT: s_mov_b32 s3, 0xf000 820; SI-NEXT: s_mov_b32 s2, -1 821; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 822; SI-NEXT: s_waitcnt lgkmcnt(0) 823; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 824; SI-NEXT: s_mov_b32 s0, s4 825; SI-NEXT: s_mov_b32 s1, s5 826; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 827; SI-NEXT: s_waitcnt lgkmcnt(0) 828; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 829; SI-NEXT: s_lshr_b32 s6, s6, 16 830; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 831; SI-NEXT: s_lshr_b32 s6, s7, 16 832; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 833; SI-NEXT: s_lshr_b32 s6, s5, 16 834; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 835; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 836; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 837; SI-NEXT: s_lshr_b32 s4, s4, 16 838; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 839; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 840; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 841; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 842; SI-NEXT: v_max_f32_e32 v3, v3, v5 843; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 844; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 845; SI-NEXT: v_max_f32_e32 v1, v1, v5 846; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 847; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 848; SI-NEXT: v_max_f32_e32 v2, v2, v5 849; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 850; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 851; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 852; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 853; SI-NEXT: v_max_f32_e32 v0, v0, v4 854; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 855; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 856; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 857; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 858; SI-NEXT: v_or_b32_e32 v1, v1, v3 859; SI-NEXT: v_or_b32_e32 v0, v0, v2 860; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 861; SI-NEXT: s_endpgm 862; 863; VI-LABEL: maxnum_v4f16: 864; VI: ; %bb.0: ; %entry 865; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 866; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 867; VI-NEXT: s_mov_b32 s7, 0xf000 868; VI-NEXT: s_mov_b32 s6, -1 869; VI-NEXT: s_waitcnt lgkmcnt(0) 870; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 871; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 872; VI-NEXT: s_mov_b32 s4, s0 873; VI-NEXT: s_mov_b32 s5, s1 874; VI-NEXT: s_waitcnt lgkmcnt(0) 875; VI-NEXT: v_max_f16_e64 v0, s9, s9 876; VI-NEXT: v_max_f16_e64 v1, s3, s3 877; VI-NEXT: s_lshr_b32 s0, s9, 16 878; VI-NEXT: v_max_f16_e32 v0, v1, v0 879; VI-NEXT: v_max_f16_e64 v1, s0, s0 880; VI-NEXT: s_lshr_b32 s0, s3, 16 881; VI-NEXT: v_max_f16_e64 v2, s0, s0 882; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 883; VI-NEXT: v_or_b32_e32 v1, v0, v1 884; VI-NEXT: v_max_f16_e64 v0, s8, s8 885; VI-NEXT: v_max_f16_e64 v2, s2, s2 886; VI-NEXT: s_lshr_b32 s0, s8, 16 887; VI-NEXT: v_max_f16_e32 v0, v2, v0 888; VI-NEXT: v_max_f16_e64 v2, s0, s0 889; VI-NEXT: s_lshr_b32 s0, s2, 16 890; VI-NEXT: v_max_f16_e64 v3, s0, s0 891; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 892; VI-NEXT: v_or_b32_e32 v0, v0, v2 893; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 894; VI-NEXT: s_endpgm 895; 896; GFX9-LABEL: maxnum_v4f16: 897; GFX9: ; %bb.0: ; %entry 898; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 899; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 900; GFX9-NEXT: s_mov_b32 s3, 0xf000 901; GFX9-NEXT: s_mov_b32 s2, -1 902; GFX9-NEXT: s_waitcnt lgkmcnt(0) 903; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 904; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 905; GFX9-NEXT: s_mov_b32 s0, s4 906; GFX9-NEXT: s_mov_b32 s1, s5 907; GFX9-NEXT: s_waitcnt lgkmcnt(0) 908; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 909; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 910; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 911; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 912; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 913; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 914; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 915; GFX9-NEXT: s_endpgm 916; 917; GFX10-LABEL: maxnum_v4f16: 918; GFX10: ; %bb.0: ; %entry 919; GFX10-NEXT: s_clause 0x1 920; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 921; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 922; GFX10-NEXT: s_waitcnt lgkmcnt(0) 923; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 924; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 925; GFX10-NEXT: s_mov_b32 s7, 0x31016000 926; GFX10-NEXT: s_mov_b32 s6, -1 927; GFX10-NEXT: s_waitcnt lgkmcnt(0) 928; GFX10-NEXT: v_pk_max_f16 v0, s1, s1 929; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 930; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 931; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 932; GFX10-NEXT: v_pk_max_f16 v1, v1, v0 933; GFX10-NEXT: v_pk_max_f16 v0, v3, v2 934; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 935; GFX10-NEXT: s_endpgm 936; 937; GFX11-LABEL: maxnum_v4f16: 938; GFX11: ; %bb.0: ; %entry 939; GFX11-NEXT: s_clause 0x1 940; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 941; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 942; GFX11-NEXT: s_waitcnt lgkmcnt(0) 943; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 944; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 945; GFX11-NEXT: s_waitcnt lgkmcnt(0) 946; GFX11-NEXT: v_pk_max_f16 v0, s5, s5 947; GFX11-NEXT: v_pk_max_f16 v1, s3, s3 948; GFX11-NEXT: v_pk_max_f16 v2, s4, s4 949; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 950; GFX11-NEXT: s_mov_b32 s3, 0x31016000 951; GFX11-NEXT: s_mov_b32 s2, -1 952; GFX11-NEXT: v_pk_max_f16 v1, v1, v0 953; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 954; GFX11-NEXT: v_pk_max_f16 v0, v3, v2 955; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 956; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 957; GFX11-NEXT: s_endpgm 958 <4 x half> addrspace(1)* %r, 959 <4 x half> addrspace(1)* %a, 960 <4 x half> addrspace(1)* %b) #0 { 961entry: 962 %a.val = load <4 x half>, <4 x half> addrspace(1)* %a 963 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b 964 %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %a.val, <4 x half> %b.val) 965 store <4 x half> %r.val, <4 x half> addrspace(1)* %r 966 ret void 967} 968 969define amdgpu_kernel void @fmax_v4f16_imm_a( 970; SI-LABEL: fmax_v4f16_imm_a: 971; SI: ; %bb.0: ; %entry 972; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 973; SI-NEXT: s_waitcnt lgkmcnt(0) 974; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 975; SI-NEXT: s_mov_b32 s3, 0xf000 976; SI-NEXT: s_mov_b32 s2, -1 977; SI-NEXT: s_waitcnt lgkmcnt(0) 978; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 979; SI-NEXT: s_lshr_b32 s5, s5, 16 980; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 981; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 982; SI-NEXT: s_lshr_b32 s4, s4, 16 983; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 984; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 985; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 986; SI-NEXT: v_max_f32_e32 v2, 4.0, v2 987; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 988; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 989; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 990; SI-NEXT: v_max_f32_e32 v3, 2.0, v3 991; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 992; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 993; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 994; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0 995; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 996; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 997; SI-NEXT: v_or_b32_e32 v1, v1, v2 998; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 999; SI-NEXT: v_or_b32_e32 v0, v0, v2 1000; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1001; SI-NEXT: s_endpgm 1002; 1003; VI-LABEL: fmax_v4f16_imm_a: 1004; VI: ; %bb.0: ; %entry 1005; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1006; VI-NEXT: v_mov_b32_e32 v0, 0x4400 1007; VI-NEXT: s_mov_b32 s7, 0xf000 1008; VI-NEXT: s_mov_b32 s6, -1 1009; VI-NEXT: s_waitcnt lgkmcnt(0) 1010; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1011; VI-NEXT: s_mov_b32 s4, s0 1012; VI-NEXT: s_mov_b32 s5, s1 1013; VI-NEXT: s_waitcnt lgkmcnt(0) 1014; VI-NEXT: s_lshr_b32 s0, s3, 16 1015; VI-NEXT: v_max_f16_e64 v1, s3, s3 1016; VI-NEXT: v_max_f16_e64 v3, s0, s0 1017; VI-NEXT: v_max_f16_e64 v2, s2, s2 1018; VI-NEXT: v_max_f16_e32 v1, 0x4200, v1 1019; VI-NEXT: v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1020; VI-NEXT: s_lshr_b32 s0, s2, 16 1021; VI-NEXT: v_or_b32_e32 v1, v1, v0 1022; VI-NEXT: v_max_f16_e32 v0, 0x4800, v2 1023; VI-NEXT: v_max_f16_e64 v2, s0, s0 1024; VI-NEXT: v_mov_b32_e32 v3, 0x4000 1025; VI-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1026; VI-NEXT: v_or_b32_e32 v0, v0, v2 1027; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1028; VI-NEXT: s_endpgm 1029; 1030; GFX9-LABEL: fmax_v4f16_imm_a: 1031; GFX9: ; %bb.0: ; %entry 1032; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1033; GFX9-NEXT: s_mov_b32 s8, 0x44004200 1034; GFX9-NEXT: s_mov_b32 s9, 0x40004800 1035; GFX9-NEXT: s_mov_b32 s7, 0xf000 1036; GFX9-NEXT: s_mov_b32 s6, -1 1037; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1038; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1039; GFX9-NEXT: s_mov_b32 s4, s0 1040; GFX9-NEXT: s_mov_b32 s5, s1 1041; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1042; GFX9-NEXT: v_pk_max_f16 v0, s3, s3 1043; GFX9-NEXT: v_pk_max_f16 v2, s2, s2 1044; GFX9-NEXT: v_pk_max_f16 v1, v0, s8 1045; GFX9-NEXT: v_pk_max_f16 v0, v2, s9 1046; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1047; GFX9-NEXT: s_endpgm 1048; 1049; GFX10-LABEL: fmax_v4f16_imm_a: 1050; GFX10: ; %bb.0: ; %entry 1051; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1052; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1053; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1054; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1055; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 1056; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 1057; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1058; GFX10-NEXT: s_mov_b32 s2, -1 1059; GFX10-NEXT: v_pk_max_f16 v1, 0x44004200, v0 1060; GFX10-NEXT: v_pk_max_f16 v0, 0x40004800, v2 1061; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1062; GFX10-NEXT: s_endpgm 1063; 1064; GFX11-LABEL: fmax_v4f16_imm_a: 1065; GFX11: ; %bb.0: ; %entry 1066; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1067; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1068; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 1069; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1070; GFX11-NEXT: v_pk_max_f16 v0, s3, s3 1071; GFX11-NEXT: v_pk_max_f16 v2, s2, s2 1072; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1073; GFX11-NEXT: s_mov_b32 s2, -1 1074; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1075; GFX11-NEXT: v_pk_max_f16 v1, 0x44004200, v0 1076; GFX11-NEXT: v_pk_max_f16 v0, 0x40004800, v2 1077; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1078; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1079; GFX11-NEXT: s_endpgm 1080 <4 x half> addrspace(1)* %r, 1081 <4 x half> addrspace(1)* %b) #0 { 1082entry: 1083 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b 1084 %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val) 1085 store <4 x half> %r.val, <4 x half> addrspace(1)* %r 1086 ret void 1087} 1088 1089attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 1090