1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s 3; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s 6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11 %s 7 8declare half @llvm.minnum.f16(half %a, half %b) 9declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) 10declare <3 x half> @llvm.minnum.v3f16(<3 x half> %a, <3 x half> %b) 11declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b) 12 13define amdgpu_kernel void @minnum_f16_ieee( 14; SI-LABEL: minnum_f16_ieee: 15; SI: ; %bb.0: ; %entry 16; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 17; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 18; SI-NEXT: s_mov_b32 s3, 0xf000 19; SI-NEXT: s_mov_b32 s2, -1 20; SI-NEXT: s_mov_b32 s14, s2 21; SI-NEXT: s_waitcnt lgkmcnt(0) 22; SI-NEXT: s_mov_b32 s12, s6 23; SI-NEXT: s_mov_b32 s13, s7 24; SI-NEXT: s_mov_b32 s15, s3 25; SI-NEXT: s_mov_b32 s10, s2 26; SI-NEXT: s_mov_b32 s11, s3 27; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 28; SI-NEXT: s_waitcnt vmcnt(0) 29; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 30; SI-NEXT: s_waitcnt vmcnt(0) 31; SI-NEXT: s_mov_b32 s0, s4 32; SI-NEXT: s_mov_b32 s1, s5 33; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 34; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 35; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 36; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 37; SI-NEXT: v_min_f32_e32 v0, v0, v1 38; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 39; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 40; SI-NEXT: s_endpgm 41; 42; VI-LABEL: minnum_f16_ieee: 43; VI: ; %bb.0: ; %entry 44; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 45; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 46; VI-NEXT: s_mov_b32 s3, 0xf000 47; VI-NEXT: s_mov_b32 s2, -1 48; VI-NEXT: s_mov_b32 s14, s2 49; VI-NEXT: s_waitcnt lgkmcnt(0) 50; VI-NEXT: s_mov_b32 s12, s6 51; VI-NEXT: s_mov_b32 s13, s7 52; VI-NEXT: s_mov_b32 s15, s3 53; VI-NEXT: s_mov_b32 s10, s2 54; VI-NEXT: s_mov_b32 s11, s3 55; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 56; VI-NEXT: s_waitcnt vmcnt(0) 57; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 58; VI-NEXT: s_waitcnt vmcnt(0) 59; VI-NEXT: s_mov_b32 s0, s4 60; VI-NEXT: s_mov_b32 s1, s5 61; VI-NEXT: v_max_f16_e32 v0, v0, v0 62; VI-NEXT: v_max_f16_e32 v1, v1, v1 63; VI-NEXT: v_min_f16_e32 v0, v0, v1 64; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 65; VI-NEXT: s_endpgm 66; 67; GFX9-LABEL: minnum_f16_ieee: 68; GFX9: ; %bb.0: ; %entry 69; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 70; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 71; GFX9-NEXT: s_mov_b32 s3, 0xf000 72; GFX9-NEXT: s_mov_b32 s2, -1 73; GFX9-NEXT: s_mov_b32 s14, s2 74; GFX9-NEXT: s_waitcnt lgkmcnt(0) 75; GFX9-NEXT: s_mov_b32 s12, s6 76; GFX9-NEXT: s_mov_b32 s13, s7 77; GFX9-NEXT: s_mov_b32 s15, s3 78; GFX9-NEXT: s_mov_b32 s10, s2 79; GFX9-NEXT: s_mov_b32 s11, s3 80; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 81; GFX9-NEXT: s_waitcnt vmcnt(0) 82; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 83; GFX9-NEXT: s_waitcnt vmcnt(0) 84; GFX9-NEXT: s_mov_b32 s0, s4 85; GFX9-NEXT: s_mov_b32 s1, s5 86; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 87; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 88; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 89; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 90; GFX9-NEXT: s_endpgm 91; 92; GFX10-LABEL: minnum_f16_ieee: 93; GFX10: ; %bb.0: ; %entry 94; GFX10-NEXT: s_clause 0x1 95; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 96; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 97; GFX10-NEXT: s_mov_b32 s2, -1 98; GFX10-NEXT: s_mov_b32 s3, 0x31016000 99; GFX10-NEXT: s_mov_b32 s14, s2 100; GFX10-NEXT: s_mov_b32 s15, s3 101; GFX10-NEXT: s_mov_b32 s10, s2 102; GFX10-NEXT: s_mov_b32 s11, s3 103; GFX10-NEXT: s_waitcnt lgkmcnt(0) 104; GFX10-NEXT: s_mov_b32 s12, s6 105; GFX10-NEXT: s_mov_b32 s13, s7 106; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc 107; GFX10-NEXT: s_waitcnt vmcnt(0) 108; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc 109; GFX10-NEXT: s_waitcnt vmcnt(0) 110; GFX10-NEXT: s_mov_b32 s0, s4 111; GFX10-NEXT: s_mov_b32 s1, s5 112; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 113; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 114; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 115; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 116; GFX10-NEXT: s_endpgm 117; 118; GFX11-LABEL: minnum_f16_ieee: 119; GFX11: ; %bb.0: ; %entry 120; GFX11-NEXT: s_clause 0x1 121; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 122; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 123; GFX11-NEXT: s_mov_b32 s10, -1 124; GFX11-NEXT: s_mov_b32 s11, 0x31016000 125; GFX11-NEXT: s_mov_b32 s14, s10 126; GFX11-NEXT: s_mov_b32 s15, s11 127; GFX11-NEXT: s_mov_b32 s2, s10 128; GFX11-NEXT: s_mov_b32 s3, s11 129; GFX11-NEXT: s_waitcnt lgkmcnt(0) 130; GFX11-NEXT: s_mov_b32 s12, s6 131; GFX11-NEXT: s_mov_b32 s13, s7 132; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 133; GFX11-NEXT: s_waitcnt vmcnt(0) 134; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc 135; GFX11-NEXT: s_waitcnt vmcnt(0) 136; GFX11-NEXT: s_mov_b32 s8, s4 137; GFX11-NEXT: s_mov_b32 s9, s5 138; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 139; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 140; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 141; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 142; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 143; GFX11-NEXT: s_endpgm 144 half addrspace(1)* %r, 145 half addrspace(1)* %a, 146 half addrspace(1)* %b) #0 { 147entry: 148 %a.val = load volatile half, half addrspace(1)* %a 149 %b.val = load volatile half, half addrspace(1)* %b 150 %r.val = call half @llvm.minnum.f16(half %a.val, half %b.val) 151 store half %r.val, half addrspace(1)* %r 152 ret void 153} 154 155define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 { 156; SI-LABEL: minnum_f16_no_ieee: 157; SI: ; %bb.0: 158; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 159; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 160; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 161; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 162; SI-NEXT: v_min_f32_e32 v0, v0, v1 163; SI-NEXT: ; return to shader part epilog 164; 165; VI-LABEL: minnum_f16_no_ieee: 166; VI: ; %bb.0: 167; VI-NEXT: v_min_f16_e32 v0, v0, v1 168; VI-NEXT: ; return to shader part epilog 169; 170; GFX9-LABEL: minnum_f16_no_ieee: 171; GFX9: ; %bb.0: 172; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 173; GFX9-NEXT: ; return to shader part epilog 174; 175; GFX10PLUS-LABEL: minnum_f16_no_ieee: 176; GFX10PLUS: ; %bb.0: 177; GFX10PLUS-NEXT: v_min_f16_e32 v0, v0, v1 178; GFX10PLUS-NEXT: ; return to shader part epilog 179 %r.val = call half @llvm.minnum.f16(half %a, half %b) 180 ret half %r.val 181} 182 183define amdgpu_kernel void @minnum_f16_imm_a( 184; SI-LABEL: minnum_f16_imm_a: 185; SI: ; %bb.0: ; %entry 186; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 187; SI-NEXT: s_mov_b32 s7, 0xf000 188; SI-NEXT: s_mov_b32 s6, -1 189; SI-NEXT: s_mov_b32 s10, s6 190; SI-NEXT: s_mov_b32 s11, s7 191; SI-NEXT: s_waitcnt lgkmcnt(0) 192; SI-NEXT: s_mov_b32 s8, s2 193; SI-NEXT: s_mov_b32 s9, s3 194; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 195; SI-NEXT: s_mov_b32 s4, s0 196; SI-NEXT: s_mov_b32 s5, s1 197; SI-NEXT: s_waitcnt vmcnt(0) 198; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 199; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 200; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 201; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 202; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 203; SI-NEXT: s_endpgm 204; 205; VI-LABEL: minnum_f16_imm_a: 206; VI: ; %bb.0: ; %entry 207; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 208; VI-NEXT: s_mov_b32 s7, 0xf000 209; VI-NEXT: s_mov_b32 s6, -1 210; VI-NEXT: s_mov_b32 s10, s6 211; VI-NEXT: s_mov_b32 s11, s7 212; VI-NEXT: s_waitcnt lgkmcnt(0) 213; VI-NEXT: s_mov_b32 s8, s2 214; VI-NEXT: s_mov_b32 s9, s3 215; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 216; VI-NEXT: s_mov_b32 s4, s0 217; VI-NEXT: s_mov_b32 s5, s1 218; VI-NEXT: s_waitcnt vmcnt(0) 219; VI-NEXT: v_max_f16_e32 v0, v0, v0 220; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0 221; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 222; VI-NEXT: s_endpgm 223; 224; GFX9-LABEL: minnum_f16_imm_a: 225; GFX9: ; %bb.0: ; %entry 226; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 227; GFX9-NEXT: s_mov_b32 s7, 0xf000 228; GFX9-NEXT: s_mov_b32 s6, -1 229; GFX9-NEXT: s_mov_b32 s10, s6 230; GFX9-NEXT: s_mov_b32 s11, s7 231; GFX9-NEXT: s_waitcnt lgkmcnt(0) 232; GFX9-NEXT: s_mov_b32 s8, s2 233; GFX9-NEXT: s_mov_b32 s9, s3 234; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 235; GFX9-NEXT: s_mov_b32 s4, s0 236; GFX9-NEXT: s_mov_b32 s5, s1 237; GFX9-NEXT: s_waitcnt vmcnt(0) 238; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 239; GFX9-NEXT: v_min_f16_e32 v0, 0x4200, v0 240; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 241; GFX9-NEXT: s_endpgm 242; 243; GFX10-LABEL: minnum_f16_imm_a: 244; GFX10: ; %bb.0: ; %entry 245; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 246; GFX10-NEXT: s_mov_b32 s6, -1 247; GFX10-NEXT: s_mov_b32 s7, 0x31016000 248; GFX10-NEXT: s_mov_b32 s10, s6 249; GFX10-NEXT: s_mov_b32 s11, s7 250; GFX10-NEXT: s_waitcnt lgkmcnt(0) 251; GFX10-NEXT: s_mov_b32 s8, s2 252; GFX10-NEXT: s_mov_b32 s9, s3 253; GFX10-NEXT: s_mov_b32 s4, s0 254; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 255; GFX10-NEXT: s_mov_b32 s5, s1 256; GFX10-NEXT: s_waitcnt vmcnt(0) 257; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 258; GFX10-NEXT: v_min_f16_e32 v0, 0x4200, v0 259; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 260; GFX10-NEXT: s_endpgm 261; 262; GFX11-LABEL: minnum_f16_imm_a: 263; GFX11: ; %bb.0: ; %entry 264; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 265; GFX11-NEXT: s_mov_b32 s6, -1 266; GFX11-NEXT: s_mov_b32 s7, 0x31016000 267; GFX11-NEXT: s_mov_b32 s10, s6 268; GFX11-NEXT: s_mov_b32 s11, s7 269; GFX11-NEXT: s_waitcnt lgkmcnt(0) 270; GFX11-NEXT: s_mov_b32 s8, s2 271; GFX11-NEXT: s_mov_b32 s9, s3 272; GFX11-NEXT: s_mov_b32 s4, s0 273; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 274; GFX11-NEXT: s_mov_b32 s5, s1 275; GFX11-NEXT: s_waitcnt vmcnt(0) 276; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 277; GFX11-NEXT: v_min_f16_e32 v0, 0x4200, v0 278; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 279; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 280; GFX11-NEXT: s_endpgm 281 half addrspace(1)* %r, 282 half addrspace(1)* %b) #0 { 283entry: 284 %b.val = load half, half addrspace(1)* %b 285 %r.val = call half @llvm.minnum.f16(half 3.0, half %b.val) 286 store half %r.val, half addrspace(1)* %r 287 ret void 288} 289 290define amdgpu_kernel void @minnum_f16_imm_b( 291; SI-LABEL: minnum_f16_imm_b: 292; SI: ; %bb.0: ; %entry 293; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 294; SI-NEXT: s_mov_b32 s7, 0xf000 295; SI-NEXT: s_mov_b32 s6, -1 296; SI-NEXT: s_mov_b32 s10, s6 297; SI-NEXT: s_mov_b32 s11, s7 298; SI-NEXT: s_waitcnt lgkmcnt(0) 299; SI-NEXT: s_mov_b32 s8, s2 300; SI-NEXT: s_mov_b32 s9, s3 301; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 302; SI-NEXT: s_mov_b32 s4, s0 303; SI-NEXT: s_mov_b32 s5, s1 304; SI-NEXT: s_waitcnt vmcnt(0) 305; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 306; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 307; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 308; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 309; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 310; SI-NEXT: s_endpgm 311; 312; VI-LABEL: minnum_f16_imm_b: 313; VI: ; %bb.0: ; %entry 314; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 315; VI-NEXT: s_mov_b32 s7, 0xf000 316; VI-NEXT: s_mov_b32 s6, -1 317; VI-NEXT: s_mov_b32 s10, s6 318; VI-NEXT: s_mov_b32 s11, s7 319; VI-NEXT: s_waitcnt lgkmcnt(0) 320; VI-NEXT: s_mov_b32 s8, s2 321; VI-NEXT: s_mov_b32 s9, s3 322; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 323; VI-NEXT: s_mov_b32 s4, s0 324; VI-NEXT: s_mov_b32 s5, s1 325; VI-NEXT: s_waitcnt vmcnt(0) 326; VI-NEXT: v_max_f16_e32 v0, v0, v0 327; VI-NEXT: v_min_f16_e32 v0, 4.0, v0 328; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 329; VI-NEXT: s_endpgm 330; 331; GFX9-LABEL: minnum_f16_imm_b: 332; GFX9: ; %bb.0: ; %entry 333; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 334; GFX9-NEXT: s_mov_b32 s7, 0xf000 335; GFX9-NEXT: s_mov_b32 s6, -1 336; GFX9-NEXT: s_mov_b32 s10, s6 337; GFX9-NEXT: s_mov_b32 s11, s7 338; GFX9-NEXT: s_waitcnt lgkmcnt(0) 339; GFX9-NEXT: s_mov_b32 s8, s2 340; GFX9-NEXT: s_mov_b32 s9, s3 341; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 342; GFX9-NEXT: s_mov_b32 s4, s0 343; GFX9-NEXT: s_mov_b32 s5, s1 344; GFX9-NEXT: s_waitcnt vmcnt(0) 345; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 346; GFX9-NEXT: v_min_f16_e32 v0, 4.0, v0 347; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 348; GFX9-NEXT: s_endpgm 349; 350; GFX10-LABEL: minnum_f16_imm_b: 351; GFX10: ; %bb.0: ; %entry 352; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 353; GFX10-NEXT: s_mov_b32 s6, -1 354; GFX10-NEXT: s_mov_b32 s7, 0x31016000 355; GFX10-NEXT: s_mov_b32 s10, s6 356; GFX10-NEXT: s_mov_b32 s11, s7 357; GFX10-NEXT: s_waitcnt lgkmcnt(0) 358; GFX10-NEXT: s_mov_b32 s8, s2 359; GFX10-NEXT: s_mov_b32 s9, s3 360; GFX10-NEXT: s_mov_b32 s4, s0 361; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 362; GFX10-NEXT: s_mov_b32 s5, s1 363; GFX10-NEXT: s_waitcnt vmcnt(0) 364; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 365; GFX10-NEXT: v_min_f16_e32 v0, 4.0, v0 366; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 367; GFX10-NEXT: s_endpgm 368; 369; GFX11-LABEL: minnum_f16_imm_b: 370; GFX11: ; %bb.0: ; %entry 371; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 372; GFX11-NEXT: s_mov_b32 s6, -1 373; GFX11-NEXT: s_mov_b32 s7, 0x31016000 374; GFX11-NEXT: s_mov_b32 s10, s6 375; GFX11-NEXT: s_mov_b32 s11, s7 376; GFX11-NEXT: s_waitcnt lgkmcnt(0) 377; GFX11-NEXT: s_mov_b32 s8, s2 378; GFX11-NEXT: s_mov_b32 s9, s3 379; GFX11-NEXT: s_mov_b32 s4, s0 380; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 381; GFX11-NEXT: s_mov_b32 s5, s1 382; GFX11-NEXT: s_waitcnt vmcnt(0) 383; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 384; GFX11-NEXT: v_min_f16_e32 v0, 4.0, v0 385; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 386; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 387; GFX11-NEXT: s_endpgm 388 half addrspace(1)* %r, 389 half addrspace(1)* %a) #0 { 390entry: 391 %a.val = load half, half addrspace(1)* %a 392 %r.val = call half @llvm.minnum.f16(half %a.val, half 4.0) 393 store half %r.val, half addrspace(1)* %r 394 ret void 395} 396 397define amdgpu_kernel void @minnum_v2f16_ieee( 398; SI-LABEL: minnum_v2f16_ieee: 399; SI: ; %bb.0: ; %entry 400; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 401; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 402; SI-NEXT: s_waitcnt lgkmcnt(0) 403; SI-NEXT: s_load_dword s2, s[6:7], 0x0 404; SI-NEXT: s_load_dword s0, s[0:1], 0x0 405; SI-NEXT: s_mov_b32 s7, 0xf000 406; SI-NEXT: s_mov_b32 s6, -1 407; SI-NEXT: s_waitcnt lgkmcnt(0) 408; SI-NEXT: s_lshr_b32 s1, s2, 16 409; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 410; SI-NEXT: s_lshr_b32 s0, s0, 16 411; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 412; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 413; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 414; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 415; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 416; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 417; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 418; SI-NEXT: v_min_f32_e32 v2, v3, v2 419; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 420; SI-NEXT: v_min_f32_e32 v0, v0, v1 421; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 422; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 423; SI-NEXT: v_or_b32_e32 v0, v0, v1 424; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 425; SI-NEXT: s_endpgm 426; 427; VI-LABEL: minnum_v2f16_ieee: 428; VI: ; %bb.0: ; %entry 429; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 430; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 431; VI-NEXT: s_mov_b32 s7, 0xf000 432; VI-NEXT: s_mov_b32 s6, -1 433; VI-NEXT: s_waitcnt lgkmcnt(0) 434; VI-NEXT: s_load_dword s8, s[4:5], 0x0 435; VI-NEXT: s_load_dword s2, s[2:3], 0x0 436; VI-NEXT: s_mov_b32 s4, s0 437; VI-NEXT: s_mov_b32 s5, s1 438; VI-NEXT: s_waitcnt lgkmcnt(0) 439; VI-NEXT: v_max_f16_e64 v0, s8, s8 440; VI-NEXT: v_max_f16_e64 v1, s2, s2 441; VI-NEXT: s_lshr_b32 s0, s8, 16 442; VI-NEXT: v_min_f16_e32 v0, v1, v0 443; VI-NEXT: v_max_f16_e64 v1, s0, s0 444; VI-NEXT: s_lshr_b32 s0, s2, 16 445; VI-NEXT: v_max_f16_e64 v2, s0, s0 446; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 447; VI-NEXT: v_or_b32_e32 v0, v0, v1 448; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 449; VI-NEXT: s_endpgm 450; 451; GFX9-LABEL: minnum_v2f16_ieee: 452; GFX9: ; %bb.0: ; %entry 453; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 454; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 455; GFX9-NEXT: s_mov_b32 s3, 0xf000 456; GFX9-NEXT: s_mov_b32 s2, -1 457; GFX9-NEXT: s_waitcnt lgkmcnt(0) 458; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 459; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 460; GFX9-NEXT: s_mov_b32 s0, s4 461; GFX9-NEXT: s_mov_b32 s1, s5 462; GFX9-NEXT: s_waitcnt lgkmcnt(0) 463; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 464; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 465; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 466; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 467; GFX9-NEXT: s_endpgm 468; 469; GFX10-LABEL: minnum_v2f16_ieee: 470; GFX10: ; %bb.0: ; %entry 471; GFX10-NEXT: s_clause 0x1 472; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 473; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 474; GFX10-NEXT: s_waitcnt lgkmcnt(0) 475; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 476; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0 477; GFX10-NEXT: s_mov_b32 s7, 0x31016000 478; GFX10-NEXT: s_mov_b32 s6, -1 479; GFX10-NEXT: s_waitcnt lgkmcnt(0) 480; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 481; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 482; GFX10-NEXT: v_pk_min_f16 v0, v1, v0 483; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 484; GFX10-NEXT: s_endpgm 485; 486; GFX11-LABEL: minnum_v2f16_ieee: 487; GFX11: ; %bb.0: ; %entry 488; GFX11-NEXT: s_clause 0x1 489; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 490; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 491; GFX11-NEXT: s_waitcnt lgkmcnt(0) 492; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 493; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 494; GFX11-NEXT: s_mov_b32 s3, 0x31016000 495; GFX11-NEXT: s_waitcnt lgkmcnt(0) 496; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 497; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 498; GFX11-NEXT: s_mov_b32 s2, -1 499; GFX11-NEXT: v_pk_min_f16 v0, v1, v0 500; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 501; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 502; GFX11-NEXT: s_endpgm 503 <2 x half> addrspace(1)* %r, 504 <2 x half> addrspace(1)* %a, 505 <2 x half> addrspace(1)* %b) #0 { 506entry: 507 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 508 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 509 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> %b.val) 510 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 511 ret void 512} 513 514define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) #0 { 515; SI-LABEL: minnum_v2f16_no_ieee: 516; SI: ; %bb.0: 517; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 518; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 519; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 520; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 521; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 522; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 523; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 524; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 525; SI-NEXT: v_min_f32_e32 v0, v0, v2 526; SI-NEXT: v_min_f32_e32 v1, v1, v3 527; SI-NEXT: ; return to shader part epilog 528; 529; VI-LABEL: minnum_v2f16_no_ieee: 530; VI: ; %bb.0: 531; VI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 532; VI-NEXT: v_min_f16_e32 v0, v0, v1 533; VI-NEXT: v_or_b32_e32 v0, v0, v2 534; VI-NEXT: ; return to shader part epilog 535; 536; GFX9-LABEL: minnum_v2f16_no_ieee: 537; GFX9: ; %bb.0: 538; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 539; GFX9-NEXT: ; return to shader part epilog 540; 541; GFX10PLUS-LABEL: minnum_v2f16_no_ieee: 542; GFX10PLUS: ; %bb.0: 543; GFX10PLUS-NEXT: v_pk_min_f16 v0, v0, v1 544; GFX10PLUS-NEXT: ; return to shader part epilog 545 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) 546 ret <2 x half> %r.val 547} 548 549define amdgpu_kernel void @minnum_v2f16_imm_a( 550; SI-LABEL: minnum_v2f16_imm_a: 551; SI: ; %bb.0: ; %entry 552; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 553; SI-NEXT: s_waitcnt lgkmcnt(0) 554; SI-NEXT: s_load_dword s2, s[2:3], 0x0 555; SI-NEXT: s_mov_b32 s3, 0xf000 556; SI-NEXT: s_waitcnt lgkmcnt(0) 557; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 558; SI-NEXT: s_lshr_b32 s2, s2, 16 559; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 560; SI-NEXT: s_mov_b32 s2, -1 561; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 562; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 563; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 564; SI-NEXT: v_min_f32_e32 v1, 4.0, v1 565; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 566; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 567; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 568; SI-NEXT: v_or_b32_e32 v0, v0, v1 569; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 570; SI-NEXT: s_endpgm 571; 572; VI-LABEL: minnum_v2f16_imm_a: 573; VI: ; %bb.0: ; %entry 574; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 575; VI-NEXT: v_mov_b32_e32 v2, 0x4400 576; VI-NEXT: s_waitcnt lgkmcnt(0) 577; VI-NEXT: s_load_dword s4, s[2:3], 0x0 578; VI-NEXT: s_mov_b32 s3, 0xf000 579; VI-NEXT: s_mov_b32 s2, -1 580; VI-NEXT: s_waitcnt lgkmcnt(0) 581; VI-NEXT: v_max_f16_e64 v0, s4, s4 582; VI-NEXT: s_lshr_b32 s4, s4, 16 583; VI-NEXT: v_max_f16_e64 v1, s4, s4 584; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0 585; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 586; VI-NEXT: v_or_b32_e32 v0, v0, v1 587; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 588; VI-NEXT: s_endpgm 589; 590; GFX9-LABEL: minnum_v2f16_imm_a: 591; GFX9: ; %bb.0: ; %entry 592; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 593; GFX9-NEXT: s_waitcnt lgkmcnt(0) 594; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 595; GFX9-NEXT: s_mov_b32 s3, 0xf000 596; GFX9-NEXT: s_mov_b32 s2, -1 597; GFX9-NEXT: s_waitcnt lgkmcnt(0) 598; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 599; GFX9-NEXT: s_mov_b32 s4, 0x44004200 600; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 601; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 602; GFX9-NEXT: s_endpgm 603; 604; GFX10-LABEL: minnum_v2f16_imm_a: 605; GFX10: ; %bb.0: ; %entry 606; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 607; GFX10-NEXT: s_waitcnt lgkmcnt(0) 608; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 609; GFX10-NEXT: s_mov_b32 s3, 0x31016000 610; GFX10-NEXT: s_waitcnt lgkmcnt(0) 611; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 612; GFX10-NEXT: s_mov_b32 s2, -1 613; GFX10-NEXT: v_pk_min_f16 v0, 0x44004200, v0 614; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 615; GFX10-NEXT: s_endpgm 616; 617; GFX11-LABEL: minnum_v2f16_imm_a: 618; GFX11: ; %bb.0: ; %entry 619; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 620; GFX11-NEXT: s_waitcnt lgkmcnt(0) 621; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 622; GFX11-NEXT: s_mov_b32 s3, 0x31016000 623; GFX11-NEXT: s_waitcnt lgkmcnt(0) 624; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 625; GFX11-NEXT: s_mov_b32 s2, -1 626; GFX11-NEXT: v_pk_min_f16 v0, 0x44004200, v0 627; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 628; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 629; GFX11-NEXT: s_endpgm 630 <2 x half> addrspace(1)* %r, 631 <2 x half> addrspace(1)* %b) #0 { 632entry: 633 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 634 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val) 635 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 636 ret void 637} 638 639define amdgpu_kernel void @minnum_v2f16_imm_b( 640; SI-LABEL: minnum_v2f16_imm_b: 641; SI: ; %bb.0: ; %entry 642; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 643; SI-NEXT: s_waitcnt lgkmcnt(0) 644; SI-NEXT: s_load_dword s2, s[2:3], 0x0 645; SI-NEXT: s_mov_b32 s3, 0xf000 646; SI-NEXT: s_waitcnt lgkmcnt(0) 647; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 648; SI-NEXT: s_lshr_b32 s2, s2, 16 649; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 650; SI-NEXT: s_mov_b32 s2, -1 651; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 652; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 653; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 654; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 655; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 656; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 657; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 658; SI-NEXT: v_or_b32_e32 v0, v0, v1 659; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 660; SI-NEXT: s_endpgm 661; 662; VI-LABEL: minnum_v2f16_imm_b: 663; VI: ; %bb.0: ; %entry 664; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 665; VI-NEXT: v_mov_b32_e32 v2, 0x4200 666; VI-NEXT: s_waitcnt lgkmcnt(0) 667; VI-NEXT: s_load_dword s4, s[2:3], 0x0 668; VI-NEXT: s_mov_b32 s3, 0xf000 669; VI-NEXT: s_mov_b32 s2, -1 670; VI-NEXT: s_waitcnt lgkmcnt(0) 671; VI-NEXT: v_max_f16_e64 v0, s4, s4 672; VI-NEXT: s_lshr_b32 s4, s4, 16 673; VI-NEXT: v_max_f16_e64 v1, s4, s4 674; VI-NEXT: v_min_f16_e32 v0, 4.0, v0 675; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 676; VI-NEXT: v_or_b32_e32 v0, v0, v1 677; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 678; VI-NEXT: s_endpgm 679; 680; GFX9-LABEL: minnum_v2f16_imm_b: 681; GFX9: ; %bb.0: ; %entry 682; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 683; GFX9-NEXT: s_waitcnt lgkmcnt(0) 684; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 685; GFX9-NEXT: s_mov_b32 s3, 0xf000 686; GFX9-NEXT: s_mov_b32 s2, -1 687; GFX9-NEXT: s_waitcnt lgkmcnt(0) 688; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 689; GFX9-NEXT: s_mov_b32 s4, 0x42004400 690; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 691; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 692; GFX9-NEXT: s_endpgm 693; 694; GFX10-LABEL: minnum_v2f16_imm_b: 695; GFX10: ; %bb.0: ; %entry 696; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 697; GFX10-NEXT: s_waitcnt lgkmcnt(0) 698; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 699; GFX10-NEXT: s_mov_b32 s3, 0x31016000 700; GFX10-NEXT: s_waitcnt lgkmcnt(0) 701; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 702; GFX10-NEXT: s_mov_b32 s2, -1 703; GFX10-NEXT: v_pk_min_f16 v0, 0x42004400, v0 704; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 705; GFX10-NEXT: s_endpgm 706; 707; GFX11-LABEL: minnum_v2f16_imm_b: 708; GFX11: ; %bb.0: ; %entry 709; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 710; GFX11-NEXT: s_waitcnt lgkmcnt(0) 711; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 712; GFX11-NEXT: s_mov_b32 s3, 0x31016000 713; GFX11-NEXT: s_waitcnt lgkmcnt(0) 714; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 715; GFX11-NEXT: s_mov_b32 s2, -1 716; GFX11-NEXT: v_pk_min_f16 v0, 0x42004400, v0 717; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 718; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 719; GFX11-NEXT: s_endpgm 720 <2 x half> addrspace(1)* %r, 721 <2 x half> addrspace(1)* %a) #0 { 722entry: 723 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 724 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>) 725 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 726 ret void 727} 728 729; FIXME: Scalarize with undef half 730define amdgpu_kernel void @minnum_v3f16( 731; SI-LABEL: minnum_v3f16: 732; SI: ; %bb.0: ; %entry 733; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 734; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 735; SI-NEXT: s_waitcnt lgkmcnt(0) 736; SI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 737; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 738; SI-NEXT: s_mov_b32 s7, 0xf000 739; SI-NEXT: s_mov_b32 s6, -1 740; SI-NEXT: s_waitcnt lgkmcnt(0) 741; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 742; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 743; SI-NEXT: s_lshr_b32 s2, s2, 16 744; SI-NEXT: s_lshr_b32 s3, s0, 16 745; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 746; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 747; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 748; SI-NEXT: v_cvt_f32_f16_e32 v4, s1 749; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 750; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 751; SI-NEXT: v_min_f32_e32 v2, v3, v2 752; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 753; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 754; SI-NEXT: v_min_f32_e32 v1, v1, v3 755; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 756; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 757; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 758; SI-NEXT: v_min_f32_e32 v0, v0, v3 759; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 760; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 761; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 762; SI-NEXT: v_or_b32_e32 v1, v1, v2 763; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 764; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 765; SI-NEXT: s_endpgm 766; 767; VI-LABEL: minnum_v3f16: 768; VI: ; %bb.0: ; %entry 769; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 770; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 771; VI-NEXT: s_mov_b32 s7, 0xf000 772; VI-NEXT: s_mov_b32 s6, -1 773; VI-NEXT: s_waitcnt lgkmcnt(0) 774; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 775; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 776; VI-NEXT: s_mov_b32 s4, s0 777; VI-NEXT: s_mov_b32 s5, s1 778; VI-NEXT: s_waitcnt lgkmcnt(0) 779; VI-NEXT: v_max_f16_e64 v0, s8, s8 780; VI-NEXT: v_max_f16_e64 v1, s2, s2 781; VI-NEXT: s_lshr_b32 s0, s8, 16 782; VI-NEXT: v_min_f16_e32 v0, v1, v0 783; VI-NEXT: v_max_f16_e64 v1, s0, s0 784; VI-NEXT: s_lshr_b32 s0, s2, 16 785; VI-NEXT: v_max_f16_e64 v2, s0, s0 786; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 787; VI-NEXT: v_or_b32_e32 v0, v0, v1 788; VI-NEXT: v_max_f16_e64 v1, s9, s9 789; VI-NEXT: v_max_f16_e64 v2, s3, s3 790; VI-NEXT: v_min_f16_e32 v1, v2, v1 791; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 792; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 793; VI-NEXT: s_endpgm 794; 795; GFX9-LABEL: minnum_v3f16: 796; GFX9: ; %bb.0: ; %entry 797; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 798; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 799; GFX9-NEXT: s_mov_b32 s3, 0xf000 800; GFX9-NEXT: s_mov_b32 s2, -1 801; GFX9-NEXT: s_waitcnt lgkmcnt(0) 802; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 803; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 804; GFX9-NEXT: s_mov_b32 s0, s4 805; GFX9-NEXT: s_mov_b32 s1, s5 806; GFX9-NEXT: s_waitcnt lgkmcnt(0) 807; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 808; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 809; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 810; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 811; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 812; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 813; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 814; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 815; GFX9-NEXT: s_endpgm 816; 817; GFX10-LABEL: minnum_v3f16: 818; GFX10: ; %bb.0: ; %entry 819; GFX10-NEXT: s_clause 0x1 820; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 821; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 822; GFX10-NEXT: s_waitcnt lgkmcnt(0) 823; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 824; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 825; GFX10-NEXT: s_mov_b32 s7, 0x31016000 826; GFX10-NEXT: s_mov_b32 s6, -1 827; GFX10-NEXT: s_waitcnt lgkmcnt(0) 828; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 829; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 830; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 831; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 832; GFX10-NEXT: v_pk_min_f16 v1, v2, v1 833; GFX10-NEXT: v_pk_min_f16 v0, v3, v0 834; GFX10-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 835; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 836; GFX10-NEXT: s_endpgm 837; 838; GFX11-LABEL: minnum_v3f16: 839; GFX11: ; %bb.0: ; %entry 840; GFX11-NEXT: s_clause 0x1 841; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 842; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 843; GFX11-NEXT: s_waitcnt lgkmcnt(0) 844; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 845; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 846; GFX11-NEXT: s_waitcnt lgkmcnt(0) 847; GFX11-NEXT: v_pk_max_f16 v1, s5, s5 848; GFX11-NEXT: v_pk_max_f16 v2, s3, s3 849; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 850; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 851; GFX11-NEXT: s_mov_b32 s3, 0x31016000 852; GFX11-NEXT: s_mov_b32 s2, -1 853; GFX11-NEXT: v_pk_min_f16 v1, v2, v1 854; GFX11-NEXT: v_pk_min_f16 v0, v3, v0 855; GFX11-NEXT: s_clause 0x1 856; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4 857; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 858; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 859; GFX11-NEXT: s_endpgm 860 <3 x half> addrspace(1)* %r, 861 <3 x half> addrspace(1)* %a, 862 <3 x half> addrspace(1)* %b) #0 { 863entry: 864 %a.val = load <3 x half>, <3 x half> addrspace(1)* %a 865 %b.val = load <3 x half>, <3 x half> addrspace(1)* %b 866 %r.val = call <3 x half> @llvm.minnum.v3f16(<3 x half> %a.val, <3 x half> %b.val) 867 store <3 x half> %r.val, <3 x half> addrspace(1)* %r 868 ret void 869} 870 871define amdgpu_kernel void @minnum_v4f16( 872; SI-LABEL: minnum_v4f16: 873; SI: ; %bb.0: ; %entry 874; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 875; SI-NEXT: s_mov_b32 s3, 0xf000 876; SI-NEXT: s_mov_b32 s2, -1 877; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 878; SI-NEXT: s_waitcnt lgkmcnt(0) 879; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 880; SI-NEXT: s_mov_b32 s0, s4 881; SI-NEXT: s_mov_b32 s1, s5 882; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 883; SI-NEXT: s_waitcnt lgkmcnt(0) 884; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 885; SI-NEXT: s_lshr_b32 s6, s6, 16 886; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 887; SI-NEXT: s_lshr_b32 s6, s7, 16 888; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 889; SI-NEXT: s_lshr_b32 s6, s5, 16 890; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 891; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 892; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 893; SI-NEXT: s_lshr_b32 s4, s4, 16 894; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 895; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 896; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 897; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 898; SI-NEXT: v_min_f32_e32 v3, v3, v5 899; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 900; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 901; SI-NEXT: v_min_f32_e32 v1, v1, v5 902; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 903; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 904; SI-NEXT: v_min_f32_e32 v2, v2, v5 905; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 906; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 907; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 908; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 909; SI-NEXT: v_min_f32_e32 v0, v0, v4 910; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 911; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 912; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 913; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 914; SI-NEXT: v_or_b32_e32 v1, v1, v3 915; SI-NEXT: v_or_b32_e32 v0, v0, v2 916; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 917; SI-NEXT: s_endpgm 918; 919; VI-LABEL: minnum_v4f16: 920; VI: ; %bb.0: ; %entry 921; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 922; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 923; VI-NEXT: s_mov_b32 s7, 0xf000 924; VI-NEXT: s_mov_b32 s6, -1 925; VI-NEXT: s_waitcnt lgkmcnt(0) 926; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 927; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 928; VI-NEXT: s_mov_b32 s4, s0 929; VI-NEXT: s_mov_b32 s5, s1 930; VI-NEXT: s_waitcnt lgkmcnt(0) 931; VI-NEXT: v_max_f16_e64 v0, s9, s9 932; VI-NEXT: v_max_f16_e64 v1, s3, s3 933; VI-NEXT: s_lshr_b32 s0, s9, 16 934; VI-NEXT: v_min_f16_e32 v0, v1, v0 935; VI-NEXT: v_max_f16_e64 v1, s0, s0 936; VI-NEXT: s_lshr_b32 s0, s3, 16 937; VI-NEXT: v_max_f16_e64 v2, s0, s0 938; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 939; VI-NEXT: v_or_b32_e32 v1, v0, v1 940; VI-NEXT: v_max_f16_e64 v0, s8, s8 941; VI-NEXT: v_max_f16_e64 v2, s2, s2 942; VI-NEXT: s_lshr_b32 s0, s8, 16 943; VI-NEXT: v_min_f16_e32 v0, v2, v0 944; VI-NEXT: v_max_f16_e64 v2, s0, s0 945; VI-NEXT: s_lshr_b32 s0, s2, 16 946; VI-NEXT: v_max_f16_e64 v3, s0, s0 947; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 948; VI-NEXT: v_or_b32_e32 v0, v0, v2 949; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 950; VI-NEXT: s_endpgm 951; 952; GFX9-LABEL: minnum_v4f16: 953; GFX9: ; %bb.0: ; %entry 954; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 955; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 956; GFX9-NEXT: s_mov_b32 s3, 0xf000 957; GFX9-NEXT: s_mov_b32 s2, -1 958; GFX9-NEXT: s_waitcnt lgkmcnt(0) 959; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 960; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 961; GFX9-NEXT: s_mov_b32 s0, s4 962; GFX9-NEXT: s_mov_b32 s1, s5 963; GFX9-NEXT: s_waitcnt lgkmcnt(0) 964; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 965; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 966; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 967; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 968; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 969; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 970; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 971; GFX9-NEXT: s_endpgm 972; 973; GFX10-LABEL: minnum_v4f16: 974; GFX10: ; %bb.0: ; %entry 975; GFX10-NEXT: s_clause 0x1 976; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 977; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 978; GFX10-NEXT: s_waitcnt lgkmcnt(0) 979; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 980; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 981; GFX10-NEXT: s_mov_b32 s7, 0x31016000 982; GFX10-NEXT: s_mov_b32 s6, -1 983; GFX10-NEXT: s_waitcnt lgkmcnt(0) 984; GFX10-NEXT: v_pk_max_f16 v0, s1, s1 985; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 986; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 987; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 988; GFX10-NEXT: v_pk_min_f16 v1, v1, v0 989; GFX10-NEXT: v_pk_min_f16 v0, v3, v2 990; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 991; GFX10-NEXT: s_endpgm 992; 993; GFX11-LABEL: minnum_v4f16: 994; GFX11: ; %bb.0: ; %entry 995; GFX11-NEXT: s_clause 0x1 996; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 997; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 998; GFX11-NEXT: s_waitcnt lgkmcnt(0) 999; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 1000; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 1001; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1002; GFX11-NEXT: v_pk_max_f16 v0, s5, s5 1003; GFX11-NEXT: v_pk_max_f16 v1, s3, s3 1004; GFX11-NEXT: v_pk_max_f16 v2, s4, s4 1005; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 1006; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1007; GFX11-NEXT: s_mov_b32 s2, -1 1008; GFX11-NEXT: v_pk_min_f16 v1, v1, v0 1009; GFX11-NEXT: v_pk_min_f16 v0, v3, v2 1010; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1011; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1012; GFX11-NEXT: s_endpgm 1013 <4 x half> addrspace(1)* %r, 1014 <4 x half> addrspace(1)* %a, 1015 <4 x half> addrspace(1)* %b) #0 { 1016entry: 1017 %a.val = load <4 x half>, <4 x half> addrspace(1)* %a 1018 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b 1019 %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> %a.val, <4 x half> %b.val) 1020 store <4 x half> %r.val, <4 x half> addrspace(1)* %r 1021 ret void 1022} 1023 1024define amdgpu_kernel void @fmin_v4f16_imm_a( 1025; SI-LABEL: fmin_v4f16_imm_a: 1026; SI: ; %bb.0: ; %entry 1027; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1028; SI-NEXT: s_waitcnt lgkmcnt(0) 1029; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 1030; SI-NEXT: s_mov_b32 s3, 0xf000 1031; SI-NEXT: s_mov_b32 s2, -1 1032; SI-NEXT: s_waitcnt lgkmcnt(0) 1033; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 1034; SI-NEXT: s_lshr_b32 s5, s5, 16 1035; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 1036; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 1037; SI-NEXT: s_lshr_b32 s4, s4, 16 1038; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 1039; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 1040; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 1041; SI-NEXT: v_min_f32_e32 v2, 4.0, v2 1042; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 1043; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 1044; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1045; SI-NEXT: v_min_f32_e32 v3, 2.0, v3 1046; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 1047; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1048; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1049; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0 1050; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1051; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1052; SI-NEXT: v_or_b32_e32 v1, v1, v2 1053; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 1054; SI-NEXT: v_or_b32_e32 v0, v0, v2 1055; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1056; SI-NEXT: s_endpgm 1057; 1058; VI-LABEL: fmin_v4f16_imm_a: 1059; VI: ; %bb.0: ; %entry 1060; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1061; VI-NEXT: v_mov_b32_e32 v0, 0x4400 1062; VI-NEXT: s_mov_b32 s7, 0xf000 1063; VI-NEXT: s_mov_b32 s6, -1 1064; VI-NEXT: s_waitcnt lgkmcnt(0) 1065; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1066; VI-NEXT: s_mov_b32 s4, s0 1067; VI-NEXT: s_mov_b32 s5, s1 1068; VI-NEXT: s_waitcnt lgkmcnt(0) 1069; VI-NEXT: s_lshr_b32 s0, s3, 16 1070; VI-NEXT: v_max_f16_e64 v1, s3, s3 1071; VI-NEXT: v_max_f16_e64 v3, s0, s0 1072; VI-NEXT: v_max_f16_e64 v2, s2, s2 1073; VI-NEXT: v_min_f16_e32 v1, 0x4200, v1 1074; VI-NEXT: v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1075; VI-NEXT: s_lshr_b32 s0, s2, 16 1076; VI-NEXT: v_or_b32_e32 v1, v1, v0 1077; VI-NEXT: v_min_f16_e32 v0, 0x4800, v2 1078; VI-NEXT: v_max_f16_e64 v2, s0, s0 1079; VI-NEXT: v_mov_b32_e32 v3, 0x4000 1080; VI-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1081; VI-NEXT: v_or_b32_e32 v0, v0, v2 1082; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1083; VI-NEXT: s_endpgm 1084; 1085; GFX9-LABEL: fmin_v4f16_imm_a: 1086; GFX9: ; %bb.0: ; %entry 1087; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1088; GFX9-NEXT: s_mov_b32 s8, 0x44004200 1089; GFX9-NEXT: s_mov_b32 s9, 0x40004800 1090; GFX9-NEXT: s_mov_b32 s7, 0xf000 1091; GFX9-NEXT: s_mov_b32 s6, -1 1092; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1093; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1094; GFX9-NEXT: s_mov_b32 s4, s0 1095; GFX9-NEXT: s_mov_b32 s5, s1 1096; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1097; GFX9-NEXT: v_pk_max_f16 v0, s3, s3 1098; GFX9-NEXT: v_pk_max_f16 v2, s2, s2 1099; GFX9-NEXT: v_pk_min_f16 v1, v0, s8 1100; GFX9-NEXT: v_pk_min_f16 v0, v2, s9 1101; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1102; GFX9-NEXT: s_endpgm 1103; 1104; GFX10-LABEL: fmin_v4f16_imm_a: 1105; GFX10: ; %bb.0: ; %entry 1106; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1107; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1108; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1109; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1110; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 1111; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 1112; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1113; GFX10-NEXT: s_mov_b32 s2, -1 1114; GFX10-NEXT: v_pk_min_f16 v1, 0x44004200, v0 1115; GFX10-NEXT: v_pk_min_f16 v0, 0x40004800, v2 1116; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1117; GFX10-NEXT: s_endpgm 1118; 1119; GFX11-LABEL: fmin_v4f16_imm_a: 1120; GFX11: ; %bb.0: ; %entry 1121; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1122; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1123; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 1124; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1125; GFX11-NEXT: v_pk_max_f16 v0, s3, s3 1126; GFX11-NEXT: v_pk_max_f16 v2, s2, s2 1127; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1128; GFX11-NEXT: s_mov_b32 s2, -1 1129; GFX11-NEXT: v_pk_min_f16 v1, 0x44004200, v0 1130; GFX11-NEXT: v_pk_min_f16 v0, 0x40004800, v2 1131; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1132; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1133; GFX11-NEXT: s_endpgm 1134 <4 x half> addrspace(1)* %r, 1135 <4 x half> addrspace(1)* %b) #0 { 1136entry: 1137 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b 1138 %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val) 1139 store <4 x half> %r.val, <4 x half> addrspace(1)* %r 1140 ret void 1141} 1142 1143attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 1144