1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s 3; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s 6 7declare half @llvm.minnum.f16(half %a, half %b) 8declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) 9declare <3 x half> @llvm.minnum.v3f16(<3 x half> %a, <3 x half> %b) 10declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b) 11 12define amdgpu_kernel void @minnum_f16_ieee( 13; SI-LABEL: minnum_f16_ieee: 14; SI: ; %bb.0: ; %entry 15; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 16; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 17; SI-NEXT: s_mov_b32 s3, 0xf000 18; SI-NEXT: s_mov_b32 s2, -1 19; SI-NEXT: s_mov_b32 s14, s2 20; SI-NEXT: s_waitcnt lgkmcnt(0) 21; SI-NEXT: s_mov_b32 s12, s6 22; SI-NEXT: s_mov_b32 s13, s7 23; SI-NEXT: s_mov_b32 s15, s3 24; SI-NEXT: s_mov_b32 s10, s2 25; SI-NEXT: s_mov_b32 s11, s3 26; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 27; SI-NEXT: s_waitcnt vmcnt(0) 28; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 29; SI-NEXT: s_waitcnt vmcnt(0) 30; SI-NEXT: s_mov_b32 s0, s4 31; SI-NEXT: s_mov_b32 s1, s5 32; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 33; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 34; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 35; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 36; SI-NEXT: v_min_f32_e32 v0, v0, v1 37; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 38; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 39; SI-NEXT: s_endpgm 40; 41; VI-LABEL: minnum_f16_ieee: 42; VI: ; %bb.0: ; %entry 43; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 44; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 45; VI-NEXT: s_mov_b32 s3, 0xf000 46; VI-NEXT: s_mov_b32 s2, -1 47; VI-NEXT: s_mov_b32 s14, s2 48; VI-NEXT: s_waitcnt lgkmcnt(0) 49; VI-NEXT: s_mov_b32 s12, s6 50; VI-NEXT: s_mov_b32 s13, s7 51; VI-NEXT: s_mov_b32 s15, s3 52; VI-NEXT: s_mov_b32 s10, s2 53; VI-NEXT: s_mov_b32 s11, s3 54; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 55; VI-NEXT: s_waitcnt vmcnt(0) 56; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 57; VI-NEXT: s_waitcnt vmcnt(0) 58; VI-NEXT: s_mov_b32 s0, s4 59; VI-NEXT: s_mov_b32 s1, s5 60; VI-NEXT: v_max_f16_e32 v0, v0, v0 61; VI-NEXT: v_max_f16_e32 v1, v1, v1 62; VI-NEXT: v_min_f16_e32 v0, v0, v1 63; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 64; VI-NEXT: s_endpgm 65; 66; GFX9-LABEL: minnum_f16_ieee: 67; GFX9: ; %bb.0: ; %entry 68; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 69; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 70; GFX9-NEXT: s_mov_b32 s3, 0xf000 71; GFX9-NEXT: s_mov_b32 s2, -1 72; GFX9-NEXT: s_mov_b32 s14, s2 73; GFX9-NEXT: s_waitcnt lgkmcnt(0) 74; GFX9-NEXT: s_mov_b32 s12, s6 75; GFX9-NEXT: s_mov_b32 s13, s7 76; GFX9-NEXT: s_mov_b32 s15, s3 77; GFX9-NEXT: s_mov_b32 s10, s2 78; GFX9-NEXT: s_mov_b32 s11, s3 79; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 80; GFX9-NEXT: s_waitcnt vmcnt(0) 81; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 82; GFX9-NEXT: s_waitcnt vmcnt(0) 83; GFX9-NEXT: s_mov_b32 s0, s4 84; GFX9-NEXT: s_mov_b32 s1, s5 85; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 86; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 87; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 88; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 89; GFX9-NEXT: s_endpgm 90; 91; GFX10-LABEL: minnum_f16_ieee: 92; GFX10: ; %bb.0: ; %entry 93; GFX10-NEXT: s_clause 0x1 94; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 95; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 96; GFX10-NEXT: s_mov_b32 s2, -1 97; GFX10-NEXT: s_mov_b32 s3, 0x31016000 98; GFX10-NEXT: s_mov_b32 s14, s2 99; GFX10-NEXT: s_mov_b32 s15, s3 100; GFX10-NEXT: s_mov_b32 s10, s2 101; GFX10-NEXT: s_mov_b32 s11, s3 102; GFX10-NEXT: s_waitcnt lgkmcnt(0) 103; GFX10-NEXT: s_mov_b32 s12, s6 104; GFX10-NEXT: s_mov_b32 s13, s7 105; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc 106; GFX10-NEXT: s_waitcnt vmcnt(0) 107; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc 108; GFX10-NEXT: s_waitcnt vmcnt(0) 109; GFX10-NEXT: s_mov_b32 s0, s4 110; GFX10-NEXT: s_mov_b32 s1, s5 111; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 112; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 113; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 114; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 115; GFX10-NEXT: s_endpgm 116 half addrspace(1)* %r, 117 half addrspace(1)* %a, 118 half addrspace(1)* %b) #0 { 119entry: 120 %a.val = load volatile half, half addrspace(1)* %a 121 %b.val = load volatile half, half addrspace(1)* %b 122 %r.val = call half @llvm.minnum.f16(half %a.val, half %b.val) 123 store half %r.val, half addrspace(1)* %r 124 ret void 125} 126 127define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 { 128; SI-LABEL: minnum_f16_no_ieee: 129; SI: ; %bb.0: 130; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 131; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 132; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 133; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 134; SI-NEXT: v_min_f32_e32 v0, v0, v1 135; SI-NEXT: ; return to shader part epilog 136; 137; VI-LABEL: minnum_f16_no_ieee: 138; VI: ; %bb.0: 139; VI-NEXT: v_min_f16_e32 v0, v0, v1 140; VI-NEXT: ; return to shader part epilog 141; 142; GFX9-LABEL: minnum_f16_no_ieee: 143; GFX9: ; %bb.0: 144; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 145; GFX9-NEXT: ; return to shader part epilog 146; 147; GFX10-LABEL: minnum_f16_no_ieee: 148; GFX10: ; %bb.0: 149; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 150; GFX10-NEXT: ; return to shader part epilog 151 %r.val = call half @llvm.minnum.f16(half %a, half %b) 152 ret half %r.val 153} 154 155define amdgpu_kernel void @minnum_f16_imm_a( 156; SI-LABEL: minnum_f16_imm_a: 157; SI: ; %bb.0: ; %entry 158; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 159; SI-NEXT: s_mov_b32 s7, 0xf000 160; SI-NEXT: s_mov_b32 s6, -1 161; SI-NEXT: s_mov_b32 s10, s6 162; SI-NEXT: s_mov_b32 s11, s7 163; SI-NEXT: s_waitcnt lgkmcnt(0) 164; SI-NEXT: s_mov_b32 s8, s2 165; SI-NEXT: s_mov_b32 s9, s3 166; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 167; SI-NEXT: s_mov_b32 s4, s0 168; SI-NEXT: s_mov_b32 s5, s1 169; SI-NEXT: s_waitcnt vmcnt(0) 170; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 171; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 172; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 173; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 174; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 175; SI-NEXT: s_endpgm 176; 177; VI-LABEL: minnum_f16_imm_a: 178; VI: ; %bb.0: ; %entry 179; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 180; VI-NEXT: s_mov_b32 s7, 0xf000 181; VI-NEXT: s_mov_b32 s6, -1 182; VI-NEXT: s_mov_b32 s10, s6 183; VI-NEXT: s_mov_b32 s11, s7 184; VI-NEXT: s_waitcnt lgkmcnt(0) 185; VI-NEXT: s_mov_b32 s8, s2 186; VI-NEXT: s_mov_b32 s9, s3 187; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 188; VI-NEXT: s_mov_b32 s4, s0 189; VI-NEXT: s_mov_b32 s5, s1 190; VI-NEXT: s_waitcnt vmcnt(0) 191; VI-NEXT: v_max_f16_e32 v0, v0, v0 192; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0 193; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 194; VI-NEXT: s_endpgm 195; 196; GFX9-LABEL: minnum_f16_imm_a: 197; GFX9: ; %bb.0: ; %entry 198; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 199; GFX9-NEXT: s_mov_b32 s7, 0xf000 200; GFX9-NEXT: s_mov_b32 s6, -1 201; GFX9-NEXT: s_mov_b32 s10, s6 202; GFX9-NEXT: s_mov_b32 s11, s7 203; GFX9-NEXT: s_waitcnt lgkmcnt(0) 204; GFX9-NEXT: s_mov_b32 s8, s2 205; GFX9-NEXT: s_mov_b32 s9, s3 206; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 207; GFX9-NEXT: s_mov_b32 s4, s0 208; GFX9-NEXT: s_mov_b32 s5, s1 209; GFX9-NEXT: s_waitcnt vmcnt(0) 210; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 211; GFX9-NEXT: v_min_f16_e32 v0, 0x4200, v0 212; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 213; GFX9-NEXT: s_endpgm 214; 215; GFX10-LABEL: minnum_f16_imm_a: 216; GFX10: ; %bb.0: ; %entry 217; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 218; GFX10-NEXT: s_mov_b32 s6, -1 219; GFX10-NEXT: s_mov_b32 s7, 0x31016000 220; GFX10-NEXT: s_mov_b32 s10, s6 221; GFX10-NEXT: s_mov_b32 s11, s7 222; GFX10-NEXT: s_waitcnt lgkmcnt(0) 223; GFX10-NEXT: s_mov_b32 s8, s2 224; GFX10-NEXT: s_mov_b32 s9, s3 225; GFX10-NEXT: s_mov_b32 s4, s0 226; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 227; GFX10-NEXT: s_mov_b32 s5, s1 228; GFX10-NEXT: s_waitcnt vmcnt(0) 229; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 230; GFX10-NEXT: v_min_f16_e32 v0, 0x4200, v0 231; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 232; GFX10-NEXT: s_endpgm 233 half addrspace(1)* %r, 234 half addrspace(1)* %b) #0 { 235entry: 236 %b.val = load half, half addrspace(1)* %b 237 %r.val = call half @llvm.minnum.f16(half 3.0, half %b.val) 238 store half %r.val, half addrspace(1)* %r 239 ret void 240} 241 242define amdgpu_kernel void @minnum_f16_imm_b( 243; SI-LABEL: minnum_f16_imm_b: 244; SI: ; %bb.0: ; %entry 245; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 246; SI-NEXT: s_mov_b32 s7, 0xf000 247; SI-NEXT: s_mov_b32 s6, -1 248; SI-NEXT: s_mov_b32 s10, s6 249; SI-NEXT: s_mov_b32 s11, s7 250; SI-NEXT: s_waitcnt lgkmcnt(0) 251; SI-NEXT: s_mov_b32 s8, s2 252; SI-NEXT: s_mov_b32 s9, s3 253; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 254; SI-NEXT: s_mov_b32 s4, s0 255; SI-NEXT: s_mov_b32 s5, s1 256; SI-NEXT: s_waitcnt vmcnt(0) 257; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 258; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 259; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 260; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 261; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 262; SI-NEXT: s_endpgm 263; 264; VI-LABEL: minnum_f16_imm_b: 265; VI: ; %bb.0: ; %entry 266; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 267; VI-NEXT: s_mov_b32 s7, 0xf000 268; VI-NEXT: s_mov_b32 s6, -1 269; VI-NEXT: s_mov_b32 s10, s6 270; VI-NEXT: s_mov_b32 s11, s7 271; VI-NEXT: s_waitcnt lgkmcnt(0) 272; VI-NEXT: s_mov_b32 s8, s2 273; VI-NEXT: s_mov_b32 s9, s3 274; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 275; VI-NEXT: s_mov_b32 s4, s0 276; VI-NEXT: s_mov_b32 s5, s1 277; VI-NEXT: s_waitcnt vmcnt(0) 278; VI-NEXT: v_max_f16_e32 v0, v0, v0 279; VI-NEXT: v_min_f16_e32 v0, 4.0, v0 280; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 281; VI-NEXT: s_endpgm 282; 283; GFX9-LABEL: minnum_f16_imm_b: 284; GFX9: ; %bb.0: ; %entry 285; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 286; GFX9-NEXT: s_mov_b32 s7, 0xf000 287; GFX9-NEXT: s_mov_b32 s6, -1 288; GFX9-NEXT: s_mov_b32 s10, s6 289; GFX9-NEXT: s_mov_b32 s11, s7 290; GFX9-NEXT: s_waitcnt lgkmcnt(0) 291; GFX9-NEXT: s_mov_b32 s8, s2 292; GFX9-NEXT: s_mov_b32 s9, s3 293; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 294; GFX9-NEXT: s_mov_b32 s4, s0 295; GFX9-NEXT: s_mov_b32 s5, s1 296; GFX9-NEXT: s_waitcnt vmcnt(0) 297; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 298; GFX9-NEXT: v_min_f16_e32 v0, 4.0, v0 299; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 300; GFX9-NEXT: s_endpgm 301; 302; GFX10-LABEL: minnum_f16_imm_b: 303; GFX10: ; %bb.0: ; %entry 304; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 305; GFX10-NEXT: s_mov_b32 s6, -1 306; GFX10-NEXT: s_mov_b32 s7, 0x31016000 307; GFX10-NEXT: s_mov_b32 s10, s6 308; GFX10-NEXT: s_mov_b32 s11, s7 309; GFX10-NEXT: s_waitcnt lgkmcnt(0) 310; GFX10-NEXT: s_mov_b32 s8, s2 311; GFX10-NEXT: s_mov_b32 s9, s3 312; GFX10-NEXT: s_mov_b32 s4, s0 313; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 314; GFX10-NEXT: s_mov_b32 s5, s1 315; GFX10-NEXT: s_waitcnt vmcnt(0) 316; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 317; GFX10-NEXT: v_min_f16_e32 v0, 4.0, v0 318; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 319; GFX10-NEXT: s_endpgm 320 half addrspace(1)* %r, 321 half addrspace(1)* %a) #0 { 322entry: 323 %a.val = load half, half addrspace(1)* %a 324 %r.val = call half @llvm.minnum.f16(half %a.val, half 4.0) 325 store half %r.val, half addrspace(1)* %r 326 ret void 327} 328 329define amdgpu_kernel void @minnum_v2f16_ieee( 330; SI-LABEL: minnum_v2f16_ieee: 331; SI: ; %bb.0: ; %entry 332; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 333; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 334; SI-NEXT: s_waitcnt lgkmcnt(0) 335; SI-NEXT: s_load_dword s2, s[6:7], 0x0 336; SI-NEXT: s_load_dword s0, s[0:1], 0x0 337; SI-NEXT: s_mov_b32 s7, 0xf000 338; SI-NEXT: s_mov_b32 s6, -1 339; SI-NEXT: s_waitcnt lgkmcnt(0) 340; SI-NEXT: s_lshr_b32 s1, s2, 16 341; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 342; SI-NEXT: s_lshr_b32 s0, s0, 16 343; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 344; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 345; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 346; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 347; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 348; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 349; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 350; SI-NEXT: v_min_f32_e32 v2, v3, v2 351; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 352; SI-NEXT: v_min_f32_e32 v0, v0, v1 353; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 354; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 355; SI-NEXT: v_or_b32_e32 v0, v0, v1 356; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 357; SI-NEXT: s_endpgm 358; 359; VI-LABEL: minnum_v2f16_ieee: 360; VI: ; %bb.0: ; %entry 361; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 362; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 363; VI-NEXT: s_mov_b32 s7, 0xf000 364; VI-NEXT: s_mov_b32 s6, -1 365; VI-NEXT: s_waitcnt lgkmcnt(0) 366; VI-NEXT: s_load_dword s8, s[4:5], 0x0 367; VI-NEXT: s_load_dword s2, s[2:3], 0x0 368; VI-NEXT: s_mov_b32 s4, s0 369; VI-NEXT: s_mov_b32 s5, s1 370; VI-NEXT: s_waitcnt lgkmcnt(0) 371; VI-NEXT: v_max_f16_e64 v0, s8, s8 372; VI-NEXT: v_max_f16_e64 v1, s2, s2 373; VI-NEXT: s_lshr_b32 s0, s8, 16 374; VI-NEXT: v_min_f16_e32 v0, v1, v0 375; VI-NEXT: v_max_f16_e64 v1, s0, s0 376; VI-NEXT: s_lshr_b32 s0, s2, 16 377; VI-NEXT: v_max_f16_e64 v2, s0, s0 378; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 379; VI-NEXT: v_or_b32_e32 v0, v0, v1 380; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 381; VI-NEXT: s_endpgm 382; 383; GFX9-LABEL: minnum_v2f16_ieee: 384; GFX9: ; %bb.0: ; %entry 385; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 386; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 387; GFX9-NEXT: s_mov_b32 s3, 0xf000 388; GFX9-NEXT: s_mov_b32 s2, -1 389; GFX9-NEXT: s_waitcnt lgkmcnt(0) 390; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 391; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 392; GFX9-NEXT: s_mov_b32 s0, s4 393; GFX9-NEXT: s_mov_b32 s1, s5 394; GFX9-NEXT: s_waitcnt lgkmcnt(0) 395; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 396; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 397; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 398; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 399; GFX9-NEXT: s_endpgm 400; 401; GFX10-LABEL: minnum_v2f16_ieee: 402; GFX10: ; %bb.0: ; %entry 403; GFX10-NEXT: s_clause 0x1 404; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 405; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 406; GFX10-NEXT: s_waitcnt lgkmcnt(0) 407; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 408; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0 409; GFX10-NEXT: s_mov_b32 s7, 0x31016000 410; GFX10-NEXT: s_mov_b32 s6, -1 411; GFX10-NEXT: s_waitcnt lgkmcnt(0) 412; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 413; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 414; GFX10-NEXT: v_pk_min_f16 v0, v1, v0 415; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 416; GFX10-NEXT: s_endpgm 417 <2 x half> addrspace(1)* %r, 418 <2 x half> addrspace(1)* %a, 419 <2 x half> addrspace(1)* %b) #0 { 420entry: 421 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 422 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 423 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> %b.val) 424 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 425 ret void 426} 427 428define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) #0 { 429; SI-LABEL: minnum_v2f16_no_ieee: 430; SI: ; %bb.0: 431; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 432; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 433; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 434; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 435; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 436; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 437; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 438; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 439; SI-NEXT: v_min_f32_e32 v0, v0, v2 440; SI-NEXT: v_min_f32_e32 v1, v1, v3 441; SI-NEXT: ; return to shader part epilog 442; 443; VI-LABEL: minnum_v2f16_no_ieee: 444; VI: ; %bb.0: 445; VI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 446; VI-NEXT: v_min_f16_e32 v0, v0, v1 447; VI-NEXT: v_or_b32_e32 v0, v0, v2 448; VI-NEXT: ; return to shader part epilog 449; 450; GFX9-LABEL: minnum_v2f16_no_ieee: 451; GFX9: ; %bb.0: 452; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 453; GFX9-NEXT: ; return to shader part epilog 454; 455; GFX10-LABEL: minnum_v2f16_no_ieee: 456; GFX10: ; %bb.0: 457; GFX10-NEXT: v_pk_min_f16 v0, v0, v1 458; GFX10-NEXT: ; return to shader part epilog 459 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) 460 ret <2 x half> %r.val 461} 462 463define amdgpu_kernel void @minnum_v2f16_imm_a( 464; SI-LABEL: minnum_v2f16_imm_a: 465; SI: ; %bb.0: ; %entry 466; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 467; SI-NEXT: s_waitcnt lgkmcnt(0) 468; SI-NEXT: s_load_dword s2, s[2:3], 0x0 469; SI-NEXT: s_mov_b32 s3, 0xf000 470; SI-NEXT: s_waitcnt lgkmcnt(0) 471; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 472; SI-NEXT: s_lshr_b32 s2, s2, 16 473; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 474; SI-NEXT: s_mov_b32 s2, -1 475; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 476; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 477; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 478; SI-NEXT: v_min_f32_e32 v1, 4.0, v1 479; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 480; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 481; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 482; SI-NEXT: v_or_b32_e32 v0, v0, v1 483; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 484; SI-NEXT: s_endpgm 485; 486; VI-LABEL: minnum_v2f16_imm_a: 487; VI: ; %bb.0: ; %entry 488; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 489; VI-NEXT: v_mov_b32_e32 v2, 0x4400 490; VI-NEXT: s_waitcnt lgkmcnt(0) 491; VI-NEXT: s_load_dword s4, s[2:3], 0x0 492; VI-NEXT: s_mov_b32 s3, 0xf000 493; VI-NEXT: s_mov_b32 s2, -1 494; VI-NEXT: s_waitcnt lgkmcnt(0) 495; VI-NEXT: v_max_f16_e64 v0, s4, s4 496; VI-NEXT: s_lshr_b32 s4, s4, 16 497; VI-NEXT: v_max_f16_e64 v1, s4, s4 498; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0 499; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 500; VI-NEXT: v_or_b32_e32 v0, v0, v1 501; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 502; VI-NEXT: s_endpgm 503; 504; GFX9-LABEL: minnum_v2f16_imm_a: 505; GFX9: ; %bb.0: ; %entry 506; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 507; GFX9-NEXT: s_waitcnt lgkmcnt(0) 508; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 509; GFX9-NEXT: s_mov_b32 s3, 0xf000 510; GFX9-NEXT: s_mov_b32 s2, -1 511; GFX9-NEXT: s_waitcnt lgkmcnt(0) 512; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 513; GFX9-NEXT: s_mov_b32 s4, 0x44004200 514; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 515; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 516; GFX9-NEXT: s_endpgm 517; 518; GFX10-LABEL: minnum_v2f16_imm_a: 519; GFX10: ; %bb.0: ; %entry 520; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 521; GFX10-NEXT: s_waitcnt lgkmcnt(0) 522; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 523; GFX10-NEXT: s_mov_b32 s3, 0x31016000 524; GFX10-NEXT: s_waitcnt lgkmcnt(0) 525; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 526; GFX10-NEXT: s_mov_b32 s2, -1 527; GFX10-NEXT: v_pk_min_f16 v0, 0x44004200, v0 528; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 529; GFX10-NEXT: s_endpgm 530 <2 x half> addrspace(1)* %r, 531 <2 x half> addrspace(1)* %b) #0 { 532entry: 533 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 534 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val) 535 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 536 ret void 537} 538 539define amdgpu_kernel void @minnum_v2f16_imm_b( 540; SI-LABEL: minnum_v2f16_imm_b: 541; SI: ; %bb.0: ; %entry 542; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 543; SI-NEXT: s_waitcnt lgkmcnt(0) 544; SI-NEXT: s_load_dword s2, s[2:3], 0x0 545; SI-NEXT: s_mov_b32 s3, 0xf000 546; SI-NEXT: s_waitcnt lgkmcnt(0) 547; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 548; SI-NEXT: s_lshr_b32 s2, s2, 16 549; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 550; SI-NEXT: s_mov_b32 s2, -1 551; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 552; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 553; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 554; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 555; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 556; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 557; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 558; SI-NEXT: v_or_b32_e32 v0, v0, v1 559; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 560; SI-NEXT: s_endpgm 561; 562; VI-LABEL: minnum_v2f16_imm_b: 563; VI: ; %bb.0: ; %entry 564; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 565; VI-NEXT: v_mov_b32_e32 v2, 0x4200 566; VI-NEXT: s_waitcnt lgkmcnt(0) 567; VI-NEXT: s_load_dword s4, s[2:3], 0x0 568; VI-NEXT: s_mov_b32 s3, 0xf000 569; VI-NEXT: s_mov_b32 s2, -1 570; VI-NEXT: s_waitcnt lgkmcnt(0) 571; VI-NEXT: v_max_f16_e64 v0, s4, s4 572; VI-NEXT: s_lshr_b32 s4, s4, 16 573; VI-NEXT: v_max_f16_e64 v1, s4, s4 574; VI-NEXT: v_min_f16_e32 v0, 4.0, v0 575; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 576; VI-NEXT: v_or_b32_e32 v0, v0, v1 577; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 578; VI-NEXT: s_endpgm 579; 580; GFX9-LABEL: minnum_v2f16_imm_b: 581; GFX9: ; %bb.0: ; %entry 582; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 583; GFX9-NEXT: s_waitcnt lgkmcnt(0) 584; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 585; GFX9-NEXT: s_mov_b32 s3, 0xf000 586; GFX9-NEXT: s_mov_b32 s2, -1 587; GFX9-NEXT: s_waitcnt lgkmcnt(0) 588; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 589; GFX9-NEXT: s_mov_b32 s4, 0x42004400 590; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 591; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 592; GFX9-NEXT: s_endpgm 593; 594; GFX10-LABEL: minnum_v2f16_imm_b: 595; GFX10: ; %bb.0: ; %entry 596; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 597; GFX10-NEXT: s_waitcnt lgkmcnt(0) 598; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 599; GFX10-NEXT: s_mov_b32 s3, 0x31016000 600; GFX10-NEXT: s_waitcnt lgkmcnt(0) 601; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 602; GFX10-NEXT: s_mov_b32 s2, -1 603; GFX10-NEXT: v_pk_min_f16 v0, 0x42004400, v0 604; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 605; GFX10-NEXT: s_endpgm 606 <2 x half> addrspace(1)* %r, 607 <2 x half> addrspace(1)* %a) #0 { 608entry: 609 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 610 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>) 611 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 612 ret void 613} 614 615; FIXME: Scalarize with undef half 616define amdgpu_kernel void @minnum_v3f16( 617; SI-LABEL: minnum_v3f16: 618; SI: ; %bb.0: ; %entry 619; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 620; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 621; SI-NEXT: s_waitcnt lgkmcnt(0) 622; SI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 623; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 624; SI-NEXT: s_mov_b32 s7, 0xf000 625; SI-NEXT: s_mov_b32 s6, -1 626; SI-NEXT: s_waitcnt lgkmcnt(0) 627; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 628; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 629; SI-NEXT: s_lshr_b32 s2, s2, 16 630; SI-NEXT: s_lshr_b32 s3, s0, 16 631; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 632; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 633; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 634; SI-NEXT: v_cvt_f32_f16_e32 v4, s1 635; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 636; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 637; SI-NEXT: v_min_f32_e32 v2, v3, v2 638; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 639; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 640; SI-NEXT: v_min_f32_e32 v1, v1, v3 641; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 642; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 643; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 644; SI-NEXT: v_min_f32_e32 v0, v0, v3 645; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 646; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 647; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 648; SI-NEXT: v_or_b32_e32 v1, v1, v2 649; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 650; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 651; SI-NEXT: s_endpgm 652; 653; VI-LABEL: minnum_v3f16: 654; VI: ; %bb.0: ; %entry 655; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 656; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 657; VI-NEXT: s_mov_b32 s7, 0xf000 658; VI-NEXT: s_mov_b32 s6, -1 659; VI-NEXT: s_waitcnt lgkmcnt(0) 660; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 661; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 662; VI-NEXT: s_mov_b32 s4, s0 663; VI-NEXT: s_mov_b32 s5, s1 664; VI-NEXT: s_waitcnt lgkmcnt(0) 665; VI-NEXT: v_max_f16_e64 v0, s8, s8 666; VI-NEXT: v_max_f16_e64 v1, s2, s2 667; VI-NEXT: s_lshr_b32 s0, s8, 16 668; VI-NEXT: v_min_f16_e32 v0, v1, v0 669; VI-NEXT: v_max_f16_e64 v1, s0, s0 670; VI-NEXT: s_lshr_b32 s0, s2, 16 671; VI-NEXT: v_max_f16_e64 v2, s0, s0 672; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 673; VI-NEXT: v_or_b32_e32 v0, v0, v1 674; VI-NEXT: v_max_f16_e64 v1, s9, s9 675; VI-NEXT: v_max_f16_e64 v2, s3, s3 676; VI-NEXT: v_min_f16_e32 v1, v2, v1 677; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 678; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 679; VI-NEXT: s_endpgm 680; 681; GFX9-LABEL: minnum_v3f16: 682; GFX9: ; %bb.0: ; %entry 683; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 684; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 685; GFX9-NEXT: s_mov_b32 s3, 0xf000 686; GFX9-NEXT: s_mov_b32 s2, -1 687; GFX9-NEXT: s_waitcnt lgkmcnt(0) 688; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 689; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 690; GFX9-NEXT: s_mov_b32 s0, s4 691; GFX9-NEXT: s_mov_b32 s1, s5 692; GFX9-NEXT: s_waitcnt lgkmcnt(0) 693; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 694; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 695; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 696; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 697; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 698; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 699; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 700; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 701; GFX9-NEXT: s_endpgm 702; 703; GFX10-LABEL: minnum_v3f16: 704; GFX10: ; %bb.0: ; %entry 705; GFX10-NEXT: s_clause 0x1 706; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 707; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 708; GFX10-NEXT: s_waitcnt lgkmcnt(0) 709; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 710; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 711; GFX10-NEXT: s_mov_b32 s7, 0x31016000 712; GFX10-NEXT: s_mov_b32 s6, -1 713; GFX10-NEXT: s_waitcnt lgkmcnt(0) 714; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 715; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 716; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 717; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 718; GFX10-NEXT: v_pk_min_f16 v1, v2, v1 719; GFX10-NEXT: v_pk_min_f16 v0, v3, v0 720; GFX10-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 721; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 722; GFX10-NEXT: s_endpgm 723 <3 x half> addrspace(1)* %r, 724 <3 x half> addrspace(1)* %a, 725 <3 x half> addrspace(1)* %b) #0 { 726entry: 727 %a.val = load <3 x half>, <3 x half> addrspace(1)* %a 728 %b.val = load <3 x half>, <3 x half> addrspace(1)* %b 729 %r.val = call <3 x half> @llvm.minnum.v3f16(<3 x half> %a.val, <3 x half> %b.val) 730 store <3 x half> %r.val, <3 x half> addrspace(1)* %r 731 ret void 732} 733 734define amdgpu_kernel void @minnum_v4f16( 735; SI-LABEL: minnum_v4f16: 736; SI: ; %bb.0: ; %entry 737; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 738; SI-NEXT: s_mov_b32 s3, 0xf000 739; SI-NEXT: s_mov_b32 s2, -1 740; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 741; SI-NEXT: s_waitcnt lgkmcnt(0) 742; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 743; SI-NEXT: s_mov_b32 s0, s4 744; SI-NEXT: s_mov_b32 s1, s5 745; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 746; SI-NEXT: s_waitcnt lgkmcnt(0) 747; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 748; SI-NEXT: s_lshr_b32 s6, s6, 16 749; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 750; SI-NEXT: s_lshr_b32 s6, s7, 16 751; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 752; SI-NEXT: s_lshr_b32 s6, s5, 16 753; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 754; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 755; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 756; SI-NEXT: s_lshr_b32 s4, s4, 16 757; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 758; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 759; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 760; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 761; SI-NEXT: v_min_f32_e32 v3, v3, v5 762; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 763; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 764; SI-NEXT: v_min_f32_e32 v1, v1, v5 765; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 766; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 767; SI-NEXT: v_min_f32_e32 v2, v2, v5 768; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 769; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 770; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 771; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 772; SI-NEXT: v_min_f32_e32 v0, v0, v4 773; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 774; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 775; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 776; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 777; SI-NEXT: v_or_b32_e32 v1, v1, v3 778; SI-NEXT: v_or_b32_e32 v0, v0, v2 779; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 780; SI-NEXT: s_endpgm 781; 782; VI-LABEL: minnum_v4f16: 783; VI: ; %bb.0: ; %entry 784; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 785; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 786; VI-NEXT: s_mov_b32 s7, 0xf000 787; VI-NEXT: s_mov_b32 s6, -1 788; VI-NEXT: s_waitcnt lgkmcnt(0) 789; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 790; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 791; VI-NEXT: s_mov_b32 s4, s0 792; VI-NEXT: s_mov_b32 s5, s1 793; VI-NEXT: s_waitcnt lgkmcnt(0) 794; VI-NEXT: v_max_f16_e64 v0, s9, s9 795; VI-NEXT: v_max_f16_e64 v1, s3, s3 796; VI-NEXT: s_lshr_b32 s0, s9, 16 797; VI-NEXT: v_min_f16_e32 v0, v1, v0 798; VI-NEXT: v_max_f16_e64 v1, s0, s0 799; VI-NEXT: s_lshr_b32 s0, s3, 16 800; VI-NEXT: v_max_f16_e64 v2, s0, s0 801; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 802; VI-NEXT: v_or_b32_e32 v1, v0, v1 803; VI-NEXT: v_max_f16_e64 v0, s8, s8 804; VI-NEXT: v_max_f16_e64 v2, s2, s2 805; VI-NEXT: s_lshr_b32 s0, s8, 16 806; VI-NEXT: v_min_f16_e32 v0, v2, v0 807; VI-NEXT: v_max_f16_e64 v2, s0, s0 808; VI-NEXT: s_lshr_b32 s0, s2, 16 809; VI-NEXT: v_max_f16_e64 v3, s0, s0 810; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 811; VI-NEXT: v_or_b32_e32 v0, v0, v2 812; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 813; VI-NEXT: s_endpgm 814; 815; GFX9-LABEL: minnum_v4f16: 816; GFX9: ; %bb.0: ; %entry 817; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 818; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 819; GFX9-NEXT: s_mov_b32 s3, 0xf000 820; GFX9-NEXT: s_mov_b32 s2, -1 821; GFX9-NEXT: s_waitcnt lgkmcnt(0) 822; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 823; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 824; GFX9-NEXT: s_mov_b32 s0, s4 825; GFX9-NEXT: s_mov_b32 s1, s5 826; GFX9-NEXT: s_waitcnt lgkmcnt(0) 827; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 828; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 829; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 830; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 831; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 832; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 833; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 834; GFX9-NEXT: s_endpgm 835; 836; GFX10-LABEL: minnum_v4f16: 837; GFX10: ; %bb.0: ; %entry 838; GFX10-NEXT: s_clause 0x1 839; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 840; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 841; GFX10-NEXT: s_waitcnt lgkmcnt(0) 842; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 843; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 844; GFX10-NEXT: s_mov_b32 s7, 0x31016000 845; GFX10-NEXT: s_mov_b32 s6, -1 846; GFX10-NEXT: s_waitcnt lgkmcnt(0) 847; GFX10-NEXT: v_pk_max_f16 v0, s1, s1 848; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 849; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 850; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 851; GFX10-NEXT: v_pk_min_f16 v1, v1, v0 852; GFX10-NEXT: v_pk_min_f16 v0, v3, v2 853; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 854; GFX10-NEXT: s_endpgm 855 <4 x half> addrspace(1)* %r, 856 <4 x half> addrspace(1)* %a, 857 <4 x half> addrspace(1)* %b) #0 { 858entry: 859 %a.val = load <4 x half>, <4 x half> addrspace(1)* %a 860 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b 861 %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> %a.val, <4 x half> %b.val) 862 store <4 x half> %r.val, <4 x half> addrspace(1)* %r 863 ret void 864} 865 866define amdgpu_kernel void @fmin_v4f16_imm_a( 867; SI-LABEL: fmin_v4f16_imm_a: 868; SI: ; %bb.0: ; %entry 869; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 870; SI-NEXT: s_waitcnt lgkmcnt(0) 871; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 872; SI-NEXT: s_mov_b32 s3, 0xf000 873; SI-NEXT: s_mov_b32 s2, -1 874; SI-NEXT: s_waitcnt lgkmcnt(0) 875; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 876; SI-NEXT: s_lshr_b32 s5, s5, 16 877; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 878; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 879; SI-NEXT: s_lshr_b32 s4, s4, 16 880; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 881; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 882; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 883; SI-NEXT: v_min_f32_e32 v2, 4.0, v2 884; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 885; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 886; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 887; SI-NEXT: v_min_f32_e32 v3, 2.0, v3 888; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 889; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 890; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 891; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0 892; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 893; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 894; SI-NEXT: v_or_b32_e32 v1, v1, v2 895; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 896; SI-NEXT: v_or_b32_e32 v0, v0, v2 897; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 898; SI-NEXT: s_endpgm 899; 900; VI-LABEL: fmin_v4f16_imm_a: 901; VI: ; %bb.0: ; %entry 902; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 903; VI-NEXT: v_mov_b32_e32 v0, 0x4400 904; VI-NEXT: s_mov_b32 s7, 0xf000 905; VI-NEXT: s_mov_b32 s6, -1 906; VI-NEXT: s_waitcnt lgkmcnt(0) 907; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 908; VI-NEXT: s_mov_b32 s4, s0 909; VI-NEXT: s_mov_b32 s5, s1 910; VI-NEXT: s_waitcnt lgkmcnt(0) 911; VI-NEXT: s_lshr_b32 s0, s3, 16 912; VI-NEXT: v_max_f16_e64 v1, s3, s3 913; VI-NEXT: v_max_f16_e64 v3, s0, s0 914; VI-NEXT: v_max_f16_e64 v2, s2, s2 915; VI-NEXT: v_min_f16_e32 v1, 0x4200, v1 916; VI-NEXT: v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 917; VI-NEXT: s_lshr_b32 s0, s2, 16 918; VI-NEXT: v_or_b32_e32 v1, v1, v0 919; VI-NEXT: v_min_f16_e32 v0, 0x4800, v2 920; VI-NEXT: v_max_f16_e64 v2, s0, s0 921; VI-NEXT: v_mov_b32_e32 v3, 0x4000 922; VI-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 923; VI-NEXT: v_or_b32_e32 v0, v0, v2 924; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 925; VI-NEXT: s_endpgm 926; 927; GFX9-LABEL: fmin_v4f16_imm_a: 928; GFX9: ; %bb.0: ; %entry 929; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 930; GFX9-NEXT: s_mov_b32 s8, 0x44004200 931; GFX9-NEXT: s_mov_b32 s9, 0x40004800 932; GFX9-NEXT: s_mov_b32 s7, 0xf000 933; GFX9-NEXT: s_mov_b32 s6, -1 934; GFX9-NEXT: s_waitcnt lgkmcnt(0) 935; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 936; GFX9-NEXT: s_mov_b32 s4, s0 937; GFX9-NEXT: s_mov_b32 s5, s1 938; GFX9-NEXT: s_waitcnt lgkmcnt(0) 939; GFX9-NEXT: v_pk_max_f16 v0, s3, s3 940; GFX9-NEXT: v_pk_max_f16 v2, s2, s2 941; GFX9-NEXT: v_pk_min_f16 v1, v0, s8 942; GFX9-NEXT: v_pk_min_f16 v0, v2, s9 943; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 944; GFX9-NEXT: s_endpgm 945; 946; GFX10-LABEL: fmin_v4f16_imm_a: 947; GFX10: ; %bb.0: ; %entry 948; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 949; GFX10-NEXT: s_waitcnt lgkmcnt(0) 950; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 951; GFX10-NEXT: s_waitcnt lgkmcnt(0) 952; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 953; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 954; GFX10-NEXT: s_mov_b32 s3, 0x31016000 955; GFX10-NEXT: s_mov_b32 s2, -1 956; GFX10-NEXT: v_pk_min_f16 v1, 0x44004200, v0 957; GFX10-NEXT: v_pk_min_f16 v0, 0x40004800, v2 958; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 959; GFX10-NEXT: s_endpgm 960 <4 x half> addrspace(1)* %r, 961 <4 x half> addrspace(1)* %b) #0 { 962entry: 963 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b 964 %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val) 965 store <4 x half> %r.val, <4 x half> addrspace(1)* %r 966 ret void 967} 968 969attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 970