1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=SI %s 3; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s 5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s 6 7declare half @llvm.maxnum.f16(half %a, half %b) 8declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) 9declare <3 x half> @llvm.maxnum.v3f16(<3 x half> %a, <3 x half> %b) 10declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b) 11 12define amdgpu_kernel void @maxnum_f16( 13; SI-LABEL: maxnum_f16: 14; SI: ; %bb.0: ; %entry 15; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 16; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 17; SI-NEXT: s_mov_b32 s3, 0xf000 18; SI-NEXT: s_mov_b32 s2, -1 19; SI-NEXT: s_mov_b32 s14, s2 20; SI-NEXT: s_waitcnt lgkmcnt(0) 21; SI-NEXT: s_mov_b32 s12, s6 22; SI-NEXT: s_mov_b32 s13, s7 23; SI-NEXT: s_mov_b32 s15, s3 24; SI-NEXT: s_mov_b32 s10, s2 25; SI-NEXT: s_mov_b32 s11, s3 26; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 27; SI-NEXT: s_waitcnt vmcnt(0) 28; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 29; SI-NEXT: s_waitcnt vmcnt(0) 30; SI-NEXT: s_mov_b32 s0, s4 31; SI-NEXT: s_mov_b32 s1, s5 32; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 33; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 34; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 35; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 36; SI-NEXT: v_max_f32_e32 v0, v0, v1 37; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 38; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 39; SI-NEXT: s_endpgm 40; 41; VI-LABEL: maxnum_f16: 42; VI: ; %bb.0: ; %entry 43; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 44; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 45; VI-NEXT: s_mov_b32 s3, 0xf000 46; VI-NEXT: s_mov_b32 s2, -1 47; VI-NEXT: s_mov_b32 s14, s2 48; VI-NEXT: s_waitcnt lgkmcnt(0) 49; VI-NEXT: s_mov_b32 s12, s6 50; VI-NEXT: s_mov_b32 s13, s7 51; VI-NEXT: s_mov_b32 s15, s3 52; VI-NEXT: s_mov_b32 s10, s2 53; VI-NEXT: s_mov_b32 s11, s3 54; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 55; VI-NEXT: s_waitcnt vmcnt(0) 56; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 57; VI-NEXT: s_waitcnt vmcnt(0) 58; VI-NEXT: s_mov_b32 s0, s4 59; VI-NEXT: s_mov_b32 s1, s5 60; VI-NEXT: v_max_f16_e32 v0, v0, v0 61; VI-NEXT: v_max_f16_e32 v1, v1, v1 62; VI-NEXT: v_max_f16_e32 v0, v0, v1 63; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 64; VI-NEXT: s_endpgm 65; 66; GFX9-LABEL: maxnum_f16: 67; GFX9: ; %bb.0: ; %entry 68; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 69; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 70; GFX9-NEXT: s_mov_b32 s3, 0xf000 71; GFX9-NEXT: s_mov_b32 s2, -1 72; GFX9-NEXT: s_mov_b32 s14, s2 73; GFX9-NEXT: s_waitcnt lgkmcnt(0) 74; GFX9-NEXT: s_mov_b32 s12, s6 75; GFX9-NEXT: s_mov_b32 s13, s7 76; GFX9-NEXT: s_mov_b32 s15, s3 77; GFX9-NEXT: s_mov_b32 s10, s2 78; GFX9-NEXT: s_mov_b32 s11, s3 79; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 80; GFX9-NEXT: s_waitcnt vmcnt(0) 81; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 82; GFX9-NEXT: s_waitcnt vmcnt(0) 83; GFX9-NEXT: s_mov_b32 s0, s4 84; GFX9-NEXT: s_mov_b32 s1, s5 85; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 86; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 87; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 88; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 89; GFX9-NEXT: s_endpgm 90; 91; GFX10-LABEL: maxnum_f16: 92; GFX10: ; %bb.0: ; %entry 93; GFX10-NEXT: s_clause 0x1 94; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 95; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 96; GFX10-NEXT: s_mov_b32 s2, -1 97; GFX10-NEXT: s_mov_b32 s3, 0x31016000 98; GFX10-NEXT: s_mov_b32 s14, s2 99; GFX10-NEXT: s_mov_b32 s15, s3 100; GFX10-NEXT: s_mov_b32 s10, s2 101; GFX10-NEXT: s_mov_b32 s11, s3 102; GFX10-NEXT: s_waitcnt lgkmcnt(0) 103; GFX10-NEXT: s_mov_b32 s12, s6 104; GFX10-NEXT: s_mov_b32 s13, s7 105; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc 106; GFX10-NEXT: s_waitcnt vmcnt(0) 107; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc 108; GFX10-NEXT: s_waitcnt vmcnt(0) 109; GFX10-NEXT: s_mov_b32 s0, s4 110; GFX10-NEXT: s_mov_b32 s1, s5 111; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 112; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 113; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 114; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 115; GFX10-NEXT: s_endpgm 116 half addrspace(1)* %r, 117 half addrspace(1)* %a, 118 half addrspace(1)* %b) #0 { 119entry: 120 %a.val = load volatile half, half addrspace(1)* %a 121 %b.val = load volatile half, half addrspace(1)* %b 122 %r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val) 123 store half %r.val, half addrspace(1)* %r 124 ret void 125} 126 127define amdgpu_kernel void @maxnum_f16_imm_a( 128; SI-LABEL: maxnum_f16_imm_a: 129; SI: ; %bb.0: ; %entry 130; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 131; SI-NEXT: s_mov_b32 s7, 0xf000 132; SI-NEXT: s_mov_b32 s6, -1 133; SI-NEXT: s_mov_b32 s10, s6 134; SI-NEXT: s_mov_b32 s11, s7 135; SI-NEXT: s_waitcnt lgkmcnt(0) 136; SI-NEXT: s_mov_b32 s8, s2 137; SI-NEXT: s_mov_b32 s9, s3 138; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 139; SI-NEXT: s_mov_b32 s4, s0 140; SI-NEXT: s_mov_b32 s5, s1 141; SI-NEXT: s_waitcnt vmcnt(0) 142; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 143; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 144; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 145; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 146; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 147; SI-NEXT: s_endpgm 148; 149; VI-LABEL: maxnum_f16_imm_a: 150; VI: ; %bb.0: ; %entry 151; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 152; VI-NEXT: s_mov_b32 s7, 0xf000 153; VI-NEXT: s_mov_b32 s6, -1 154; VI-NEXT: s_mov_b32 s10, s6 155; VI-NEXT: s_mov_b32 s11, s7 156; VI-NEXT: s_waitcnt lgkmcnt(0) 157; VI-NEXT: s_mov_b32 s8, s2 158; VI-NEXT: s_mov_b32 s9, s3 159; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 160; VI-NEXT: s_mov_b32 s4, s0 161; VI-NEXT: s_mov_b32 s5, s1 162; VI-NEXT: s_waitcnt vmcnt(0) 163; VI-NEXT: v_max_f16_e32 v0, v0, v0 164; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0 165; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 166; VI-NEXT: s_endpgm 167; 168; GFX9-LABEL: maxnum_f16_imm_a: 169; GFX9: ; %bb.0: ; %entry 170; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 171; GFX9-NEXT: s_mov_b32 s7, 0xf000 172; GFX9-NEXT: s_mov_b32 s6, -1 173; GFX9-NEXT: s_mov_b32 s10, s6 174; GFX9-NEXT: s_mov_b32 s11, s7 175; GFX9-NEXT: s_waitcnt lgkmcnt(0) 176; GFX9-NEXT: s_mov_b32 s8, s2 177; GFX9-NEXT: s_mov_b32 s9, s3 178; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 179; GFX9-NEXT: s_mov_b32 s4, s0 180; GFX9-NEXT: s_mov_b32 s5, s1 181; GFX9-NEXT: s_waitcnt vmcnt(0) 182; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 183; GFX9-NEXT: v_max_f16_e32 v0, 0x4200, v0 184; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 185; GFX9-NEXT: s_endpgm 186; 187; GFX10-LABEL: maxnum_f16_imm_a: 188; GFX10: ; %bb.0: ; %entry 189; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 190; GFX10-NEXT: s_mov_b32 s6, -1 191; GFX10-NEXT: s_mov_b32 s7, 0x31016000 192; GFX10-NEXT: s_mov_b32 s10, s6 193; GFX10-NEXT: s_mov_b32 s11, s7 194; GFX10-NEXT: s_waitcnt lgkmcnt(0) 195; GFX10-NEXT: s_mov_b32 s8, s2 196; GFX10-NEXT: s_mov_b32 s9, s3 197; GFX10-NEXT: s_mov_b32 s4, s0 198; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 199; GFX10-NEXT: s_mov_b32 s5, s1 200; GFX10-NEXT: s_waitcnt vmcnt(0) 201; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 202; GFX10-NEXT: v_max_f16_e32 v0, 0x4200, v0 203; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 204; GFX10-NEXT: s_endpgm 205 half addrspace(1)* %r, 206 half addrspace(1)* %b) #0 { 207entry: 208 %b.val = load half, half addrspace(1)* %b 209 %r.val = call half @llvm.maxnum.f16(half 3.0, half %b.val) 210 store half %r.val, half addrspace(1)* %r 211 ret void 212} 213 214define amdgpu_kernel void @maxnum_f16_imm_b( 215; SI-LABEL: maxnum_f16_imm_b: 216; SI: ; %bb.0: ; %entry 217; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 218; SI-NEXT: s_mov_b32 s7, 0xf000 219; SI-NEXT: s_mov_b32 s6, -1 220; SI-NEXT: s_mov_b32 s10, s6 221; SI-NEXT: s_mov_b32 s11, s7 222; SI-NEXT: s_waitcnt lgkmcnt(0) 223; SI-NEXT: s_mov_b32 s8, s2 224; SI-NEXT: s_mov_b32 s9, s3 225; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 226; SI-NEXT: s_mov_b32 s4, s0 227; SI-NEXT: s_mov_b32 s5, s1 228; SI-NEXT: s_waitcnt vmcnt(0) 229; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 230; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 231; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 232; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 233; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 234; SI-NEXT: s_endpgm 235; 236; VI-LABEL: maxnum_f16_imm_b: 237; VI: ; %bb.0: ; %entry 238; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 239; VI-NEXT: s_mov_b32 s7, 0xf000 240; VI-NEXT: s_mov_b32 s6, -1 241; VI-NEXT: s_mov_b32 s10, s6 242; VI-NEXT: s_mov_b32 s11, s7 243; VI-NEXT: s_waitcnt lgkmcnt(0) 244; VI-NEXT: s_mov_b32 s8, s2 245; VI-NEXT: s_mov_b32 s9, s3 246; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 247; VI-NEXT: s_mov_b32 s4, s0 248; VI-NEXT: s_mov_b32 s5, s1 249; VI-NEXT: s_waitcnt vmcnt(0) 250; VI-NEXT: v_max_f16_e32 v0, v0, v0 251; VI-NEXT: v_max_f16_e32 v0, 4.0, v0 252; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 253; VI-NEXT: s_endpgm 254; 255; GFX9-LABEL: maxnum_f16_imm_b: 256; GFX9: ; %bb.0: ; %entry 257; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 258; GFX9-NEXT: s_mov_b32 s7, 0xf000 259; GFX9-NEXT: s_mov_b32 s6, -1 260; GFX9-NEXT: s_mov_b32 s10, s6 261; GFX9-NEXT: s_mov_b32 s11, s7 262; GFX9-NEXT: s_waitcnt lgkmcnt(0) 263; GFX9-NEXT: s_mov_b32 s8, s2 264; GFX9-NEXT: s_mov_b32 s9, s3 265; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 266; GFX9-NEXT: s_mov_b32 s4, s0 267; GFX9-NEXT: s_mov_b32 s5, s1 268; GFX9-NEXT: s_waitcnt vmcnt(0) 269; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 270; GFX9-NEXT: v_max_f16_e32 v0, 4.0, v0 271; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 272; GFX9-NEXT: s_endpgm 273; 274; GFX10-LABEL: maxnum_f16_imm_b: 275; GFX10: ; %bb.0: ; %entry 276; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 277; GFX10-NEXT: s_mov_b32 s6, -1 278; GFX10-NEXT: s_mov_b32 s7, 0x31016000 279; GFX10-NEXT: s_mov_b32 s10, s6 280; GFX10-NEXT: s_mov_b32 s11, s7 281; GFX10-NEXT: s_waitcnt lgkmcnt(0) 282; GFX10-NEXT: s_mov_b32 s8, s2 283; GFX10-NEXT: s_mov_b32 s9, s3 284; GFX10-NEXT: s_mov_b32 s4, s0 285; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 286; GFX10-NEXT: s_mov_b32 s5, s1 287; GFX10-NEXT: s_waitcnt vmcnt(0) 288; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 289; GFX10-NEXT: v_max_f16_e32 v0, 4.0, v0 290; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 291; GFX10-NEXT: s_endpgm 292 half addrspace(1)* %r, 293 half addrspace(1)* %a) #0 { 294entry: 295 %a.val = load half, half addrspace(1)* %a 296 %r.val = call half @llvm.maxnum.f16(half %a.val, half 4.0) 297 store half %r.val, half addrspace(1)* %r 298 ret void 299} 300 301define amdgpu_kernel void @maxnum_v2f16( 302; SI-LABEL: maxnum_v2f16: 303; SI: ; %bb.0: ; %entry 304; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 305; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 306; SI-NEXT: s_waitcnt lgkmcnt(0) 307; SI-NEXT: s_load_dword s2, s[6:7], 0x0 308; SI-NEXT: s_load_dword s0, s[0:1], 0x0 309; SI-NEXT: s_mov_b32 s7, 0xf000 310; SI-NEXT: s_mov_b32 s6, -1 311; SI-NEXT: s_waitcnt lgkmcnt(0) 312; SI-NEXT: s_lshr_b32 s1, s2, 16 313; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 314; SI-NEXT: s_lshr_b32 s0, s0, 16 315; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 316; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 317; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 318; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 319; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 320; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 321; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 322; SI-NEXT: v_max_f32_e32 v2, v3, v2 323; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 324; SI-NEXT: v_max_f32_e32 v0, v0, v1 325; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 326; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 327; SI-NEXT: v_or_b32_e32 v0, v0, v1 328; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 329; SI-NEXT: s_endpgm 330; 331; VI-LABEL: maxnum_v2f16: 332; VI: ; %bb.0: ; %entry 333; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 334; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 335; VI-NEXT: s_mov_b32 s7, 0xf000 336; VI-NEXT: s_mov_b32 s6, -1 337; VI-NEXT: s_waitcnt lgkmcnt(0) 338; VI-NEXT: s_load_dword s8, s[4:5], 0x0 339; VI-NEXT: s_load_dword s2, s[2:3], 0x0 340; VI-NEXT: s_mov_b32 s4, s0 341; VI-NEXT: s_mov_b32 s5, s1 342; VI-NEXT: s_waitcnt lgkmcnt(0) 343; VI-NEXT: v_max_f16_e64 v0, s8, s8 344; VI-NEXT: v_max_f16_e64 v1, s2, s2 345; VI-NEXT: s_lshr_b32 s0, s8, 16 346; VI-NEXT: v_max_f16_e32 v0, v1, v0 347; VI-NEXT: v_max_f16_e64 v1, s0, s0 348; VI-NEXT: s_lshr_b32 s0, s2, 16 349; VI-NEXT: v_max_f16_e64 v2, s0, s0 350; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 351; VI-NEXT: v_or_b32_e32 v0, v0, v1 352; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 353; VI-NEXT: s_endpgm 354; 355; GFX9-LABEL: maxnum_v2f16: 356; GFX9: ; %bb.0: ; %entry 357; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 358; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 359; GFX9-NEXT: s_mov_b32 s3, 0xf000 360; GFX9-NEXT: s_mov_b32 s2, -1 361; GFX9-NEXT: s_waitcnt lgkmcnt(0) 362; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 363; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 364; GFX9-NEXT: s_mov_b32 s0, s4 365; GFX9-NEXT: s_mov_b32 s1, s5 366; GFX9-NEXT: s_waitcnt lgkmcnt(0) 367; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 368; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 369; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 370; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 371; GFX9-NEXT: s_endpgm 372; 373; GFX10-LABEL: maxnum_v2f16: 374; GFX10: ; %bb.0: ; %entry 375; GFX10-NEXT: s_clause 0x1 376; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 377; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 378; GFX10-NEXT: s_waitcnt lgkmcnt(0) 379; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 380; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0 381; GFX10-NEXT: s_mov_b32 s7, 0x31016000 382; GFX10-NEXT: s_mov_b32 s6, -1 383; GFX10-NEXT: s_waitcnt lgkmcnt(0) 384; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 385; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 386; GFX10-NEXT: v_pk_max_f16 v0, v1, v0 387; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 388; GFX10-NEXT: s_endpgm 389 <2 x half> addrspace(1)* %r, 390 <2 x half> addrspace(1)* %a, 391 <2 x half> addrspace(1)* %b) #0 { 392entry: 393 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 394 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 395 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> %b.val) 396 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 397 ret void 398} 399 400define amdgpu_kernel void @maxnum_v2f16_imm_a( 401; SI-LABEL: maxnum_v2f16_imm_a: 402; SI: ; %bb.0: ; %entry 403; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 404; SI-NEXT: s_waitcnt lgkmcnt(0) 405; SI-NEXT: s_load_dword s2, s[2:3], 0x0 406; SI-NEXT: s_mov_b32 s3, 0xf000 407; SI-NEXT: s_waitcnt lgkmcnt(0) 408; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 409; SI-NEXT: s_lshr_b32 s2, s2, 16 410; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 411; SI-NEXT: s_mov_b32 s2, -1 412; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 413; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 414; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 415; SI-NEXT: v_max_f32_e32 v1, 4.0, v1 416; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 417; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 418; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 419; SI-NEXT: v_or_b32_e32 v0, v0, v1 420; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 421; SI-NEXT: s_endpgm 422; 423; VI-LABEL: maxnum_v2f16_imm_a: 424; VI: ; %bb.0: ; %entry 425; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 426; VI-NEXT: v_mov_b32_e32 v2, 0x4400 427; VI-NEXT: s_waitcnt lgkmcnt(0) 428; VI-NEXT: s_load_dword s4, s[2:3], 0x0 429; VI-NEXT: s_mov_b32 s3, 0xf000 430; VI-NEXT: s_mov_b32 s2, -1 431; VI-NEXT: s_waitcnt lgkmcnt(0) 432; VI-NEXT: v_max_f16_e64 v0, s4, s4 433; VI-NEXT: s_lshr_b32 s4, s4, 16 434; VI-NEXT: v_max_f16_e64 v1, s4, s4 435; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0 436; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 437; VI-NEXT: v_or_b32_e32 v0, v0, v1 438; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 439; VI-NEXT: s_endpgm 440; 441; GFX9-LABEL: maxnum_v2f16_imm_a: 442; GFX9: ; %bb.0: ; %entry 443; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 444; GFX9-NEXT: s_waitcnt lgkmcnt(0) 445; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 446; GFX9-NEXT: s_mov_b32 s3, 0xf000 447; GFX9-NEXT: s_mov_b32 s2, -1 448; GFX9-NEXT: s_waitcnt lgkmcnt(0) 449; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 450; GFX9-NEXT: s_mov_b32 s4, 0x44004200 451; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 452; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 453; GFX9-NEXT: s_endpgm 454; 455; GFX10-LABEL: maxnum_v2f16_imm_a: 456; GFX10: ; %bb.0: ; %entry 457; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 458; GFX10-NEXT: s_waitcnt lgkmcnt(0) 459; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 460; GFX10-NEXT: s_mov_b32 s3, 0x31016000 461; GFX10-NEXT: s_waitcnt lgkmcnt(0) 462; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 463; GFX10-NEXT: s_mov_b32 s2, -1 464; GFX10-NEXT: v_pk_max_f16 v0, 0x44004200, v0 465; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 466; GFX10-NEXT: s_endpgm 467 <2 x half> addrspace(1)* %r, 468 <2 x half> addrspace(1)* %b) #0 { 469entry: 470 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 471 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val) 472 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 473 ret void 474} 475 476define amdgpu_kernel void @maxnum_v2f16_imm_b( 477; SI-LABEL: maxnum_v2f16_imm_b: 478; SI: ; %bb.0: ; %entry 479; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 480; SI-NEXT: s_waitcnt lgkmcnt(0) 481; SI-NEXT: s_load_dword s2, s[2:3], 0x0 482; SI-NEXT: s_mov_b32 s3, 0xf000 483; SI-NEXT: s_waitcnt lgkmcnt(0) 484; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 485; SI-NEXT: s_lshr_b32 s2, s2, 16 486; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 487; SI-NEXT: s_mov_b32 s2, -1 488; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 489; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 490; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 491; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 492; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 493; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 494; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 495; SI-NEXT: v_or_b32_e32 v0, v0, v1 496; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 497; SI-NEXT: s_endpgm 498; 499; VI-LABEL: maxnum_v2f16_imm_b: 500; VI: ; %bb.0: ; %entry 501; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 502; VI-NEXT: v_mov_b32_e32 v2, 0x4200 503; VI-NEXT: s_waitcnt lgkmcnt(0) 504; VI-NEXT: s_load_dword s4, s[2:3], 0x0 505; VI-NEXT: s_mov_b32 s3, 0xf000 506; VI-NEXT: s_mov_b32 s2, -1 507; VI-NEXT: s_waitcnt lgkmcnt(0) 508; VI-NEXT: v_max_f16_e64 v0, s4, s4 509; VI-NEXT: s_lshr_b32 s4, s4, 16 510; VI-NEXT: v_max_f16_e64 v1, s4, s4 511; VI-NEXT: v_max_f16_e32 v0, 4.0, v0 512; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 513; VI-NEXT: v_or_b32_e32 v0, v0, v1 514; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 515; VI-NEXT: s_endpgm 516; 517; GFX9-LABEL: maxnum_v2f16_imm_b: 518; GFX9: ; %bb.0: ; %entry 519; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 520; GFX9-NEXT: s_waitcnt lgkmcnt(0) 521; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 522; GFX9-NEXT: s_mov_b32 s3, 0xf000 523; GFX9-NEXT: s_mov_b32 s2, -1 524; GFX9-NEXT: s_waitcnt lgkmcnt(0) 525; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 526; GFX9-NEXT: s_mov_b32 s4, 0x42004400 527; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 528; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 529; GFX9-NEXT: s_endpgm 530; 531; GFX10-LABEL: maxnum_v2f16_imm_b: 532; GFX10: ; %bb.0: ; %entry 533; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 534; GFX10-NEXT: s_waitcnt lgkmcnt(0) 535; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 536; GFX10-NEXT: s_mov_b32 s3, 0x31016000 537; GFX10-NEXT: s_waitcnt lgkmcnt(0) 538; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 539; GFX10-NEXT: s_mov_b32 s2, -1 540; GFX10-NEXT: v_pk_max_f16 v0, 0x42004400, v0 541; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 542; GFX10-NEXT: s_endpgm 543 <2 x half> addrspace(1)* %r, 544 <2 x half> addrspace(1)* %a) #0 { 545entry: 546 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 547 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>) 548 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 549 ret void 550} 551 552; FIXME: Scalarize with undef half 553define amdgpu_kernel void @maxnum_v3f16( 554; SI-LABEL: maxnum_v3f16: 555; SI: ; %bb.0: ; %entry 556; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 557; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 558; SI-NEXT: s_waitcnt lgkmcnt(0) 559; SI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 560; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 561; SI-NEXT: s_mov_b32 s7, 0xf000 562; SI-NEXT: s_mov_b32 s6, -1 563; SI-NEXT: s_waitcnt lgkmcnt(0) 564; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 565; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 566; SI-NEXT: s_lshr_b32 s2, s2, 16 567; SI-NEXT: s_lshr_b32 s3, s0, 16 568; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 569; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 570; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 571; SI-NEXT: v_cvt_f32_f16_e32 v4, s1 572; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 573; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 574; SI-NEXT: v_max_f32_e32 v2, v3, v2 575; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 576; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 577; SI-NEXT: v_max_f32_e32 v1, v1, v3 578; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 579; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 580; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 581; SI-NEXT: v_max_f32_e32 v0, v0, v3 582; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 583; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 584; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 585; SI-NEXT: v_or_b32_e32 v1, v1, v2 586; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 587; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 588; SI-NEXT: s_endpgm 589; 590; VI-LABEL: maxnum_v3f16: 591; VI: ; %bb.0: ; %entry 592; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 593; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 594; VI-NEXT: s_mov_b32 s7, 0xf000 595; VI-NEXT: s_mov_b32 s6, -1 596; VI-NEXT: s_waitcnt lgkmcnt(0) 597; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 598; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 599; VI-NEXT: s_mov_b32 s4, s0 600; VI-NEXT: s_mov_b32 s5, s1 601; VI-NEXT: s_waitcnt lgkmcnt(0) 602; VI-NEXT: v_max_f16_e64 v0, s8, s8 603; VI-NEXT: v_max_f16_e64 v1, s2, s2 604; VI-NEXT: s_lshr_b32 s0, s8, 16 605; VI-NEXT: v_max_f16_e32 v0, v1, v0 606; VI-NEXT: v_max_f16_e64 v1, s0, s0 607; VI-NEXT: s_lshr_b32 s0, s2, 16 608; VI-NEXT: v_max_f16_e64 v2, s0, s0 609; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 610; VI-NEXT: v_or_b32_e32 v0, v0, v1 611; VI-NEXT: v_max_f16_e64 v1, s9, s9 612; VI-NEXT: v_max_f16_e64 v2, s3, s3 613; VI-NEXT: v_max_f16_e32 v1, v2, v1 614; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 615; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 616; VI-NEXT: s_endpgm 617; 618; GFX9-LABEL: maxnum_v3f16: 619; GFX9: ; %bb.0: ; %entry 620; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 621; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 622; GFX9-NEXT: s_mov_b32 s3, 0xf000 623; GFX9-NEXT: s_mov_b32 s2, -1 624; GFX9-NEXT: s_waitcnt lgkmcnt(0) 625; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 626; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 627; GFX9-NEXT: s_mov_b32 s0, s4 628; GFX9-NEXT: s_mov_b32 s1, s5 629; GFX9-NEXT: s_waitcnt lgkmcnt(0) 630; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 631; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 632; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 633; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 634; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 635; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 636; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 637; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 638; GFX9-NEXT: s_endpgm 639; 640; GFX10-LABEL: maxnum_v3f16: 641; GFX10: ; %bb.0: ; %entry 642; GFX10-NEXT: s_clause 0x1 643; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 644; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 645; GFX10-NEXT: s_waitcnt lgkmcnt(0) 646; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 647; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 648; GFX10-NEXT: s_mov_b32 s7, 0x31016000 649; GFX10-NEXT: s_mov_b32 s6, -1 650; GFX10-NEXT: s_waitcnt lgkmcnt(0) 651; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 652; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 653; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 654; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 655; GFX10-NEXT: v_pk_max_f16 v1, v2, v1 656; GFX10-NEXT: v_pk_max_f16 v0, v3, v0 657; GFX10-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 658; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 659; GFX10-NEXT: s_endpgm 660 <3 x half> addrspace(1)* %r, 661 <3 x half> addrspace(1)* %a, 662 <3 x half> addrspace(1)* %b) #0 { 663entry: 664 %a.val = load <3 x half>, <3 x half> addrspace(1)* %a 665 %b.val = load <3 x half>, <3 x half> addrspace(1)* %b 666 %r.val = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %a.val, <3 x half> %b.val) 667 store <3 x half> %r.val, <3 x half> addrspace(1)* %r 668 ret void 669} 670 671define amdgpu_kernel void @maxnum_v4f16( 672; SI-LABEL: maxnum_v4f16: 673; SI: ; %bb.0: ; %entry 674; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 675; SI-NEXT: s_mov_b32 s3, 0xf000 676; SI-NEXT: s_mov_b32 s2, -1 677; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 678; SI-NEXT: s_waitcnt lgkmcnt(0) 679; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 680; SI-NEXT: s_mov_b32 s0, s4 681; SI-NEXT: s_mov_b32 s1, s5 682; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 683; SI-NEXT: s_waitcnt lgkmcnt(0) 684; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 685; SI-NEXT: s_lshr_b32 s6, s6, 16 686; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 687; SI-NEXT: s_lshr_b32 s6, s7, 16 688; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 689; SI-NEXT: s_lshr_b32 s6, s5, 16 690; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 691; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 692; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 693; SI-NEXT: s_lshr_b32 s4, s4, 16 694; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 695; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 696; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 697; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 698; SI-NEXT: v_max_f32_e32 v3, v3, v5 699; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 700; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 701; SI-NEXT: v_max_f32_e32 v1, v1, v5 702; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 703; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 704; SI-NEXT: v_max_f32_e32 v2, v2, v5 705; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 706; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 707; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 708; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 709; SI-NEXT: v_max_f32_e32 v0, v0, v4 710; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 711; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 712; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 713; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 714; SI-NEXT: v_or_b32_e32 v1, v1, v3 715; SI-NEXT: v_or_b32_e32 v0, v0, v2 716; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 717; SI-NEXT: s_endpgm 718; 719; VI-LABEL: maxnum_v4f16: 720; VI: ; %bb.0: ; %entry 721; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 722; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 723; VI-NEXT: s_mov_b32 s7, 0xf000 724; VI-NEXT: s_mov_b32 s6, -1 725; VI-NEXT: s_waitcnt lgkmcnt(0) 726; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 727; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 728; VI-NEXT: s_mov_b32 s4, s0 729; VI-NEXT: s_mov_b32 s5, s1 730; VI-NEXT: s_waitcnt lgkmcnt(0) 731; VI-NEXT: v_max_f16_e64 v0, s9, s9 732; VI-NEXT: v_max_f16_e64 v1, s3, s3 733; VI-NEXT: s_lshr_b32 s0, s9, 16 734; VI-NEXT: v_max_f16_e32 v0, v1, v0 735; VI-NEXT: v_max_f16_e64 v1, s0, s0 736; VI-NEXT: s_lshr_b32 s0, s3, 16 737; VI-NEXT: v_max_f16_e64 v2, s0, s0 738; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 739; VI-NEXT: v_or_b32_e32 v1, v0, v1 740; VI-NEXT: v_max_f16_e64 v0, s8, s8 741; VI-NEXT: v_max_f16_e64 v2, s2, s2 742; VI-NEXT: s_lshr_b32 s0, s8, 16 743; VI-NEXT: v_max_f16_e32 v0, v2, v0 744; VI-NEXT: v_max_f16_e64 v2, s0, s0 745; VI-NEXT: s_lshr_b32 s0, s2, 16 746; VI-NEXT: v_max_f16_e64 v3, s0, s0 747; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 748; VI-NEXT: v_or_b32_e32 v0, v0, v2 749; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 750; VI-NEXT: s_endpgm 751; 752; GFX9-LABEL: maxnum_v4f16: 753; GFX9: ; %bb.0: ; %entry 754; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 755; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 756; GFX9-NEXT: s_mov_b32 s3, 0xf000 757; GFX9-NEXT: s_mov_b32 s2, -1 758; GFX9-NEXT: s_waitcnt lgkmcnt(0) 759; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 760; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 761; GFX9-NEXT: s_mov_b32 s0, s4 762; GFX9-NEXT: s_mov_b32 s1, s5 763; GFX9-NEXT: s_waitcnt lgkmcnt(0) 764; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 765; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 766; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 767; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 768; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 769; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 770; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 771; GFX9-NEXT: s_endpgm 772; 773; GFX10-LABEL: maxnum_v4f16: 774; GFX10: ; %bb.0: ; %entry 775; GFX10-NEXT: s_clause 0x1 776; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 777; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 778; GFX10-NEXT: s_waitcnt lgkmcnt(0) 779; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 780; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 781; GFX10-NEXT: s_mov_b32 s7, 0x31016000 782; GFX10-NEXT: s_mov_b32 s6, -1 783; GFX10-NEXT: s_waitcnt lgkmcnt(0) 784; GFX10-NEXT: v_pk_max_f16 v0, s1, s1 785; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 786; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 787; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 788; GFX10-NEXT: v_pk_max_f16 v1, v1, v0 789; GFX10-NEXT: v_pk_max_f16 v0, v3, v2 790; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 791; GFX10-NEXT: s_endpgm 792 <4 x half> addrspace(1)* %r, 793 <4 x half> addrspace(1)* %a, 794 <4 x half> addrspace(1)* %b) #0 { 795entry: 796 %a.val = load <4 x half>, <4 x half> addrspace(1)* %a 797 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b 798 %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %a.val, <4 x half> %b.val) 799 store <4 x half> %r.val, <4 x half> addrspace(1)* %r 800 ret void 801} 802 803define amdgpu_kernel void @fmax_v4f16_imm_a( 804; SI-LABEL: fmax_v4f16_imm_a: 805; SI: ; %bb.0: ; %entry 806; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 807; SI-NEXT: s_waitcnt lgkmcnt(0) 808; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 809; SI-NEXT: s_mov_b32 s3, 0xf000 810; SI-NEXT: s_mov_b32 s2, -1 811; SI-NEXT: s_waitcnt lgkmcnt(0) 812; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 813; SI-NEXT: s_lshr_b32 s5, s5, 16 814; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 815; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 816; SI-NEXT: s_lshr_b32 s4, s4, 16 817; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 818; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 819; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 820; SI-NEXT: v_max_f32_e32 v2, 4.0, v2 821; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 822; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 823; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 824; SI-NEXT: v_max_f32_e32 v3, 2.0, v3 825; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 826; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 827; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 828; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0 829; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 830; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 831; SI-NEXT: v_or_b32_e32 v1, v1, v2 832; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 833; SI-NEXT: v_or_b32_e32 v0, v0, v2 834; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 835; SI-NEXT: s_endpgm 836; 837; VI-LABEL: fmax_v4f16_imm_a: 838; VI: ; %bb.0: ; %entry 839; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 840; VI-NEXT: v_mov_b32_e32 v0, 0x4400 841; VI-NEXT: s_mov_b32 s7, 0xf000 842; VI-NEXT: s_mov_b32 s6, -1 843; VI-NEXT: s_waitcnt lgkmcnt(0) 844; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 845; VI-NEXT: s_mov_b32 s4, s0 846; VI-NEXT: s_mov_b32 s5, s1 847; VI-NEXT: s_waitcnt lgkmcnt(0) 848; VI-NEXT: s_lshr_b32 s0, s3, 16 849; VI-NEXT: v_max_f16_e64 v1, s3, s3 850; VI-NEXT: v_max_f16_e64 v3, s0, s0 851; VI-NEXT: v_max_f16_e64 v2, s2, s2 852; VI-NEXT: v_max_f16_e32 v1, 0x4200, v1 853; VI-NEXT: v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 854; VI-NEXT: s_lshr_b32 s0, s2, 16 855; VI-NEXT: v_or_b32_e32 v1, v1, v0 856; VI-NEXT: v_max_f16_e32 v0, 0x4800, v2 857; VI-NEXT: v_max_f16_e64 v2, s0, s0 858; VI-NEXT: v_mov_b32_e32 v3, 0x4000 859; VI-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 860; VI-NEXT: v_or_b32_e32 v0, v0, v2 861; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 862; VI-NEXT: s_endpgm 863; 864; GFX9-LABEL: fmax_v4f16_imm_a: 865; GFX9: ; %bb.0: ; %entry 866; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 867; GFX9-NEXT: s_mov_b32 s8, 0x44004200 868; GFX9-NEXT: s_mov_b32 s9, 0x40004800 869; GFX9-NEXT: s_mov_b32 s7, 0xf000 870; GFX9-NEXT: s_mov_b32 s6, -1 871; GFX9-NEXT: s_waitcnt lgkmcnt(0) 872; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 873; GFX9-NEXT: s_mov_b32 s4, s0 874; GFX9-NEXT: s_mov_b32 s5, s1 875; GFX9-NEXT: s_waitcnt lgkmcnt(0) 876; GFX9-NEXT: v_pk_max_f16 v0, s3, s3 877; GFX9-NEXT: v_pk_max_f16 v2, s2, s2 878; GFX9-NEXT: v_pk_max_f16 v1, v0, s8 879; GFX9-NEXT: v_pk_max_f16 v0, v2, s9 880; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 881; GFX9-NEXT: s_endpgm 882; 883; GFX10-LABEL: fmax_v4f16_imm_a: 884; GFX10: ; %bb.0: ; %entry 885; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 886; GFX10-NEXT: s_waitcnt lgkmcnt(0) 887; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 888; GFX10-NEXT: s_waitcnt lgkmcnt(0) 889; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 890; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 891; GFX10-NEXT: s_mov_b32 s3, 0x31016000 892; GFX10-NEXT: s_mov_b32 s2, -1 893; GFX10-NEXT: v_pk_max_f16 v1, 0x44004200, v0 894; GFX10-NEXT: v_pk_max_f16 v0, 0x40004800, v2 895; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 896; GFX10-NEXT: s_endpgm 897 <4 x half> addrspace(1)* %r, 898 <4 x half> addrspace(1)* %b) #0 { 899entry: 900 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b 901 %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val) 902 store <4 x half> %r.val, <4 x half> addrspace(1)* %r 903 ret void 904} 905 906attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 907