1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s 4; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s 5; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s 6; FIXME: promotion not handled without f16 insts 7 8define half @v_constained_fadd_f16_fpexcept_strict(half %x, half %y) #0 { 9; GCN-LABEL: v_constained_fadd_f16_fpexcept_strict: 10; GCN: ; %bb.0: 11; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; GCN-NEXT: v_add_f16_e32 v0, v0, v1 13; GCN-NEXT: s_setpc_b64 s[30:31] 14; 15; GFX10PLUS-LABEL: v_constained_fadd_f16_fpexcept_strict: 16; GFX10PLUS: ; %bb.0: 17; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 19; GFX10PLUS-NEXT: v_add_f16_e32 v0, v0, v1 20; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 21 %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 22 ret half %val 23} 24 25define half @v_constained_fadd_f16_fpexcept_ignore(half %x, half %y) #0 { 26; GCN-LABEL: v_constained_fadd_f16_fpexcept_ignore: 27; GCN: ; %bb.0: 28; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29; GCN-NEXT: v_add_f16_e32 v0, v0, v1 30; GCN-NEXT: s_setpc_b64 s[30:31] 31; 32; GFX10PLUS-LABEL: v_constained_fadd_f16_fpexcept_ignore: 33; GFX10PLUS: ; %bb.0: 34; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 36; GFX10PLUS-NEXT: v_add_f16_e32 v0, v0, v1 37; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 38 %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") 39 ret half %val 40} 41 42define half @v_constained_fadd_f16_fpexcept_maytrap(half %x, half %y) #0 { 43; GCN-LABEL: v_constained_fadd_f16_fpexcept_maytrap: 44; GCN: ; %bb.0: 45; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 46; GCN-NEXT: v_add_f16_e32 v0, v0, v1 47; GCN-NEXT: s_setpc_b64 s[30:31] 48; 49; GFX10PLUS-LABEL: v_constained_fadd_f16_fpexcept_maytrap: 50; GFX10PLUS: ; %bb.0: 51; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 52; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 53; GFX10PLUS-NEXT: v_add_f16_e32 v0, v0, v1 54; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 55 %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") 56 ret half %val 57} 58 59define <2 x half> @v_constained_fadd_v2f16_fpexcept_strict(<2 x half> %x, <2 x half> %y) #0 { 60; GFX9-LABEL: v_constained_fadd_v2f16_fpexcept_strict: 61; GFX9: ; %bb.0: 62; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 63; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 64; GFX9-NEXT: s_setpc_b64 s[30:31] 65; 66; GFX8-LABEL: v_constained_fadd_v2f16_fpexcept_strict: 67; GFX8: ; %bb.0: 68; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 69; GFX8-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 70; GFX8-NEXT: v_add_f16_e32 v0, v0, v1 71; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 72; GFX8-NEXT: s_setpc_b64 s[30:31] 73; 74; GFX10PLUS-LABEL: v_constained_fadd_v2f16_fpexcept_strict: 75; GFX10PLUS: ; %bb.0: 76; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 77; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 78; GFX10PLUS-NEXT: v_pk_add_f16 v0, v0, v1 79; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 80 %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 81 ret <2 x half> %val 82} 83 84define <2 x half> @v_constained_fadd_v2f16_fpexcept_ignore(<2 x half> %x, <2 x half> %y) #0 { 85; GFX9-LABEL: v_constained_fadd_v2f16_fpexcept_ignore: 86; GFX9: ; %bb.0: 87; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 88; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 89; GFX9-NEXT: s_setpc_b64 s[30:31] 90; 91; GFX8-LABEL: v_constained_fadd_v2f16_fpexcept_ignore: 92; GFX8: ; %bb.0: 93; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 94; GFX8-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 95; GFX8-NEXT: v_add_f16_e32 v0, v0, v1 96; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 97; GFX8-NEXT: s_setpc_b64 s[30:31] 98; 99; GFX10PLUS-LABEL: v_constained_fadd_v2f16_fpexcept_ignore: 100; GFX10PLUS: ; %bb.0: 101; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 102; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 103; GFX10PLUS-NEXT: v_pk_add_f16 v0, v0, v1 104; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 105 %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") 106 ret <2 x half> %val 107} 108 109define <2 x half> @v_constained_fadd_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x half> %y) #0 { 110; GFX9-LABEL: v_constained_fadd_v2f16_fpexcept_maytrap: 111; GFX9: ; %bb.0: 112; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 113; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 114; GFX9-NEXT: s_setpc_b64 s[30:31] 115; 116; GFX8-LABEL: v_constained_fadd_v2f16_fpexcept_maytrap: 117; GFX8: ; %bb.0: 118; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 119; GFX8-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 120; GFX8-NEXT: v_add_f16_e32 v0, v0, v1 121; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 122; GFX8-NEXT: s_setpc_b64 s[30:31] 123; 124; GFX10PLUS-LABEL: v_constained_fadd_v2f16_fpexcept_maytrap: 125; GFX10PLUS: ; %bb.0: 126; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 127; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 128; GFX10PLUS-NEXT: v_pk_add_f16 v0, v0, v1 129; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 130 %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") 131 ret <2 x half> %val 132} 133 134define <3 x half> @v_constained_fadd_v3f16_fpexcept_strict(<3 x half> %x, <3 x half> %y) #0 { 135; GFX9-LABEL: v_constained_fadd_v3f16_fpexcept_strict: 136; GFX9: ; %bb.0: 137; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 138; GFX9-NEXT: v_pk_add_f16 v0, v0, v2 139; GFX9-NEXT: v_add_f16_e32 v1, v1, v3 140; GFX9-NEXT: s_setpc_b64 s[30:31] 141; 142; GFX8-LABEL: v_constained_fadd_v3f16_fpexcept_strict: 143; GFX8: ; %bb.0: 144; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 145; GFX8-NEXT: v_add_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 146; GFX8-NEXT: v_add_f16_e32 v0, v0, v2 147; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 148; GFX8-NEXT: v_add_f16_e32 v1, v1, v3 149; GFX8-NEXT: s_setpc_b64 s[30:31] 150; 151; GFX10PLUS-LABEL: v_constained_fadd_v3f16_fpexcept_strict: 152; GFX10PLUS: ; %bb.0: 153; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 154; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 155; GFX10PLUS-NEXT: v_pk_add_f16 v0, v0, v2 156; GFX10PLUS-NEXT: v_add_f16_e32 v1, v1, v3 157; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 158 %val = call <3 x half> @llvm.experimental.constrained.fadd.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 159 ret <3 x half> %val 160} 161 162; FIXME: Scalarized 163define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x half> %y) #0 { 164; GFX9-LABEL: v_constained_fadd_v4f16_fpexcept_strict: 165; GFX9: ; %bb.0: 166; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 167; GFX9-NEXT: v_add_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 168; GFX9-NEXT: v_add_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 169; GFX9-NEXT: v_add_f16_e32 v1, v1, v3 170; GFX9-NEXT: v_add_f16_e32 v0, v0, v2 171; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 172; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 173; GFX9-NEXT: s_setpc_b64 s[30:31] 174; 175; GFX8-LABEL: v_constained_fadd_v4f16_fpexcept_strict: 176; GFX8: ; %bb.0: 177; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 178; GFX8-NEXT: v_add_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 179; GFX8-NEXT: v_add_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 180; GFX8-NEXT: v_add_f16_e32 v1, v1, v3 181; GFX8-NEXT: v_add_f16_e32 v0, v0, v2 182; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 183; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 184; GFX8-NEXT: s_setpc_b64 s[30:31] 185; 186; GFX10-LABEL: v_constained_fadd_v4f16_fpexcept_strict: 187; GFX10: ; %bb.0: 188; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 189; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 190; GFX10-NEXT: v_add_f16_e32 v4, v0, v2 191; GFX10-NEXT: v_add_f16_e32 v5, v1, v3 192; GFX10-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 193; GFX10-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 194; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 195; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v5 196; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 197; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 198; GFX10-NEXT: s_setpc_b64 s[30:31] 199; 200; GFX11-LABEL: v_constained_fadd_v4f16_fpexcept_strict: 201; GFX11: ; %bb.0: 202; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 203; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 204; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 205; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 206; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 207; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 208; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 209; GFX11-NEXT: v_add_f16_e32 v1, v1, v3 210; GFX11-NEXT: v_add_f16_e32 v2, v5, v4 211; GFX11-NEXT: v_add_f16_e32 v3, v7, v6 212; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 213; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 214; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 215; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 216; GFX11-NEXT: s_setpc_b64 s[30:31] 217 %val = call <4 x half> @llvm.experimental.constrained.fadd.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 218 ret <4 x half> %val 219} 220 221define amdgpu_ps half @s_constained_fadd_f16_fpexcept_strict(half inreg %x, half inreg %y) #0 { 222; GCN-LABEL: s_constained_fadd_f16_fpexcept_strict: 223; GCN: ; %bb.0: 224; GCN-NEXT: v_mov_b32_e32 v0, s3 225; GCN-NEXT: v_add_f16_e32 v0, s2, v0 226; GCN-NEXT: ; return to shader part epilog 227; 228; GFX10PLUS-LABEL: s_constained_fadd_f16_fpexcept_strict: 229; GFX10PLUS: ; %bb.0: 230; GFX10PLUS-NEXT: v_add_f16_e64 v0, s2, s3 231; GFX10PLUS-NEXT: ; return to shader part epilog 232 %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 233 ret half %val 234} 235 236define amdgpu_ps <2 x half> @s_constained_fadd_v2f16_fpexcept_strict(<2 x half> inreg %x, <2 x half> inreg %y) #0 { 237; GFX9-LABEL: s_constained_fadd_v2f16_fpexcept_strict: 238; GFX9: ; %bb.0: 239; GFX9-NEXT: v_mov_b32_e32 v0, s3 240; GFX9-NEXT: v_pk_add_f16 v0, s2, v0 241; GFX9-NEXT: ; return to shader part epilog 242; 243; GFX8-LABEL: s_constained_fadd_v2f16_fpexcept_strict: 244; GFX8: ; %bb.0: 245; GFX8-NEXT: s_lshr_b32 s0, s3, 16 246; GFX8-NEXT: s_lshr_b32 s1, s2, 16 247; GFX8-NEXT: v_mov_b32_e32 v0, s0 248; GFX8-NEXT: v_mov_b32_e32 v1, s1 249; GFX8-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 250; GFX8-NEXT: v_mov_b32_e32 v1, s3 251; GFX8-NEXT: v_add_f16_e32 v1, s2, v1 252; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 253; GFX8-NEXT: ; return to shader part epilog 254; 255; GFX10PLUS-LABEL: s_constained_fadd_v2f16_fpexcept_strict: 256; GFX10PLUS: ; %bb.0: 257; GFX10PLUS-NEXT: v_pk_add_f16 v0, s2, s3 258; GFX10PLUS-NEXT: ; return to shader part epilog 259 %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 260 ret <2 x half> %val 261} 262 263declare half @llvm.experimental.constrained.fadd.f16(half, half, metadata, metadata) #1 264declare <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half>, <2 x half>, metadata, metadata) #1 265declare <3 x half> @llvm.experimental.constrained.fadd.v3f16(<3 x half>, <3 x half>, metadata, metadata) #1 266declare <4 x half> @llvm.experimental.constrained.fadd.v4f16(<4 x half>, <4 x half>, metadata, metadata) #1 267 268attributes #0 = { strictfp } 269attributes #1 = { inaccessiblememonly nounwind willreturn } 270