1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s 4; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s 5; FIXME: promotion not handled without f16 insts 6 7define half @v_constained_fsub_f16_fpexcept_strict(half %x, half %y) #0 { 8; GCN-LABEL: v_constained_fsub_f16_fpexcept_strict: 9; GCN: ; %bb.0: 10; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; GCN-NEXT: v_sub_f16_e32 v0, v0, v1 12; GCN-NEXT: s_setpc_b64 s[30:31] 13; 14; GFX10-LABEL: v_constained_fsub_f16_fpexcept_strict: 15; GFX10: ; %bb.0: 16; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 18; GFX10-NEXT: v_sub_f16_e32 v0, v0, v1 19; GFX10-NEXT: s_setpc_b64 s[30:31] 20 %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 21 ret half %val 22} 23 24define half @v_constained_fsub_f16_fpexcept_ignore(half %x, half %y) #0 { 25; GCN-LABEL: v_constained_fsub_f16_fpexcept_ignore: 26; GCN: ; %bb.0: 27; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28; GCN-NEXT: v_sub_f16_e32 v0, v0, v1 29; GCN-NEXT: s_setpc_b64 s[30:31] 30; 31; GFX10-LABEL: v_constained_fsub_f16_fpexcept_ignore: 32; GFX10: ; %bb.0: 33; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 35; GFX10-NEXT: v_sub_f16_e32 v0, v0, v1 36; GFX10-NEXT: s_setpc_b64 s[30:31] 37 %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") 38 ret half %val 39} 40 41define half @v_constained_fsub_f16_fpexcept_maytrap(half %x, half %y) #0 { 42; GCN-LABEL: v_constained_fsub_f16_fpexcept_maytrap: 43; GCN: ; %bb.0: 44; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 45; GCN-NEXT: v_sub_f16_e32 v0, v0, v1 46; GCN-NEXT: s_setpc_b64 s[30:31] 47; 48; GFX10-LABEL: v_constained_fsub_f16_fpexcept_maytrap: 49; GFX10: ; %bb.0: 50; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 51; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 52; GFX10-NEXT: v_sub_f16_e32 v0, v0, v1 53; GFX10-NEXT: s_setpc_b64 s[30:31] 54 %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") 55 ret half %val 56} 57 58define <2 x half> @v_constained_fsub_v2f16_fpexcept_strict(<2 x half> %x, <2 x half> %y) #0 { 59; GFX9-LABEL: v_constained_fsub_v2f16_fpexcept_strict: 60; GFX9: ; %bb.0: 61; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 62; GFX9-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 63; GFX9-NEXT: v_sub_f16_e32 v0, v0, v1 64; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 65; GFX9-NEXT: s_setpc_b64 s[30:31] 66; 67; GFX8-LABEL: v_constained_fsub_v2f16_fpexcept_strict: 68; GFX8: ; %bb.0: 69; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 70; GFX8-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 71; GFX8-NEXT: v_sub_f16_e32 v0, v0, v1 72; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 73; GFX8-NEXT: s_setpc_b64 s[30:31] 74; 75; GFX10-LABEL: v_constained_fsub_v2f16_fpexcept_strict: 76; GFX10: ; %bb.0: 77; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 78; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 79; GFX10-NEXT: v_sub_f16_e32 v2, v0, v1 80; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 81; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 82; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 83; GFX10-NEXT: s_setpc_b64 s[30:31] 84 %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 85 ret <2 x half> %val 86} 87 88define <2 x half> @v_constained_fsub_v2f16_fpexcept_ignore(<2 x half> %x, <2 x half> %y) #0 { 89; GFX9-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: 90; GFX9: ; %bb.0: 91; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 92; GFX9-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 93; GFX9-NEXT: v_sub_f16_e32 v0, v0, v1 94; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 95; GFX9-NEXT: s_setpc_b64 s[30:31] 96; 97; GFX8-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: 98; GFX8: ; %bb.0: 99; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 100; GFX8-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 101; GFX8-NEXT: v_sub_f16_e32 v0, v0, v1 102; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 103; GFX8-NEXT: s_setpc_b64 s[30:31] 104; 105; GFX10-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: 106; GFX10: ; %bb.0: 107; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 108; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 109; GFX10-NEXT: v_sub_f16_e32 v2, v0, v1 110; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 111; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 112; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 113; GFX10-NEXT: s_setpc_b64 s[30:31] 114 %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") 115 ret <2 x half> %val 116} 117 118define <2 x half> @v_constained_fsub_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x half> %y) #0 { 119; GFX9-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: 120; GFX9: ; %bb.0: 121; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 122; GFX9-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 123; GFX9-NEXT: v_sub_f16_e32 v0, v0, v1 124; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 125; GFX9-NEXT: s_setpc_b64 s[30:31] 126; 127; GFX8-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: 128; GFX8: ; %bb.0: 129; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 130; GFX8-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 131; GFX8-NEXT: v_sub_f16_e32 v0, v0, v1 132; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 133; GFX8-NEXT: s_setpc_b64 s[30:31] 134; 135; GFX10-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: 136; GFX10: ; %bb.0: 137; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 138; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 139; GFX10-NEXT: v_sub_f16_e32 v2, v0, v1 140; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 141; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 142; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 143; GFX10-NEXT: s_setpc_b64 s[30:31] 144 %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") 145 ret <2 x half> %val 146} 147 148define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x half> %y) #0 { 149; GFX9-LABEL: v_constained_fsub_v3f16_fpexcept_strict: 150; GFX9: ; %bb.0: 151; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 152; GFX9-NEXT: v_sub_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 153; GFX9-NEXT: v_sub_f16_e32 v0, v0, v2 154; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 155; GFX9-NEXT: v_sub_f16_e32 v1, v1, v3 156; GFX9-NEXT: s_setpc_b64 s[30:31] 157; 158; GFX8-LABEL: v_constained_fsub_v3f16_fpexcept_strict: 159; GFX8: ; %bb.0: 160; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 161; GFX8-NEXT: v_sub_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 162; GFX8-NEXT: v_sub_f16_e32 v0, v0, v2 163; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 164; GFX8-NEXT: v_sub_f16_e32 v1, v1, v3 165; GFX8-NEXT: s_setpc_b64 s[30:31] 166; 167; GFX10-LABEL: v_constained_fsub_v3f16_fpexcept_strict: 168; GFX10: ; %bb.0: 169; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 170; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 171; GFX10-NEXT: v_sub_f16_e32 v4, v0, v2 172; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 173; GFX10-NEXT: v_sub_f16_e32 v1, v1, v3 174; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 175; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 176; GFX10-NEXT: s_setpc_b64 s[30:31] 177 %val = call <3 x half> @llvm.experimental.constrained.fsub.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 178 ret <3 x half> %val 179} 180 181; FIXME: Scalarized 182define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x half> %y) #0 { 183; GFX9-LABEL: v_constained_fsub_v4f16_fpexcept_strict: 184; GFX9: ; %bb.0: 185; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 186; GFX9-NEXT: v_sub_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 187; GFX9-NEXT: v_sub_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 188; GFX9-NEXT: v_sub_f16_e32 v1, v1, v3 189; GFX9-NEXT: v_sub_f16_e32 v0, v0, v2 190; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 191; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 192; GFX9-NEXT: s_setpc_b64 s[30:31] 193; 194; GFX8-LABEL: v_constained_fsub_v4f16_fpexcept_strict: 195; GFX8: ; %bb.0: 196; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 197; GFX8-NEXT: v_sub_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 198; GFX8-NEXT: v_sub_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 199; GFX8-NEXT: v_sub_f16_e32 v1, v1, v3 200; GFX8-NEXT: v_sub_f16_e32 v0, v0, v2 201; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 202; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 203; GFX8-NEXT: s_setpc_b64 s[30:31] 204; 205; GFX10-LABEL: v_constained_fsub_v4f16_fpexcept_strict: 206; GFX10: ; %bb.0: 207; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 208; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 209; GFX10-NEXT: v_sub_f16_e32 v4, v0, v2 210; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff 211; GFX10-NEXT: v_sub_f16_e32 v6, v1, v3 212; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 213; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 214; GFX10-NEXT: v_and_b32_e32 v2, v5, v4 215; GFX10-NEXT: v_and_b32_e32 v3, v5, v6 216; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 217; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 218; GFX10-NEXT: s_setpc_b64 s[30:31] 219 %val = call <4 x half> @llvm.experimental.constrained.fsub.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 220 ret <4 x half> %val 221} 222 223define amdgpu_ps half @s_constained_fsub_f16_fpexcept_strict(half inreg %x, half inreg %y) #0 { 224; GCN-LABEL: s_constained_fsub_f16_fpexcept_strict: 225; GCN: ; %bb.0: 226; GCN-NEXT: v_mov_b32_e32 v0, s3 227; GCN-NEXT: v_sub_f16_e32 v0, s2, v0 228; GCN-NEXT: ; return to shader part epilog 229; 230; GFX10-LABEL: s_constained_fsub_f16_fpexcept_strict: 231; GFX10: ; %bb.0: 232; GFX10-NEXT: v_sub_f16_e64 v0, s2, s3 233; GFX10-NEXT: ; return to shader part epilog 234 %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 235 ret half %val 236} 237 238define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> inreg %x, <2 x half> inreg %y) #0 { 239; GFX9-LABEL: s_constained_fsub_v2f16_fpexcept_strict: 240; GFX9: ; %bb.0: 241; GFX9-NEXT: s_lshr_b32 s0, s3, 16 242; GFX9-NEXT: s_lshr_b32 s1, s2, 16 243; GFX9-NEXT: v_mov_b32_e32 v0, s0 244; GFX9-NEXT: v_mov_b32_e32 v1, s3 245; GFX9-NEXT: v_sub_f16_e32 v0, s1, v0 246; GFX9-NEXT: v_sub_f16_e32 v1, s2, v1 247; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 248; GFX9-NEXT: ; return to shader part epilog 249; 250; GFX8-LABEL: s_constained_fsub_v2f16_fpexcept_strict: 251; GFX8: ; %bb.0: 252; GFX8-NEXT: s_lshr_b32 s0, s3, 16 253; GFX8-NEXT: s_lshr_b32 s1, s2, 16 254; GFX8-NEXT: v_mov_b32_e32 v0, s0 255; GFX8-NEXT: v_mov_b32_e32 v1, s1 256; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 257; GFX8-NEXT: v_mov_b32_e32 v1, s3 258; GFX8-NEXT: v_sub_f16_e32 v1, s2, v1 259; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 260; GFX8-NEXT: ; return to shader part epilog 261; 262; GFX10-LABEL: s_constained_fsub_v2f16_fpexcept_strict: 263; GFX10: ; %bb.0: 264; GFX10-NEXT: v_sub_f16_e64 v0, s2, s3 265; GFX10-NEXT: s_lshr_b32 s0, s3, 16 266; GFX10-NEXT: s_lshr_b32 s1, s2, 16 267; GFX10-NEXT: v_sub_f16_e64 v1, s1, s0 268; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 269; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 270; GFX10-NEXT: ; return to shader part epilog 271 %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 272 ret <2 x half> %val 273} 274 275declare half @llvm.experimental.constrained.fsub.f16(half, half, metadata, metadata) #1 276declare <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half>, <2 x half>, metadata, metadata) #1 277declare <3 x half> @llvm.experimental.constrained.fsub.v3f16(<3 x half>, <3 x half>, metadata, metadata) #1 278declare <4 x half> @llvm.experimental.constrained.fsub.v4f16(<4 x half>, <4 x half>, metadata, metadata) #1 279 280attributes #0 = { strictfp } 281attributes #1 = { inaccessiblememonly nounwind willreturn } 282