1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s 4; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s 5; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10 %s 6 7define half @v_constained_fma_f16_fpexcept_strict(half %x, half %y, half %z) #0 { 8; GCN-LABEL: v_constained_fma_f16_fpexcept_strict: 9; GCN: ; %bb.0: 10; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; GCN-NEXT: v_fma_f16 v0, v0, v1, v2 12; GCN-NEXT: s_setpc_b64 s[30:31] 13; 14; GFX10-LABEL: v_constained_fma_f16_fpexcept_strict: 15; GFX10: ; %bb.0: 16; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 18; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2 19; GFX10-NEXT: s_setpc_b64 s[30:31] 20 %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict") 21 ret half %val 22} 23 24define <2 x half> @v_constained_fma_v2f16_fpexcept_strict(<2 x half> %x, <2 x half> %y, <2 x half> %z) #0 { 25; GFX9-LABEL: v_constained_fma_v2f16_fpexcept_strict: 26; GFX9: ; %bb.0: 27; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 29; GFX9-NEXT: s_setpc_b64 s[30:31] 30; 31; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict: 32; GFX8: ; %bb.0: 33; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 35; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 36; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 37; GFX8-NEXT: v_fma_f16 v3, v5, v4, v3 38; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 39; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 40; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 41; GFX8-NEXT: s_setpc_b64 s[30:31] 42; 43; GFX10-LABEL: v_constained_fma_v2f16_fpexcept_strict: 44; GFX10: ; %bb.0: 45; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 46; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 47; GFX10-NEXT: v_pk_fma_f16 v0, v0, v1, v2 48; GFX10-NEXT: s_setpc_b64 s[30:31] 49 %val = call <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") 50 ret <2 x half> %val 51} 52 53define <3 x half> @v_constained_fma_v3f16_fpexcept_strict(<3 x half> %x, <3 x half> %y, <3 x half> %z) #0 { 54; GFX9-LABEL: v_constained_fma_v3f16_fpexcept_strict: 55; GFX9: ; %bb.0: 56; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 57; GFX9-NEXT: v_pk_fma_f16 v0, v0, v2, v4 58; GFX9-NEXT: v_fma_f16 v1, v1, v3, v5 59; GFX9-NEXT: s_setpc_b64 s[30:31] 60; 61; GFX8-LABEL: v_constained_fma_v3f16_fpexcept_strict: 62; GFX8: ; %bb.0: 63; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 64; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4 65; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v2 66; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 67; GFX8-NEXT: v_fma_f16 v6, v8, v7, v6 68; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6 69; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4 70; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 71; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5 72; GFX8-NEXT: s_setpc_b64 s[30:31] 73; 74; GFX10-LABEL: v_constained_fma_v3f16_fpexcept_strict: 75; GFX10: ; %bb.0: 76; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 77; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 78; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v4 79; GFX10-NEXT: v_fma_f16 v1, v1, v3, v5 80; GFX10-NEXT: s_setpc_b64 s[30:31] 81 %val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") 82 ret <3 x half> %val 83} 84 85define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x half> %y, <4 x half> %z) #0 { 86; GFX9-LABEL: v_constained_fma_v4f16_fpexcept_strict: 87; GFX9: ; %bb.0: 88; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 89; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5 90; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3 91; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1 92; GFX9-NEXT: v_fma_f16 v6, v8, v7, v6 93; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 94; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v2 95; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 96; GFX9-NEXT: v_fma_f16 v1, v1, v3, v5 97; GFX9-NEXT: v_fma_f16 v0, v0, v2, v4 98; GFX9-NEXT: v_fma_f16 v7, v9, v8, v7 99; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 100; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 101; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 102; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 103; GFX9-NEXT: s_setpc_b64 s[30:31] 104; 105; GFX8-LABEL: v_constained_fma_v4f16_fpexcept_strict: 106; GFX8: ; %bb.0: 107; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 108; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5 109; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v3 110; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v1 111; GFX8-NEXT: v_fma_f16 v6, v8, v7, v6 112; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v4 113; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 114; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v0 115; GFX8-NEXT: v_fma_f16 v7, v9, v8, v7 116; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4 117; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 118; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5 119; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 120; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6 121; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 122; GFX8-NEXT: s_setpc_b64 s[30:31] 123; 124; GFX10-LABEL: v_constained_fma_v4f16_fpexcept_strict: 125; GFX10: ; %bb.0: 126; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 127; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 128; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v5 129; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3 130; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 131; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v4 132; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 133; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0 134; GFX10-NEXT: v_fmac_f16_e32 v4, v0, v2 135; GFX10-NEXT: v_fmac_f16_e32 v5, v1, v3 136; GFX10-NEXT: v_fmac_f16_e32 v6, v8, v7 137; GFX10-NEXT: v_fmac_f16_e32 v9, v11, v10 138; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 139; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v5 140; GFX10-NEXT: v_lshl_or_b32 v0, v9, 16, v0 141; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v1 142; GFX10-NEXT: s_setpc_b64 s[30:31] 143 %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") 144 ret <4 x half> %val 145} 146 147define half @v_constained_fma_f16_fpexcept_strict_fneg(half %x, half %y, half %z) #0 { 148; GCN-LABEL: v_constained_fma_f16_fpexcept_strict_fneg: 149; GCN: ; %bb.0: 150; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 151; GCN-NEXT: v_fma_f16 v0, v0, v1, -v2 152; GCN-NEXT: s_setpc_b64 s[30:31] 153; 154; GFX10-LABEL: v_constained_fma_f16_fpexcept_strict_fneg: 155; GFX10: ; %bb.0: 156; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 157; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 158; GFX10-NEXT: v_fma_f16 v0, v0, v1, -v2 159; GFX10-NEXT: s_setpc_b64 s[30:31] 160 %neg.z = fneg half %z 161 %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %neg.z, metadata !"round.tonearest", metadata !"fpexcept.strict") 162 ret half %val 163} 164 165define half @v_constained_fma_f16_fpexcept_strict_fneg_fneg(half %x, half %y, half %z) #0 { 166; GCN-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg: 167; GCN: ; %bb.0: 168; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 169; GCN-NEXT: v_fma_f16 v0, -v0, -v1, v2 170; GCN-NEXT: s_setpc_b64 s[30:31] 171; 172; GFX10-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg: 173; GFX10: ; %bb.0: 174; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 175; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 176; GFX10-NEXT: v_fma_f16 v0, -v0, -v1, v2 177; GFX10-NEXT: s_setpc_b64 s[30:31] 178 %neg.x = fneg half %x 179 %neg.y = fneg half %y 180 %val = call half @llvm.experimental.constrained.fma.f16(half %neg.x, half %neg.y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict") 181 ret half %val 182} 183 184define half @v_constained_fma_f16_fpexcept_strict_fabs_fabs(half %x, half %y, half %z) #0 { 185; GCN-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs: 186; GCN: ; %bb.0: 187; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 188; GCN-NEXT: v_fma_f16 v0, |v0|, |v1|, v2 189; GCN-NEXT: s_setpc_b64 s[30:31] 190; 191; GFX10-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs: 192; GFX10: ; %bb.0: 193; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 194; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 195; GFX10-NEXT: v_fma_f16 v0, |v0|, |v1|, v2 196; GFX10-NEXT: s_setpc_b64 s[30:31] 197 %neg.x = call half @llvm.fabs.f16(half %x) 198 %neg.y = call half @llvm.fabs.f16(half %y) 199 %val = call half @llvm.experimental.constrained.fma.f16(half %neg.x, half %neg.y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict") 200 ret half %val 201} 202 203define <2 x half> @v_constained_fma_v2f16_fpexcept_strict_fneg_fneg(<2 x half> %x, <2 x half> %y, <2 x half> %z) #0 { 204; GFX9-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg: 205; GFX9: ; %bb.0: 206; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 207; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] 208; GFX9-NEXT: s_setpc_b64 s[30:31] 209; 210; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg: 211; GFX8: ; %bb.0: 212; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 213; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 214; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 215; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 216; GFX8-NEXT: v_fma_f16 v3, -v5, -v4, v3 217; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 218; GFX8-NEXT: v_fma_f16 v0, -v0, -v1, v2 219; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 220; GFX8-NEXT: s_setpc_b64 s[30:31] 221; 222; GFX10-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg: 223; GFX10: ; %bb.0: 224; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 225; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 226; GFX10-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] 227; GFX10-NEXT: s_setpc_b64 s[30:31] 228 %neg.x = fneg <2 x half> %x 229 %neg.y = fneg <2 x half> %y 230 %val = call <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half> %neg.x, <2 x half> %neg.y, <2 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") 231 ret <2 x half> %val 232} 233 234declare half @llvm.fabs.f16(half) #1 235declare half @llvm.experimental.constrained.fma.f16(half, half, half, metadata, metadata) #1 236declare <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half>, <2 x half>, <2 x half>, metadata, metadata) #1 237declare <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half>, <3 x half>, <3 x half>, metadata, metadata) #1 238declare <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half>, <4 x half>, <4 x half>, metadata, metadata) #1 239 240attributes #0 = { strictfp } 241attributes #1 = { inaccessiblememonly nounwind willreturn } 242