1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI -check-prefix=FUNC %s 2; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI -check-prefix=FUNC %s 3; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI -check-prefix=FUNC %s 4 5; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be 6; beneficial even without fp32 denormals, but they do require no-infs-fp-math 7; for correctness. 8 9declare i32 @llvm.amdgcn.workitem.id.x() #0 10declare double @llvm.fabs.f64(double) #0 11declare double @llvm.fma.f64(double, double, double) #0 12declare float @llvm.fma.f32(float, float, float) #0 13 14; (fadd (fmul x, y), z) -> (fma x, y, z) 15; FUNC-LABEL: {{^}}combine_to_fma_f64_0: 16; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 17; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 18; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 19; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] 20; SI: buffer_store_dwordx2 [[RESULT]] 21define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 22 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 23 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 24 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 25 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 26 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 27 28 %a = load volatile double, double addrspace(1)* %gep.0 29 %b = load volatile double, double addrspace(1)* %gep.1 30 %c = load volatile double, double addrspace(1)* %gep.2 31 32 %mul = fmul double %a, %b 33 %fma = fadd double %mul, %c 34 store double %fma, double addrspace(1)* %gep.out 35 ret void 36} 37 38; (fadd (fmul x, y), z) -> (fma x, y, z) 39; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use: 40; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 41; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 42; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 43; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 44; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] 45; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]] 46; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 47; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 48; SI: s_endpgm 49define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 50 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 51 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 52 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 53 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 54 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 55 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 56 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 57 58 %a = load volatile double, double addrspace(1)* %gep.0 59 %b = load volatile double, double addrspace(1)* %gep.1 60 %c = load volatile double, double addrspace(1)* %gep.2 61 %d = load volatile double, double addrspace(1)* %gep.3 62 63 %mul = fmul double %a, %b 64 %fma0 = fadd double %mul, %c 65 %fma1 = fadd double %mul, %d 66 store volatile double %fma0, double addrspace(1)* %gep.out.0 67 store volatile double %fma1, double addrspace(1)* %gep.out.1 68 ret void 69} 70 71; (fadd x, (fmul y, z)) -> (fma y, z, x) 72; FUNC-LABEL: {{^}}combine_to_fma_f64_1: 73; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 74; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 75; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 76; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] 77; SI: buffer_store_dwordx2 [[RESULT]] 78define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 79 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 80 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 81 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 82 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 83 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 84 85 %a = load volatile double, double addrspace(1)* %gep.0 86 %b = load volatile double, double addrspace(1)* %gep.1 87 %c = load volatile double, double addrspace(1)* %gep.2 88 89 %mul = fmul double %a, %b 90 %fma = fadd double %c, %mul 91 store double %fma, double addrspace(1)* %gep.out 92 ret void 93} 94 95; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 96; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64: 97; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 98; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 99; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 100; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] 101; SI: buffer_store_dwordx2 [[RESULT]] 102define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 103 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 104 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 105 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 106 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 107 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 108 109 %a = load volatile double, double addrspace(1)* %gep.0 110 %b = load volatile double, double addrspace(1)* %gep.1 111 %c = load volatile double, double addrspace(1)* %gep.2 112 113 %mul = fmul double %a, %b 114 %fma = fsub double %mul, %c 115 store double %fma, double addrspace(1)* %gep.out 116 ret void 117} 118 119; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 120; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use: 121; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 122; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 123; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 124; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 125; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] 126; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] 127; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 128; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 129; SI: s_endpgm 130define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 131 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 132 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 133 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 134 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 135 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 136 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 137 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 138 139 %a = load volatile double, double addrspace(1)* %gep.0 140 %b = load volatile double, double addrspace(1)* %gep.1 141 %c = load volatile double, double addrspace(1)* %gep.2 142 %d = load volatile double, double addrspace(1)* %gep.3 143 144 %mul = fmul double %a, %b 145 %fma0 = fsub double %mul, %c 146 %fma1 = fsub double %mul, %d 147 store volatile double %fma0, double addrspace(1)* %gep.out.0 148 store volatile double %fma1, double addrspace(1)* %gep.out.1 149 ret void 150} 151 152; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 153; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64: 154; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 155; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 156; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 157; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] 158; SI: buffer_store_dwordx2 [[RESULT]] 159define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 160 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 161 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 162 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 163 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 164 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 165 166 %a = load volatile double, double addrspace(1)* %gep.0 167 %b = load volatile double, double addrspace(1)* %gep.1 168 %c = load volatile double, double addrspace(1)* %gep.2 169 170 %mul = fmul double %a, %b 171 %fma = fsub double %c, %mul 172 store double %fma, double addrspace(1)* %gep.out 173 ret void 174} 175 176; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 177; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use: 178; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 179; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 180; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 181; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 182; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] 183; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]] 184; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 185; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 186; SI: s_endpgm 187define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 188 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 189 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 190 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 191 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 192 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 193 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 194 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 195 196 %a = load volatile double, double addrspace(1)* %gep.0 197 %b = load volatile double, double addrspace(1)* %gep.1 198 %c = load volatile double, double addrspace(1)* %gep.2 199 %d = load volatile double, double addrspace(1)* %gep.3 200 201 %mul = fmul double %a, %b 202 %fma0 = fsub double %c, %mul 203 %fma1 = fsub double %d, %mul 204 store volatile double %fma0, double addrspace(1)* %gep.out.0 205 store volatile double %fma1, double addrspace(1)* %gep.out.1 206 ret void 207} 208 209; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 210; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64: 211; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 212; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 213; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 214; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] 215; SI: buffer_store_dwordx2 [[RESULT]] 216define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 217 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 218 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 219 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 220 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 221 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 222 223 %a = load volatile double, double addrspace(1)* %gep.0 224 %b = load volatile double, double addrspace(1)* %gep.1 225 %c = load volatile double, double addrspace(1)* %gep.2 226 227 %mul = fmul double %a, %b 228 %mul.neg = fsub double -0.0, %mul 229 %fma = fsub double %mul.neg, %c 230 231 store double %fma, double addrspace(1)* %gep.out 232 ret void 233} 234 235; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 236; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg: 237; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 238; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 239; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 240; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] 241; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]] 242; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 243; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 244; SI: s_endpgm 245define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 246 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 247 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 248 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 249 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 250 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 251 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 252 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 253 254 %a = load volatile double, double addrspace(1)* %gep.0 255 %b = load volatile double, double addrspace(1)* %gep.1 256 %c = load volatile double, double addrspace(1)* %gep.2 257 %d = load volatile double, double addrspace(1)* %gep.3 258 259 %mul = fmul double %a, %b 260 %mul.neg = fsub double -0.0, %mul 261 %fma0 = fsub double %mul.neg, %c 262 %fma1 = fsub double %mul.neg, %d 263 264 store volatile double %fma0, double addrspace(1)* %gep.out.0 265 store volatile double %fma1, double addrspace(1)* %gep.out.1 266 ret void 267} 268 269; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 270; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul: 271; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 272; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 273; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 274; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] 275; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] 276; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 277; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 278; SI: s_endpgm 279define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 280 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 281 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 282 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 283 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 284 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 285 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 286 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 287 288 %a = load volatile double, double addrspace(1)* %gep.0 289 %b = load volatile double, double addrspace(1)* %gep.1 290 %c = load volatile double, double addrspace(1)* %gep.2 291 %d = load volatile double, double addrspace(1)* %gep.3 292 293 %mul = fmul double %a, %b 294 %mul.neg = fsub double -0.0, %mul 295 %fma0 = fsub double %mul.neg, %c 296 %fma1 = fsub double %mul, %d 297 298 store volatile double %fma0, double addrspace(1)* %gep.out.0 299 store volatile double %fma1, double addrspace(1)* %gep.out.1 300 ret void 301} 302 303; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) 304 305; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64: 306; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 307; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 308; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 309; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 310; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} 311; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]] 312; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]] 313; SI: buffer_store_dwordx2 [[RESULT]] 314define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 315 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 316 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 317 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 318 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 319 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 320 %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4 321 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 322 323 %x = load volatile double, double addrspace(1)* %gep.0 324 %y = load volatile double, double addrspace(1)* %gep.1 325 %z = load volatile double, double addrspace(1)* %gep.2 326 %u = load volatile double, double addrspace(1)* %gep.3 327 %v = load volatile double, double addrspace(1)* %gep.4 328 329 %tmp0 = fmul double %u, %v 330 %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0 331 %tmp2 = fsub double %tmp1, %z 332 333 store double %tmp2, double addrspace(1)* %gep.out 334 ret void 335} 336 337; fold (fsub x, (fma y, z, (fmul u, v))) 338; -> (fma (fneg y), z, (fma (fneg u), v, x)) 339 340; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64: 341; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 342; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 343; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 344; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 345; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} 346; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]] 347; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]] 348; SI: buffer_store_dwordx2 [[RESULT]] 349define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 350 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 351 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 352 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 353 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 354 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 355 %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4 356 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 357 358 %x = load volatile double, double addrspace(1)* %gep.0 359 %y = load volatile double, double addrspace(1)* %gep.1 360 %z = load volatile double, double addrspace(1)* %gep.2 361 %u = load volatile double, double addrspace(1)* %gep.3 362 %v = load volatile double, double addrspace(1)* %gep.4 363 364 %tmp0 = fmul double %u, %v 365 %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0 366 %tmp2 = fsub double %x, %tmp1 367 368 store double %tmp2, double addrspace(1)* %gep.out 369 ret void 370} 371 372; 373; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y) 374; 375 376; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y: 377; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 378; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 379; 380; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 381define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out, 382 float addrspace(1)* %in1, 383 float addrspace(1)* %in2) { 384 %x = load volatile float, float addrspace(1)* %in1 385 %y = load volatile float, float addrspace(1)* %in2 386 %a = fadd float %x, 1.0 387 %m = fmul float %a, %y 388 store float %m, float addrspace(1)* %out 389 ret void 390} 391 392; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one: 393; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 394; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 395; 396; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 397define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out, 398 float addrspace(1)* %in1, 399 float addrspace(1)* %in2) { 400 %x = load volatile float, float addrspace(1)* %in1 401 %y = load volatile float, float addrspace(1)* %in2 402 %a = fadd float %x, 1.0 403 %m = fmul float %y, %a 404 store float %m, float addrspace(1)* %out 405 ret void 406} 407 408; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y: 409; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 410; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 411; 412; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 413define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out, 414 float addrspace(1)* %in1, 415 float addrspace(1)* %in2) { 416 %x = load float, float addrspace(1)* %in1 417 %y = load float, float addrspace(1)* %in2 418 %a = fadd float %x, -1.0 419 %m = fmul float %a, %y 420 store float %m, float addrspace(1)* %out 421 ret void 422} 423 424; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone: 425; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 426; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 427; 428; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 429define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out, 430 float addrspace(1)* %in1, 431 float addrspace(1)* %in2) { 432 %x = load float, float addrspace(1)* %in1 433 %y = load float, float addrspace(1)* %in2 434 %a = fadd float %x, -1.0 435 %m = fmul float %y, %a 436 store float %m, float addrspace(1)* %out 437 ret void 438} 439 440; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y: 441; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 442; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 443; 444; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 445define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out, 446 float addrspace(1)* %in1, 447 float addrspace(1)* %in2) { 448 %x = load float, float addrspace(1)* %in1 449 %y = load float, float addrspace(1)* %in2 450 %s = fsub float 1.0, %x 451 %m = fmul float %s, %y 452 store float %m, float addrspace(1)* %out 453 ret void 454} 455 456; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x: 457; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 458; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 459; 460; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 461define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out, 462 float addrspace(1)* %in1, 463 float addrspace(1)* %in2) { 464 %x = load float, float addrspace(1)* %in1 465 %y = load float, float addrspace(1)* %in2 466 %s = fsub float 1.0, %x 467 %m = fmul float %y, %s 468 store float %m, float addrspace(1)* %out 469 ret void 470} 471 472; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y: 473; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 474; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 475; 476; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 477define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out, 478 float addrspace(1)* %in1, 479 float addrspace(1)* %in2) { 480 %x = load float, float addrspace(1)* %in1 481 %y = load float, float addrspace(1)* %in2 482 %s = fsub float -1.0, %x 483 %m = fmul float %s, %y 484 store float %m, float addrspace(1)* %out 485 ret void 486} 487 488; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x: 489; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 490; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 491; 492; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 493define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out, 494 float addrspace(1)* %in1, 495 float addrspace(1)* %in2) { 496 %x = load float, float addrspace(1)* %in1 497 %y = load float, float addrspace(1)* %in2 498 %s = fsub float -1.0, %x 499 %m = fmul float %y, %s 500 store float %m, float addrspace(1)* %out 501 ret void 502} 503 504; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y: 505; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 506; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 507; 508; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 509define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out, 510 float addrspace(1)* %in1, 511 float addrspace(1)* %in2) { 512 %x = load float, float addrspace(1)* %in1 513 %y = load float, float addrspace(1)* %in2 514 %s = fsub float %x, 1.0 515 %m = fmul float %s, %y 516 store float %m, float addrspace(1)* %out 517 ret void 518} 519 520; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one: 521; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 522; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 523; 524; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 525define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out, 526 float addrspace(1)* %in1, 527 float addrspace(1)* %in2) { 528 %x = load float, float addrspace(1)* %in1 529 %y = load float, float addrspace(1)* %in2 530 %s = fsub float %x, 1.0 531 %m = fmul float %y, %s 532 store float %m, float addrspace(1)* %out 533 ret void 534} 535 536; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y: 537; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 538; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 539; 540; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 541define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out, 542 float addrspace(1)* %in1, 543 float addrspace(1)* %in2) { 544 %x = load float, float addrspace(1)* %in1 545 %y = load float, float addrspace(1)* %in2 546 %s = fsub float %x, -1.0 547 %m = fmul float %s, %y 548 store float %m, float addrspace(1)* %out 549 ret void 550} 551 552; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone: 553; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 554; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 555; 556; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 557define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out, 558 float addrspace(1)* %in1, 559 float addrspace(1)* %in2) { 560 %x = load float, float addrspace(1)* %in1 561 %y = load float, float addrspace(1)* %in2 562 %s = fsub float %x, -1.0 563 %m = fmul float %y, %s 564 store float %m, float addrspace(1)* %out 565 ret void 566} 567 568; 569; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y)) 570; 571 572; FUNC-LABEL: {{^}}test_f32_interp: 573; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]] 574; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VT1]], [[VY:v[0-9]]] 575; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VT]], [[VX:v[0-9]]] 576; 577; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]] 578; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]] 579define void @test_f32_interp(float addrspace(1)* %out, 580 float addrspace(1)* %in1, 581 float addrspace(1)* %in2, 582 float addrspace(1)* %in3) { 583 %x = load float, float addrspace(1)* %in1 584 %y = load float, float addrspace(1)* %in2 585 %t = load float, float addrspace(1)* %in3 586 %t1 = fsub float 1.0, %t 587 %tx = fmul float %x, %t 588 %ty = fmul float %y, %t1 589 %r = fadd float %tx, %ty 590 store float %r, float addrspace(1)* %out 591 ret void 592} 593 594; FUNC-LABEL: {{^}}test_f64_interp: 595; SI-NOFMA: v_add_f64 [[VT1:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], 1.0 596; SI-NOFMA: v_mul_f64 [[VTY:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VT1]] 597; SI-NOFMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VTY]] 598; 599; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]] 600; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]] 601define void @test_f64_interp(double addrspace(1)* %out, 602 double addrspace(1)* %in1, 603 double addrspace(1)* %in2, 604 double addrspace(1)* %in3) { 605 %x = load double, double addrspace(1)* %in1 606 %y = load double, double addrspace(1)* %in2 607 %t = load double, double addrspace(1)* %in3 608 %t1 = fsub double 1.0, %t 609 %tx = fmul double %x, %t 610 %ty = fmul double %y, %t1 611 %r = fadd double %tx, %ty 612 store double %r, double addrspace(1)* %out 613 ret void 614} 615 616attributes #0 = { nounwind readnone } 617attributes #1 = { nounwind } 618