1; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s 2; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s 3; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s 4; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s 5 6; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s 7; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s 8; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s 9; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s 10 11; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow. 12 13target triple = "amdgcn--" 14 15 16declare i32 @llvm.amdgcn.workitem.id.x() #1 17declare float @llvm.fmuladd.f32(float, float, float) #1 18declare half @llvm.fmuladd.f16(half, half, half) #1 19declare float @llvm.fabs.f32(float) #1 20 21; GCN-LABEL: {{^}}fmuladd_f32: 22; GCN-FLUSH: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 23 24; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 25 26; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 27; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 28define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 29 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { 30 %r0 = load float, float addrspace(1)* %in1 31 %r1 = load float, float addrspace(1)* %in2 32 %r2 = load float, float addrspace(1)* %in3 33 %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2) 34 store float %r3, float addrspace(1)* %out 35 ret void 36} 37 38; GCN-LABEL: {{^}}fmul_fadd_f32: 39; GCN-FLUSH: v_mac_f32 40 41; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 42 43; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 44; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 45 46; GCN-DENORM-STRICT: v_mul_f32_e32 47; GCN-DENORM-STRICT: v_add_f32_e32 48define void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 49 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { 50 %r0 = load volatile float, float addrspace(1)* %in1 51 %r1 = load volatile float, float addrspace(1)* %in2 52 %r2 = load volatile float, float addrspace(1)* %in3 53 %mul = fmul float %r0, %r1 54 %add = fadd float %mul, %r2 55 store float %add, float addrspace(1)* %out 56 ret void 57} 58 59; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32 60; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], 61; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], 62 63; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 64; SI-FLUSH: buffer_store_dword [[R2]] 65; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 66 67; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 68 69; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 70; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 71 72; SI-DENORM buffer_store_dword [[RESULT]] 73; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 74define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 75 %tid = call i32 @llvm.amdgcn.workitem.id.x() 76 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 77 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 78 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 79 80 %r1 = load volatile float, float addrspace(1)* %gep.0 81 %r2 = load volatile float, float addrspace(1)* %gep.1 82 83 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2) 84 store float %r3, float addrspace(1)* %gep.out 85 ret void 86} 87 88; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32 89; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], 90; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], 91 92; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 93; SI-FLUSH: buffer_store_dword [[R2]] 94; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 95 96; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 97 98; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 99; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 100 101; SI-DENORM: buffer_store_dword [[RESULT]] 102; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 103define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 104 %tid = call i32 @llvm.amdgcn.workitem.id.x() 105 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 106 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 107 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 108 109 %r1 = load volatile float, float addrspace(1)* %gep.0 110 %r2 = load volatile float, float addrspace(1)* %gep.1 111 112 %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2) 113 store float %r3, float addrspace(1)* %gep.out 114 ret void 115} 116 117; GCN-LABEL: {{^}}fadd_a_a_b_f32: 118; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], 119; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], 120 121; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 122; SI-FLUSH: buffer_store_dword [[R2]] 123; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 124 125; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 126 127; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 128; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 129 130; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 131; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 132 133; SI-DENORM: buffer_store_dword [[RESULT]] 134; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 135define void @fadd_a_a_b_f32(float addrspace(1)* %out, 136 float addrspace(1)* %in1, 137 float addrspace(1)* %in2) #0 { 138 %tid = call i32 @llvm.amdgcn.workitem.id.x() 139 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 140 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 141 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 142 143 %r0 = load volatile float, float addrspace(1)* %gep.0 144 %r1 = load volatile float, float addrspace(1)* %gep.1 145 146 %add.0 = fadd float %r0, %r0 147 %add.1 = fadd float %add.0, %r1 148 store float %add.1, float addrspace(1)* %gep.out 149 ret void 150} 151 152; GCN-LABEL: {{^}}fadd_b_a_a_f32: 153; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], 154; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], 155 156; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 157; SI-FLUSH: buffer_store_dword [[R2]] 158; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 159 160; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 161 162; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 163; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 164 165; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 166; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 167 168; SI-DENORM: buffer_store_dword [[RESULT]] 169; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 170define void @fadd_b_a_a_f32(float addrspace(1)* %out, 171 float addrspace(1)* %in1, 172 float addrspace(1)* %in2) #0 { 173 %tid = call i32 @llvm.amdgcn.workitem.id.x() 174 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 175 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 176 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 177 178 %r0 = load volatile float, float addrspace(1)* %gep.0 179 %r1 = load volatile float, float addrspace(1)* %gep.1 180 181 %add.0 = fadd float %r0, %r0 182 %add.1 = fadd float %r1, %add.0 183 store float %add.1, float addrspace(1)* %gep.out 184 ret void 185} 186 187; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 188; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], 189; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], 190; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]] 191 192; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 193 194; GCN-DENORM-SLOWFMA: v_mul_f32_e32 [[TMP:v[0-9]+]], -2.0, [[R1]] 195; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 196 197; SI-DENORM: buffer_store_dword [[RESULT]] 198; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 199define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 200 %tid = call i32 @llvm.amdgcn.workitem.id.x() 201 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 202 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 203 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 204 205 %r1 = load volatile float, float addrspace(1)* %gep.0 206 %r2 = load volatile float, float addrspace(1)* %gep.1 207 208 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2) 209 store float %r3, float addrspace(1)* %gep.out 210 ret void 211} 212 213; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32 214; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], 215; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], 216 217; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 218; SI-FLUSH: buffer_store_dword [[R2]] 219; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 220 221; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]] 222 223; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 224; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 225 226; SI-DENORM: buffer_store_dword [[RESULT]] 227; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 228define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 229 %tid = call i32 @llvm.amdgcn.workitem.id.x() 230 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 231 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 232 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 233 234 %r1 = load volatile float, float addrspace(1)* %gep.0 235 %r2 = load volatile float, float addrspace(1)* %gep.1 236 237 %r1.fneg = fsub float -0.000000e+00, %r1 238 239 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2) 240 store float %r3, float addrspace(1)* %gep.out 241 ret void 242} 243 244; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32: 245; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], 246; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], 247 248; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]] 249; SI-FLUSH: buffer_store_dword [[R2]] 250; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 251 252; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]] 253 254; GCN-DENORM-SLOWFMA: v_mul_f32_e32 [[TMP:v[0-9]+]], -2.0, [[R1]] 255; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 256 257; SI-DENORM: buffer_store_dword [[RESULT]] 258; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 259define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 260 %tid = call i32 @llvm.amdgcn.workitem.id.x() 261 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 262 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 263 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 264 265 %r1 = load volatile float, float addrspace(1)* %gep.0 266 %r2 = load volatile float, float addrspace(1)* %gep.1 267 268 %r1.fneg = fsub float -0.000000e+00, %r1 269 270 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2) 271 store float %r3, float addrspace(1)* %gep.out 272 ret void 273} 274 275; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32: 276; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], 277; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], 278; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 279; SI-FLUSH: buffer_store_dword [[RESULT]] 280; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 281 282; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 283 284; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 285; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 286 287; SI-DENORM: buffer_store_dword [[RESULT]] 288; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 289define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 290 %tid = call i32 @llvm.amdgcn.workitem.id.x() 291 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 292 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 293 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 294 295 %r1 = load volatile float, float addrspace(1)* %gep.0 296 %r2 = load volatile float, float addrspace(1)* %gep.1 297 298 %r2.fneg = fsub float -0.000000e+00, %r2 299 300 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg) 301 store float %r3, float addrspace(1)* %gep.out 302 ret void 303} 304 305; GCN-LABEL: {{^}}mad_sub_f32: 306; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] 307; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] 308; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] 309; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] 310 311; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] 312 313; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] 314; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 315 316; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] 317; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 318 319; SI: buffer_store_dword [[RESULT]] 320; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 321define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 322 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 323 %tid.ext = sext i32 %tid to i64 324 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 325 %add1 = add i64 %tid.ext, 1 326 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 327 %add2 = add i64 %tid.ext, 2 328 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 329 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 330 %a = load volatile float, float addrspace(1)* %gep0, align 4 331 %b = load volatile float, float addrspace(1)* %gep1, align 4 332 %c = load volatile float, float addrspace(1)* %gep2, align 4 333 %mul = fmul float %a, %b 334 %sub = fsub float %mul, %c 335 store float %sub, float addrspace(1)* %outgep, align 4 336 ret void 337} 338 339; GCN-LABEL: {{^}}mad_sub_inv_f32: 340; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] 341; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] 342; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] 343 344; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] 345 346; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] 347 348; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] 349; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 350 351; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] 352; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 353 354; SI: buffer_store_dword [[RESULT]] 355; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 356define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 357 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 358 %tid.ext = sext i32 %tid to i64 359 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 360 %add1 = add i64 %tid.ext, 1 361 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 362 %add2 = add i64 %tid.ext, 2 363 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 364 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 365 %a = load volatile float, float addrspace(1)* %gep0, align 4 366 %b = load volatile float, float addrspace(1)* %gep1, align 4 367 %c = load volatile float, float addrspace(1)* %gep2, align 4 368 %mul = fmul float %a, %b 369 %sub = fsub float %c, %mul 370 store float %sub, float addrspace(1)* %outgep, align 4 371 ret void 372} 373 374; GCN-LABEL: {{^}}mad_sub_fabs_f32: 375; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] 376; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] 377; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] 378; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| 379 380; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| 381 382; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] 383; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| 384 385; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] 386; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| 387 388; SI: buffer_store_dword [[RESULT]] 389; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 390define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 391 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 392 %tid.ext = sext i32 %tid to i64 393 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 394 %add1 = add i64 %tid.ext, 1 395 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 396 %add2 = add i64 %tid.ext, 2 397 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 398 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 399 %a = load volatile float, float addrspace(1)* %gep0, align 4 400 %b = load volatile float, float addrspace(1)* %gep1, align 4 401 %c = load volatile float, float addrspace(1)* %gep2, align 4 402 %c.abs = call float @llvm.fabs.f32(float %c) #0 403 %mul = fmul float %a, %b 404 %sub = fsub float %mul, %c.abs 405 store float %sub, float addrspace(1)* %outgep, align 4 406 ret void 407} 408 409; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32: 410; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] 411; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] 412; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] 413; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| 414 415; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| 416 417; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] 418; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] 419 420; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] 421; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] 422 423; SI: buffer_store_dword [[RESULT]] 424; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 425define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 426 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 427 %tid.ext = sext i32 %tid to i64 428 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 429 %add1 = add i64 %tid.ext, 1 430 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 431 %add2 = add i64 %tid.ext, 2 432 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 433 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 434 %a = load volatile float, float addrspace(1)* %gep0, align 4 435 %b = load volatile float, float addrspace(1)* %gep1, align 4 436 %c = load volatile float, float addrspace(1)* %gep2, align 4 437 %c.abs = call float @llvm.fabs.f32(float %c) #0 438 %mul = fmul float %a, %b 439 %sub = fsub float %c.abs, %mul 440 store float %sub, float addrspace(1)* %outgep, align 4 441 ret void 442} 443 444; GCN-LABEL: {{^}}neg_neg_mad_f32: 445; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] 446; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] 447; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] 448 449; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGB]], [[REGA]] 450; SI-FLUSH: buffer_store_dword [[REGC]] 451; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] 452 453; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] 454 455; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] 456; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 457 458; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] 459; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 460 461; SI-DENORM: buffer_store_dword [[RESULT]] 462; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 463define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 464 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 465 %tid.ext = sext i32 %tid to i64 466 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 467 %add1 = add i64 %tid.ext, 1 468 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 469 %add2 = add i64 %tid.ext, 2 470 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 471 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 472 %a = load volatile float, float addrspace(1)* %gep0, align 4 473 %b = load volatile float, float addrspace(1)* %gep1, align 4 474 %c = load volatile float, float addrspace(1)* %gep2, align 4 475 %nega = fsub float -0.000000e+00, %a 476 %negb = fsub float -0.000000e+00, %b 477 %mul = fmul float %nega, %negb 478 %sub = fadd float %mul, %c 479 store float %sub, float addrspace(1)* %outgep, align 4 480 ret void 481} 482 483; GCN-LABEL: {{^}}mad_fabs_sub_f32: 484; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] 485; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] 486; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] 487; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] 488 489; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] 490 491; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| 492; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 493 494; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| 495; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 496 497; SI: buffer_store_dword [[RESULT]] 498; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 499define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 500 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 501 %tid.ext = sext i32 %tid to i64 502 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 503 %add1 = add i64 %tid.ext, 1 504 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 505 %add2 = add i64 %tid.ext, 2 506 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 507 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 508 %a = load volatile float, float addrspace(1)* %gep0, align 4 509 %b = load volatile float, float addrspace(1)* %gep1, align 4 510 %c = load volatile float, float addrspace(1)* %gep2, align 4 511 %b.abs = call float @llvm.fabs.f32(float %b) #0 512 %mul = fmul float %a, %b.abs 513 %sub = fsub float %mul, %c 514 store float %sub, float addrspace(1)* %outgep, align 4 515 ret void 516} 517 518; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32: 519; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], 520; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], 521; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]] 522; SI-FLUSH: buffer_store_dword [[R2]] 523; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 524 525; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 526 527; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 528; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 529 530; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 531; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 532 533; SI-DENORM: buffer_store_dword [[RESULT]] 534; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 535define void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 536 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 537 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 538 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 539 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 540 541 %r1 = load volatile float, float addrspace(1)* %gep.0 542 %r2 = load volatile float, float addrspace(1)* %gep.1 543 544 %add = fadd float %r1, %r1 545 %r3 = fsub float %r2, %add 546 547 store float %r3, float addrspace(1)* %gep.out 548 ret void 549} 550 551; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32: 552; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], 553; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], 554; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 555 556; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 557 558; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 559; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 560 561; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 562; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 563 564; SI: buffer_store_dword [[RESULT]] 565; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 566define void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 567 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 568 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 569 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 570 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 571 572 %r1 = load volatile float, float addrspace(1)* %gep.0 573 %r2 = load volatile float, float addrspace(1)* %gep.1 574 575 %add = fadd float %r1, %r1 576 %r3 = fsub float %add, %r2 577 578 store float %r3, float addrspace(1)* %gep.out 579 ret void 580} 581 582attributes #0 = { nounwind } 583attributes #1 = { nounwind readnone } 584