1; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI-FLUSH,VI %s 2; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI-FLUSH,VI %s 3 4; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-STRICT,VI-DENORM,VI %s 5; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s 6 7; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s 8; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s 9; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s 10; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s 11 12declare i32 @llvm.amdgcn.workitem.id.x() #1 13declare half @llvm.fmuladd.f16(half, half, half) #1 14declare half @llvm.fabs.f16(half) #1 15 16; GCN-LABEL: {{^}}fmuladd_f16: 17; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 18 19; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 20 21; GFX10-FLUSH: v_mul_f16_e32 22; GFX10-FLUSH: v_add_f16_e32 23; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 24 25define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 26 half addrspace(1)* %in2, half addrspace(1)* %in3) #0 { 27 %r0 = load half, half addrspace(1)* %in1 28 %r1 = load half, half addrspace(1)* %in2 29 %r2 = load half, half addrspace(1)* %in3 30 %r3 = tail call half @llvm.fmuladd.f16(half %r0, half %r1, half %r2) 31 store half %r3, half addrspace(1)* %out 32 ret void 33} 34 35; GCN-LABEL: {{^}}fmul_fadd_f16: 36; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 37 38; VI-DENORM-CONTRACT: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 39 40; GFX10-FLUSH: v_mul_f16_e32 41; GFX10-FLUSH: v_add_f16_e32 42; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 43 44define amdgpu_kernel void @fmul_fadd_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 45 half addrspace(1)* %in2, half addrspace(1)* %in3) #0 { 46 %r0 = load half, half addrspace(1)* %in1 47 %r1 = load half, half addrspace(1)* %in2 48 %r2 = load half, half addrspace(1)* %in3 49 %mul = fmul half %r0, %r1 50 %add = fadd half %mul, %r2 51 store half %add, half addrspace(1)* %out 52 ret void 53} 54 55; GCN-LABEL: {{^}}fmul_fadd_contract_f16: 56; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 57 58; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 59 60; GFX10-FLUSH: v_mul_f16_e32 61; GFX10-FLUSH: v_add_f16_e32 62; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 63 64define amdgpu_kernel void @fmul_fadd_contract_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 65 half addrspace(1)* %in2, half addrspace(1)* %in3) #0 { 66 %r0 = load half, half addrspace(1)* %in1 67 %r1 = load half, half addrspace(1)* %in2 68 %r2 = load half, half addrspace(1)* %in3 69 %mul = fmul contract half %r0, %r1 70 %add = fadd contract half %mul, %r2 71 store half %add, half addrspace(1)* %out 72 ret void 73} 74 75; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16 76; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 77; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 78; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] 79; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 80 81; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 82; GFX10-DENORM: v_fmac_f16_e32 [[R2:v[0-9]+]], 2.0, [[R1]] 83 84; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] 85; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] 86 87; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 88; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] 89; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} 90 91define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 92 %tid = call i32 @llvm.amdgcn.workitem.id.x() 93 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 94 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 95 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 96 97 %r1 = load volatile half, half addrspace(1)* %gep.0 98 %r2 = load volatile half, half addrspace(1)* %gep.1 99 100 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2) 101 store half %r3, half addrspace(1)* %gep.out 102 ret void 103} 104 105; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16 106; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 107; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 108; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] 109; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 110 111; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 112; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]] 113 114; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] 115; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] 116 117; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 118; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] 119; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] 120 121define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 122 %tid = call i32 @llvm.amdgcn.workitem.id.x() 123 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 124 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 125 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 126 127 %r1 = load volatile half, half addrspace(1)* %gep.0 128 %r2 = load volatile half, half addrspace(1)* %gep.1 129 130 %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2) 131 store half %r3, half addrspace(1)* %gep.out 132 ret void 133} 134 135; GCN-LABEL: {{^}}fadd_a_a_b_f16: 136; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 137; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 138; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] 139; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 140 141; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 142; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]] 143 144; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 145; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 146 147; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 148 149; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] 150; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] 151; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] 152; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] 153; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]] 154 155define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out, 156 half addrspace(1)* %in1, 157 half addrspace(1)* %in2) #0 { 158 %tid = call i32 @llvm.amdgcn.workitem.id.x() 159 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 160 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 161 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 162 163 %r0 = load volatile half, half addrspace(1)* %gep.0 164 %r1 = load volatile half, half addrspace(1)* %gep.1 165 166 %add.0 = fadd half %r0, %r0 167 %add.1 = fadd half %add.0, %r1 168 store half %add.1, half addrspace(1)* %gep.out 169 ret void 170} 171 172; GCN-LABEL: {{^}}fadd_b_a_a_f16: 173; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 174; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 175; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] 176; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 177 178; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 179; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]] 180 181; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 182; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 183 184; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 185 186; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] 187; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] 188; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] 189; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] 190; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]] 191 192define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out, 193 half addrspace(1)* %in1, 194 half addrspace(1)* %in2) #0 { 195 %tid = call i32 @llvm.amdgcn.workitem.id.x() 196 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 197 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 198 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 199 200 %r0 = load volatile half, half addrspace(1)* %gep.0 201 %r1 = load volatile half, half addrspace(1)* %gep.1 202 203 %add.0 = fadd half %r0, %r0 204 %add.1 = fadd half %r1, %add.0 205 store half %add.1, half addrspace(1)* %gep.out 206 ret void 207} 208 209; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16 210; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 211; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 212; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] 213; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 214; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]] 215; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 216; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 217; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] 218; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] 219; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] 220; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] 221define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 222 %tid = call i32 @llvm.amdgcn.workitem.id.x() 223 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 224 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 225 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 226 227 %r1 = load volatile half, half addrspace(1)* %gep.0 228 %r2 = load volatile half, half addrspace(1)* %gep.1 229 230 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2) 231 store half %r3, half addrspace(1)* %gep.out 232 ret void 233} 234 235; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16 236; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 237; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 238; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] 239; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 240 241; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 242; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 243 244; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] 245; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] 246; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] 247 248; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]] 249; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] 250define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 251 %tid = call i32 @llvm.amdgcn.workitem.id.x() 252 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 253 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 254 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 255 256 %r1 = load volatile half, half addrspace(1)* %gep.0 257 %r2 = load volatile half, half addrspace(1)* %gep.1 258 259 %r1.fneg = fneg half %r1 260 261 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2) 262 store half %r3, half addrspace(1)* %gep.out 263 ret void 264} 265 266; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16 267; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 268; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 269; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] 270; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 271 272; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 273; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 274 275; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] 276; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] 277; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] 278 279; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]] 280; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] 281define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 282 %tid = call i32 @llvm.amdgcn.workitem.id.x() 283 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 284 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 285 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 286 287 %r1 = load volatile half, half addrspace(1)* %gep.0 288 %r2 = load volatile half, half addrspace(1)* %gep.1 289 290 %r1.fneg = fneg half %r1 291 292 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2) 293 store half %r3, half addrspace(1)* %gep.out 294 ret void 295} 296 297; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16 298; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 299; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 300; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 301; GCN-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 302; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 303; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] 304; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] 305; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] 306define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 307 %tid = call i32 @llvm.amdgcn.workitem.id.x() 308 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 309 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 310 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 311 312 %r1 = load volatile half, half addrspace(1)* %gep.0 313 %r2 = load volatile half, half addrspace(1)* %gep.1 314 315 %r2.fneg = fneg half %r2 316 317 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg) 318 store half %r3, half addrspace(1)* %gep.out 319 ret void 320} 321 322; GCN-LABEL: {{^}}mad_sub_f16: 323; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] 324; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] 325; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] 326 327; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] 328 329; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] 330 331; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 332; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 333 334; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 335 336; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 337; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 338; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] 339define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { 340 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 341 %tid.ext = sext i32 %tid to i64 342 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext 343 %add1 = add i64 %tid.ext, 1 344 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 345 %add2 = add i64 %tid.ext, 2 346 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 347 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext 348 %a = load volatile half, half addrspace(1)* %gep0, align 2 349 %b = load volatile half, half addrspace(1)* %gep1, align 2 350 %c = load volatile half, half addrspace(1)* %gep2, align 2 351 %mul = fmul half %a, %b 352 %sub = fsub half %mul, %c 353 store half %sub, half addrspace(1)* %outgep, align 2 354 ret void 355} 356 357; GCN-LABEL: {{^}}mad_sub_inv_f16: 358; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] 359; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] 360; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] 361; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] 362 363; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] 364 365; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 366; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 367 368; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 369 370; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 371; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 372; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] 373define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { 374 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 375 %tid.ext = sext i32 %tid to i64 376 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext 377 %add1 = add i64 %tid.ext, 1 378 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 379 %add2 = add i64 %tid.ext, 2 380 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 381 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext 382 %a = load volatile half, half addrspace(1)* %gep0, align 2 383 %b = load volatile half, half addrspace(1)* %gep1, align 2 384 %c = load volatile half, half addrspace(1)* %gep2, align 2 385 %mul = fmul half %a, %b 386 %sub = fsub half %c, %mul 387 store half %sub, half addrspace(1)* %outgep, align 2 388 ret void 389} 390 391; GCN-LABEL: {{^}}mad_sub_fabs_f16: 392; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] 393; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] 394; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] 395; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| 396 397; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| 398 399; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 400; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| 401 402; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 403 404; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 405; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| 406; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] 407define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { 408 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 409 %tid.ext = sext i32 %tid to i64 410 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext 411 %add1 = add i64 %tid.ext, 1 412 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 413 %add2 = add i64 %tid.ext, 2 414 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 415 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext 416 %a = load volatile half, half addrspace(1)* %gep0, align 2 417 %b = load volatile half, half addrspace(1)* %gep1, align 2 418 %c = load volatile half, half addrspace(1)* %gep2, align 2 419 %c.abs = call half @llvm.fabs.f16(half %c) #0 420 %mul = fmul half %a, %b 421 %sub = fsub half %mul, %c.abs 422 store half %sub, half addrspace(1)* %outgep, align 2 423 ret void 424} 425 426; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16: 427; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] 428; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] 429; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] 430 431; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| 432 433; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| 434 435; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 436; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] 437 438; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 439 440; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 441; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] 442; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] 443define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { 444 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 445 %tid.ext = sext i32 %tid to i64 446 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext 447 %add1 = add i64 %tid.ext, 1 448 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 449 %add2 = add i64 %tid.ext, 2 450 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 451 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext 452 %a = load volatile half, half addrspace(1)* %gep0, align 2 453 %b = load volatile half, half addrspace(1)* %gep1, align 2 454 %c = load volatile half, half addrspace(1)* %gep2, align 2 455 %c.abs = call half @llvm.fabs.f16(half %c) #0 456 %mul = fmul half %a, %b 457 %sub = fsub half %c.abs, %mul 458 store half %sub, half addrspace(1)* %outgep, align 2 459 ret void 460} 461 462; GCN-LABEL: {{^}}neg_neg_mad_f16: 463; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] 464; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] 465; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] 466 467; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]] 468; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] 469 470; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] 471; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[REGC]], [[REGA]], [[REGB]] 472 473; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 474; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 475; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 476 477; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 478; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 479; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] 480; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] 481; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[REGC]] 482define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { 483 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 484 %tid.ext = sext i32 %tid to i64 485 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext 486 %add1 = add i64 %tid.ext, 1 487 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 488 %add2 = add i64 %tid.ext, 2 489 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 490 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext 491 %a = load volatile half, half addrspace(1)* %gep0, align 2 492 %b = load volatile half, half addrspace(1)* %gep1, align 2 493 %c = load volatile half, half addrspace(1)* %gep2, align 2 494 %nega = fneg half %a 495 %negb = fneg half %b 496 %mul = fmul half %nega, %negb 497 %sub = fadd half %mul, %c 498 store half %sub, half addrspace(1)* %outgep, align 2 499 ret void 500} 501 502; GCN-LABEL: {{^}}mad_fabs_sub_f16: 503; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] 504; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] 505; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] 506 507; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] 508 509; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] 510 511; GCN-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| 512; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 513 514; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 515 516; GFX10-FLUSH: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| 517; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 518; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] 519define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { 520 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 521 %tid.ext = sext i32 %tid to i64 522 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext 523 %add1 = add i64 %tid.ext, 1 524 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 525 %add2 = add i64 %tid.ext, 2 526 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 527 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext 528 %a = load volatile half, half addrspace(1)* %gep0, align 2 529 %b = load volatile half, half addrspace(1)* %gep1, align 2 530 %c = load volatile half, half addrspace(1)* %gep2, align 2 531 %b.abs = call half @llvm.fabs.f16(half %b) #0 532 %mul = fmul half %a, %b.abs 533 %sub = fsub half %mul, %c 534 store half %sub, half addrspace(1)* %outgep, align 2 535 ret void 536} 537 538; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16: 539; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 540; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 541; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] 542; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 543 544; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 545; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], -2.0, [[R1]] 546 547; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 548; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 549 550; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 551 552; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 553; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 554; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] 555; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] 556; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]] 557define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) { 558 %tid = call i32 @llvm.amdgcn.workitem.id.x() 559 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 560 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 561 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 562 563 %r1 = load volatile half, half addrspace(1)* %gep.0 564 %r2 = load volatile half, half addrspace(1)* %gep.1 565 566 %add = fadd half %r1, %r1 567 %r3 = fsub half %r2, %add 568 569 store half %r3, half addrspace(1)* %gep.out 570 ret void 571} 572 573; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16: 574; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 575; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 576 577; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 578 579; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 580 581; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 582; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 583 584; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 585 586; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 587; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 588; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] 589define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) { 590 %tid = call i32 @llvm.amdgcn.workitem.id.x() 591 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 592 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 593 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 594 595 %r1 = load volatile half, half addrspace(1)* %gep.0 596 %r2 = load volatile half, half addrspace(1)* %gep.1 597 598 %add = fadd half %r1, %r1 599 %r3 = fsub half %add, %r2 600 601 store half %r3, half addrspace(1)* %gep.out 602 ret void 603} 604 605attributes #0 = { nounwind } 606attributes #1 = { nounwind readnone } 607