1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s 2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s 3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s 4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,GFX10-MAD %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA,GFX10-FMA %s 6; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8_9_10,FMA,GFX940-FMA %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 9declare float @llvm.fabs.f32(float) nounwind readnone 10 11; GCN-LABEL: {{^}}madak_f32: 12; GFX6: buffer_load_dword [[VA:v[0-9]+]] 13; GFX6: buffer_load_dword [[VB:v[0-9]+]] 14; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]] 15; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] 16; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] 17; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] 18; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] 19; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]] 20; MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 21; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 22; FMA: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 23define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 { 24 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 25 %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid 26 %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid 27 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 28 29 %a = load float, float addrspace(1)* %in.a.gep, align 4 30 %b = load float, float addrspace(1)* %in.b.gep, align 4 31 32 %mul = fmul float %a, %b 33 %madak = fadd float %mul, 10.0 34 store float %madak, float addrspace(1)* %out.gep, align 4 35 ret void 36} 37 38; Make sure this is only folded with one use. This is a code size 39; optimization and if we fold the immediate multiple times, we'll undo 40; it. 41 42; GCN-LABEL: {{^}}madak_2_use_f32: 43; GFX9: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 44; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} 45; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 46; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 47; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]], 48; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]], 49; GFX8_9_10: {{flat|global}}_load_dword [[VC:v[0-9]+]], 50; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 51; GFX8-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 52; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 53; GFX10-MAD-DAG:v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 54; FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 55; MAD-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]] 56; GFX10-FMA-DAG:v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VC]], 0x41200000 57; GFX940-FMA-DAG:v_fmac_f32_e32 [[VK]], [[VA]], [[VC]] 58; GCN: s_endpgm 59define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { 60 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 61 62 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 63 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 64 %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2 65 66 %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 67 %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 68 69 %a = load volatile float, float addrspace(1)* %in.gep.0, align 4 70 %b = load volatile float, float addrspace(1)* %in.gep.1, align 4 71 %c = load volatile float, float addrspace(1)* %in.gep.2, align 4 72 73 %mul0 = fmul float %a, %b 74 %mul1 = fmul float %a, %c 75 %madak0 = fadd float %mul0, 10.0 76 %madak1 = fadd float %mul1, 10.0 77 78 store volatile float %madak0, float addrspace(1)* %out.gep.0, align 4 79 store volatile float %madak1, float addrspace(1)* %out.gep.1, align 4 80 ret void 81} 82 83; GCN-LABEL: {{^}}madak_m_inline_imm_f32: 84; GCN: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]] 85; MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 86; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 87; FMA: v_fmaak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 88define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) #0 { 89 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 90 %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid 91 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 92 93 %a = load float, float addrspace(1)* %in.a.gep, align 4 94 95 %mul = fmul float 4.0, %a 96 %madak = fadd float %mul, 10.0 97 store float %madak, float addrspace(1)* %out.gep, align 4 98 ret void 99} 100 101; Make sure nothing weird happens with a value that is also allowed as 102; an inline immediate. 103 104; GCN-LABEL: {{^}}madak_inline_imm_f32: 105; GFX6: buffer_load_dword [[VA:v[0-9]+]] 106; GFX6: buffer_load_dword [[VB:v[0-9]+]] 107; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]] 108; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] 109; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] 110; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] 111; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] 112; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]] 113; MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 114; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 115; FMA: v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 116define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 { 117 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 118 %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid 119 %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid 120 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 121 122 %a = load float, float addrspace(1)* %in.a.gep, align 4 123 %b = load float, float addrspace(1)* %in.b.gep, align 4 124 125 %mul = fmul float %a, %b 126 %madak = fadd float %mul, 4.0 127 store float %madak, float addrspace(1)* %out.gep, align 4 128 ret void 129} 130 131; We can't use an SGPR when forming madak 132; GCN-LABEL: {{^}}s_v_madak_f32: 133; GCN-DAG: s_load_dword [[SB:s[0-9]+]] 134; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 135; GCN-DAG: {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]] 136; GCN-NOT: v_madak_f32 137; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] 138; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 139; GFX10-FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 140; GFX940-FMA: v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]] 141define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) #0 { 142 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 143 %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid 144 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 145 146 %a = load float, float addrspace(1)* %in.a.gep, align 4 147 148 %mul = fmul float %a, %b 149 %madak = fadd float %mul, 10.0 150 store float %madak, float addrspace(1)* %out.gep, align 4 151 ret void 152} 153 154; GCN-LABEL: @v_s_madak_f32 155; GCN-DAG: s_load_dword [[SB:s[0-9]+]] 156; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 157; GCN-DAG: {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]] 158; GFX6_8_9-NOT: v_madak_f32 159; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] 160; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 161; GFX10-FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 162; GFX940-FMA: v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]] 163define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) #0 { 164 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 165 %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid 166 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 167 168 %b = load float, float addrspace(1)* %in.b.gep, align 4 169 170 %mul = fmul float %a, %b 171 %madak = fadd float %mul, 10.0 172 store float %madak, float addrspace(1)* %out.gep, align 4 173 ret void 174} 175 176; GCN-LABEL: {{^}}s_s_madak_f32: 177; GCN-NOT: v_madak_f32 178; GFX8_9: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} 179; GFX10-MAD: v_mac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 180; GFX10-FMA: v_fmac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 181; GFX940-FMA: v_fmac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} 182define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) #0 { 183 %mul = fmul float %a, %b 184 %madak = fadd float %mul, 10.0 185 store float %madak, float addrspace(1)* %out, align 4 186 ret void 187} 188 189; GCN-LABEL: {{^}}no_madak_src0_modifier_f32: 190; GFX6: buffer_load_dword [[VA:v[0-9]+]] 191; GFX6: buffer_load_dword [[VB:v[0-9]+]] 192; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]] 193; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]] 194; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}} 195; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000 196; GFX10-FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000 197; GFX940-FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{s[0-9]+}} 198; GCN: s_endpgm 199define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 { 200 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 201 %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid 202 %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid 203 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 204 205 %a = load float, float addrspace(1)* %in.a.gep, align 4 206 %b = load float, float addrspace(1)* %in.b.gep, align 4 207 208 %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone 209 210 %mul = fmul float %a.fabs, %b 211 %madak = fadd float %mul, 10.0 212 store float %madak, float addrspace(1)* %out.gep, align 4 213 ret void 214} 215 216; GCN-LABEL: {{^}}no_madak_src1_modifier_f32: 217; GFX6: buffer_load_dword [[VA:v[0-9]+]] 218; GFX6: buffer_load_dword [[VB:v[0-9]+]] 219; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]] 220; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]] 221; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}} 222; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000 223; GFX10-FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000 224; GFX940-FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{s[0-9]+}} 225; GCN: s_endpgm 226define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 { 227 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 228 %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid 229 %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid 230 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 231 232 %a = load float, float addrspace(1)* %in.a.gep, align 4 233 %b = load float, float addrspace(1)* %in.b.gep, align 4 234 235 %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone 236 237 %mul = fmul float %a, %b.fabs 238 %madak = fadd float %mul, 10.0 239 store float %madak, float addrspace(1)* %out.gep, align 4 240 ret void 241} 242 243; SIFoldOperands should not fold the SGPR copy into the instruction before GFX10 244; because the implicit immediate already uses the constant bus. 245; On GFX10+ we can use two scalar operands. 246; GCN-LABEL: {{^}}madak_constant_bus_violation: 247; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}} 248 249; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]] 250; MAD: v_mov_b32_e32 [[MADAK:v[0-9]+]], 0x42280000 251; MAD: v_mac_f32_e64 [[MADAK]], [[SGPR0]], 0.5 252; GFX10: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] 253; GFX10-MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 254; GFX10-FMA: v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 255; GFX940-FMA: v_fmac_f32_e64 [[MADAK:v[0-9]+]], [[SGPR0]], 0.5 256; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]] 257; GFX6: buffer_store_dword [[MUL]] 258; GFX8_9_10: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]] 259define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 { 260bb: 261 %tmp = icmp eq i32 %arg1, 0 262 br i1 %tmp, label %bb3, label %bb4 263 264bb3: 265 store volatile float 0.0, float addrspace(1)* undef 266 br label %bb4 267 268bb4: 269 %vgpr = load volatile float, float addrspace(1)* undef 270 %tmp0 = fmul float %sgpr0, 0.5 271 %tmp1 = fadd float %tmp0, 42.0 272 %tmp2 = fmul float %tmp1, %vgpr 273 store volatile float %tmp2, float addrspace(1)* undef, align 4 274 ret void 275} 276 277attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 278