1; RUN: llc -march=amdgcn -mcpu=hawaii -start-before=amdgpu-unify-divergent-exit-nodes -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,SI %s 2; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global -start-before=amdgpu-unify-divergent-exit-nodes -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,SI %s 3 4; RUN: llc -march=amdgcn -mcpu=fiji -start-before=amdgpu-unify-divergent-exit-nodes --verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,VI %s 5; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -start-before=amdgpu-unify-divergent-exit-nodes -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,VI %s 6 7; -------------------------------------------------------------------------------- 8; fadd tests 9; -------------------------------------------------------------------------------- 10 11; GCN-LABEL: {{^}}v_fneg_add_f32: 12; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 13; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 14 15; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 16; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] 17 18; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]] 19; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 20define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 21 %tid = call i32 @llvm.amdgcn.workitem.id.x() 22 %tid.ext = sext i32 %tid to i64 23 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 24 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 25 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 26 %a = load volatile float, float addrspace(1)* %a.gep 27 %b = load volatile float, float addrspace(1)* %b.gep 28 %add = fadd float %a, %b 29 %fneg = fneg float %add 30 store float %fneg, float addrspace(1)* %out.gep 31 ret void 32} 33 34; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32: 35; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 36; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 37; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 38; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] 39; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] 40; GCN-NEXT: s_waitcnt vmcnt(0) 41; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 42; GCN-NEXT: s_waitcnt vmcnt(0) 43define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 44 %tid = call i32 @llvm.amdgcn.workitem.id.x() 45 %tid.ext = sext i32 %tid to i64 46 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 47 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 48 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 49 %a = load volatile float, float addrspace(1)* %a.gep 50 %b = load volatile float, float addrspace(1)* %b.gep 51 %add = fadd float %a, %b 52 %fneg = fneg float %add 53 store volatile float %fneg, float addrspace(1)* %out 54 store volatile float %add, float addrspace(1)* %out 55 ret void 56} 57 58; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32: 59; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 60; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 61 62; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 63; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] 64; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]] 65 66; GCN-NSZ: v_sub_f32_e64 [[NEG_ADD:v[0-9]+]], -[[A]], [[B]] 67; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_ADD]] 68 69; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] 70; GCN-NEXT: s_waitcnt vmcnt(0) 71; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 72; GCN-NEXT: s_waitcnt vmcnt(0) 73define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 74 %tid = call i32 @llvm.amdgcn.workitem.id.x() 75 %tid.ext = sext i32 %tid to i64 76 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 77 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 78 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 79 %a = load volatile float, float addrspace(1)* %a.gep 80 %b = load volatile float, float addrspace(1)* %b.gep 81 %add = fadd float %a, %b 82 %fneg = fneg float %add 83 %use1 = fmul float %add, 4.0 84 store volatile float %fneg, float addrspace(1)* %out 85 store volatile float %use1, float addrspace(1)* %out 86 ret void 87} 88 89; GCN-LABEL: {{^}}v_fneg_add_fneg_x_f32: 90; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 91; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 92 93; GCN-SAFE: v_sub_f32_e32 94; GCN-SAFE: v_xor_b32_e32 [[ADD:v[0-9]+]], 0x80000000, 95 96; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 97 98; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 99define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 100 %tid = call i32 @llvm.amdgcn.workitem.id.x() 101 %tid.ext = sext i32 %tid to i64 102 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 103 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 104 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 105 %a = load volatile float, float addrspace(1)* %a.gep 106 %b = load volatile float, float addrspace(1)* %b.gep 107 %fneg.a = fneg float %a 108 %add = fadd float %fneg.a, %b 109 %fneg = fneg float %add 110 store volatile float %fneg, float addrspace(1)* %out 111 ret void 112} 113 114; GCN-LABEL: {{^}}v_fneg_add_x_fneg_f32: 115; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 116; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 117 118; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 119; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] 120 121; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] 122; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 123define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 124 %tid = call i32 @llvm.amdgcn.workitem.id.x() 125 %tid.ext = sext i32 %tid to i64 126 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 127 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 128 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 129 %a = load volatile float, float addrspace(1)* %a.gep 130 %b = load volatile float, float addrspace(1)* %b.gep 131 %fneg.b = fneg float %b 132 %add = fadd float %a, %fneg.b 133 %fneg = fneg float %add 134 store volatile float %fneg, float addrspace(1)* %out 135 ret void 136} 137 138; GCN-LABEL: {{^}}v_fneg_add_fneg_fneg_f32: 139; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 140; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 141 142; GCN-SAFE: v_sub_f32_e64 [[ADD:v[0-9]+]], -[[A]], [[B]] 143; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] 144 145; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 146; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 147define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 148 %tid = call i32 @llvm.amdgcn.workitem.id.x() 149 %tid.ext = sext i32 %tid to i64 150 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 151 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 152 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 153 %a = load volatile float, float addrspace(1)* %a.gep 154 %b = load volatile float, float addrspace(1)* %b.gep 155 %fneg.a = fneg float %a 156 %fneg.b = fneg float %b 157 %add = fadd float %fneg.a, %fneg.b 158 %fneg = fneg float %add 159 store volatile float %fneg, float addrspace(1)* %out 160 ret void 161} 162 163; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32: 164; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 165; GCN-DAG: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 166 167; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 168; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] 169; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] 170 171; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 172; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]] 173; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] 174; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 175; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 176; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 177define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 178 %tid = call i32 @llvm.amdgcn.workitem.id.x() 179 %tid.ext = sext i32 %tid to i64 180 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 181 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 182 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 183 %a = load volatile float, float addrspace(1)* %a.gep 184 %b = load volatile float, float addrspace(1)* %b.gep 185 %fneg.a = fneg float %a 186 %add = fadd float %fneg.a, %b 187 %fneg = fneg float %add 188 store volatile float %fneg, float addrspace(1)* %out 189 store volatile float %fneg.a, float addrspace(1)* %out 190 ret void 191} 192 193; GCN-LABEL: {{^}}v_fneg_add_multi_use_fneg_x_f32: 194; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 195; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 196 197; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 198; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] 199; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] 200 201; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]] 202; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 203; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] 204; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 205; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 206; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 207define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { 208 %tid = call i32 @llvm.amdgcn.workitem.id.x() 209 %tid.ext = sext i32 %tid to i64 210 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 211 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 212 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 213 %a = load volatile float, float addrspace(1)* %a.gep 214 %b = load volatile float, float addrspace(1)* %b.gep 215 %fneg.a = fneg float %a 216 %add = fadd float %fneg.a, %b 217 %fneg = fneg float %add 218 %use1 = fmul float %fneg.a, %c 219 store volatile float %fneg, float addrspace(1)* %out 220 store volatile float %use1, float addrspace(1)* %out 221 ret void 222} 223 224; This one asserted with -enable-no-signed-zeros-fp-math 225; GCN-LABEL: {{^}}fneg_fadd_0: 226; GCN-SAFE-DAG: v_mad_f32 [[A:v[0-9]+]], 227; GCN-SAFE-DAG: v_cmp_ngt_f32_e32 {{.*}}, [[A]] 228; GCN-SAFE-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -[[A]] 229define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 { 230.entry: 231 %tmp7 = fdiv float 1.000000e+00, %tmp6 232 %tmp8 = fmul float 0.000000e+00, %tmp7 233 %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8 234 %.i188 = fadd float %tmp9, 0.000000e+00 235 %tmp10 = fcmp uge float %.i188, %tmp2 236 %tmp11 = fneg float %.i188 237 %.i092 = select i1 %tmp10, float %tmp2, float %tmp11 238 %tmp12 = fcmp ule float %.i092, 0.000000e+00 239 %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000 240 ret float %.i198 241} 242 243; This is a workaround because -enable-no-signed-zeros-fp-math does not set up 244; function attribute unsafe-fp-math automatically. Combine with the previous test 245; when that is done. 246; GCN-LABEL: {{^}}fneg_fadd_0_nsz: 247; GCN-NSZ-DAG: v_rcp_f32_e32 [[A:v[0-9]+]], 248; GCN-NSZ-DAG: v_mov_b32_e32 [[B:v[0-9]+]], 249; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]], 250; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]], 251; GCN-NSZ-DAG: v_cmp_nlt_f32_e64 {{.*}}, -[[D]] 252define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #2 { 253.entry: 254 %tmp7 = fdiv afn float 1.000000e+00, %tmp6 255 %tmp8 = fmul float 0.000000e+00, %tmp7 256 %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8 257 %.i188 = fadd float %tmp9, 0.000000e+00 258 %tmp10 = fcmp uge float %.i188, %tmp2 259 %tmp11 = fneg float %.i188 260 %.i092 = select i1 %tmp10, float %tmp2, float %tmp11 261 %tmp12 = fcmp ule float %.i092, 0.000000e+00 262 %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000 263 ret float %.i198 264} 265 266; -------------------------------------------------------------------------------- 267; fmul tests 268; -------------------------------------------------------------------------------- 269 270; GCN-LABEL: {{^}}v_fneg_mul_f32: 271; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 272; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 273; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]] 274; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 275define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 276 %tid = call i32 @llvm.amdgcn.workitem.id.x() 277 %tid.ext = sext i32 %tid to i64 278 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 279 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 280 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 281 %a = load volatile float, float addrspace(1)* %a.gep 282 %b = load volatile float, float addrspace(1)* %b.gep 283 %mul = fmul float %a, %b 284 %fneg = fneg float %mul 285 store float %fneg, float addrspace(1)* %out.gep 286 ret void 287} 288 289; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32: 290; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 291; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 292; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 293; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]] 294; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]] 295; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 296define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 297 %tid = call i32 @llvm.amdgcn.workitem.id.x() 298 %tid.ext = sext i32 %tid to i64 299 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 300 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 301 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 302 %a = load volatile float, float addrspace(1)* %a.gep 303 %b = load volatile float, float addrspace(1)* %b.gep 304 %mul = fmul float %a, %b 305 %fneg = fneg float %mul 306 store volatile float %fneg, float addrspace(1)* %out 307 store volatile float %mul, float addrspace(1)* %out 308 ret void 309} 310 311; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32: 312; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 313; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 314; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], [[A]], -[[B]] 315; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]] 316 317; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] 318; GCN-NEXT: s_waitcnt vmcnt(0) 319; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 320; GCN-NEXT: s_waitcnt vmcnt(0) 321define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 322 %tid = call i32 @llvm.amdgcn.workitem.id.x() 323 %tid.ext = sext i32 %tid to i64 324 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 325 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 326 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 327 %a = load volatile float, float addrspace(1)* %a.gep 328 %b = load volatile float, float addrspace(1)* %b.gep 329 %mul = fmul float %a, %b 330 %fneg = fneg float %mul 331 %use1 = fmul float %mul, 4.0 332 store volatile float %fneg, float addrspace(1)* %out 333 store volatile float %use1, float addrspace(1)* %out 334 ret void 335} 336 337; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32: 338; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 339; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 340; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 341; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 342define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 343 %tid = call i32 @llvm.amdgcn.workitem.id.x() 344 %tid.ext = sext i32 %tid to i64 345 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 346 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 347 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 348 %a = load volatile float, float addrspace(1)* %a.gep 349 %b = load volatile float, float addrspace(1)* %b.gep 350 %fneg.a = fneg float %a 351 %mul = fmul float %fneg.a, %b 352 %fneg = fneg float %mul 353 store volatile float %fneg, float addrspace(1)* %out 354 ret void 355} 356 357; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32: 358; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 359; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 360; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 361; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 362define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 363 %tid = call i32 @llvm.amdgcn.workitem.id.x() 364 %tid.ext = sext i32 %tid to i64 365 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 366 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 367 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 368 %a = load volatile float, float addrspace(1)* %a.gep 369 %b = load volatile float, float addrspace(1)* %b.gep 370 %fneg.b = fneg float %b 371 %mul = fmul float %a, %fneg.b 372 %fneg = fneg float %mul 373 store volatile float %fneg, float addrspace(1)* %out 374 ret void 375} 376 377; GCN-LABEL: {{^}}v_fneg_mul_fneg_fneg_f32: 378; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 379; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 380; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] 381; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 382define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 383 %tid = call i32 @llvm.amdgcn.workitem.id.x() 384 %tid.ext = sext i32 %tid to i64 385 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 386 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 387 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 388 %a = load volatile float, float addrspace(1)* %a.gep 389 %b = load volatile float, float addrspace(1)* %b.gep 390 %fneg.a = fneg float %a 391 %fneg.b = fneg float %b 392 %mul = fmul float %fneg.a, %fneg.b 393 %fneg = fneg float %mul 394 store volatile float %fneg, float addrspace(1)* %out 395 ret void 396} 397 398; GCN-LABEL: {{^}}v_fneg_mul_store_use_fneg_x_f32: 399; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 400; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 401; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 402; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]] 403 404; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]] 405; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 406define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 407 %tid = call i32 @llvm.amdgcn.workitem.id.x() 408 %tid.ext = sext i32 %tid to i64 409 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 410 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 411 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 412 %a = load volatile float, float addrspace(1)* %a.gep 413 %b = load volatile float, float addrspace(1)* %b.gep 414 %fneg.a = fneg float %a 415 %mul = fmul float %fneg.a, %b 416 %fneg = fneg float %mul 417 store volatile float %fneg, float addrspace(1)* %out 418 store volatile float %fneg.a, float addrspace(1)* %out 419 ret void 420} 421 422; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32: 423; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 424; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 425; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]] 426; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 427; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]] 428; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 429define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { 430 %tid = call i32 @llvm.amdgcn.workitem.id.x() 431 %tid.ext = sext i32 %tid to i64 432 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 433 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 434 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 435 %a = load volatile float, float addrspace(1)* %a.gep 436 %b = load volatile float, float addrspace(1)* %b.gep 437 %fneg.a = fneg float %a 438 %mul = fmul float %fneg.a, %b 439 %fneg = fneg float %mul 440 %use1 = fmul float %fneg.a, %c 441 store volatile float %fneg, float addrspace(1)* %out 442 store volatile float %use1, float addrspace(1)* %out 443 ret void 444} 445 446; -------------------------------------------------------------------------------- 447; fminnum tests 448; -------------------------------------------------------------------------------- 449 450; GCN-LABEL: {{^}}v_fneg_minnum_f32_ieee: 451; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 452; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 453; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 454; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] 455; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] 456; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 457define amdgpu_kernel void @v_fneg_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 458 %tid = call i32 @llvm.amdgcn.workitem.id.x() 459 %tid.ext = sext i32 %tid to i64 460 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 461 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 462 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 463 %a = load volatile float, float addrspace(1)* %a.gep 464 %b = load volatile float, float addrspace(1)* %b.gep 465 %min = call float @llvm.minnum.f32(float %a, float %b) 466 %fneg = fneg float %min 467 store float %fneg, float addrspace(1)* %out.gep 468 ret void 469} 470 471; GCN-LABEL: {{^}}v_fneg_minnum_f32_no_ieee: 472; GCN-NOT: v0 473; GCN-NOT: v1 474; GCN: v_max_f32_e64 v0, -v0, -v1 475; GCN-NEXT: ; return 476define amdgpu_ps float @v_fneg_minnum_f32_no_ieee(float %a, float %b) #0 { 477 %min = call float @llvm.minnum.f32(float %a, float %b) 478 %fneg = fneg float %min 479 ret float %fneg 480} 481 482; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_ieee: 483; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 484; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 485; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]] 486; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 487define amdgpu_kernel void @v_fneg_self_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 488 %tid = call i32 @llvm.amdgcn.workitem.id.x() 489 %tid.ext = sext i32 %tid to i64 490 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 491 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 492 %a = load volatile float, float addrspace(1)* %a.gep 493 %min = call float @llvm.minnum.f32(float %a, float %a) 494 %min.fneg = fneg float %min 495 store float %min.fneg, float addrspace(1)* %out.gep 496 ret void 497} 498 499; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_no_ieee: 500; GCN-NOT: v0 501; GCN: v_max_f32_e64 v0, -v0, -v0 502; GCN-NEXT: ; return 503define amdgpu_ps float @v_fneg_self_minnum_f32_no_ieee(float %a) #0 { 504 %min = call float @llvm.minnum.f32(float %a, float %a) 505 %min.fneg = fneg float %min 506 ret float %min.fneg 507} 508 509; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_ieee: 510; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 511; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 512; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]] 513; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 514define amdgpu_kernel void @v_fneg_posk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 515 %tid = call i32 @llvm.amdgcn.workitem.id.x() 516 %tid.ext = sext i32 %tid to i64 517 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 518 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 519 %a = load volatile float, float addrspace(1)* %a.gep 520 %min = call float @llvm.minnum.f32(float 4.0, float %a) 521 %fneg = fneg float %min 522 store float %fneg, float addrspace(1)* %out.gep 523 ret void 524} 525 526; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_no_ieee: 527; GCN-NOT: v0 528; GCN: v_max_f32_e64 v0, -v0, -4.0 529; GCN-NEXT: ; return 530define amdgpu_ps float @v_fneg_posk_minnum_f32_no_ieee(float %a) #0 { 531 %min = call float @llvm.minnum.f32(float 4.0, float %a) 532 %fneg = fneg float %min 533 ret float %fneg 534} 535 536; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_ieee: 537; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 538; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 539; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]] 540; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 541define amdgpu_kernel void @v_fneg_negk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 542 %tid = call i32 @llvm.amdgcn.workitem.id.x() 543 %tid.ext = sext i32 %tid to i64 544 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 545 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 546 %a = load volatile float, float addrspace(1)* %a.gep 547 %min = call float @llvm.minnum.f32(float -4.0, float %a) 548 %fneg = fneg float %min 549 store float %fneg, float addrspace(1)* %out.gep 550 ret void 551} 552 553; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_no_ieee: 554; GCN-NOT: v0 555; GCN: v_max_f32_e64 v0, -v0, 4.0 556; GCN-NEXT: ; return 557define amdgpu_ps float @v_fneg_negk_minnum_f32_no_ieee(float %a) #0 { 558 %min = call float @llvm.minnum.f32(float -4.0, float %a) 559 %fneg = fneg float %min 560 ret float %fneg 561} 562 563; GCN-LABEL: {{^}}v_fneg_0_minnum_f32: 564; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 565; GCN-NOT [[A]] 566; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]] 567; GCN: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MIN]] 568; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 569define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 570 %tid = call i32 @llvm.amdgcn.workitem.id.x() 571 %tid.ext = sext i32 %tid to i64 572 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 573 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 574 %a = load volatile float, float addrspace(1)* %a.gep 575 %min = call nnan float @llvm.minnum.f32(float 0.0, float %a) 576 %fneg = fneg float %min 577 store float %fneg, float addrspace(1)* %out.gep 578 ret void 579} 580 581; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_ieee: 582; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 583; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 584; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]] 585; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 586define amdgpu_kernel void @v_fneg_neg0_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 587 %tid = call i32 @llvm.amdgcn.workitem.id.x() 588 %tid.ext = sext i32 %tid to i64 589 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 590 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 591 %a = load volatile float, float addrspace(1)* %a.gep 592 %min = call float @llvm.minnum.f32(float -0.0, float %a) 593 %fneg = fneg float %min 594 store float %fneg, float addrspace(1)* %out.gep 595 ret void 596} 597 598; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f32: 599; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 600 601; SI-DAG: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]] 602; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]] 603 604; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]] 605; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]] 606; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]] 607 608; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 609define amdgpu_kernel void @v_fneg_inv2pi_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 610 %tid = call i32 @llvm.amdgcn.workitem.id.x() 611 %tid.ext = sext i32 %tid to i64 612 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 613 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 614 %a = load volatile float, float addrspace(1)* %a.gep 615 %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a) 616 %fneg = fneg float %min 617 store float %fneg, float addrspace(1)* %out.gep 618 ret void 619} 620 621; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f32: 622; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 623 624; SI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]] 625; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0x3e22f983, [[NEG_QUIET]] 626 627; VI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]] 628; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]] 629 630; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 631define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 632 %tid = call i32 @llvm.amdgcn.workitem.id.x() 633 %tid.ext = sext i32 %tid to i64 634 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 635 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 636 %a = load volatile float, float addrspace(1)* %a.gep 637 %min = call float @llvm.minnum.f32(float 0xBFC45F3060000000, float %a) 638 %fneg = fneg float %min 639 store float %fneg, float addrspace(1)* %out.gep 640 ret void 641} 642 643; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f16: 644; GCN-DAG: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 645 646; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] 647; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0xbe230000, [[CVT]] 648; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]] 649 650; VI: v_max_f16_e32 [[QUIET:v[0-9]+]], [[A]], [[A]] 651; VI: v_min_f16_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]] 652; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x8000, [[MAX]] 653 654; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 655define amdgpu_kernel void @v_fneg_inv2pi_minnum_f16(half addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { 656 %tid = call i32 @llvm.amdgcn.workitem.id.x() 657 %tid.ext = sext i32 %tid to i64 658 %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext 659 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 660 %a = load volatile half, half addrspace(1)* %a.gep 661 %min = call half @llvm.minnum.f16(half 0xH3118, half %a) 662 %fneg = fsub half -0.000000e+00, %min 663 store half %fneg, half addrspace(1)* %out.gep 664 ret void 665} 666 667; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f16: 668; GCN-DAG: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 669 670; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] 671; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0x3e230000, [[CVT]] 672; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]] 673 674; VI: v_max_f16_e64 [[NEG_QUIET:v[0-9]+]], -[[A]], -[[A]] 675; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]] 676 677; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 678define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f16(half addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { 679 %tid = call i32 @llvm.amdgcn.workitem.id.x() 680 %tid.ext = sext i32 %tid to i64 681 %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext 682 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 683 %a = load volatile half, half addrspace(1)* %a.gep 684 %min = call half @llvm.minnum.f16(half 0xHB118, half %a) 685 %fneg = fsub half -0.000000e+00, %min 686 store half %fneg, half addrspace(1)* %out.gep 687 ret void 688} 689 690; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f64: 691; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 692 693; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfc45f30 694; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882 695; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]] 696; SI: v_max_f64 v[[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]], [[NEG_QUIET]], s[[[K_LO]]:[[K_HI]]] 697 698; VI: v_min_f64 v[[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]], [[A]], 0.15915494 699; VI: v_xor_b32_e32 v[[RESULT_HI]], 0x80000000, v[[RESULT_HI]] 700 701; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[RESULT_LO]]:[[RESULT_HI]]] 702define amdgpu_kernel void @v_fneg_inv2pi_minnum_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { 703 %tid = call i32 @llvm.amdgcn.workitem.id.x() 704 %tid.ext = sext i32 %tid to i64 705 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 706 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 707 %a = load volatile double, double addrspace(1)* %a.gep 708 %min = call double @llvm.minnum.f64(double 0x3fc45f306dc9c882, double %a) 709 %fneg = fsub double -0.000000e+00, %min 710 store double %fneg, double addrspace(1)* %out.gep 711 ret void 712} 713 714; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f64: 715; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 716 717; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fc45f30 718; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882 719; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]] 720; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], s[[[K_LO]]:[[K_HI]]] 721 722; VI: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]] 723; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], 0.15915494 724 725; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 726define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { 727 %tid = call i32 @llvm.amdgcn.workitem.id.x() 728 %tid.ext = sext i32 %tid to i64 729 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 730 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 731 %a = load volatile double, double addrspace(1)* %a.gep 732 %min = call double @llvm.minnum.f64(double 0xbfc45f306dc9c882, double %a) 733 %fneg = fsub double -0.000000e+00, %min 734 store double %fneg, double addrspace(1)* %out.gep 735 ret void 736} 737 738; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_no_ieee: 739; GCN-NOT: v0 740; GCN: v_max_f32_e64 v0, -v0, 0{{$}} 741; GCN-NEXT: ; return 742define amdgpu_ps float @v_fneg_neg0_minnum_f32_no_ieee(float %a) #0 { 743 %min = call float @llvm.minnum.f32(float -0.0, float %a) 744 %fneg = fneg float %min 745 ret float %fneg 746} 747 748; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_ieee: 749; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 750; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 751; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] 752; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[QUIET_A]] 753; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]] 754; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 755define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 756 %tid = call i32 @llvm.amdgcn.workitem.id.x() 757 %tid.ext = sext i32 %tid to i64 758 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 759 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 760 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 761 %a = load volatile float, float addrspace(1)* %a.gep 762 %b = load volatile float, float addrspace(1)* %b.gep 763 %min = call float @llvm.minnum.f32(float 0.0, float %a) 764 %fneg = fneg float %min 765 %mul = fmul float %fneg, %b 766 store float %mul, float addrspace(1)* %out.gep 767 ret void 768} 769 770; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_foldable_use_f32: 771; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 772; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 773 774; SI: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]] 775 776; SI: v_max_f32_e32 [[MIN:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]] 777; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[B]] 778 779; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]] 780; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0.15915494, [[QUIET]] 781; VI: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]] 782 783; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 784define amdgpu_kernel void @v_fneg_inv2pi_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 785 %tid = call i32 @llvm.amdgcn.workitem.id.x() 786 %tid.ext = sext i32 %tid to i64 787 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 788 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 789 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 790 %a = load volatile float, float addrspace(1)* %a.gep 791 %b = load volatile float, float addrspace(1)* %b.gep 792 %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a) 793 %fneg = fneg float %min 794 %mul = fmul float %fneg, %b 795 store float %mul, float addrspace(1)* %out.gep 796 ret void 797} 798 799; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_no_ieee: 800; GCN-NOT: v0 801; GCN-NOT: v1 802; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, v0 803; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], v1 804; GCN-NEXT: ; return 805define amdgpu_ps float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float %b) #0 { 806 %min = call float @llvm.minnum.f32(float 0.0, float %a) 807 %fneg = fneg float %min 808 %mul = fmul float %fneg, %b 809 ret float %mul 810} 811 812; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_ieee: 813; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 814; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 815; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 816; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] 817; GCN: v_max_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] 818; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]] 819; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]] 820; GCN-NEXT: s_waitcnt vmcnt(0) 821; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 822; GCN-NEXT: s_waitcnt vmcnt(0) 823define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 824 %tid = call i32 @llvm.amdgcn.workitem.id.x() 825 %tid.ext = sext i32 %tid to i64 826 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 827 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 828 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 829 %a = load volatile float, float addrspace(1)* %a.gep 830 %b = load volatile float, float addrspace(1)* %b.gep 831 %min = call float @llvm.minnum.f32(float %a, float %b) 832 %fneg = fneg float %min 833 %use1 = fmul float %min, 4.0 834 store volatile float %fneg, float addrspace(1)* %out 835 store volatile float %use1, float addrspace(1)* %out 836 ret void 837} 838 839; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_no_ieee: 840; GCN-NOT: v0 841; GCN-NOT: v1 842; GCN: v_max_f32_e64 v0, -v0, -v1 843; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0 844; GCN-NEXT: ; return 845define amdgpu_ps <2 x float> @v_fneg_minnum_multi_use_minnum_f32_no_ieee(float %a, float %b) #0 { 846 %min = call float @llvm.minnum.f32(float %a, float %b) 847 %fneg = fneg float %min 848 %use1 = fmul float %min, 4.0 849 %ins0 = insertelement <2 x float> undef, float %fneg, i32 0 850 %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1 851 ret <2 x float> %ins1 852} 853 854; -------------------------------------------------------------------------------- 855; fmaxnum tests 856; -------------------------------------------------------------------------------- 857 858 859; GCN-LABEL: {{^}}v_fneg_maxnum_f32_ieee: 860; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 861; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 862; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 863; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] 864; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] 865; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 866define amdgpu_kernel void @v_fneg_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 867 %tid = call i32 @llvm.amdgcn.workitem.id.x() 868 %tid.ext = sext i32 %tid to i64 869 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 870 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 871 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 872 %a = load volatile float, float addrspace(1)* %a.gep 873 %b = load volatile float, float addrspace(1)* %b.gep 874 %max = call float @llvm.maxnum.f32(float %a, float %b) 875 %fneg = fneg float %max 876 store float %fneg, float addrspace(1)* %out.gep 877 ret void 878} 879 880; GCN-LABEL: {{^}}v_fneg_maxnum_f32_no_ieee: 881; GCN-NOT: v0 882; GCN-NOT: v1 883; GCN: v_min_f32_e64 v0, -v0, -v1 884; GCN-NEXT: ; return 885define amdgpu_ps float @v_fneg_maxnum_f32_no_ieee(float %a, float %b) #0 { 886 %max = call float @llvm.maxnum.f32(float %a, float %b) 887 %fneg = fneg float %max 888 ret float %fneg 889} 890 891; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_ieee: 892; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 893; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 894; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]] 895; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 896define amdgpu_kernel void @v_fneg_self_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 897 %tid = call i32 @llvm.amdgcn.workitem.id.x() 898 %tid.ext = sext i32 %tid to i64 899 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 900 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 901 %a = load volatile float, float addrspace(1)* %a.gep 902 %max = call float @llvm.maxnum.f32(float %a, float %a) 903 %max.fneg = fneg float %max 904 store float %max.fneg, float addrspace(1)* %out.gep 905 ret void 906} 907 908; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_no_ieee: 909; GCN-NOT: v0 910; GCN: v_min_f32_e64 v0, -v0, -v0 911; GCN-NEXT: ; return 912define amdgpu_ps float @v_fneg_self_maxnum_f32_no_ieee(float %a) #0 { 913 %max = call float @llvm.maxnum.f32(float %a, float %a) 914 %max.fneg = fneg float %max 915 ret float %max.fneg 916} 917 918; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_ieee: 919; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 920; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 921; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]] 922; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 923define amdgpu_kernel void @v_fneg_posk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 924 %tid = call i32 @llvm.amdgcn.workitem.id.x() 925 %tid.ext = sext i32 %tid to i64 926 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 927 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 928 %a = load volatile float, float addrspace(1)* %a.gep 929 %max = call float @llvm.maxnum.f32(float 4.0, float %a) 930 %fneg = fneg float %max 931 store float %fneg, float addrspace(1)* %out.gep 932 ret void 933} 934 935; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_no_ieee: 936; GCN-NOT: v0 937; GCN: v_min_f32_e64 v0, -v0, -4.0 938; GCN-NEXT: ; return 939define amdgpu_ps float @v_fneg_posk_maxnum_f32_no_ieee(float %a) #0 { 940 %max = call float @llvm.maxnum.f32(float 4.0, float %a) 941 %fneg = fneg float %max 942 ret float %fneg 943} 944 945; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_ieee: 946; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 947; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 948; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]] 949; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 950define amdgpu_kernel void @v_fneg_negk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 951 %tid = call i32 @llvm.amdgcn.workitem.id.x() 952 %tid.ext = sext i32 %tid to i64 953 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 954 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 955 %a = load volatile float, float addrspace(1)* %a.gep 956 %max = call float @llvm.maxnum.f32(float -4.0, float %a) 957 %fneg = fneg float %max 958 store float %fneg, float addrspace(1)* %out.gep 959 ret void 960} 961 962; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_no_ieee: 963; GCN-NOT: v0 964; GCN: v_min_f32_e64 v0, -v0, 4.0 965; GCN-NEXT: ; return 966define amdgpu_ps float @v_fneg_negk_maxnum_f32_no_ieee(float %a) #0 { 967 %max = call float @llvm.maxnum.f32(float -4.0, float %a) 968 %fneg = fneg float %max 969 ret float %fneg 970} 971 972; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32: 973; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 974; GCN-NOT: [[A]] 975; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]] 976; GCN: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]] 977; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 978define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 979 %tid = call i32 @llvm.amdgcn.workitem.id.x() 980 %tid.ext = sext i32 %tid to i64 981 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 982 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 983 %a = load volatile float, float addrspace(1)* %a.gep 984 %max = call nnan float @llvm.maxnum.f32(float 0.0, float %a) 985 %fneg = fneg float %max 986 store float %fneg, float addrspace(1)* %out.gep 987 ret void 988} 989 990; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_ieee: 991; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 992; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 993; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]] 994; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 995define amdgpu_kernel void @v_fneg_neg0_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 996 %tid = call i32 @llvm.amdgcn.workitem.id.x() 997 %tid.ext = sext i32 %tid to i64 998 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 999 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1000 %a = load volatile float, float addrspace(1)* %a.gep 1001 %max = call float @llvm.maxnum.f32(float -0.0, float %a) 1002 %fneg = fneg float %max 1003 store float %fneg, float addrspace(1)* %out.gep 1004 ret void 1005} 1006 1007; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_no_ieee: 1008; GCN-NOT: v0 1009; GCN: v_min_f32_e64 v0, -v0, 0{{$}} 1010; GCN-NEXT: ; return 1011define amdgpu_ps float @v_fneg_neg0_maxnum_f32_no_ieee(float %a) #0 { 1012 %max = call float @llvm.maxnum.f32(float -0.0, float %a) 1013 %fneg = fneg float %max 1014 ret float %fneg 1015} 1016 1017; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_ieee: 1018; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1019; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1020; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] 1021; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]] 1022; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]] 1023; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1024define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1025 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1026 %tid.ext = sext i32 %tid to i64 1027 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1028 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1029 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1030 %a = load volatile float, float addrspace(1)* %a.gep 1031 %b = load volatile float, float addrspace(1)* %b.gep 1032 %max = call float @llvm.maxnum.f32(float 0.0, float %a) 1033 %fneg = fneg float %max 1034 %mul = fmul float %fneg, %b 1035 store float %mul, float addrspace(1)* %out.gep 1036 ret void 1037} 1038 1039; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_no_ieee: 1040; GCN-NOT: v0 1041; GCN-NOT: v1 1042; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, v0 1043; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], v1 1044; GCN-NEXT: ; return 1045define amdgpu_ps float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float %b) #0 { 1046 %max = call float @llvm.maxnum.f32(float 0.0, float %a) 1047 %fneg = fneg float %max 1048 %mul = fmul float %fneg, %b 1049 ret float %mul 1050} 1051 1052; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_ieee: 1053; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1054; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1055; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 1056; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] 1057; GCN: v_min_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] 1058; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]] 1059; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]] 1060; GCN-NEXT: s_waitcnt vmcnt(0) 1061; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 1062; GCN-NEXT: s_waitcnt vmcnt(0) 1063define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1064 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1065 %tid.ext = sext i32 %tid to i64 1066 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1067 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1068 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1069 %a = load volatile float, float addrspace(1)* %a.gep 1070 %b = load volatile float, float addrspace(1)* %b.gep 1071 %max = call float @llvm.maxnum.f32(float %a, float %b) 1072 %fneg = fneg float %max 1073 %use1 = fmul float %max, 4.0 1074 store volatile float %fneg, float addrspace(1)* %out 1075 store volatile float %use1, float addrspace(1)* %out 1076 ret void 1077} 1078 1079; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_no_ieee: 1080; GCN-NOT: v0 1081; GCN-NOT: v1 1082; GCN: v_min_f32_e64 v0, -v0, -v1 1083; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0 1084; GCN-NEXT: ; return 1085define amdgpu_ps <2 x float> @v_fneg_maxnum_multi_use_maxnum_f32_no_ieee(float %a, float %b) #0 { 1086 %max = call float @llvm.maxnum.f32(float %a, float %b) 1087 %fneg = fneg float %max 1088 %use1 = fmul float %max, 4.0 1089 %ins0 = insertelement <2 x float> undef, float %fneg, i32 0 1090 %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1 1091 ret <2 x float> %ins1 1092} 1093 1094; -------------------------------------------------------------------------------- 1095; fma tests 1096; -------------------------------------------------------------------------------- 1097 1098; GCN-LABEL: {{^}}v_fneg_fma_f32: 1099; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1100; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1101; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1102 1103; GCN-SAFE: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] 1104; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[RESULT]] 1105 1106; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] 1107; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1108define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1109 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1110 %tid.ext = sext i32 %tid to i64 1111 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1112 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1113 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1114 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1115 %a = load volatile float, float addrspace(1)* %a.gep 1116 %b = load volatile float, float addrspace(1)* %b.gep 1117 %c = load volatile float, float addrspace(1)* %c.gep 1118 %fma = call float @llvm.fma.f32(float %a, float %b, float %c) 1119 %fneg = fneg float %fma 1120 store float %fneg, float addrspace(1)* %out.gep 1121 ret void 1122} 1123 1124; GCN-LABEL: {{^}}v_fneg_fma_store_use_fma_f32: 1125; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1126; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1127; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1128; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] 1129; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]] 1130; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]] 1131; GCN-NEXT: s_waitcnt vmcnt(0) 1132; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1133; GCN-NEXT: s_waitcnt vmcnt(0) 1134define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1135 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1136 %tid.ext = sext i32 %tid to i64 1137 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1138 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1139 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1140 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1141 %a = load volatile float, float addrspace(1)* %a.gep 1142 %b = load volatile float, float addrspace(1)* %b.gep 1143 %c = load volatile float, float addrspace(1)* %c.gep 1144 %fma = call float @llvm.fma.f32(float %a, float %b, float %c) 1145 %fneg = fneg float %fma 1146 store volatile float %fneg, float addrspace(1)* %out 1147 store volatile float %fma, float addrspace(1)* %out 1148 ret void 1149} 1150 1151; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fma_f32: 1152; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1153; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1154; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1155 1156; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] 1157; GCN-SAFE: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]] 1158; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]] 1159 1160; GCN-NSZ: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], -[[B]], -[[C]] 1161; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]] 1162 1163; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]] 1164; GCN-NEXT: s_waitcnt vmcnt(0) 1165; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1166; GCN-NEXT: s_waitcnt vmcnt(0) 1167define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1168 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1169 %tid.ext = sext i32 %tid to i64 1170 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1171 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1172 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1173 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1174 %a = load volatile float, float addrspace(1)* %a.gep 1175 %b = load volatile float, float addrspace(1)* %b.gep 1176 %c = load volatile float, float addrspace(1)* %c.gep 1177 %fma = call float @llvm.fma.f32(float %a, float %b, float %c) 1178 %fneg = fneg float %fma 1179 %use1 = fmul float %fma, 4.0 1180 store volatile float %fneg, float addrspace(1)* %out 1181 store volatile float %use1, float addrspace(1)* %out 1182 ret void 1183} 1184 1185; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_y_f32: 1186; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1187; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1188; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1189 1190; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], [[C]] 1191; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1192 1193; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1194; GCN-NSZ-NOT: [[FMA]] 1195; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1196define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1197 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1198 %tid.ext = sext i32 %tid to i64 1199 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1200 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1201 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1202 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1203 %a = load volatile float, float addrspace(1)* %a.gep 1204 %b = load volatile float, float addrspace(1)* %b.gep 1205 %c = load volatile float, float addrspace(1)* %c.gep 1206 %fneg.a = fneg float %a 1207 %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) 1208 %fneg = fneg float %fma 1209 store volatile float %fneg, float addrspace(1)* %out 1210 ret void 1211} 1212 1213; GCN-LABEL: {{^}}v_fneg_fma_x_fneg_y_f32: 1214; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1215; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1216; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1217 1218; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]] 1219; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1220 1221; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1222; GCN-NSZ-NOT: [[FMA]] 1223; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1224define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1225 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1226 %tid.ext = sext i32 %tid to i64 1227 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1228 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1229 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1230 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1231 %a = load volatile float, float addrspace(1)* %a.gep 1232 %b = load volatile float, float addrspace(1)* %b.gep 1233 %c = load volatile float, float addrspace(1)* %c.gep 1234 %fneg.b = fneg float %b 1235 %fma = call float @llvm.fma.f32(float %a, float %fneg.b, float %c) 1236 %fneg = fneg float %fma 1237 store volatile float %fneg, float addrspace(1)* %out 1238 ret void 1239} 1240 1241; GCN-LABEL: {{^}}v_fneg_fma_fneg_fneg_y_f32: 1242; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1243; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1244; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1245 1246; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] 1247; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1248 1249; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]] 1250; GCN-NSZ-NOT: [[FMA]] 1251; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1252define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1253 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1254 %tid.ext = sext i32 %tid to i64 1255 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1256 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1257 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1258 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1259 %a = load volatile float, float addrspace(1)* %a.gep 1260 %b = load volatile float, float addrspace(1)* %b.gep 1261 %c = load volatile float, float addrspace(1)* %c.gep 1262 %fneg.a = fneg float %a 1263 %fneg.b = fneg float %b 1264 %fma = call float @llvm.fma.f32(float %fneg.a, float %fneg.b, float %c) 1265 %fneg = fneg float %fma 1266 store volatile float %fneg, float addrspace(1)* %out 1267 ret void 1268} 1269 1270; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_fneg_f32: 1271; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1272; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1273; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1274 1275; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], -[[C]] 1276; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1277 1278; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] 1279; GCN-NSZ-NOT: [[FMA]] 1280; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1281define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1282 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1283 %tid.ext = sext i32 %tid to i64 1284 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1285 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1286 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1287 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1288 %a = load volatile float, float addrspace(1)* %a.gep 1289 %b = load volatile float, float addrspace(1)* %b.gep 1290 %c = load volatile float, float addrspace(1)* %c.gep 1291 %fneg.a = fneg float %a 1292 %fneg.c = fneg float %c 1293 %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %fneg.c) 1294 %fneg = fneg float %fma 1295 store volatile float %fneg, float addrspace(1)* %out 1296 ret void 1297} 1298 1299; GCN-LABEL: {{^}}v_fneg_fma_x_y_fneg_f32: 1300; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1301; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1302; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1303 1304; GCN-NSZ-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1305; GCN-NSZ-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1306 1307; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]] 1308; GCN-NSZ-NOT: [[FMA]] 1309; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1310define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1311 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1312 %tid.ext = sext i32 %tid to i64 1313 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1314 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1315 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1316 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1317 %a = load volatile float, float addrspace(1)* %a.gep 1318 %b = load volatile float, float addrspace(1)* %b.gep 1319 %c = load volatile float, float addrspace(1)* %c.gep 1320 %fneg.c = fneg float %c 1321 %fma = call float @llvm.fma.f32(float %a, float %b, float %fneg.c) 1322 %fneg = fneg float %fma 1323 store volatile float %fneg, float addrspace(1)* %out 1324 ret void 1325} 1326 1327; GCN-LABEL: {{^}}v_fneg_fma_store_use_fneg_x_y_f32: 1328; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1329; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1330; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1331 1332; GCN-SAFE: v_xor_b32 1333; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], 1334; GCN-SAFE: v_xor_b32 1335 1336; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 1337; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1338 1339; GCN-NSZ-NOT: [[FMA]] 1340; GCN-NSZ-NOT: [[NEG_A]] 1341; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1342; GCN-NSZ-NOT: [[NEG_A]] 1343; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 1344define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1345 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1346 %tid.ext = sext i32 %tid to i64 1347 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1348 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1349 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1350 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1351 %a = load volatile float, float addrspace(1)* %a.gep 1352 %b = load volatile float, float addrspace(1)* %b.gep 1353 %c = load volatile float, float addrspace(1)* %c.gep 1354 %fneg.a = fneg float %a 1355 %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) 1356 %fneg = fneg float %fma 1357 store volatile float %fneg, float addrspace(1)* %out 1358 store volatile float %fneg.a, float addrspace(1)* %out 1359 ret void 1360} 1361 1362; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fneg_x_y_f32: 1363; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1364; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1365; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1366 1367; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 1368; GCN-SAFE-DAG: v_fma_f32 [[FMA:v[0-9]+]] 1369; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1370 1371; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1372; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]] 1373; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 1374; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1375; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 1376define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 { 1377 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1378 %tid.ext = sext i32 %tid to i64 1379 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1380 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1381 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1382 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1383 %a = load volatile float, float addrspace(1)* %a.gep 1384 %b = load volatile float, float addrspace(1)* %b.gep 1385 %c = load volatile float, float addrspace(1)* %c.gep 1386 %fneg.a = fneg float %a 1387 %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) 1388 %fneg = fneg float %fma 1389 %use1 = fmul float %fneg.a, %d 1390 store volatile float %fneg, float addrspace(1)* %out 1391 store volatile float %use1, float addrspace(1)* %out 1392 ret void 1393} 1394 1395; -------------------------------------------------------------------------------- 1396; fmad tests 1397; -------------------------------------------------------------------------------- 1398 1399; GCN-LABEL: {{^}}v_fneg_fmad_f32: 1400; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1401; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1402; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1403 1404; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]] 1405; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[C]] 1406 1407; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] 1408; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1409define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1410 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1411 %tid.ext = sext i32 %tid to i64 1412 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1413 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1414 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1415 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1416 %a = load volatile float, float addrspace(1)* %a.gep 1417 %b = load volatile float, float addrspace(1)* %b.gep 1418 %c = load volatile float, float addrspace(1)* %c.gep 1419 %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c) 1420 %fneg = fneg float %fma 1421 store float %fneg, float addrspace(1)* %out.gep 1422 ret void 1423} 1424 1425; GCN-LABEL: {{^}}v_fneg_fmad_v4f32: 1426 1427; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 1428; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 1429; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 1430; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 1431define amdgpu_kernel void @v_fneg_fmad_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %a.ptr, <4 x float> addrspace(1)* %b.ptr, <4 x float> addrspace(1)* %c.ptr) #0 { 1432 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1433 %tid.ext = sext i32 %tid to i64 1434 %a.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a.ptr, i64 %tid.ext 1435 %b.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %b.ptr, i64 %tid.ext 1436 %c.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %c.ptr, i64 %tid.ext 1437 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext 1438 %a = load volatile <4 x float>, <4 x float> addrspace(1)* %a.gep 1439 %b = load volatile <4 x float>, <4 x float> addrspace(1)* %b.gep 1440 %c = load volatile <4 x float>, <4 x float> addrspace(1)* %c.gep 1441 %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) 1442 %fneg = fneg <4 x float> %fma 1443 store <4 x float> %fneg, <4 x float> addrspace(1)* %out.gep 1444 ret void 1445} 1446 1447; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32: 1448; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1449; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1450; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1451 1452; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]] 1453; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]] 1454; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]] 1455 1456; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], [[A]], -[[B]], -[[C]] 1457; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]] 1458 1459; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MAD]] 1460; GCN-NEXT: s_waitcnt vmcnt(0) 1461; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1462; GCN-NEXT: s_waitcnt vmcnt(0) 1463define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1464 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1465 %tid.ext = sext i32 %tid to i64 1466 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1467 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1468 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1469 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1470 %a = load volatile float, float addrspace(1)* %a.gep 1471 %b = load volatile float, float addrspace(1)* %b.gep 1472 %c = load volatile float, float addrspace(1)* %c.gep 1473 %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c) 1474 %fneg = fneg float %fma 1475 %use1 = fmul float %fma, 4.0 1476 store volatile float %fneg, float addrspace(1)* %out 1477 store volatile float %use1, float addrspace(1)* %out 1478 ret void 1479} 1480 1481; -------------------------------------------------------------------------------- 1482; fp_extend tests 1483; -------------------------------------------------------------------------------- 1484 1485; GCN-LABEL: {{^}}v_fneg_fp_extend_f32_to_f64: 1486; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1487; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]] 1488; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1489define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1490 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1491 %tid.ext = sext i32 %tid to i64 1492 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1493 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 1494 %a = load volatile float, float addrspace(1)* %a.gep 1495 %fpext = fpext float %a to double 1496 %fneg = fsub double -0.000000e+00, %fpext 1497 store double %fneg, double addrspace(1)* %out.gep 1498 ret void 1499} 1500 1501; GCN-LABEL: {{^}}v_fneg_fp_extend_fneg_f32_to_f64: 1502; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1503; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]] 1504; GCN: {{buffer|flat}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1505define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1506 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1507 %tid.ext = sext i32 %tid to i64 1508 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1509 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 1510 %a = load volatile float, float addrspace(1)* %a.gep 1511 %fneg.a = fneg float %a 1512 %fpext = fpext float %fneg.a to double 1513 %fneg = fsub double -0.000000e+00, %fpext 1514 store double %fneg, double addrspace(1)* %out.gep 1515 ret void 1516} 1517 1518; GCN-LABEL: {{^}}v_fneg_fp_extend_store_use_fneg_f32_to_f64: 1519; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1520; GCN-DAG: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]] 1521; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]] 1522; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1523; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FNEG_A]] 1524define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1525 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1526 %tid.ext = sext i32 %tid to i64 1527 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1528 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 1529 %a = load volatile float, float addrspace(1)* %a.gep 1530 %fneg.a = fneg float %a 1531 %fpext = fpext float %fneg.a to double 1532 %fneg = fsub double -0.000000e+00, %fpext 1533 store volatile double %fneg, double addrspace(1)* %out.gep 1534 store volatile float %fneg.a, float addrspace(1)* undef 1535 ret void 1536} 1537 1538; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f32_to_f64: 1539; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1540; GCN-DAG: v_cvt_f64_f32_e32 v[[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]], [[A]] 1541; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]] 1542; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]] 1543; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[CVT_LO]]:[[CVT_HI]]] 1544define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1545 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1546 %tid.ext = sext i32 %tid to i64 1547 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1548 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 1549 %a = load volatile float, float addrspace(1)* %a.gep 1550 %fpext = fpext float %a to double 1551 %fneg = fsub double -0.000000e+00, %fpext 1552 store volatile double %fneg, double addrspace(1)* %out.gep 1553 store volatile double %fpext, double addrspace(1)* undef 1554 ret void 1555} 1556 1557; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64: 1558; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1559; GCN-DAG: v_cvt_f64_f32_e32 v[[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]], [[A]] 1560; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]] 1561; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v[[[CVT_LO]]:[[CVT_HI]]], 4.0 1562; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]] 1563; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1564define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1565 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1566 %tid.ext = sext i32 %tid to i64 1567 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1568 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 1569 %a = load volatile float, float addrspace(1)* %a.gep 1570 %fpext = fpext float %a to double 1571 %fneg = fsub double -0.000000e+00, %fpext 1572 %mul = fmul double %fpext, 4.0 1573 store volatile double %fneg, double addrspace(1)* %out.gep 1574 store volatile double %mul, double addrspace(1)* %out.gep 1575 ret void 1576} 1577 1578; FIXME: Source modifiers not folded for f16->f32 1579; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32: 1580define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { 1581 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1582 %tid.ext = sext i32 %tid to i64 1583 %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext 1584 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1585 %a = load volatile half, half addrspace(1)* %a.gep 1586 %fpext = fpext half %a to float 1587 %fneg = fneg float %fpext 1588 store volatile float %fneg, float addrspace(1)* %out.gep 1589 store volatile float %fpext, float addrspace(1)* %out.gep 1590 ret void 1591} 1592 1593; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32: 1594define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { 1595 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1596 %tid.ext = sext i32 %tid to i64 1597 %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext 1598 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1599 %a = load volatile half, half addrspace(1)* %a.gep 1600 %fpext = fpext half %a to float 1601 %fneg = fneg float %fpext 1602 %mul = fmul float %fpext, 4.0 1603 store volatile float %fneg, float addrspace(1)* %out.gep 1604 store volatile float %mul, float addrspace(1)* %out.gep 1605 ret void 1606} 1607 1608; -------------------------------------------------------------------------------- 1609; fp_round tests 1610; -------------------------------------------------------------------------------- 1611 1612; GCN-LABEL: {{^}}v_fneg_fp_round_f64_to_f32: 1613; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 1614; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]] 1615; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1616define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { 1617 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1618 %tid.ext = sext i32 %tid to i64 1619 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 1620 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1621 %a = load volatile double, double addrspace(1)* %a.gep 1622 %fpround = fptrunc double %a to float 1623 %fneg = fneg float %fpround 1624 store float %fneg, float addrspace(1)* %out.gep 1625 ret void 1626} 1627 1628; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f64_to_f32: 1629; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 1630; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]] 1631; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1632define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { 1633 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1634 %tid.ext = sext i32 %tid to i64 1635 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 1636 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1637 %a = load volatile double, double addrspace(1)* %a.gep 1638 %fneg.a = fsub double -0.000000e+00, %a 1639 %fpround = fptrunc double %fneg.a to float 1640 %fneg = fneg float %fpround 1641 store float %fneg, float addrspace(1)* %out.gep 1642 ret void 1643} 1644 1645; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f64_to_f32: 1646; GCN: {{buffer|flat}}_load_dwordx2 v[[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]] 1647; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v[[[A_LO]]:[[A_HI]]] 1648; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]] 1649; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1650; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[A_LO]]:[[NEG_A_HI]]] 1651define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { 1652 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1653 %tid.ext = sext i32 %tid to i64 1654 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 1655 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1656 %a = load volatile double, double addrspace(1)* %a.gep 1657 %fneg.a = fsub double -0.000000e+00, %a 1658 %fpround = fptrunc double %fneg.a to float 1659 %fneg = fneg float %fpround 1660 store volatile float %fneg, float addrspace(1)* %out.gep 1661 store volatile double %fneg.a, double addrspace(1)* undef 1662 ret void 1663} 1664 1665; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f64_to_f32: 1666; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 1667; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]] 1668; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s[ 1669 1670; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1671; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[USE1]] 1672define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 { 1673 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1674 %tid.ext = sext i32 %tid to i64 1675 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 1676 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1677 %a = load volatile double, double addrspace(1)* %a.gep 1678 %fneg.a = fsub double -0.000000e+00, %a 1679 %fpround = fptrunc double %fneg.a to float 1680 %fneg = fneg float %fpround 1681 %use1 = fmul double %fneg.a, %c 1682 store volatile float %fneg, float addrspace(1)* %out.gep 1683 store volatile double %use1, double addrspace(1)* undef 1684 ret void 1685} 1686 1687; GCN-LABEL: {{^}}v_fneg_fp_round_f32_to_f16: 1688; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1689; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 1690; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1691define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1692 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1693 %tid.ext = sext i32 %tid to i64 1694 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1695 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 1696 %a = load volatile float, float addrspace(1)* %a.gep 1697 %fpround = fptrunc float %a to half 1698 %fneg = fsub half -0.000000e+00, %fpround 1699 store half %fneg, half addrspace(1)* %out.gep 1700 ret void 1701} 1702 1703; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f32_to_f16: 1704; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1705; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1706; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1707define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1708 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1709 %tid.ext = sext i32 %tid to i64 1710 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1711 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 1712 %a = load volatile float, float addrspace(1)* %a.gep 1713 %fneg.a = fneg float %a 1714 %fpround = fptrunc float %fneg.a to half 1715 %fneg = fsub half -0.000000e+00, %fpround 1716 store half %fneg, half addrspace(1)* %out.gep 1717 ret void 1718} 1719 1720; GCN-LABEL: {{^}}v_fneg_multi_use_fp_round_fneg_f64_to_f32: 1721; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 1722; GCN-DAG: v_cvt_f32_f64_e32 [[CVT:v[0-9]+]], [[A]] 1723; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]] 1724; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG]] 1725; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[CVT]] 1726define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { 1727 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1728 %tid.ext = sext i32 %tid to i64 1729 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 1730 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1731 %a = load volatile double, double addrspace(1)* %a.gep 1732 %fpround = fptrunc double %a to float 1733 %fneg = fneg float %fpround 1734 store volatile float %fneg, float addrspace(1)* %out.gep 1735 store volatile float %fpround, float addrspace(1)* %out.gep 1736 ret void 1737} 1738 1739; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f32_to_f16: 1740; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1741; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1742; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 1743; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1744; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 1745define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1746 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1747 %tid.ext = sext i32 %tid to i64 1748 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1749 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 1750 %a = load volatile float, float addrspace(1)* %a.gep 1751 %fneg.a = fneg float %a 1752 %fpround = fptrunc float %fneg.a to half 1753 %fneg = fsub half -0.000000e+00, %fpround 1754 store volatile half %fneg, half addrspace(1)* %out.gep 1755 store volatile float %fneg.a, float addrspace(1)* undef 1756 ret void 1757} 1758 1759; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f32_to_f16: 1760; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1761; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1762; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s 1763; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1764; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[USE1]] 1765define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 { 1766 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1767 %tid.ext = sext i32 %tid to i64 1768 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1769 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 1770 %a = load volatile float, float addrspace(1)* %a.gep 1771 %fneg.a = fneg float %a 1772 %fpround = fptrunc float %fneg.a to half 1773 %fneg = fsub half -0.000000e+00, %fpround 1774 %use1 = fmul float %fneg.a, %c 1775 store volatile half %fneg, half addrspace(1)* %out.gep 1776 store volatile float %use1, float addrspace(1)* undef 1777 ret void 1778} 1779 1780; -------------------------------------------------------------------------------- 1781; rcp tests 1782; -------------------------------------------------------------------------------- 1783 1784; GCN-LABEL: {{^}}v_fneg_rcp_f32: 1785; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1786; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 1787; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1788define amdgpu_kernel void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1789 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1790 %tid.ext = sext i32 %tid to i64 1791 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1792 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1793 %a = load volatile float, float addrspace(1)* %a.gep 1794 %rcp = call float @llvm.amdgcn.rcp.f32(float %a) 1795 %fneg = fneg float %rcp 1796 store float %fneg, float addrspace(1)* %out.gep 1797 ret void 1798} 1799 1800; GCN-LABEL: {{^}}v_fneg_rcp_fneg_f32: 1801; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1802; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1803; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1804define amdgpu_kernel void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1805 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1806 %tid.ext = sext i32 %tid to i64 1807 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1808 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1809 %a = load volatile float, float addrspace(1)* %a.gep 1810 %fneg.a = fneg float %a 1811 %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a) 1812 %fneg = fneg float %rcp 1813 store float %fneg, float addrspace(1)* %out.gep 1814 ret void 1815} 1816 1817; GCN-LABEL: {{^}}v_fneg_rcp_store_use_fneg_f32: 1818; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1819; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1820; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 1821; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1822; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 1823define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1824 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1825 %tid.ext = sext i32 %tid to i64 1826 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1827 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1828 %a = load volatile float, float addrspace(1)* %a.gep 1829 %fneg.a = fneg float %a 1830 %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a) 1831 %fneg = fneg float %rcp 1832 store volatile float %fneg, float addrspace(1)* %out.gep 1833 store volatile float %fneg.a, float addrspace(1)* undef 1834 ret void 1835} 1836 1837; GCN-LABEL: {{^}}v_fneg_rcp_multi_use_fneg_f32: 1838; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1839; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1840; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 1841; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1842; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1843define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 { 1844 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1845 %tid.ext = sext i32 %tid to i64 1846 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1847 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1848 %a = load volatile float, float addrspace(1)* %a.gep 1849 %fneg.a = fneg float %a 1850 %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a) 1851 %fneg = fneg float %rcp 1852 %use1 = fmul float %fneg.a, %c 1853 store volatile float %fneg, float addrspace(1)* %out.gep 1854 store volatile float %use1, float addrspace(1)* undef 1855 ret void 1856} 1857 1858; -------------------------------------------------------------------------------- 1859; fmul_legacy tests 1860; -------------------------------------------------------------------------------- 1861 1862; GCN-LABEL: {{^}}v_fneg_mul_legacy_f32: 1863; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1864; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1865; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]] 1866; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1867define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1868 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1869 %tid.ext = sext i32 %tid to i64 1870 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1871 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1872 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1873 %a = load volatile float, float addrspace(1)* %a.gep 1874 %b = load volatile float, float addrspace(1)* %b.gep 1875 %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) 1876 %fneg = fneg float %mul 1877 store float %fneg, float addrspace(1)* %out.gep 1878 ret void 1879} 1880 1881; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32: 1882; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1883; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1884; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 1885; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]] 1886; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]] 1887; GCN-NEXT: s_waitcnt vmcnt(0) 1888; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1889; GCN-NEXT: s_waitcnt vmcnt(0) 1890define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1891 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1892 %tid.ext = sext i32 %tid to i64 1893 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1894 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1895 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1896 %a = load volatile float, float addrspace(1)* %a.gep 1897 %b = load volatile float, float addrspace(1)* %b.gep 1898 %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) 1899 %fneg = fneg float %mul 1900 store volatile float %fneg, float addrspace(1)* %out 1901 store volatile float %mul, float addrspace(1)* %out 1902 ret void 1903} 1904 1905; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32: 1906; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1907; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1908; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] 1909; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0 1910; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1911; GCN-NEXT: s_waitcnt vmcnt(0) 1912; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1913; GCN-NEXT: s_waitcnt vmcnt(0) 1914define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1915 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1916 %tid.ext = sext i32 %tid to i64 1917 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1918 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1919 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1920 %a = load volatile float, float addrspace(1)* %a.gep 1921 %b = load volatile float, float addrspace(1)* %b.gep 1922 %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) 1923 %fneg = fneg float %mul 1924 %use1 = call float @llvm.amdgcn.fmul.legacy(float %mul, float 4.0) 1925 store volatile float %fneg, float addrspace(1)* %out 1926 store volatile float %use1, float addrspace(1)* %out 1927 ret void 1928} 1929 1930; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32: 1931; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1932; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1933; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 1934; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1935define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1936 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1937 %tid.ext = sext i32 %tid to i64 1938 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1939 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1940 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1941 %a = load volatile float, float addrspace(1)* %a.gep 1942 %b = load volatile float, float addrspace(1)* %b.gep 1943 %fneg.a = fneg float %a 1944 %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b) 1945 %fneg = fneg float %mul 1946 store volatile float %fneg, float addrspace(1)* %out 1947 ret void 1948} 1949 1950; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32: 1951; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1952; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1953; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 1954; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1955define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1956 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1957 %tid.ext = sext i32 %tid to i64 1958 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1959 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1960 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1961 %a = load volatile float, float addrspace(1)* %a.gep 1962 %b = load volatile float, float addrspace(1)* %b.gep 1963 %fneg.b = fneg float %b 1964 %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %fneg.b) 1965 %fneg = fneg float %mul 1966 store volatile float %fneg, float addrspace(1)* %out 1967 ret void 1968} 1969 1970; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_fneg_f32: 1971; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1972; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1973; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] 1974; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1975define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1976 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1977 %tid.ext = sext i32 %tid to i64 1978 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1979 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1980 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1981 %a = load volatile float, float addrspace(1)* %a.gep 1982 %b = load volatile float, float addrspace(1)* %b.gep 1983 %fneg.a = fneg float %a 1984 %fneg.b = fneg float %b 1985 %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %fneg.b) 1986 %fneg = fneg float %mul 1987 store volatile float %fneg, float addrspace(1)* %out 1988 ret void 1989} 1990 1991; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_fneg_x_f32: 1992; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1993; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1994; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 1995; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]] 1996; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]] 1997; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 1998define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1999 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2000 %tid.ext = sext i32 %tid to i64 2001 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2002 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2003 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2004 %a = load volatile float, float addrspace(1)* %a.gep 2005 %b = load volatile float, float addrspace(1)* %b.gep 2006 %fneg.a = fneg float %a 2007 %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b) 2008 %fneg = fneg float %mul 2009 store volatile float %fneg, float addrspace(1)* %out 2010 store volatile float %fneg.a, float addrspace(1)* %out 2011 ret void 2012} 2013 2014; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32: 2015; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2016; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2017; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]] 2018; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 2019; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]] 2020; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 2021define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { 2022 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2023 %tid.ext = sext i32 %tid to i64 2024 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2025 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2026 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2027 %a = load volatile float, float addrspace(1)* %a.gep 2028 %b = load volatile float, float addrspace(1)* %b.gep 2029 %fneg.a = fneg float %a 2030 %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b) 2031 %fneg = fneg float %mul 2032 %use1 = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %c) 2033 store volatile float %fneg, float addrspace(1)* %out 2034 store volatile float %use1, float addrspace(1)* %out 2035 ret void 2036} 2037 2038; -------------------------------------------------------------------------------- 2039; sin tests 2040; -------------------------------------------------------------------------------- 2041 2042; GCN-LABEL: {{^}}v_fneg_sin_f32: 2043; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2044; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbe22f983, [[A]] 2045; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]] 2046; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]] 2047; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2048define amdgpu_kernel void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2049 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2050 %tid.ext = sext i32 %tid to i64 2051 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2052 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2053 %a = load volatile float, float addrspace(1)* %a.gep 2054 %sin = call float @llvm.sin.f32(float %a) 2055 %fneg = fneg float %sin 2056 store float %fneg, float addrspace(1)* %out.gep 2057 ret void 2058} 2059 2060; GCN-LABEL: {{^}}v_fneg_amdgcn_sin_f32: 2061; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2062; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 2063; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2064define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2065 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2066 %tid.ext = sext i32 %tid to i64 2067 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2068 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2069 %a = load volatile float, float addrspace(1)* %a.gep 2070 %sin = call float @llvm.amdgcn.sin.f32(float %a) 2071 %fneg = fneg float %sin 2072 store float %fneg, float addrspace(1)* %out.gep 2073 ret void 2074} 2075 2076; -------------------------------------------------------------------------------- 2077; ftrunc tests 2078; -------------------------------------------------------------------------------- 2079 2080; GCN-LABEL: {{^}}v_fneg_trunc_f32: 2081; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2082; GCN: v_trunc_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 2083; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2084define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2085 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2086 %tid.ext = sext i32 %tid to i64 2087 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2088 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2089 %a = load volatile float, float addrspace(1)* %a.gep 2090 %trunc = call float @llvm.trunc.f32(float %a) 2091 %fneg = fneg float %trunc 2092 store float %fneg, float addrspace(1)* %out.gep 2093 ret void 2094} 2095 2096; -------------------------------------------------------------------------------- 2097; fround tests 2098; -------------------------------------------------------------------------------- 2099 2100; GCN-LABEL: {{^}}v_fneg_round_f32: 2101; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2102; GCN: v_trunc_f32_e32 2103; GCN: v_sub_f32_e32 2104; GCN: v_cndmask_b32 2105 2106; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} 2107; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[ADD]] 2108 2109; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}} 2110; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2111define amdgpu_kernel void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2112 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2113 %tid.ext = sext i32 %tid to i64 2114 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2115 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2116 %a = load volatile float, float addrspace(1)* %a.gep 2117 %round = call float @llvm.round.f32(float %a) 2118 %fneg = fneg float %round 2119 store float %fneg, float addrspace(1)* %out.gep 2120 ret void 2121} 2122 2123; -------------------------------------------------------------------------------- 2124; rint tests 2125; -------------------------------------------------------------------------------- 2126 2127; GCN-LABEL: {{^}}v_fneg_rint_f32: 2128; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2129; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 2130; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2131define amdgpu_kernel void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2132 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2133 %tid.ext = sext i32 %tid to i64 2134 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2135 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2136 %a = load volatile float, float addrspace(1)* %a.gep 2137 %rint = call float @llvm.rint.f32(float %a) 2138 %fneg = fneg float %rint 2139 store float %fneg, float addrspace(1)* %out.gep 2140 ret void 2141} 2142 2143; -------------------------------------------------------------------------------- 2144; nearbyint tests 2145; -------------------------------------------------------------------------------- 2146 2147; GCN-LABEL: {{^}}v_fneg_nearbyint_f32: 2148; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2149; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 2150; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2151define amdgpu_kernel void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2152 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2153 %tid.ext = sext i32 %tid to i64 2154 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2155 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2156 %a = load volatile float, float addrspace(1)* %a.gep 2157 %nearbyint = call float @llvm.nearbyint.f32(float %a) 2158 %fneg = fneg float %nearbyint 2159 store float %fneg, float addrspace(1)* %out.gep 2160 ret void 2161} 2162 2163; -------------------------------------------------------------------------------- 2164; fcanonicalize tests 2165; -------------------------------------------------------------------------------- 2166 2167; GCN-LABEL: {{^}}v_fneg_canonicalize_f32: 2168; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2169; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], -1.0, [[A]] 2170; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2171define amdgpu_kernel void @v_fneg_canonicalize_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2172 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2173 %tid.ext = sext i32 %tid to i64 2174 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2175 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2176 %a = load volatile float, float addrspace(1)* %a.gep 2177 %trunc = call float @llvm.canonicalize.f32(float %a) 2178 %fneg = fneg float %trunc 2179 store float %fneg, float addrspace(1)* %out.gep 2180 ret void 2181} 2182 2183; -------------------------------------------------------------------------------- 2184; vintrp tests 2185; -------------------------------------------------------------------------------- 2186 2187; GCN-LABEL: {{^}}v_fneg_interp_p1_f32: 2188; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2189; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2190; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] 2191; GCN: v_interp_p1_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]] 2192; GCN: v_interp_p1_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]] 2193define amdgpu_kernel void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 2194 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2195 %tid.ext = sext i32 %tid to i64 2196 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2197 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2198 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2199 %a = load volatile float, float addrspace(1)* %a.gep 2200 %b = load volatile float, float addrspace(1)* %b.gep 2201 %mul = fmul float %a, %b 2202 %fneg = fneg float %mul 2203 %intrp0 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 0, i32 0, i32 0) 2204 %intrp1 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 1, i32 0, i32 0) 2205 store volatile float %intrp0, float addrspace(1)* %out.gep 2206 store volatile float %intrp1, float addrspace(1)* %out.gep 2207 ret void 2208} 2209 2210; GCN-LABEL: {{^}}v_fneg_interp_p2_f32: 2211; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2212; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2213; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] 2214; GCN: v_interp_p2_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]] 2215; GCN: v_interp_p2_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]] 2216define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 2217 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2218 %tid.ext = sext i32 %tid to i64 2219 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2220 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2221 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2222 %a = load volatile float, float addrspace(1)* %a.gep 2223 %b = load volatile float, float addrspace(1)* %b.gep 2224 %mul = fmul float %a, %b 2225 %fneg = fneg float %mul 2226 %intrp0 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 0, i32 0, i32 0) 2227 %intrp1 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 1, i32 0, i32 0) 2228 store volatile float %intrp0, float addrspace(1)* %out.gep 2229 store volatile float %intrp1, float addrspace(1)* %out.gep 2230 ret void 2231} 2232 2233; -------------------------------------------------------------------------------- 2234; CopyToReg tests 2235; -------------------------------------------------------------------------------- 2236 2237; GCN-LABEL: {{^}}v_fneg_copytoreg_f32: 2238; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2239; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2240; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2241; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]] 2242; GCN: s_cbranch_scc0 2243 2244; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] 2245; GCN: s_endpgm 2246 2247; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]] 2248; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[XOR]], [[C]] 2249; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2250 2251define amdgpu_kernel void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { 2252 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2253 %tid.ext = sext i32 %tid to i64 2254 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2255 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2256 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2257 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2258 %a = load volatile float, float addrspace(1)* %a.gep 2259 %b = load volatile float, float addrspace(1)* %b.gep 2260 %c = load volatile float, float addrspace(1)* %c.gep 2261 %mul = fmul float %a, %b 2262 %fneg = fneg float %mul 2263 %cmp0 = icmp eq i32 %d, 0 2264 br i1 %cmp0, label %if, label %endif 2265 2266if: 2267 %mul1 = fmul float %fneg, %c 2268 store volatile float %mul1, float addrspace(1)* %out.gep 2269 br label %endif 2270 2271endif: 2272 store volatile float %mul, float addrspace(1)* %out.gep 2273 ret void 2274} 2275 2276; -------------------------------------------------------------------------------- 2277; inlineasm tests 2278; -------------------------------------------------------------------------------- 2279 2280; Can't fold into use, so should fold into source 2281; GCN-LABEL: {{^}}v_fneg_inlineasm_f32: 2282; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2283; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2284; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] 2285; GCN: ; use [[MUL]] 2286; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 2287define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { 2288 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2289 %tid.ext = sext i32 %tid to i64 2290 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2291 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2292 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2293 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2294 %a = load volatile float, float addrspace(1)* %a.gep 2295 %b = load volatile float, float addrspace(1)* %b.gep 2296 %c = load volatile float, float addrspace(1)* %c.gep 2297 %mul = fmul float %a, %b 2298 %fneg = fneg float %mul 2299 call void asm sideeffect "; use $0", "v"(float %fneg) #0 2300 store volatile float %fneg, float addrspace(1)* %out.gep 2301 ret void 2302} 2303 2304; -------------------------------------------------------------------------------- 2305; inlineasm tests 2306; -------------------------------------------------------------------------------- 2307 2308; Can't fold into use, so should fold into source 2309; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32: 2310; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2311; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2312; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]] 2313; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]] 2314; GCN: ; use [[NEG]] 2315; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 2316define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { 2317 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2318 %tid.ext = sext i32 %tid to i64 2319 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2320 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2321 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2322 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2323 %a = load volatile float, float addrspace(1)* %a.gep 2324 %b = load volatile float, float addrspace(1)* %b.gep 2325 %c = load volatile float, float addrspace(1)* %c.gep 2326 %mul = fmul float %a, %b 2327 %fneg = fneg float %mul 2328 call void asm sideeffect "; use $0", "v"(float %fneg) #0 2329 store volatile float %mul, float addrspace(1)* %out.gep 2330 ret void 2331} 2332 2333; -------------------------------------------------------------------------------- 2334; code size regression tests 2335; -------------------------------------------------------------------------------- 2336 2337; There are multiple users of the fneg that must use a VOP3 2338; instruction, so there is no penalty 2339; GCN-LABEL: {{^}}multiuse_fneg_2_vop3_users_f32: 2340; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2341; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2342; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2343 2344; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], [[C]] 2345; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0 2346 2347; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]] 2348; GCN-NEXT: s_waitcnt vmcnt(0) 2349; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA1]] 2350; GCN-NEXT: s_waitcnt vmcnt(0) 2351define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 2352 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2353 %tid.ext = sext i32 %tid to i64 2354 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2355 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2356 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2357 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2358 %a = load volatile float, float addrspace(1)* %a.gep 2359 %b = load volatile float, float addrspace(1)* %b.gep 2360 %c = load volatile float, float addrspace(1)* %c.gep 2361 2362 %fneg.a = fneg float %a 2363 %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) 2364 %fma1 = call float @llvm.fma.f32(float %fneg.a, float %c, float 2.0) 2365 2366 store volatile float %fma0, float addrspace(1)* %out 2367 store volatile float %fma1, float addrspace(1)* %out 2368 ret void 2369} 2370 2371; There are multiple users, but both require using a larger encoding 2372; for the modifier. 2373 2374; GCN-LABEL: {{^}}multiuse_fneg_2_vop2_users_f32: 2375; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2376; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2377; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2378 2379; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]] 2380; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]] 2381; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] 2382; GCN-NEXT: s_waitcnt vmcnt(0) 2383; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2384; GCN-NEXT: s_waitcnt vmcnt(0) 2385define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 2386 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2387 %tid.ext = sext i32 %tid to i64 2388 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2389 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2390 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2391 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2392 %a = load volatile float, float addrspace(1)* %a.gep 2393 %b = load volatile float, float addrspace(1)* %b.gep 2394 %c = load volatile float, float addrspace(1)* %c.gep 2395 2396 %fneg.a = fneg float %a 2397 %mul0 = fmul float %fneg.a, %b 2398 %mul1 = fmul float %fneg.a, %c 2399 2400 store volatile float %mul0, float addrspace(1)* %out 2401 store volatile float %mul1, float addrspace(1)* %out 2402 ret void 2403} 2404 2405; One user is VOP3 so has no cost to folding the modifier, the other does. 2406; GCN-LABEL: {{^}}multiuse_fneg_vop2_vop3_users_f32: 2407; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2408; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2409; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2410 2411; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], 2.0 2412; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]] 2413 2414; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]] 2415; GCN-NEXT: s_waitcnt vmcnt(0) 2416; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2417; GCN-NEXT: s_waitcnt vmcnt(0) 2418define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 2419 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2420 %tid.ext = sext i32 %tid to i64 2421 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2422 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2423 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2424 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2425 %a = load volatile float, float addrspace(1)* %a.gep 2426 %b = load volatile float, float addrspace(1)* %b.gep 2427 %c = load volatile float, float addrspace(1)* %c.gep 2428 2429 %fneg.a = fneg float %a 2430 %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0) 2431 %mul1 = fmul float %fneg.a, %c 2432 2433 store volatile float %fma0, float addrspace(1)* %out 2434 store volatile float %mul1, float addrspace(1)* %out 2435 ret void 2436} 2437 2438; The use of the fneg requires a code size increase, but folding into 2439; the source does not 2440 2441; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f32: 2442; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2443; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2444; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2445; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]] 2446 2447; GCN-SAFE: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], [[B]], 2.0 2448; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[FMA0]], [[C]] 2449; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]] 2450 2451; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0 2452; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[FMA0]], [[C]] 2453; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]] 2454 2455; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2456; GCN-NEXT: s_waitcnt vmcnt(0) 2457; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL2]] 2458; GCN-NEXT: s_waitcnt vmcnt(0) 2459define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { 2460 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2461 %tid.ext = sext i32 %tid to i64 2462 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2463 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2464 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2465 %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext 2466 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2467 %a = load volatile float, float addrspace(1)* %a.gep 2468 %b = load volatile float, float addrspace(1)* %b.gep 2469 %c = load volatile float, float addrspace(1)* %c.gep 2470 %d = load volatile float, float addrspace(1)* %d.gep 2471 2472 %fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0) 2473 %fneg.fma0 = fneg float %fma0 2474 %mul1 = fmul float %fneg.fma0, %c 2475 %mul2 = fmul float %fneg.fma0, %d 2476 2477 store volatile float %mul1, float addrspace(1)* %out 2478 store volatile float %mul2, float addrspace(1)* %out 2479 ret void 2480} 2481 2482; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64: 2483; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 2484; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] 2485; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]] 2486; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]] 2487 2488; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0 2489; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]] 2490; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]] 2491 2492; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] 2493; GCN-NEXT: s_waitcnt vmcnt(0) 2494; GCN-NEXT: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2495; GCN-NEXT: s_waitcnt vmcnt(0) 2496define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 { 2497 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2498 %tid.ext = sext i32 %tid to i64 2499 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 2500 %b.gep = getelementptr inbounds double, double addrspace(1)* %b.ptr, i64 %tid.ext 2501 %c.gep = getelementptr inbounds double, double addrspace(1)* %c.ptr, i64 %tid.ext 2502 %d.gep = getelementptr inbounds double, double addrspace(1)* %d.ptr, i64 %tid.ext 2503 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 2504 %a = load volatile double, double addrspace(1)* %a.gep 2505 %b = load volatile double, double addrspace(1)* %b.gep 2506 %c = load volatile double, double addrspace(1)* %c.gep 2507 %d = load volatile double, double addrspace(1)* %d.gep 2508 2509 %fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0) 2510 %fneg.fma0 = fsub double -0.0, %fma0 2511 %mul1 = fmul double %fneg.fma0, %c 2512 %mul2 = fmul double %fneg.fma0, %d 2513 2514 store volatile double %mul1, double addrspace(1)* %out 2515 store volatile double %mul2, double addrspace(1)* %out 2516 ret void 2517} 2518 2519; %trunc.a has one fneg use, but it requires a code size increase and 2520; %the fneg can instead be folded for free into the fma. 2521 2522; GCN-LABEL: {{^}}one_use_cost_to_fold_into_src_f32: 2523; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2524; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2525; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2526; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]] 2527; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]] 2528; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]] 2529define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { 2530 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2531 %tid.ext = sext i32 %tid to i64 2532 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2533 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2534 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2535 %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext 2536 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2537 %a = load volatile float, float addrspace(1)* %a.gep 2538 %b = load volatile float, float addrspace(1)* %b.gep 2539 %c = load volatile float, float addrspace(1)* %c.gep 2540 %d = load volatile float, float addrspace(1)* %d.gep 2541 2542 %trunc.a = call float @llvm.trunc.f32(float %a) 2543 %trunc.fneg.a = fneg float %trunc.a 2544 %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c) 2545 store volatile float %fma0, float addrspace(1)* %out 2546 ret void 2547} 2548 2549; GCN-LABEL: {{^}}multi_use_cost_to_fold_into_src: 2550; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2551; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2552; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2553; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]] 2554; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]] 2555; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]] 2556; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[TRUNC_A]], [[D]] 2557; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]] 2558; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2559define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { 2560 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2561 %tid.ext = sext i32 %tid to i64 2562 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2563 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2564 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2565 %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext 2566 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2567 %a = load volatile float, float addrspace(1)* %a.gep 2568 %b = load volatile float, float addrspace(1)* %b.gep 2569 %c = load volatile float, float addrspace(1)* %c.gep 2570 %d = load volatile float, float addrspace(1)* %d.gep 2571 2572 %trunc.a = call float @llvm.trunc.f32(float %a) 2573 %trunc.fneg.a = fneg float %trunc.a 2574 %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c) 2575 %mul1 = fmul float %trunc.a, %d 2576 store volatile float %fma0, float addrspace(1)* %out 2577 store volatile float %mul1, float addrspace(1)* %out 2578 ret void 2579} 2580 2581; The AMDGPU combine to pull fneg into the FMA operands was being 2582; undone by the generic combine to pull the fneg out of the fma if 2583; !isFNegFree. We were reporting false for v2f32 even though it will 2584; be split into f32 where it will be free. 2585; GCN-LABEL: {{^}}fneg_fma_fneg_dagcombine_loop: 2586; GCN: s_brev_b32 [[NEGZERO:s[0-9]+]], 1{{$}} 2587; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], v2, -v4, [[NEGZERO]] 2588; GCN-DAG: v_fma_f32 [[FMA1:v[0-9]+]], v3, -v5, [[NEGZERO]] 2589; GCN-DAG: v_sub_f32_e32 [[SUB0:v[0-9]+]], [[FMA0]], v0 2590; GCN-DAG: v_sub_f32_e32 [[SUB1:v[0-9]+]], [[FMA1]], v1 2591; GCN-DAG: v_mul_f32_e32 v0, [[SUB0]], v4 2592; GCN-DAG: v_mul_f32_e32 v1, [[SUB1]], v5 2593; GCN: s_setpc_b64 2594define <2 x float> @fneg_fma_fneg_dagcombine_loop(<2 x float> %arg, <2 x float> %arg1, <2 x float> %arg2) #0 { 2595bb: 2596 %i3 = call fast <2 x float> @llvm.fma.v2f32(<2 x float> %arg1, <2 x float> %arg2, <2 x float> zeroinitializer) 2597 %i4 = fadd fast <2 x float> %i3, %arg 2598 %i5 = fneg <2 x float> %i4 2599 %i6 = fmul fast <2 x float> %i5, %arg2 2600 ret <2 x float> %i6 2601} 2602 2603; This expects denormal flushing, so can't turn this fmul into fneg 2604; TODO: Keeping this as fmul saves encoding size 2605; GCN-LABEL: {{^}}nnan_fmul_neg1_to_fneg: 2606; GCN: v_sub_f32_e32 [[TMP:v[0-9]+]], 0x80000000, v0 2607; GCN-NEXT: v_mul_f32_e32 v0, [[TMP]], v1 2608define float @nnan_fmul_neg1_to_fneg(float %x, float %y) #0 { 2609 %mul = fmul float %x, -1.0 2610 %add = fmul nnan float %mul, %y 2611 ret float %add 2612} 2613 2614; It's legal to turn this fmul into an fneg since denormals are 2615; preserved and we know an snan can't happen from the flag. 2616; GCN-LABEL: {{^}}denormal_fmul_neg1_to_fneg: 2617; GCN: v_mul_f32_e64 v0, -v0, v1 2618; GCN-NEXT: s_setpc_b64 2619define float @denormal_fmul_neg1_to_fneg(float %x, float %y) { 2620 %mul = fmul nnan float %x, -1.0 2621 %add = fmul float %mul, %y 2622 ret float %add 2623} 2624 2625; know the source can't be an snan 2626; GCN-LABEL: {{^}}denorm_snan_fmul_neg1_to_fneg: 2627; GCN: v_mul_f32_e64 [[TMP:v[0-9]+]], v0, -v0 2628; GCN: v_mul_f32_e32 v0, [[TMP]], v1 2629; GCN-NEXT: s_setpc_b64 2630define float @denorm_snan_fmul_neg1_to_fneg(float %x, float %y) { 2631 %canonical = fmul float %x, %x 2632 %mul = fmul float %canonical, -1.0 2633 %add = fmul float %mul, %y 2634 ret float %add 2635} 2636 2637; GCN-LABEL: {{^}}flush_snan_fmul_neg1_to_fneg: 2638; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], 1.0, v0 2639; GCN: v_sub_f32_e32 [[TMP1:v[0-9]+]], 0x80000000, [[TMP0]] 2640; GCN-NEXT: v_mul_f32_e32 v0, [[TMP1]], v1 2641define float @flush_snan_fmul_neg1_to_fneg(float %x, float %y) #0 { 2642 %quiet = call float @llvm.canonicalize.f32(float %x) 2643 %mul = fmul float %quiet, -1.0 2644 %add = fmul float %mul, %y 2645 ret float %add 2646} 2647 2648declare i32 @llvm.amdgcn.workitem.id.x() #1 2649declare float @llvm.fma.f32(float, float, float) #1 2650declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) 2651declare float @llvm.fmuladd.f32(float, float, float) #1 2652declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1 2653declare float @llvm.sin.f32(float) #1 2654declare float @llvm.trunc.f32(float) #1 2655declare float @llvm.round.f32(float) #1 2656declare float @llvm.rint.f32(float) #1 2657declare float @llvm.nearbyint.f32(float) #1 2658declare float @llvm.canonicalize.f32(float) #1 2659declare float @llvm.minnum.f32(float, float) #1 2660declare float @llvm.maxnum.f32(float, float) #1 2661declare half @llvm.minnum.f16(half, half) #1 2662declare double @llvm.minnum.f64(double, double) #1 2663declare double @llvm.fma.f64(double, double, double) #1 2664 2665declare float @llvm.amdgcn.sin.f32(float) #1 2666declare float @llvm.amdgcn.rcp.f32(float) #1 2667declare float @llvm.amdgcn.rcp.legacy(float) #1 2668declare float @llvm.amdgcn.fmul.legacy(float, float) #1 2669declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0 2670declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0 2671 2672attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 2673attributes #1 = { nounwind readnone } 2674attributes #2 = { nounwind "unsafe-fp-math"="true" } 2675attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" } 2676