1; RUN: llc -march=amdgcn -mcpu=hawaii -start-after=sink -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,SI %s 2; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,SI %s 3 4; RUN: llc -march=amdgcn -mcpu=fiji -start-after=sink --verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,VI %s 5; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,VI %s 6 7; -------------------------------------------------------------------------------- 8; fadd tests 9; -------------------------------------------------------------------------------- 10 11; GCN-LABEL: {{^}}v_fneg_add_f32: 12; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 13; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 14 15; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 16; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] 17 18; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]] 19; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 20define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 21 %tid = call i32 @llvm.amdgcn.workitem.id.x() 22 %tid.ext = sext i32 %tid to i64 23 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 24 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 25 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 26 %a = load volatile float, float addrspace(1)* %a.gep 27 %b = load volatile float, float addrspace(1)* %b.gep 28 %add = fadd float %a, %b 29 %fneg = fneg float %add 30 store float %fneg, float addrspace(1)* %out.gep 31 ret void 32} 33 34; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32: 35; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 36; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 37; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 38; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] 39; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] 40; GCN-NEXT: s_waitcnt vmcnt(0) 41; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 42; GCN-NEXT: s_waitcnt vmcnt(0) 43define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 44 %tid = call i32 @llvm.amdgcn.workitem.id.x() 45 %tid.ext = sext i32 %tid to i64 46 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 47 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 48 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 49 %a = load volatile float, float addrspace(1)* %a.gep 50 %b = load volatile float, float addrspace(1)* %b.gep 51 %add = fadd float %a, %b 52 %fneg = fneg float %add 53 store volatile float %fneg, float addrspace(1)* %out 54 store volatile float %add, float addrspace(1)* %out 55 ret void 56} 57 58; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32: 59; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 60; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 61 62; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 63; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] 64; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]] 65 66; GCN-NSZ: v_sub_f32_e64 [[NEG_ADD:v[0-9]+]], -[[A]], [[B]] 67; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_ADD]] 68 69; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] 70; GCN-NEXT: s_waitcnt vmcnt(0) 71; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 72; GCN-NEXT: s_waitcnt vmcnt(0) 73define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 74 %tid = call i32 @llvm.amdgcn.workitem.id.x() 75 %tid.ext = sext i32 %tid to i64 76 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 77 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 78 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 79 %a = load volatile float, float addrspace(1)* %a.gep 80 %b = load volatile float, float addrspace(1)* %b.gep 81 %add = fadd float %a, %b 82 %fneg = fneg float %add 83 %use1 = fmul float %add, 4.0 84 store volatile float %fneg, float addrspace(1)* %out 85 store volatile float %use1, float addrspace(1)* %out 86 ret void 87} 88 89; GCN-LABEL: {{^}}v_fneg_add_fneg_x_f32: 90; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 91; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 92 93; GCN-SAFE: v_sub_f32_e32 94; GCN-SAFE: v_xor_b32_e32 [[ADD:v[0-9]+]], 0x80000000, 95 96; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 97 98; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 99define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 100 %tid = call i32 @llvm.amdgcn.workitem.id.x() 101 %tid.ext = sext i32 %tid to i64 102 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 103 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 104 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 105 %a = load volatile float, float addrspace(1)* %a.gep 106 %b = load volatile float, float addrspace(1)* %b.gep 107 %fneg.a = fneg float %a 108 %add = fadd float %fneg.a, %b 109 %fneg = fneg float %add 110 store volatile float %fneg, float addrspace(1)* %out 111 ret void 112} 113 114; GCN-LABEL: {{^}}v_fneg_add_x_fneg_f32: 115; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 116; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 117 118; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 119; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] 120 121; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] 122; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 123define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 124 %tid = call i32 @llvm.amdgcn.workitem.id.x() 125 %tid.ext = sext i32 %tid to i64 126 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 127 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 128 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 129 %a = load volatile float, float addrspace(1)* %a.gep 130 %b = load volatile float, float addrspace(1)* %b.gep 131 %fneg.b = fneg float %b 132 %add = fadd float %a, %fneg.b 133 %fneg = fneg float %add 134 store volatile float %fneg, float addrspace(1)* %out 135 ret void 136} 137 138; GCN-LABEL: {{^}}v_fneg_add_fneg_fneg_f32: 139; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 140; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 141 142; GCN-SAFE: v_sub_f32_e64 [[ADD:v[0-9]+]], -[[A]], [[B]] 143; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] 144 145; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 146; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 147define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 148 %tid = call i32 @llvm.amdgcn.workitem.id.x() 149 %tid.ext = sext i32 %tid to i64 150 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 151 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 152 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 153 %a = load volatile float, float addrspace(1)* %a.gep 154 %b = load volatile float, float addrspace(1)* %b.gep 155 %fneg.a = fneg float %a 156 %fneg.b = fneg float %b 157 %add = fadd float %fneg.a, %fneg.b 158 %fneg = fneg float %add 159 store volatile float %fneg, float addrspace(1)* %out 160 ret void 161} 162 163; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32: 164; GCN-SAFE-DAG: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1{{$}} 165; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 166; GCN-DAG: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 167 168; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[SIGNBIT]], [[A]] 169; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] 170; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], [[SIGNBIT]], [[ADD]] 171 172; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 173; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]] 174; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] 175; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 176; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 177; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 178define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 179 %tid = call i32 @llvm.amdgcn.workitem.id.x() 180 %tid.ext = sext i32 %tid to i64 181 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 182 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 183 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 184 %a = load volatile float, float addrspace(1)* %a.gep 185 %b = load volatile float, float addrspace(1)* %b.gep 186 %fneg.a = fneg float %a 187 %add = fadd float %fneg.a, %b 188 %fneg = fneg float %add 189 store volatile float %fneg, float addrspace(1)* %out 190 store volatile float %fneg.a, float addrspace(1)* %out 191 ret void 192} 193 194; GCN-LABEL: {{^}}v_fneg_add_multi_use_fneg_x_f32: 195; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 196; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 197 198; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 199; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] 200; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] 201 202; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]] 203; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 204; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] 205; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 206; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 207; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 208define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { 209 %tid = call i32 @llvm.amdgcn.workitem.id.x() 210 %tid.ext = sext i32 %tid to i64 211 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 212 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 213 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 214 %a = load volatile float, float addrspace(1)* %a.gep 215 %b = load volatile float, float addrspace(1)* %b.gep 216 %fneg.a = fneg float %a 217 %add = fadd float %fneg.a, %b 218 %fneg = fneg float %add 219 %use1 = fmul float %fneg.a, %c 220 store volatile float %fneg, float addrspace(1)* %out 221 store volatile float %use1, float addrspace(1)* %out 222 ret void 223} 224 225; This one asserted with -enable-no-signed-zeros-fp-math 226; GCN-LABEL: {{^}}fneg_fadd_0: 227; GCN-SAFE-DAG: v_mad_f32 [[A:v[0-9]+]], 228; GCN-SAFE-DAG: v_cmp_ngt_f32_e32 {{.*}}, [[A]] 229; GCN-SAFE-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -[[A]] 230define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 { 231.entry: 232 %tmp7 = fdiv float 1.000000e+00, %tmp6 233 %tmp8 = fmul float 0.000000e+00, %tmp7 234 %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8 235 %.i188 = fadd float %tmp9, 0.000000e+00 236 %tmp10 = fcmp uge float %.i188, %tmp2 237 %tmp11 = fneg float %.i188 238 %.i092 = select i1 %tmp10, float %tmp2, float %tmp11 239 %tmp12 = fcmp ule float %.i092, 0.000000e+00 240 %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000 241 ret float %.i198 242} 243 244; This is a workaround because -enable-no-signed-zeros-fp-math does not set up 245; function attribute unsafe-fp-math automatically. Combine with the previous test 246; when that is done. 247; GCN-LABEL: {{^}}fneg_fadd_0_nsz: 248; GCN-NSZ-DAG: v_rcp_f32_e32 [[A:v[0-9]+]], 249; GCN-NSZ-DAG: v_mov_b32_e32 [[B:v[0-9]+]], 250; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]], 251; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]], 252; GCN-NSZ-DAG: v_cmp_nlt_f32_e64 {{.*}}, -[[D]] 253define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #2 { 254.entry: 255 %tmp7 = fdiv afn float 1.000000e+00, %tmp6 256 %tmp8 = fmul float 0.000000e+00, %tmp7 257 %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8 258 %.i188 = fadd float %tmp9, 0.000000e+00 259 %tmp10 = fcmp uge float %.i188, %tmp2 260 %tmp11 = fneg float %.i188 261 %.i092 = select i1 %tmp10, float %tmp2, float %tmp11 262 %tmp12 = fcmp ule float %.i092, 0.000000e+00 263 %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000 264 ret float %.i198 265} 266 267; -------------------------------------------------------------------------------- 268; fmul tests 269; -------------------------------------------------------------------------------- 270 271; GCN-LABEL: {{^}}v_fneg_mul_f32: 272; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 273; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 274; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]] 275; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 276define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 277 %tid = call i32 @llvm.amdgcn.workitem.id.x() 278 %tid.ext = sext i32 %tid to i64 279 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 280 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 281 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 282 %a = load volatile float, float addrspace(1)* %a.gep 283 %b = load volatile float, float addrspace(1)* %b.gep 284 %mul = fmul float %a, %b 285 %fneg = fneg float %mul 286 store float %fneg, float addrspace(1)* %out.gep 287 ret void 288} 289 290; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32: 291; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 292; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 293; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 294; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]] 295; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]] 296; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 297define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 298 %tid = call i32 @llvm.amdgcn.workitem.id.x() 299 %tid.ext = sext i32 %tid to i64 300 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 301 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 302 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 303 %a = load volatile float, float addrspace(1)* %a.gep 304 %b = load volatile float, float addrspace(1)* %b.gep 305 %mul = fmul float %a, %b 306 %fneg = fneg float %mul 307 store volatile float %fneg, float addrspace(1)* %out 308 store volatile float %mul, float addrspace(1)* %out 309 ret void 310} 311 312; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32: 313; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 314; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 315; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], [[A]], -[[B]] 316; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]] 317 318; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] 319; GCN-NEXT: s_waitcnt vmcnt(0) 320; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 321; GCN-NEXT: s_waitcnt vmcnt(0) 322define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 323 %tid = call i32 @llvm.amdgcn.workitem.id.x() 324 %tid.ext = sext i32 %tid to i64 325 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 326 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 327 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 328 %a = load volatile float, float addrspace(1)* %a.gep 329 %b = load volatile float, float addrspace(1)* %b.gep 330 %mul = fmul float %a, %b 331 %fneg = fneg float %mul 332 %use1 = fmul float %mul, 4.0 333 store volatile float %fneg, float addrspace(1)* %out 334 store volatile float %use1, float addrspace(1)* %out 335 ret void 336} 337 338; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32: 339; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 340; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 341; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 342; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 343define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 344 %tid = call i32 @llvm.amdgcn.workitem.id.x() 345 %tid.ext = sext i32 %tid to i64 346 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 347 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 348 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 349 %a = load volatile float, float addrspace(1)* %a.gep 350 %b = load volatile float, float addrspace(1)* %b.gep 351 %fneg.a = fneg float %a 352 %mul = fmul float %fneg.a, %b 353 %fneg = fneg float %mul 354 store volatile float %fneg, float addrspace(1)* %out 355 ret void 356} 357 358; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32: 359; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 360; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 361; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 362; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 363define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 364 %tid = call i32 @llvm.amdgcn.workitem.id.x() 365 %tid.ext = sext i32 %tid to i64 366 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 367 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 368 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 369 %a = load volatile float, float addrspace(1)* %a.gep 370 %b = load volatile float, float addrspace(1)* %b.gep 371 %fneg.b = fneg float %b 372 %mul = fmul float %a, %fneg.b 373 %fneg = fneg float %mul 374 store volatile float %fneg, float addrspace(1)* %out 375 ret void 376} 377 378; GCN-LABEL: {{^}}v_fneg_mul_fneg_fneg_f32: 379; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 380; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 381; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] 382; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 383define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 384 %tid = call i32 @llvm.amdgcn.workitem.id.x() 385 %tid.ext = sext i32 %tid to i64 386 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 387 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 388 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 389 %a = load volatile float, float addrspace(1)* %a.gep 390 %b = load volatile float, float addrspace(1)* %b.gep 391 %fneg.a = fneg float %a 392 %fneg.b = fneg float %b 393 %mul = fmul float %fneg.a, %fneg.b 394 %fneg = fneg float %mul 395 store volatile float %fneg, float addrspace(1)* %out 396 ret void 397} 398 399; GCN-LABEL: {{^}}v_fneg_mul_store_use_fneg_x_f32: 400; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 401; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 402; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 403; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]] 404 405; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]] 406; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 407define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 408 %tid = call i32 @llvm.amdgcn.workitem.id.x() 409 %tid.ext = sext i32 %tid to i64 410 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 411 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 412 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 413 %a = load volatile float, float addrspace(1)* %a.gep 414 %b = load volatile float, float addrspace(1)* %b.gep 415 %fneg.a = fneg float %a 416 %mul = fmul float %fneg.a, %b 417 %fneg = fneg float %mul 418 store volatile float %fneg, float addrspace(1)* %out 419 store volatile float %fneg.a, float addrspace(1)* %out 420 ret void 421} 422 423; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32: 424; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 425; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 426; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]] 427; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 428; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]] 429; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 430define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { 431 %tid = call i32 @llvm.amdgcn.workitem.id.x() 432 %tid.ext = sext i32 %tid to i64 433 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 434 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 435 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 436 %a = load volatile float, float addrspace(1)* %a.gep 437 %b = load volatile float, float addrspace(1)* %b.gep 438 %fneg.a = fneg float %a 439 %mul = fmul float %fneg.a, %b 440 %fneg = fneg float %mul 441 %use1 = fmul float %fneg.a, %c 442 store volatile float %fneg, float addrspace(1)* %out 443 store volatile float %use1, float addrspace(1)* %out 444 ret void 445} 446 447; -------------------------------------------------------------------------------- 448; fminnum tests 449; -------------------------------------------------------------------------------- 450 451; GCN-LABEL: {{^}}v_fneg_minnum_f32_ieee: 452; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 453; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 454; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 455; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] 456; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] 457; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 458define amdgpu_kernel void @v_fneg_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 459 %tid = call i32 @llvm.amdgcn.workitem.id.x() 460 %tid.ext = sext i32 %tid to i64 461 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 462 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 463 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 464 %a = load volatile float, float addrspace(1)* %a.gep 465 %b = load volatile float, float addrspace(1)* %b.gep 466 %min = call float @llvm.minnum.f32(float %a, float %b) 467 %fneg = fneg float %min 468 store float %fneg, float addrspace(1)* %out.gep 469 ret void 470} 471 472; GCN-LABEL: {{^}}v_fneg_minnum_f32_no_ieee: 473; GCN-NOT: v0 474; GCN-NOT: v1 475; GCN: v_max_f32_e64 v0, -v0, -v1 476; GCN-NEXT: ; return 477define amdgpu_ps float @v_fneg_minnum_f32_no_ieee(float %a, float %b) #0 { 478 %min = call float @llvm.minnum.f32(float %a, float %b) 479 %fneg = fneg float %min 480 ret float %fneg 481} 482 483; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_ieee: 484; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 485; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 486; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]] 487; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 488define amdgpu_kernel void @v_fneg_self_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 489 %tid = call i32 @llvm.amdgcn.workitem.id.x() 490 %tid.ext = sext i32 %tid to i64 491 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 492 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 493 %a = load volatile float, float addrspace(1)* %a.gep 494 %min = call float @llvm.minnum.f32(float %a, float %a) 495 %min.fneg = fneg float %min 496 store float %min.fneg, float addrspace(1)* %out.gep 497 ret void 498} 499 500; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_no_ieee: 501; GCN-NOT: v0 502; GCN: v_max_f32_e64 v0, -v0, -v0 503; GCN-NEXT: ; return 504define amdgpu_ps float @v_fneg_self_minnum_f32_no_ieee(float %a) #0 { 505 %min = call float @llvm.minnum.f32(float %a, float %a) 506 %min.fneg = fneg float %min 507 ret float %min.fneg 508} 509 510; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_ieee: 511; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 512; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 513; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]] 514; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 515define amdgpu_kernel void @v_fneg_posk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 516 %tid = call i32 @llvm.amdgcn.workitem.id.x() 517 %tid.ext = sext i32 %tid to i64 518 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 519 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 520 %a = load volatile float, float addrspace(1)* %a.gep 521 %min = call float @llvm.minnum.f32(float 4.0, float %a) 522 %fneg = fneg float %min 523 store float %fneg, float addrspace(1)* %out.gep 524 ret void 525} 526 527; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_no_ieee: 528; GCN-NOT: v0 529; GCN: v_max_f32_e64 v0, -v0, -4.0 530; GCN-NEXT: ; return 531define amdgpu_ps float @v_fneg_posk_minnum_f32_no_ieee(float %a) #0 { 532 %min = call float @llvm.minnum.f32(float 4.0, float %a) 533 %fneg = fneg float %min 534 ret float %fneg 535} 536 537; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_ieee: 538; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 539; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 540; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]] 541; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 542define amdgpu_kernel void @v_fneg_negk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 543 %tid = call i32 @llvm.amdgcn.workitem.id.x() 544 %tid.ext = sext i32 %tid to i64 545 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 546 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 547 %a = load volatile float, float addrspace(1)* %a.gep 548 %min = call float @llvm.minnum.f32(float -4.0, float %a) 549 %fneg = fneg float %min 550 store float %fneg, float addrspace(1)* %out.gep 551 ret void 552} 553 554; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_no_ieee: 555; GCN-NOT: v0 556; GCN: v_max_f32_e64 v0, -v0, 4.0 557; GCN-NEXT: ; return 558define amdgpu_ps float @v_fneg_negk_minnum_f32_no_ieee(float %a) #0 { 559 %min = call float @llvm.minnum.f32(float -4.0, float %a) 560 %fneg = fneg float %min 561 ret float %fneg 562} 563 564; GCN-LABEL: {{^}}v_fneg_0_minnum_f32: 565; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 566; GCN-NOT [[A]] 567; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]] 568; GCN: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MIN]] 569; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 570define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 571 %tid = call i32 @llvm.amdgcn.workitem.id.x() 572 %tid.ext = sext i32 %tid to i64 573 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 574 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 575 %a = load volatile float, float addrspace(1)* %a.gep 576 %min = call nnan float @llvm.minnum.f32(float 0.0, float %a) 577 %fneg = fneg float %min 578 store float %fneg, float addrspace(1)* %out.gep 579 ret void 580} 581 582; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_ieee: 583; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 584; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 585; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]] 586; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 587define amdgpu_kernel void @v_fneg_neg0_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 588 %tid = call i32 @llvm.amdgcn.workitem.id.x() 589 %tid.ext = sext i32 %tid to i64 590 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 591 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 592 %a = load volatile float, float addrspace(1)* %a.gep 593 %min = call float @llvm.minnum.f32(float -0.0, float %a) 594 %fneg = fneg float %min 595 store float %fneg, float addrspace(1)* %out.gep 596 ret void 597} 598 599; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f32: 600; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 601 602; SI-DAG: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]] 603; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]] 604 605; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]] 606; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]] 607; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]] 608 609; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 610define amdgpu_kernel void @v_fneg_inv2pi_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 611 %tid = call i32 @llvm.amdgcn.workitem.id.x() 612 %tid.ext = sext i32 %tid to i64 613 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 614 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 615 %a = load volatile float, float addrspace(1)* %a.gep 616 %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a) 617 %fneg = fneg float %min 618 store float %fneg, float addrspace(1)* %out.gep 619 ret void 620} 621 622; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f32: 623; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 624 625; SI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]] 626; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0x3e22f983, [[NEG_QUIET]] 627 628; VI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]] 629; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]] 630 631; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 632define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 633 %tid = call i32 @llvm.amdgcn.workitem.id.x() 634 %tid.ext = sext i32 %tid to i64 635 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 636 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 637 %a = load volatile float, float addrspace(1)* %a.gep 638 %min = call float @llvm.minnum.f32(float 0xBFC45F3060000000, float %a) 639 %fneg = fneg float %min 640 store float %fneg, float addrspace(1)* %out.gep 641 ret void 642} 643 644; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f16: 645; GCN-DAG: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 646 647; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] 648; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0xbe230000, [[CVT]] 649; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]] 650 651; VI: v_max_f16_e32 [[QUIET:v[0-9]+]], [[A]], [[A]] 652; VI: v_min_f16_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]] 653; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x8000, [[MAX]] 654 655; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 656define amdgpu_kernel void @v_fneg_inv2pi_minnum_f16(half addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { 657 %tid = call i32 @llvm.amdgcn.workitem.id.x() 658 %tid.ext = sext i32 %tid to i64 659 %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext 660 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 661 %a = load volatile half, half addrspace(1)* %a.gep 662 %min = call half @llvm.minnum.f16(half 0xH3118, half %a) 663 %fneg = fsub half -0.000000e+00, %min 664 store half %fneg, half addrspace(1)* %out.gep 665 ret void 666} 667 668; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f16: 669; GCN-DAG: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 670 671; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] 672; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0x3e230000, [[CVT]] 673; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]] 674 675; VI: v_max_f16_e64 [[NEG_QUIET:v[0-9]+]], -[[A]], -[[A]] 676; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]] 677 678; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 679define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f16(half addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { 680 %tid = call i32 @llvm.amdgcn.workitem.id.x() 681 %tid.ext = sext i32 %tid to i64 682 %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext 683 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 684 %a = load volatile half, half addrspace(1)* %a.gep 685 %min = call half @llvm.minnum.f16(half 0xHB118, half %a) 686 %fneg = fsub half -0.000000e+00, %min 687 store half %fneg, half addrspace(1)* %out.gep 688 ret void 689} 690 691; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f64: 692; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 693 694; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfc45f30 695; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882 696; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]] 697; SI: v_max_f64 v[[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]], [[NEG_QUIET]], s[[[K_LO]]:[[K_HI]]] 698 699; VI: v_min_f64 v[[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]], [[A]], 0.15915494 700; VI: v_xor_b32_e32 v[[RESULT_HI]], 0x80000000, v[[RESULT_HI]] 701 702; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[RESULT_LO]]:[[RESULT_HI]]] 703define amdgpu_kernel void @v_fneg_inv2pi_minnum_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { 704 %tid = call i32 @llvm.amdgcn.workitem.id.x() 705 %tid.ext = sext i32 %tid to i64 706 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 707 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 708 %a = load volatile double, double addrspace(1)* %a.gep 709 %min = call double @llvm.minnum.f64(double 0x3fc45f306dc9c882, double %a) 710 %fneg = fsub double -0.000000e+00, %min 711 store double %fneg, double addrspace(1)* %out.gep 712 ret void 713} 714 715; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f64: 716; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 717 718; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fc45f30 719; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882 720; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]] 721; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], s[[[K_LO]]:[[K_HI]]] 722 723; VI: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]] 724; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], 0.15915494 725 726; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 727define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { 728 %tid = call i32 @llvm.amdgcn.workitem.id.x() 729 %tid.ext = sext i32 %tid to i64 730 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 731 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 732 %a = load volatile double, double addrspace(1)* %a.gep 733 %min = call double @llvm.minnum.f64(double 0xbfc45f306dc9c882, double %a) 734 %fneg = fsub double -0.000000e+00, %min 735 store double %fneg, double addrspace(1)* %out.gep 736 ret void 737} 738 739; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_no_ieee: 740; GCN-NOT: v0 741; GCN: v_max_f32_e64 v0, -v0, 0{{$}} 742; GCN-NEXT: ; return 743define amdgpu_ps float @v_fneg_neg0_minnum_f32_no_ieee(float %a) #0 { 744 %min = call float @llvm.minnum.f32(float -0.0, float %a) 745 %fneg = fneg float %min 746 ret float %fneg 747} 748 749; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_ieee: 750; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 751; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 752; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] 753; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[QUIET_A]] 754; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]] 755; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 756define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 757 %tid = call i32 @llvm.amdgcn.workitem.id.x() 758 %tid.ext = sext i32 %tid to i64 759 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 760 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 761 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 762 %a = load volatile float, float addrspace(1)* %a.gep 763 %b = load volatile float, float addrspace(1)* %b.gep 764 %min = call float @llvm.minnum.f32(float 0.0, float %a) 765 %fneg = fneg float %min 766 %mul = fmul float %fneg, %b 767 store float %mul, float addrspace(1)* %out.gep 768 ret void 769} 770 771; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_foldable_use_f32: 772; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 773; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 774 775; SI: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]] 776 777; SI: v_max_f32_e32 [[MIN:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]] 778; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[B]] 779 780; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]] 781; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0.15915494, [[QUIET]] 782; VI: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]] 783 784; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 785define amdgpu_kernel void @v_fneg_inv2pi_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 786 %tid = call i32 @llvm.amdgcn.workitem.id.x() 787 %tid.ext = sext i32 %tid to i64 788 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 789 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 790 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 791 %a = load volatile float, float addrspace(1)* %a.gep 792 %b = load volatile float, float addrspace(1)* %b.gep 793 %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a) 794 %fneg = fneg float %min 795 %mul = fmul float %fneg, %b 796 store float %mul, float addrspace(1)* %out.gep 797 ret void 798} 799 800; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_no_ieee: 801; GCN-NOT: v0 802; GCN-NOT: v1 803; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, v0 804; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], v1 805; GCN-NEXT: ; return 806define amdgpu_ps float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float %b) #0 { 807 %min = call float @llvm.minnum.f32(float 0.0, float %a) 808 %fneg = fneg float %min 809 %mul = fmul float %fneg, %b 810 ret float %mul 811} 812 813; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_ieee: 814; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 815; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 816; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 817; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] 818; GCN: v_max_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] 819; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]] 820; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]] 821; GCN-NEXT: s_waitcnt vmcnt(0) 822; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 823; GCN-NEXT: s_waitcnt vmcnt(0) 824define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 825 %tid = call i32 @llvm.amdgcn.workitem.id.x() 826 %tid.ext = sext i32 %tid to i64 827 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 828 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 829 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 830 %a = load volatile float, float addrspace(1)* %a.gep 831 %b = load volatile float, float addrspace(1)* %b.gep 832 %min = call float @llvm.minnum.f32(float %a, float %b) 833 %fneg = fneg float %min 834 %use1 = fmul float %min, 4.0 835 store volatile float %fneg, float addrspace(1)* %out 836 store volatile float %use1, float addrspace(1)* %out 837 ret void 838} 839 840; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_no_ieee: 841; GCN-NOT: v0 842; GCN-NOT: v1 843; GCN: v_max_f32_e64 v0, -v0, -v1 844; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0 845; GCN-NEXT: ; return 846define amdgpu_ps <2 x float> @v_fneg_minnum_multi_use_minnum_f32_no_ieee(float %a, float %b) #0 { 847 %min = call float @llvm.minnum.f32(float %a, float %b) 848 %fneg = fneg float %min 849 %use1 = fmul float %min, 4.0 850 %ins0 = insertelement <2 x float> undef, float %fneg, i32 0 851 %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1 852 ret <2 x float> %ins1 853} 854 855; -------------------------------------------------------------------------------- 856; fmaxnum tests 857; -------------------------------------------------------------------------------- 858 859 860; GCN-LABEL: {{^}}v_fneg_maxnum_f32_ieee: 861; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 862; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 863; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 864; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] 865; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] 866; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 867define amdgpu_kernel void @v_fneg_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 868 %tid = call i32 @llvm.amdgcn.workitem.id.x() 869 %tid.ext = sext i32 %tid to i64 870 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 871 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 872 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 873 %a = load volatile float, float addrspace(1)* %a.gep 874 %b = load volatile float, float addrspace(1)* %b.gep 875 %max = call float @llvm.maxnum.f32(float %a, float %b) 876 %fneg = fneg float %max 877 store float %fneg, float addrspace(1)* %out.gep 878 ret void 879} 880 881; GCN-LABEL: {{^}}v_fneg_maxnum_f32_no_ieee: 882; GCN-NOT: v0 883; GCN-NOT: v1 884; GCN: v_min_f32_e64 v0, -v0, -v1 885; GCN-NEXT: ; return 886define amdgpu_ps float @v_fneg_maxnum_f32_no_ieee(float %a, float %b) #0 { 887 %max = call float @llvm.maxnum.f32(float %a, float %b) 888 %fneg = fneg float %max 889 ret float %fneg 890} 891 892; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_ieee: 893; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 894; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 895; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]] 896; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 897define amdgpu_kernel void @v_fneg_self_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 898 %tid = call i32 @llvm.amdgcn.workitem.id.x() 899 %tid.ext = sext i32 %tid to i64 900 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 901 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 902 %a = load volatile float, float addrspace(1)* %a.gep 903 %max = call float @llvm.maxnum.f32(float %a, float %a) 904 %max.fneg = fneg float %max 905 store float %max.fneg, float addrspace(1)* %out.gep 906 ret void 907} 908 909; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_no_ieee: 910; GCN-NOT: v0 911; GCN: v_min_f32_e64 v0, -v0, -v0 912; GCN-NEXT: ; return 913define amdgpu_ps float @v_fneg_self_maxnum_f32_no_ieee(float %a) #0 { 914 %max = call float @llvm.maxnum.f32(float %a, float %a) 915 %max.fneg = fneg float %max 916 ret float %max.fneg 917} 918 919; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_ieee: 920; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 921; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 922; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]] 923; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 924define amdgpu_kernel void @v_fneg_posk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 925 %tid = call i32 @llvm.amdgcn.workitem.id.x() 926 %tid.ext = sext i32 %tid to i64 927 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 928 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 929 %a = load volatile float, float addrspace(1)* %a.gep 930 %max = call float @llvm.maxnum.f32(float 4.0, float %a) 931 %fneg = fneg float %max 932 store float %fneg, float addrspace(1)* %out.gep 933 ret void 934} 935 936; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_no_ieee: 937; GCN-NOT: v0 938; GCN: v_min_f32_e64 v0, -v0, -4.0 939; GCN-NEXT: ; return 940define amdgpu_ps float @v_fneg_posk_maxnum_f32_no_ieee(float %a) #0 { 941 %max = call float @llvm.maxnum.f32(float 4.0, float %a) 942 %fneg = fneg float %max 943 ret float %fneg 944} 945 946; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_ieee: 947; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 948; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 949; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]] 950; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 951define amdgpu_kernel void @v_fneg_negk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 952 %tid = call i32 @llvm.amdgcn.workitem.id.x() 953 %tid.ext = sext i32 %tid to i64 954 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 955 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 956 %a = load volatile float, float addrspace(1)* %a.gep 957 %max = call float @llvm.maxnum.f32(float -4.0, float %a) 958 %fneg = fneg float %max 959 store float %fneg, float addrspace(1)* %out.gep 960 ret void 961} 962 963; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_no_ieee: 964; GCN-NOT: v0 965; GCN: v_min_f32_e64 v0, -v0, 4.0 966; GCN-NEXT: ; return 967define amdgpu_ps float @v_fneg_negk_maxnum_f32_no_ieee(float %a) #0 { 968 %max = call float @llvm.maxnum.f32(float -4.0, float %a) 969 %fneg = fneg float %max 970 ret float %fneg 971} 972 973; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32: 974; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 975; GCN-NOT: [[A]] 976; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]] 977; GCN: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]] 978; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 979define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 980 %tid = call i32 @llvm.amdgcn.workitem.id.x() 981 %tid.ext = sext i32 %tid to i64 982 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 983 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 984 %a = load volatile float, float addrspace(1)* %a.gep 985 %max = call nnan float @llvm.maxnum.f32(float 0.0, float %a) 986 %fneg = fneg float %max 987 store float %fneg, float addrspace(1)* %out.gep 988 ret void 989} 990 991; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_ieee: 992; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 993; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 994; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]] 995; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 996define amdgpu_kernel void @v_fneg_neg0_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 997 %tid = call i32 @llvm.amdgcn.workitem.id.x() 998 %tid.ext = sext i32 %tid to i64 999 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1000 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1001 %a = load volatile float, float addrspace(1)* %a.gep 1002 %max = call float @llvm.maxnum.f32(float -0.0, float %a) 1003 %fneg = fneg float %max 1004 store float %fneg, float addrspace(1)* %out.gep 1005 ret void 1006} 1007 1008; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_no_ieee: 1009; GCN-NOT: v0 1010; GCN: v_min_f32_e64 v0, -v0, 0{{$}} 1011; GCN-NEXT: ; return 1012define amdgpu_ps float @v_fneg_neg0_maxnum_f32_no_ieee(float %a) #0 { 1013 %max = call float @llvm.maxnum.f32(float -0.0, float %a) 1014 %fneg = fneg float %max 1015 ret float %fneg 1016} 1017 1018; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_ieee: 1019; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1020; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1021; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] 1022; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]] 1023; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]] 1024; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1025define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1026 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1027 %tid.ext = sext i32 %tid to i64 1028 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1029 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1030 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1031 %a = load volatile float, float addrspace(1)* %a.gep 1032 %b = load volatile float, float addrspace(1)* %b.gep 1033 %max = call float @llvm.maxnum.f32(float 0.0, float %a) 1034 %fneg = fneg float %max 1035 %mul = fmul float %fneg, %b 1036 store float %mul, float addrspace(1)* %out.gep 1037 ret void 1038} 1039 1040; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_no_ieee: 1041; GCN-NOT: v0 1042; GCN-NOT: v1 1043; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, v0 1044; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], v1 1045; GCN-NEXT: ; return 1046define amdgpu_ps float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float %b) #0 { 1047 %max = call float @llvm.maxnum.f32(float 0.0, float %a) 1048 %fneg = fneg float %max 1049 %mul = fmul float %fneg, %b 1050 ret float %mul 1051} 1052 1053; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_ieee: 1054; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1055; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1056; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 1057; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] 1058; GCN: v_min_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] 1059; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]] 1060; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]] 1061; GCN-NEXT: s_waitcnt vmcnt(0) 1062; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 1063; GCN-NEXT: s_waitcnt vmcnt(0) 1064define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1065 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1066 %tid.ext = sext i32 %tid to i64 1067 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1068 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1069 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1070 %a = load volatile float, float addrspace(1)* %a.gep 1071 %b = load volatile float, float addrspace(1)* %b.gep 1072 %max = call float @llvm.maxnum.f32(float %a, float %b) 1073 %fneg = fneg float %max 1074 %use1 = fmul float %max, 4.0 1075 store volatile float %fneg, float addrspace(1)* %out 1076 store volatile float %use1, float addrspace(1)* %out 1077 ret void 1078} 1079 1080; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_no_ieee: 1081; GCN-NOT: v0 1082; GCN-NOT: v1 1083; GCN: v_min_f32_e64 v0, -v0, -v1 1084; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0 1085; GCN-NEXT: ; return 1086define amdgpu_ps <2 x float> @v_fneg_maxnum_multi_use_maxnum_f32_no_ieee(float %a, float %b) #0 { 1087 %max = call float @llvm.maxnum.f32(float %a, float %b) 1088 %fneg = fneg float %max 1089 %use1 = fmul float %max, 4.0 1090 %ins0 = insertelement <2 x float> undef, float %fneg, i32 0 1091 %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1 1092 ret <2 x float> %ins1 1093} 1094 1095; -------------------------------------------------------------------------------- 1096; fma tests 1097; -------------------------------------------------------------------------------- 1098 1099; GCN-LABEL: {{^}}v_fneg_fma_f32: 1100; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1101; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1102; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1103 1104; GCN-SAFE: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] 1105; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[RESULT]] 1106 1107; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] 1108; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1109define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1110 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1111 %tid.ext = sext i32 %tid to i64 1112 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1113 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1114 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1115 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1116 %a = load volatile float, float addrspace(1)* %a.gep 1117 %b = load volatile float, float addrspace(1)* %b.gep 1118 %c = load volatile float, float addrspace(1)* %c.gep 1119 %fma = call float @llvm.fma.f32(float %a, float %b, float %c) 1120 %fneg = fneg float %fma 1121 store float %fneg, float addrspace(1)* %out.gep 1122 ret void 1123} 1124 1125; GCN-LABEL: {{^}}v_fneg_fma_store_use_fma_f32: 1126; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1127; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1128; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1129; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] 1130; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]] 1131; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]] 1132; GCN-NEXT: s_waitcnt vmcnt(0) 1133; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1134; GCN-NEXT: s_waitcnt vmcnt(0) 1135define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1136 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1137 %tid.ext = sext i32 %tid to i64 1138 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1139 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1140 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1141 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1142 %a = load volatile float, float addrspace(1)* %a.gep 1143 %b = load volatile float, float addrspace(1)* %b.gep 1144 %c = load volatile float, float addrspace(1)* %c.gep 1145 %fma = call float @llvm.fma.f32(float %a, float %b, float %c) 1146 %fneg = fneg float %fma 1147 store volatile float %fneg, float addrspace(1)* %out 1148 store volatile float %fma, float addrspace(1)* %out 1149 ret void 1150} 1151 1152; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fma_f32: 1153; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1154; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1155; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1156 1157; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] 1158; GCN-SAFE: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]] 1159; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]] 1160 1161; GCN-NSZ: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], -[[B]], -[[C]] 1162; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]] 1163 1164; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]] 1165; GCN-NEXT: s_waitcnt vmcnt(0) 1166; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1167; GCN-NEXT: s_waitcnt vmcnt(0) 1168define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1169 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1170 %tid.ext = sext i32 %tid to i64 1171 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1172 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1173 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1174 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1175 %a = load volatile float, float addrspace(1)* %a.gep 1176 %b = load volatile float, float addrspace(1)* %b.gep 1177 %c = load volatile float, float addrspace(1)* %c.gep 1178 %fma = call float @llvm.fma.f32(float %a, float %b, float %c) 1179 %fneg = fneg float %fma 1180 %use1 = fmul float %fma, 4.0 1181 store volatile float %fneg, float addrspace(1)* %out 1182 store volatile float %use1, float addrspace(1)* %out 1183 ret void 1184} 1185 1186; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_y_f32: 1187; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1188; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1189; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1190 1191; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], [[C]] 1192; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1193 1194; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1195; GCN-NSZ-NOT: [[FMA]] 1196; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1197define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1198 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1199 %tid.ext = sext i32 %tid to i64 1200 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1201 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1202 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1203 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1204 %a = load volatile float, float addrspace(1)* %a.gep 1205 %b = load volatile float, float addrspace(1)* %b.gep 1206 %c = load volatile float, float addrspace(1)* %c.gep 1207 %fneg.a = fneg float %a 1208 %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) 1209 %fneg = fneg float %fma 1210 store volatile float %fneg, float addrspace(1)* %out 1211 ret void 1212} 1213 1214; GCN-LABEL: {{^}}v_fneg_fma_x_fneg_y_f32: 1215; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1216; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1217; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1218 1219; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]] 1220; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1221 1222; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1223; GCN-NSZ-NOT: [[FMA]] 1224; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1225define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1226 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1227 %tid.ext = sext i32 %tid to i64 1228 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1229 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1230 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1231 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1232 %a = load volatile float, float addrspace(1)* %a.gep 1233 %b = load volatile float, float addrspace(1)* %b.gep 1234 %c = load volatile float, float addrspace(1)* %c.gep 1235 %fneg.b = fneg float %b 1236 %fma = call float @llvm.fma.f32(float %a, float %fneg.b, float %c) 1237 %fneg = fneg float %fma 1238 store volatile float %fneg, float addrspace(1)* %out 1239 ret void 1240} 1241 1242; GCN-LABEL: {{^}}v_fneg_fma_fneg_fneg_y_f32: 1243; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1244; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1245; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1246 1247; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] 1248; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1249 1250; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]] 1251; GCN-NSZ-NOT: [[FMA]] 1252; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1253define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1254 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1255 %tid.ext = sext i32 %tid to i64 1256 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1257 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1258 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1259 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1260 %a = load volatile float, float addrspace(1)* %a.gep 1261 %b = load volatile float, float addrspace(1)* %b.gep 1262 %c = load volatile float, float addrspace(1)* %c.gep 1263 %fneg.a = fneg float %a 1264 %fneg.b = fneg float %b 1265 %fma = call float @llvm.fma.f32(float %fneg.a, float %fneg.b, float %c) 1266 %fneg = fneg float %fma 1267 store volatile float %fneg, float addrspace(1)* %out 1268 ret void 1269} 1270 1271; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_fneg_f32: 1272; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1273; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1274; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1275 1276; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], -[[C]] 1277; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1278 1279; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] 1280; GCN-NSZ-NOT: [[FMA]] 1281; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1282define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1283 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1284 %tid.ext = sext i32 %tid to i64 1285 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1286 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1287 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1288 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1289 %a = load volatile float, float addrspace(1)* %a.gep 1290 %b = load volatile float, float addrspace(1)* %b.gep 1291 %c = load volatile float, float addrspace(1)* %c.gep 1292 %fneg.a = fneg float %a 1293 %fneg.c = fneg float %c 1294 %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %fneg.c) 1295 %fneg = fneg float %fma 1296 store volatile float %fneg, float addrspace(1)* %out 1297 ret void 1298} 1299 1300; GCN-LABEL: {{^}}v_fneg_fma_x_y_fneg_f32: 1301; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1302; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1303; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1304 1305; GCN-NSZ-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1306; GCN-NSZ-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1307 1308; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]] 1309; GCN-NSZ-NOT: [[FMA]] 1310; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1311define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1312 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1313 %tid.ext = sext i32 %tid to i64 1314 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1315 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1316 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1317 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1318 %a = load volatile float, float addrspace(1)* %a.gep 1319 %b = load volatile float, float addrspace(1)* %b.gep 1320 %c = load volatile float, float addrspace(1)* %c.gep 1321 %fneg.c = fneg float %c 1322 %fma = call float @llvm.fma.f32(float %a, float %b, float %fneg.c) 1323 %fneg = fneg float %fma 1324 store volatile float %fneg, float addrspace(1)* %out 1325 ret void 1326} 1327 1328; GCN-LABEL: {{^}}v_fneg_fma_store_use_fneg_x_y_f32: 1329; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1330; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1331; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1332 1333; GCN-SAFE: v_xor_b32 1334; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], 1335; GCN-SAFE: v_xor_b32 1336 1337; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 1338; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1339 1340; GCN-NSZ-NOT: [[FMA]] 1341; GCN-NSZ-NOT: [[NEG_A]] 1342; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1343; GCN-NSZ-NOT: [[NEG_A]] 1344; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 1345define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1346 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1347 %tid.ext = sext i32 %tid to i64 1348 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1349 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1350 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1351 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1352 %a = load volatile float, float addrspace(1)* %a.gep 1353 %b = load volatile float, float addrspace(1)* %b.gep 1354 %c = load volatile float, float addrspace(1)* %c.gep 1355 %fneg.a = fneg float %a 1356 %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) 1357 %fneg = fneg float %fma 1358 store volatile float %fneg, float addrspace(1)* %out 1359 store volatile float %fneg.a, float addrspace(1)* %out 1360 ret void 1361} 1362 1363; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fneg_x_y_f32: 1364; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1365; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1366; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1367 1368; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 1369; GCN-SAFE-DAG: v_fma_f32 [[FMA:v[0-9]+]] 1370; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1371 1372; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1373; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]] 1374; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 1375; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1376; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 1377define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 { 1378 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1379 %tid.ext = sext i32 %tid to i64 1380 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1381 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1382 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1383 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1384 %a = load volatile float, float addrspace(1)* %a.gep 1385 %b = load volatile float, float addrspace(1)* %b.gep 1386 %c = load volatile float, float addrspace(1)* %c.gep 1387 %fneg.a = fneg float %a 1388 %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) 1389 %fneg = fneg float %fma 1390 %use1 = fmul float %fneg.a, %d 1391 store volatile float %fneg, float addrspace(1)* %out 1392 store volatile float %use1, float addrspace(1)* %out 1393 ret void 1394} 1395 1396; -------------------------------------------------------------------------------- 1397; fmad tests 1398; -------------------------------------------------------------------------------- 1399 1400; GCN-LABEL: {{^}}v_fneg_fmad_f32: 1401; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1402; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1403; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1404 1405; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]] 1406; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[C]] 1407 1408; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] 1409; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1410define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1411 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1412 %tid.ext = sext i32 %tid to i64 1413 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1414 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1415 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1416 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1417 %a = load volatile float, float addrspace(1)* %a.gep 1418 %b = load volatile float, float addrspace(1)* %b.gep 1419 %c = load volatile float, float addrspace(1)* %c.gep 1420 %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c) 1421 %fneg = fneg float %fma 1422 store float %fneg, float addrspace(1)* %out.gep 1423 ret void 1424} 1425 1426; GCN-LABEL: {{^}}v_fneg_fmad_v4f32: 1427 1428; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 1429; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 1430; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 1431; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 1432define amdgpu_kernel void @v_fneg_fmad_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %a.ptr, <4 x float> addrspace(1)* %b.ptr, <4 x float> addrspace(1)* %c.ptr) #0 { 1433 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1434 %tid.ext = sext i32 %tid to i64 1435 %a.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a.ptr, i64 %tid.ext 1436 %b.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %b.ptr, i64 %tid.ext 1437 %c.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %c.ptr, i64 %tid.ext 1438 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext 1439 %a = load volatile <4 x float>, <4 x float> addrspace(1)* %a.gep 1440 %b = load volatile <4 x float>, <4 x float> addrspace(1)* %b.gep 1441 %c = load volatile <4 x float>, <4 x float> addrspace(1)* %c.gep 1442 %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) 1443 %fneg = fneg <4 x float> %fma 1444 store <4 x float> %fneg, <4 x float> addrspace(1)* %out.gep 1445 ret void 1446} 1447 1448; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32: 1449; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1450; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1451; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1452 1453; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]] 1454; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]] 1455; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]] 1456 1457; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], [[A]], -[[B]], -[[C]] 1458; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]] 1459 1460; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MAD]] 1461; GCN-NEXT: s_waitcnt vmcnt(0) 1462; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1463; GCN-NEXT: s_waitcnt vmcnt(0) 1464define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1465 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1466 %tid.ext = sext i32 %tid to i64 1467 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1468 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1469 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1470 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1471 %a = load volatile float, float addrspace(1)* %a.gep 1472 %b = load volatile float, float addrspace(1)* %b.gep 1473 %c = load volatile float, float addrspace(1)* %c.gep 1474 %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c) 1475 %fneg = fneg float %fma 1476 %use1 = fmul float %fma, 4.0 1477 store volatile float %fneg, float addrspace(1)* %out 1478 store volatile float %use1, float addrspace(1)* %out 1479 ret void 1480} 1481 1482; -------------------------------------------------------------------------------- 1483; fp_extend tests 1484; -------------------------------------------------------------------------------- 1485 1486; GCN-LABEL: {{^}}v_fneg_fp_extend_f32_to_f64: 1487; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1488; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]] 1489; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1490define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1491 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1492 %tid.ext = sext i32 %tid to i64 1493 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1494 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 1495 %a = load volatile float, float addrspace(1)* %a.gep 1496 %fpext = fpext float %a to double 1497 %fneg = fsub double -0.000000e+00, %fpext 1498 store double %fneg, double addrspace(1)* %out.gep 1499 ret void 1500} 1501 1502; GCN-LABEL: {{^}}v_fneg_fp_extend_fneg_f32_to_f64: 1503; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1504; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]] 1505; GCN: {{buffer|flat}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1506define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1507 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1508 %tid.ext = sext i32 %tid to i64 1509 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1510 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 1511 %a = load volatile float, float addrspace(1)* %a.gep 1512 %fneg.a = fneg float %a 1513 %fpext = fpext float %fneg.a to double 1514 %fneg = fsub double -0.000000e+00, %fpext 1515 store double %fneg, double addrspace(1)* %out.gep 1516 ret void 1517} 1518 1519; GCN-LABEL: {{^}}v_fneg_fp_extend_store_use_fneg_f32_to_f64: 1520; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1521; GCN-DAG: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]] 1522; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]] 1523; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1524; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FNEG_A]] 1525define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1526 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1527 %tid.ext = sext i32 %tid to i64 1528 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1529 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 1530 %a = load volatile float, float addrspace(1)* %a.gep 1531 %fneg.a = fneg float %a 1532 %fpext = fpext float %fneg.a to double 1533 %fneg = fsub double -0.000000e+00, %fpext 1534 store volatile double %fneg, double addrspace(1)* %out.gep 1535 store volatile float %fneg.a, float addrspace(1)* undef 1536 ret void 1537} 1538 1539; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f32_to_f64: 1540; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1541; GCN-DAG: v_cvt_f64_f32_e32 v[[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]], [[A]] 1542; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]] 1543; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]] 1544; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[CVT_LO]]:[[CVT_HI]]] 1545define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1546 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1547 %tid.ext = sext i32 %tid to i64 1548 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1549 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 1550 %a = load volatile float, float addrspace(1)* %a.gep 1551 %fpext = fpext float %a to double 1552 %fneg = fsub double -0.000000e+00, %fpext 1553 store volatile double %fneg, double addrspace(1)* %out.gep 1554 store volatile double %fpext, double addrspace(1)* undef 1555 ret void 1556} 1557 1558; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64: 1559; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1560; GCN-DAG: v_cvt_f64_f32_e32 v[[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]], [[A]] 1561; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]] 1562; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v[[[CVT_LO]]:[[CVT_HI]]], 4.0 1563; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]] 1564; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1565define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1566 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1567 %tid.ext = sext i32 %tid to i64 1568 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1569 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 1570 %a = load volatile float, float addrspace(1)* %a.gep 1571 %fpext = fpext float %a to double 1572 %fneg = fsub double -0.000000e+00, %fpext 1573 %mul = fmul double %fpext, 4.0 1574 store volatile double %fneg, double addrspace(1)* %out.gep 1575 store volatile double %mul, double addrspace(1)* %out.gep 1576 ret void 1577} 1578 1579; FIXME: Source modifiers not folded for f16->f32 1580; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32: 1581define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { 1582 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1583 %tid.ext = sext i32 %tid to i64 1584 %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext 1585 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1586 %a = load volatile half, half addrspace(1)* %a.gep 1587 %fpext = fpext half %a to float 1588 %fneg = fneg float %fpext 1589 store volatile float %fneg, float addrspace(1)* %out.gep 1590 store volatile float %fpext, float addrspace(1)* %out.gep 1591 ret void 1592} 1593 1594; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32: 1595define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { 1596 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1597 %tid.ext = sext i32 %tid to i64 1598 %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext 1599 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1600 %a = load volatile half, half addrspace(1)* %a.gep 1601 %fpext = fpext half %a to float 1602 %fneg = fneg float %fpext 1603 %mul = fmul float %fpext, 4.0 1604 store volatile float %fneg, float addrspace(1)* %out.gep 1605 store volatile float %mul, float addrspace(1)* %out.gep 1606 ret void 1607} 1608 1609; -------------------------------------------------------------------------------- 1610; fp_round tests 1611; -------------------------------------------------------------------------------- 1612 1613; GCN-LABEL: {{^}}v_fneg_fp_round_f64_to_f32: 1614; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 1615; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]] 1616; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1617define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { 1618 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1619 %tid.ext = sext i32 %tid to i64 1620 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 1621 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1622 %a = load volatile double, double addrspace(1)* %a.gep 1623 %fpround = fptrunc double %a to float 1624 %fneg = fneg float %fpround 1625 store float %fneg, float addrspace(1)* %out.gep 1626 ret void 1627} 1628 1629; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f64_to_f32: 1630; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 1631; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]] 1632; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1633define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { 1634 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1635 %tid.ext = sext i32 %tid to i64 1636 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 1637 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1638 %a = load volatile double, double addrspace(1)* %a.gep 1639 %fneg.a = fsub double -0.000000e+00, %a 1640 %fpround = fptrunc double %fneg.a to float 1641 %fneg = fneg float %fpround 1642 store float %fneg, float addrspace(1)* %out.gep 1643 ret void 1644} 1645 1646; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f64_to_f32: 1647; GCN: {{buffer|flat}}_load_dwordx2 v[[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]] 1648; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v[[[A_LO]]:[[A_HI]]] 1649; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]] 1650; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1651; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[A_LO]]:[[NEG_A_HI]]] 1652define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { 1653 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1654 %tid.ext = sext i32 %tid to i64 1655 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 1656 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1657 %a = load volatile double, double addrspace(1)* %a.gep 1658 %fneg.a = fsub double -0.000000e+00, %a 1659 %fpround = fptrunc double %fneg.a to float 1660 %fneg = fneg float %fpround 1661 store volatile float %fneg, float addrspace(1)* %out.gep 1662 store volatile double %fneg.a, double addrspace(1)* undef 1663 ret void 1664} 1665 1666; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f64_to_f32: 1667; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 1668; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]] 1669; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s[ 1670 1671; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1672; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[USE1]] 1673define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 { 1674 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1675 %tid.ext = sext i32 %tid to i64 1676 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 1677 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1678 %a = load volatile double, double addrspace(1)* %a.gep 1679 %fneg.a = fsub double -0.000000e+00, %a 1680 %fpround = fptrunc double %fneg.a to float 1681 %fneg = fneg float %fpround 1682 %use1 = fmul double %fneg.a, %c 1683 store volatile float %fneg, float addrspace(1)* %out.gep 1684 store volatile double %use1, double addrspace(1)* undef 1685 ret void 1686} 1687 1688; GCN-LABEL: {{^}}v_fneg_fp_round_f32_to_f16: 1689; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1690; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 1691; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1692define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1693 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1694 %tid.ext = sext i32 %tid to i64 1695 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1696 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 1697 %a = load volatile float, float addrspace(1)* %a.gep 1698 %fpround = fptrunc float %a to half 1699 %fneg = fsub half -0.000000e+00, %fpround 1700 store half %fneg, half addrspace(1)* %out.gep 1701 ret void 1702} 1703 1704; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f32_to_f16: 1705; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1706; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1707; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1708define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1709 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1710 %tid.ext = sext i32 %tid to i64 1711 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1712 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 1713 %a = load volatile float, float addrspace(1)* %a.gep 1714 %fneg.a = fneg float %a 1715 %fpround = fptrunc float %fneg.a to half 1716 %fneg = fsub half -0.000000e+00, %fpround 1717 store half %fneg, half addrspace(1)* %out.gep 1718 ret void 1719} 1720 1721; GCN-LABEL: {{^}}v_fneg_multi_use_fp_round_fneg_f64_to_f32: 1722; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 1723; GCN-DAG: v_cvt_f32_f64_e32 [[CVT:v[0-9]+]], [[A]] 1724; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]] 1725; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG]] 1726; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[CVT]] 1727define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { 1728 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1729 %tid.ext = sext i32 %tid to i64 1730 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 1731 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1732 %a = load volatile double, double addrspace(1)* %a.gep 1733 %fpround = fptrunc double %a to float 1734 %fneg = fneg float %fpround 1735 store volatile float %fneg, float addrspace(1)* %out.gep 1736 store volatile float %fpround, float addrspace(1)* %out.gep 1737 ret void 1738} 1739 1740; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f32_to_f16: 1741; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1742; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1743; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 1744; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1745; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 1746define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1747 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1748 %tid.ext = sext i32 %tid to i64 1749 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1750 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 1751 %a = load volatile float, float addrspace(1)* %a.gep 1752 %fneg.a = fneg float %a 1753 %fpround = fptrunc float %fneg.a to half 1754 %fneg = fsub half -0.000000e+00, %fpround 1755 store volatile half %fneg, half addrspace(1)* %out.gep 1756 store volatile float %fneg.a, float addrspace(1)* undef 1757 ret void 1758} 1759 1760; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f32_to_f16: 1761; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1762; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1763; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s 1764; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1765; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[USE1]] 1766define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 { 1767 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1768 %tid.ext = sext i32 %tid to i64 1769 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1770 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 1771 %a = load volatile float, float addrspace(1)* %a.gep 1772 %fneg.a = fneg float %a 1773 %fpround = fptrunc float %fneg.a to half 1774 %fneg = fsub half -0.000000e+00, %fpround 1775 %use1 = fmul float %fneg.a, %c 1776 store volatile half %fneg, half addrspace(1)* %out.gep 1777 store volatile float %use1, float addrspace(1)* undef 1778 ret void 1779} 1780 1781; -------------------------------------------------------------------------------- 1782; rcp tests 1783; -------------------------------------------------------------------------------- 1784 1785; GCN-LABEL: {{^}}v_fneg_rcp_f32: 1786; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1787; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 1788; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1789define amdgpu_kernel void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1790 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1791 %tid.ext = sext i32 %tid to i64 1792 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1793 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1794 %a = load volatile float, float addrspace(1)* %a.gep 1795 %rcp = call float @llvm.amdgcn.rcp.f32(float %a) 1796 %fneg = fneg float %rcp 1797 store float %fneg, float addrspace(1)* %out.gep 1798 ret void 1799} 1800 1801; GCN-LABEL: {{^}}v_fneg_rcp_fneg_f32: 1802; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1803; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1804; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1805define amdgpu_kernel void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1806 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1807 %tid.ext = sext i32 %tid to i64 1808 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1809 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1810 %a = load volatile float, float addrspace(1)* %a.gep 1811 %fneg.a = fneg float %a 1812 %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a) 1813 %fneg = fneg float %rcp 1814 store float %fneg, float addrspace(1)* %out.gep 1815 ret void 1816} 1817 1818; GCN-LABEL: {{^}}v_fneg_rcp_store_use_fneg_f32: 1819; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1820; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1821; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 1822; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1823; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 1824define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1825 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1826 %tid.ext = sext i32 %tid to i64 1827 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1828 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1829 %a = load volatile float, float addrspace(1)* %a.gep 1830 %fneg.a = fneg float %a 1831 %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a) 1832 %fneg = fneg float %rcp 1833 store volatile float %fneg, float addrspace(1)* %out.gep 1834 store volatile float %fneg.a, float addrspace(1)* undef 1835 ret void 1836} 1837 1838; GCN-LABEL: {{^}}v_fneg_rcp_multi_use_fneg_f32: 1839; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1840; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1841; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 1842; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1843; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1844define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 { 1845 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1846 %tid.ext = sext i32 %tid to i64 1847 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1848 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1849 %a = load volatile float, float addrspace(1)* %a.gep 1850 %fneg.a = fneg float %a 1851 %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a) 1852 %fneg = fneg float %rcp 1853 %use1 = fmul float %fneg.a, %c 1854 store volatile float %fneg, float addrspace(1)* %out.gep 1855 store volatile float %use1, float addrspace(1)* undef 1856 ret void 1857} 1858 1859; -------------------------------------------------------------------------------- 1860; fmul_legacy tests 1861; -------------------------------------------------------------------------------- 1862 1863; GCN-LABEL: {{^}}v_fneg_mul_legacy_f32: 1864; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1865; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1866; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]] 1867; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1868define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1869 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1870 %tid.ext = sext i32 %tid to i64 1871 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1872 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1873 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1874 %a = load volatile float, float addrspace(1)* %a.gep 1875 %b = load volatile float, float addrspace(1)* %b.gep 1876 %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) 1877 %fneg = fneg float %mul 1878 store float %fneg, float addrspace(1)* %out.gep 1879 ret void 1880} 1881 1882; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32: 1883; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1884; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1885; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 1886; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]] 1887; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]] 1888; GCN-NEXT: s_waitcnt vmcnt(0) 1889; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1890; GCN-NEXT: s_waitcnt vmcnt(0) 1891define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1892 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1893 %tid.ext = sext i32 %tid to i64 1894 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1895 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1896 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1897 %a = load volatile float, float addrspace(1)* %a.gep 1898 %b = load volatile float, float addrspace(1)* %b.gep 1899 %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) 1900 %fneg = fneg float %mul 1901 store volatile float %fneg, float addrspace(1)* %out 1902 store volatile float %mul, float addrspace(1)* %out 1903 ret void 1904} 1905 1906; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32: 1907; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1908; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1909; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] 1910; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0 1911; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1912; GCN-NEXT: s_waitcnt vmcnt(0) 1913; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1914; GCN-NEXT: s_waitcnt vmcnt(0) 1915define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1916 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1917 %tid.ext = sext i32 %tid to i64 1918 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1919 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1920 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1921 %a = load volatile float, float addrspace(1)* %a.gep 1922 %b = load volatile float, float addrspace(1)* %b.gep 1923 %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) 1924 %fneg = fneg float %mul 1925 %use1 = call float @llvm.amdgcn.fmul.legacy(float %mul, float 4.0) 1926 store volatile float %fneg, float addrspace(1)* %out 1927 store volatile float %use1, float addrspace(1)* %out 1928 ret void 1929} 1930 1931; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32: 1932; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1933; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1934; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 1935; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1936define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1937 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1938 %tid.ext = sext i32 %tid to i64 1939 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1940 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1941 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1942 %a = load volatile float, float addrspace(1)* %a.gep 1943 %b = load volatile float, float addrspace(1)* %b.gep 1944 %fneg.a = fneg float %a 1945 %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b) 1946 %fneg = fneg float %mul 1947 store volatile float %fneg, float addrspace(1)* %out 1948 ret void 1949} 1950 1951; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32: 1952; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1953; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1954; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 1955; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1956define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1957 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1958 %tid.ext = sext i32 %tid to i64 1959 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1960 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1961 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1962 %a = load volatile float, float addrspace(1)* %a.gep 1963 %b = load volatile float, float addrspace(1)* %b.gep 1964 %fneg.b = fneg float %b 1965 %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %fneg.b) 1966 %fneg = fneg float %mul 1967 store volatile float %fneg, float addrspace(1)* %out 1968 ret void 1969} 1970 1971; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_fneg_f32: 1972; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1973; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1974; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] 1975; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1976define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1977 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1978 %tid.ext = sext i32 %tid to i64 1979 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1980 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1981 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1982 %a = load volatile float, float addrspace(1)* %a.gep 1983 %b = load volatile float, float addrspace(1)* %b.gep 1984 %fneg.a = fneg float %a 1985 %fneg.b = fneg float %b 1986 %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %fneg.b) 1987 %fneg = fneg float %mul 1988 store volatile float %fneg, float addrspace(1)* %out 1989 ret void 1990} 1991 1992; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_fneg_x_f32: 1993; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1994; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1995; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 1996; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]] 1997; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]] 1998; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 1999define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 2000 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2001 %tid.ext = sext i32 %tid to i64 2002 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2003 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2004 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2005 %a = load volatile float, float addrspace(1)* %a.gep 2006 %b = load volatile float, float addrspace(1)* %b.gep 2007 %fneg.a = fneg float %a 2008 %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b) 2009 %fneg = fneg float %mul 2010 store volatile float %fneg, float addrspace(1)* %out 2011 store volatile float %fneg.a, float addrspace(1)* %out 2012 ret void 2013} 2014 2015; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32: 2016; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2017; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2018; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]] 2019; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 2020; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]] 2021; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 2022define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { 2023 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2024 %tid.ext = sext i32 %tid to i64 2025 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2026 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2027 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2028 %a = load volatile float, float addrspace(1)* %a.gep 2029 %b = load volatile float, float addrspace(1)* %b.gep 2030 %fneg.a = fneg float %a 2031 %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b) 2032 %fneg = fneg float %mul 2033 %use1 = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %c) 2034 store volatile float %fneg, float addrspace(1)* %out 2035 store volatile float %use1, float addrspace(1)* %out 2036 ret void 2037} 2038 2039; -------------------------------------------------------------------------------- 2040; sin tests 2041; -------------------------------------------------------------------------------- 2042 2043; GCN-LABEL: {{^}}v_fneg_sin_f32: 2044; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2045; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbe22f983, [[A]] 2046; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]] 2047; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]] 2048; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2049define amdgpu_kernel void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2050 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2051 %tid.ext = sext i32 %tid to i64 2052 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2053 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2054 %a = load volatile float, float addrspace(1)* %a.gep 2055 %sin = call float @llvm.sin.f32(float %a) 2056 %fneg = fneg float %sin 2057 store float %fneg, float addrspace(1)* %out.gep 2058 ret void 2059} 2060 2061; GCN-LABEL: {{^}}v_fneg_amdgcn_sin_f32: 2062; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2063; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 2064; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2065define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2066 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2067 %tid.ext = sext i32 %tid to i64 2068 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2069 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2070 %a = load volatile float, float addrspace(1)* %a.gep 2071 %sin = call float @llvm.amdgcn.sin.f32(float %a) 2072 %fneg = fneg float %sin 2073 store float %fneg, float addrspace(1)* %out.gep 2074 ret void 2075} 2076 2077; -------------------------------------------------------------------------------- 2078; ftrunc tests 2079; -------------------------------------------------------------------------------- 2080 2081; GCN-LABEL: {{^}}v_fneg_trunc_f32: 2082; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2083; GCN: v_trunc_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 2084; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2085define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2086 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2087 %tid.ext = sext i32 %tid to i64 2088 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2089 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2090 %a = load volatile float, float addrspace(1)* %a.gep 2091 %trunc = call float @llvm.trunc.f32(float %a) 2092 %fneg = fneg float %trunc 2093 store float %fneg, float addrspace(1)* %out.gep 2094 ret void 2095} 2096 2097; -------------------------------------------------------------------------------- 2098; fround tests 2099; -------------------------------------------------------------------------------- 2100 2101; GCN-LABEL: {{^}}v_fneg_round_f32: 2102; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2103; GCN: v_trunc_f32_e32 2104; GCN: v_sub_f32_e32 2105; GCN: v_cndmask_b32 2106 2107; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} 2108; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[ADD]] 2109 2110; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}} 2111; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2112define amdgpu_kernel void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2113 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2114 %tid.ext = sext i32 %tid to i64 2115 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2116 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2117 %a = load volatile float, float addrspace(1)* %a.gep 2118 %round = call float @llvm.round.f32(float %a) 2119 %fneg = fneg float %round 2120 store float %fneg, float addrspace(1)* %out.gep 2121 ret void 2122} 2123 2124; -------------------------------------------------------------------------------- 2125; rint tests 2126; -------------------------------------------------------------------------------- 2127 2128; GCN-LABEL: {{^}}v_fneg_rint_f32: 2129; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2130; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 2131; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2132define amdgpu_kernel void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2133 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2134 %tid.ext = sext i32 %tid to i64 2135 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2136 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2137 %a = load volatile float, float addrspace(1)* %a.gep 2138 %rint = call float @llvm.rint.f32(float %a) 2139 %fneg = fneg float %rint 2140 store float %fneg, float addrspace(1)* %out.gep 2141 ret void 2142} 2143 2144; -------------------------------------------------------------------------------- 2145; nearbyint tests 2146; -------------------------------------------------------------------------------- 2147 2148; GCN-LABEL: {{^}}v_fneg_nearbyint_f32: 2149; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2150; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 2151; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2152define amdgpu_kernel void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2153 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2154 %tid.ext = sext i32 %tid to i64 2155 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2156 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2157 %a = load volatile float, float addrspace(1)* %a.gep 2158 %nearbyint = call float @llvm.nearbyint.f32(float %a) 2159 %fneg = fneg float %nearbyint 2160 store float %fneg, float addrspace(1)* %out.gep 2161 ret void 2162} 2163 2164; -------------------------------------------------------------------------------- 2165; fcanonicalize tests 2166; -------------------------------------------------------------------------------- 2167 2168; GCN-LABEL: {{^}}v_fneg_canonicalize_f32: 2169; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2170; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], -1.0, [[A]] 2171; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2172define amdgpu_kernel void @v_fneg_canonicalize_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2173 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2174 %tid.ext = sext i32 %tid to i64 2175 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2176 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2177 %a = load volatile float, float addrspace(1)* %a.gep 2178 %trunc = call float @llvm.canonicalize.f32(float %a) 2179 %fneg = fneg float %trunc 2180 store float %fneg, float addrspace(1)* %out.gep 2181 ret void 2182} 2183 2184; -------------------------------------------------------------------------------- 2185; vintrp tests 2186; -------------------------------------------------------------------------------- 2187 2188; GCN-LABEL: {{^}}v_fneg_interp_p1_f32: 2189; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2190; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2191; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] 2192; GCN: v_interp_p1_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]] 2193; GCN: v_interp_p1_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]] 2194define amdgpu_kernel void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 2195 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2196 %tid.ext = sext i32 %tid to i64 2197 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2198 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2199 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2200 %a = load volatile float, float addrspace(1)* %a.gep 2201 %b = load volatile float, float addrspace(1)* %b.gep 2202 %mul = fmul float %a, %b 2203 %fneg = fneg float %mul 2204 %intrp0 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 0, i32 0, i32 0) 2205 %intrp1 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 1, i32 0, i32 0) 2206 store volatile float %intrp0, float addrspace(1)* %out.gep 2207 store volatile float %intrp1, float addrspace(1)* %out.gep 2208 ret void 2209} 2210 2211; GCN-LABEL: {{^}}v_fneg_interp_p2_f32: 2212; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2213; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2214; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] 2215; GCN: v_interp_p2_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]] 2216; GCN: v_interp_p2_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]] 2217define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 2218 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2219 %tid.ext = sext i32 %tid to i64 2220 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2221 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2222 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2223 %a = load volatile float, float addrspace(1)* %a.gep 2224 %b = load volatile float, float addrspace(1)* %b.gep 2225 %mul = fmul float %a, %b 2226 %fneg = fneg float %mul 2227 %intrp0 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 0, i32 0, i32 0) 2228 %intrp1 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 1, i32 0, i32 0) 2229 store volatile float %intrp0, float addrspace(1)* %out.gep 2230 store volatile float %intrp1, float addrspace(1)* %out.gep 2231 ret void 2232} 2233 2234; -------------------------------------------------------------------------------- 2235; CopyToReg tests 2236; -------------------------------------------------------------------------------- 2237 2238; GCN-LABEL: {{^}}v_fneg_copytoreg_f32: 2239; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2240; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2241; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2242; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]] 2243; GCN: s_cbranch_scc0 2244 2245; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] 2246; GCN: s_endpgm 2247 2248; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]] 2249; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[XOR]], [[C]] 2250; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2251 2252define amdgpu_kernel void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { 2253 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2254 %tid.ext = sext i32 %tid to i64 2255 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2256 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2257 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2258 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2259 %a = load volatile float, float addrspace(1)* %a.gep 2260 %b = load volatile float, float addrspace(1)* %b.gep 2261 %c = load volatile float, float addrspace(1)* %c.gep 2262 %mul = fmul float %a, %b 2263 %fneg = fneg float %mul 2264 %cmp0 = icmp eq i32 %d, 0 2265 br i1 %cmp0, label %if, label %endif 2266 2267if: 2268 %mul1 = fmul float %fneg, %c 2269 store volatile float %mul1, float addrspace(1)* %out.gep 2270 br label %endif 2271 2272endif: 2273 store volatile float %mul, float addrspace(1)* %out.gep 2274 ret void 2275} 2276 2277; -------------------------------------------------------------------------------- 2278; inlineasm tests 2279; -------------------------------------------------------------------------------- 2280 2281; Can't fold into use, so should fold into source 2282; GCN-LABEL: {{^}}v_fneg_inlineasm_f32: 2283; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2284; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2285; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] 2286; GCN: ; use [[MUL]] 2287; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 2288define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { 2289 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2290 %tid.ext = sext i32 %tid to i64 2291 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2292 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2293 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2294 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2295 %a = load volatile float, float addrspace(1)* %a.gep 2296 %b = load volatile float, float addrspace(1)* %b.gep 2297 %c = load volatile float, float addrspace(1)* %c.gep 2298 %mul = fmul float %a, %b 2299 %fneg = fneg float %mul 2300 call void asm sideeffect "; use $0", "v"(float %fneg) #0 2301 store volatile float %fneg, float addrspace(1)* %out.gep 2302 ret void 2303} 2304 2305; -------------------------------------------------------------------------------- 2306; inlineasm tests 2307; -------------------------------------------------------------------------------- 2308 2309; Can't fold into use, so should fold into source 2310; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32: 2311; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2312; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2313; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]] 2314; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]] 2315; GCN: ; use [[NEG]] 2316; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 2317define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { 2318 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2319 %tid.ext = sext i32 %tid to i64 2320 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2321 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2322 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2323 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2324 %a = load volatile float, float addrspace(1)* %a.gep 2325 %b = load volatile float, float addrspace(1)* %b.gep 2326 %c = load volatile float, float addrspace(1)* %c.gep 2327 %mul = fmul float %a, %b 2328 %fneg = fneg float %mul 2329 call void asm sideeffect "; use $0", "v"(float %fneg) #0 2330 store volatile float %mul, float addrspace(1)* %out.gep 2331 ret void 2332} 2333 2334; -------------------------------------------------------------------------------- 2335; code size regression tests 2336; -------------------------------------------------------------------------------- 2337 2338; There are multiple users of the fneg that must use a VOP3 2339; instruction, so there is no penalty 2340; GCN-LABEL: {{^}}multiuse_fneg_2_vop3_users_f32: 2341; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2342; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2343; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2344 2345; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], [[C]] 2346; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0 2347 2348; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]] 2349; GCN-NEXT: s_waitcnt vmcnt(0) 2350; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA1]] 2351; GCN-NEXT: s_waitcnt vmcnt(0) 2352define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 2353 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2354 %tid.ext = sext i32 %tid to i64 2355 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2356 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2357 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2358 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2359 %a = load volatile float, float addrspace(1)* %a.gep 2360 %b = load volatile float, float addrspace(1)* %b.gep 2361 %c = load volatile float, float addrspace(1)* %c.gep 2362 2363 %fneg.a = fneg float %a 2364 %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) 2365 %fma1 = call float @llvm.fma.f32(float %fneg.a, float %c, float 2.0) 2366 2367 store volatile float %fma0, float addrspace(1)* %out 2368 store volatile float %fma1, float addrspace(1)* %out 2369 ret void 2370} 2371 2372; There are multiple users, but both require using a larger encoding 2373; for the modifier. 2374 2375; GCN-LABEL: {{^}}multiuse_fneg_2_vop2_users_f32: 2376; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2377; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2378; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2379 2380; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]] 2381; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]] 2382; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] 2383; GCN-NEXT: s_waitcnt vmcnt(0) 2384; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2385; GCN-NEXT: s_waitcnt vmcnt(0) 2386define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 2387 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2388 %tid.ext = sext i32 %tid to i64 2389 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2390 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2391 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2392 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2393 %a = load volatile float, float addrspace(1)* %a.gep 2394 %b = load volatile float, float addrspace(1)* %b.gep 2395 %c = load volatile float, float addrspace(1)* %c.gep 2396 2397 %fneg.a = fneg float %a 2398 %mul0 = fmul float %fneg.a, %b 2399 %mul1 = fmul float %fneg.a, %c 2400 2401 store volatile float %mul0, float addrspace(1)* %out 2402 store volatile float %mul1, float addrspace(1)* %out 2403 ret void 2404} 2405 2406; One user is VOP3 so has no cost to folding the modifier, the other does. 2407; GCN-LABEL: {{^}}multiuse_fneg_vop2_vop3_users_f32: 2408; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2409; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2410; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2411 2412; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], 2.0 2413; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]] 2414 2415; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]] 2416; GCN-NEXT: s_waitcnt vmcnt(0) 2417; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2418; GCN-NEXT: s_waitcnt vmcnt(0) 2419define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 2420 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2421 %tid.ext = sext i32 %tid to i64 2422 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2423 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2424 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2425 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2426 %a = load volatile float, float addrspace(1)* %a.gep 2427 %b = load volatile float, float addrspace(1)* %b.gep 2428 %c = load volatile float, float addrspace(1)* %c.gep 2429 2430 %fneg.a = fneg float %a 2431 %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0) 2432 %mul1 = fmul float %fneg.a, %c 2433 2434 store volatile float %fma0, float addrspace(1)* %out 2435 store volatile float %mul1, float addrspace(1)* %out 2436 ret void 2437} 2438 2439; The use of the fneg requires a code size increase, but folding into 2440; the source does not 2441 2442; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f32: 2443; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2444; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2445; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2446; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]] 2447 2448; GCN-SAFE: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], [[B]], 2.0 2449; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[FMA0]], [[C]] 2450; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]] 2451 2452; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0 2453; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[FMA0]], [[C]] 2454; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]] 2455 2456; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2457; GCN-NEXT: s_waitcnt vmcnt(0) 2458; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL2]] 2459; GCN-NEXT: s_waitcnt vmcnt(0) 2460define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { 2461 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2462 %tid.ext = sext i32 %tid to i64 2463 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2464 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2465 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2466 %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext 2467 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2468 %a = load volatile float, float addrspace(1)* %a.gep 2469 %b = load volatile float, float addrspace(1)* %b.gep 2470 %c = load volatile float, float addrspace(1)* %c.gep 2471 %d = load volatile float, float addrspace(1)* %d.gep 2472 2473 %fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0) 2474 %fneg.fma0 = fneg float %fma0 2475 %mul1 = fmul float %fneg.fma0, %c 2476 %mul2 = fmul float %fneg.fma0, %d 2477 2478 store volatile float %mul1, float addrspace(1)* %out 2479 store volatile float %mul2, float addrspace(1)* %out 2480 ret void 2481} 2482 2483; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64: 2484; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 2485; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] 2486; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]] 2487; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]] 2488 2489; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0 2490; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]] 2491; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]] 2492 2493; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] 2494; GCN-NEXT: s_waitcnt vmcnt(0) 2495; GCN-NEXT: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2496; GCN-NEXT: s_waitcnt vmcnt(0) 2497define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 { 2498 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2499 %tid.ext = sext i32 %tid to i64 2500 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 2501 %b.gep = getelementptr inbounds double, double addrspace(1)* %b.ptr, i64 %tid.ext 2502 %c.gep = getelementptr inbounds double, double addrspace(1)* %c.ptr, i64 %tid.ext 2503 %d.gep = getelementptr inbounds double, double addrspace(1)* %d.ptr, i64 %tid.ext 2504 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 2505 %a = load volatile double, double addrspace(1)* %a.gep 2506 %b = load volatile double, double addrspace(1)* %b.gep 2507 %c = load volatile double, double addrspace(1)* %c.gep 2508 %d = load volatile double, double addrspace(1)* %d.gep 2509 2510 %fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0) 2511 %fneg.fma0 = fsub double -0.0, %fma0 2512 %mul1 = fmul double %fneg.fma0, %c 2513 %mul2 = fmul double %fneg.fma0, %d 2514 2515 store volatile double %mul1, double addrspace(1)* %out 2516 store volatile double %mul2, double addrspace(1)* %out 2517 ret void 2518} 2519 2520; %trunc.a has one fneg use, but it requires a code size increase and 2521; %the fneg can instead be folded for free into the fma. 2522 2523; GCN-LABEL: {{^}}one_use_cost_to_fold_into_src_f32: 2524; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2525; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2526; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2527; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]] 2528; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]] 2529; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]] 2530define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { 2531 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2532 %tid.ext = sext i32 %tid to i64 2533 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2534 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2535 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2536 %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext 2537 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2538 %a = load volatile float, float addrspace(1)* %a.gep 2539 %b = load volatile float, float addrspace(1)* %b.gep 2540 %c = load volatile float, float addrspace(1)* %c.gep 2541 %d = load volatile float, float addrspace(1)* %d.gep 2542 2543 %trunc.a = call float @llvm.trunc.f32(float %a) 2544 %trunc.fneg.a = fneg float %trunc.a 2545 %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c) 2546 store volatile float %fma0, float addrspace(1)* %out 2547 ret void 2548} 2549 2550; GCN-LABEL: {{^}}multi_use_cost_to_fold_into_src: 2551; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2552; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2553; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2554; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]] 2555; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]] 2556; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]] 2557; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[TRUNC_A]], [[D]] 2558; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]] 2559; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2560define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { 2561 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2562 %tid.ext = sext i32 %tid to i64 2563 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2564 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2565 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2566 %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext 2567 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2568 %a = load volatile float, float addrspace(1)* %a.gep 2569 %b = load volatile float, float addrspace(1)* %b.gep 2570 %c = load volatile float, float addrspace(1)* %c.gep 2571 %d = load volatile float, float addrspace(1)* %d.gep 2572 2573 %trunc.a = call float @llvm.trunc.f32(float %a) 2574 %trunc.fneg.a = fneg float %trunc.a 2575 %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c) 2576 %mul1 = fmul float %trunc.a, %d 2577 store volatile float %fma0, float addrspace(1)* %out 2578 store volatile float %mul1, float addrspace(1)* %out 2579 ret void 2580} 2581 2582; The AMDGPU combine to pull fneg into the FMA operands was being 2583; undone by the generic combine to pull the fneg out of the fma if 2584; !isFNegFree. We were reporting false for v2f32 even though it will 2585; be split into f32 where it will be free. 2586; GCN-LABEL: {{^}}fneg_fma_fneg_dagcombine_loop: 2587; GCN: s_brev_b32 [[NEGZERO:s[0-9]+]], 1{{$}} 2588; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], v2, -v4, [[NEGZERO]] 2589; GCN-DAG: v_fma_f32 [[FMA1:v[0-9]+]], v3, -v5, [[NEGZERO]] 2590; GCN-DAG: v_sub_f32_e32 [[SUB0:v[0-9]+]], [[FMA0]], v0 2591; GCN-DAG: v_sub_f32_e32 [[SUB1:v[0-9]+]], [[FMA1]], v1 2592; GCN-DAG: v_mul_f32_e32 v0, [[SUB0]], v4 2593; GCN-DAG: v_mul_f32_e32 v1, [[SUB1]], v5 2594; GCN: s_setpc_b64 2595define <2 x float> @fneg_fma_fneg_dagcombine_loop(<2 x float> %arg, <2 x float> %arg1, <2 x float> %arg2) #0 { 2596bb: 2597 %i3 = call fast <2 x float> @llvm.fma.v2f32(<2 x float> %arg1, <2 x float> %arg2, <2 x float> zeroinitializer) 2598 %i4 = fadd fast <2 x float> %i3, %arg 2599 %i5 = fneg <2 x float> %i4 2600 %i6 = fmul fast <2 x float> %i5, %arg2 2601 ret <2 x float> %i6 2602} 2603 2604; This expects denormal flushing, so can't turn this fmul into fneg 2605; TODO: Keeping this as fmul saves encoding size 2606; GCN-LABEL: {{^}}nnan_fmul_neg1_to_fneg: 2607; GCN: v_sub_f32_e32 [[TMP:v[0-9]+]], 0x80000000, v0 2608; GCN-NEXT: v_mul_f32_e32 v0, [[TMP]], v1 2609define float @nnan_fmul_neg1_to_fneg(float %x, float %y) #0 { 2610 %mul = fmul float %x, -1.0 2611 %add = fmul nnan float %mul, %y 2612 ret float %add 2613} 2614 2615; It's legal to turn this fmul into an fneg since denormals are 2616; preserved and we know an snan can't happen from the flag. 2617; GCN-LABEL: {{^}}denormal_fmul_neg1_to_fneg: 2618; GCN: v_mul_f32_e64 v0, -v0, v1 2619; GCN-NEXT: s_setpc_b64 2620define float @denormal_fmul_neg1_to_fneg(float %x, float %y) { 2621 %mul = fmul nnan float %x, -1.0 2622 %add = fmul float %mul, %y 2623 ret float %add 2624} 2625 2626; know the source can't be an snan 2627; GCN-LABEL: {{^}}denorm_snan_fmul_neg1_to_fneg: 2628; GCN: v_mul_f32_e64 [[TMP:v[0-9]+]], v0, -v0 2629; GCN: v_mul_f32_e32 v0, [[TMP]], v1 2630; GCN-NEXT: s_setpc_b64 2631define float @denorm_snan_fmul_neg1_to_fneg(float %x, float %y) { 2632 %canonical = fmul float %x, %x 2633 %mul = fmul float %canonical, -1.0 2634 %add = fmul float %mul, %y 2635 ret float %add 2636} 2637 2638; GCN-LABEL: {{^}}flush_snan_fmul_neg1_to_fneg: 2639; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], 1.0, v0 2640; GCN: v_sub_f32_e32 [[TMP1:v[0-9]+]], 0x80000000, [[TMP0]] 2641; GCN-NEXT: v_mul_f32_e32 v0, [[TMP1]], v1 2642define float @flush_snan_fmul_neg1_to_fneg(float %x, float %y) #0 { 2643 %quiet = call float @llvm.canonicalize.f32(float %x) 2644 %mul = fmul float %quiet, -1.0 2645 %add = fmul float %mul, %y 2646 ret float %add 2647} 2648 2649declare i32 @llvm.amdgcn.workitem.id.x() #1 2650declare float @llvm.fma.f32(float, float, float) #1 2651declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) 2652declare float @llvm.fmuladd.f32(float, float, float) #1 2653declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1 2654declare float @llvm.sin.f32(float) #1 2655declare float @llvm.trunc.f32(float) #1 2656declare float @llvm.round.f32(float) #1 2657declare float @llvm.rint.f32(float) #1 2658declare float @llvm.nearbyint.f32(float) #1 2659declare float @llvm.canonicalize.f32(float) #1 2660declare float @llvm.minnum.f32(float, float) #1 2661declare float @llvm.maxnum.f32(float, float) #1 2662declare half @llvm.minnum.f16(half, half) #1 2663declare double @llvm.minnum.f64(double, double) #1 2664declare double @llvm.fma.f64(double, double, double) #1 2665 2666declare float @llvm.amdgcn.sin.f32(float) #1 2667declare float @llvm.amdgcn.rcp.f32(float) #1 2668declare float @llvm.amdgcn.rcp.legacy(float) #1 2669declare float @llvm.amdgcn.fmul.legacy(float, float) #1 2670declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0 2671declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0 2672 2673attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 2674attributes #1 = { nounwind readnone } 2675attributes #2 = { nounwind "unsafe-fp-math"="true" } 2676attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" } 2677