1; RUN: llc -march=amdgcn -mcpu=hawaii -start-after=sink -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,SI %s 2; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,SI %s 3 4; RUN: llc -march=amdgcn -mcpu=fiji -start-after=sink --verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,VI %s 5; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,VI %s 6 7; -------------------------------------------------------------------------------- 8; fadd tests 9; -------------------------------------------------------------------------------- 10 11; GCN-LABEL: {{^}}v_fneg_add_f32: 12; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 13; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 14 15; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 16; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] 17 18; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]] 19; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 20define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 21 %tid = call i32 @llvm.amdgcn.workitem.id.x() 22 %tid.ext = sext i32 %tid to i64 23 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 24 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 25 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 26 %a = load volatile float, float addrspace(1)* %a.gep 27 %b = load volatile float, float addrspace(1)* %b.gep 28 %add = fadd float %a, %b 29 %fneg = fneg float %add 30 store float %fneg, float addrspace(1)* %out.gep 31 ret void 32} 33 34; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32: 35; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 36; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 37; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 38; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] 39; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] 40; GCN-NEXT: s_waitcnt vmcnt(0) 41; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 42; GCN-NEXT: s_waitcnt vmcnt(0) 43define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 44 %tid = call i32 @llvm.amdgcn.workitem.id.x() 45 %tid.ext = sext i32 %tid to i64 46 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 47 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 48 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 49 %a = load volatile float, float addrspace(1)* %a.gep 50 %b = load volatile float, float addrspace(1)* %b.gep 51 %add = fadd float %a, %b 52 %fneg = fneg float %add 53 store volatile float %fneg, float addrspace(1)* %out 54 store volatile float %add, float addrspace(1)* %out 55 ret void 56} 57 58; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32: 59; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 60; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 61 62; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 63; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] 64; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]] 65 66; GCN-NSZ: v_sub_f32_e64 [[NEG_ADD:v[0-9]+]], -[[A]], [[B]] 67; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_ADD]] 68 69; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] 70; GCN-NEXT: s_waitcnt vmcnt(0) 71; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 72; GCN-NEXT: s_waitcnt vmcnt(0) 73define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 74 %tid = call i32 @llvm.amdgcn.workitem.id.x() 75 %tid.ext = sext i32 %tid to i64 76 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 77 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 78 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 79 %a = load volatile float, float addrspace(1)* %a.gep 80 %b = load volatile float, float addrspace(1)* %b.gep 81 %add = fadd float %a, %b 82 %fneg = fneg float %add 83 %use1 = fmul float %add, 4.0 84 store volatile float %fneg, float addrspace(1)* %out 85 store volatile float %use1, float addrspace(1)* %out 86 ret void 87} 88 89; GCN-LABEL: {{^}}v_fneg_add_fneg_x_f32: 90; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 91; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 92 93; GCN-SAFE: v_sub_f32_e32 94; GCN-SAFE: v_xor_b32_e32 [[ADD:v[0-9]+]], 0x80000000, 95 96; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 97 98; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 99define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 100 %tid = call i32 @llvm.amdgcn.workitem.id.x() 101 %tid.ext = sext i32 %tid to i64 102 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 103 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 104 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 105 %a = load volatile float, float addrspace(1)* %a.gep 106 %b = load volatile float, float addrspace(1)* %b.gep 107 %fneg.a = fneg float %a 108 %add = fadd float %fneg.a, %b 109 %fneg = fneg float %add 110 store volatile float %fneg, float addrspace(1)* %out 111 ret void 112} 113 114; GCN-LABEL: {{^}}v_fneg_add_x_fneg_f32: 115; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 116; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 117 118; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 119; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] 120 121; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] 122; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 123define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 124 %tid = call i32 @llvm.amdgcn.workitem.id.x() 125 %tid.ext = sext i32 %tid to i64 126 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 127 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 128 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 129 %a = load volatile float, float addrspace(1)* %a.gep 130 %b = load volatile float, float addrspace(1)* %b.gep 131 %fneg.b = fneg float %b 132 %add = fadd float %a, %fneg.b 133 %fneg = fneg float %add 134 store volatile float %fneg, float addrspace(1)* %out 135 ret void 136} 137 138; GCN-LABEL: {{^}}v_fneg_add_fneg_fneg_f32: 139; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 140; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 141 142; GCN-SAFE: v_sub_f32_e64 [[ADD:v[0-9]+]], -[[A]], [[B]] 143; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] 144 145; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 146; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 147define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 148 %tid = call i32 @llvm.amdgcn.workitem.id.x() 149 %tid.ext = sext i32 %tid to i64 150 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 151 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 152 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 153 %a = load volatile float, float addrspace(1)* %a.gep 154 %b = load volatile float, float addrspace(1)* %b.gep 155 %fneg.a = fneg float %a 156 %fneg.b = fneg float %b 157 %add = fadd float %fneg.a, %fneg.b 158 %fneg = fneg float %add 159 store volatile float %fneg, float addrspace(1)* %out 160 ret void 161} 162 163; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32: 164; GCN-SAFE-DAG: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1{{$}} 165; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 166; GCN-DAG: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 167 168; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[SIGNBIT]], [[A]] 169; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] 170; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], [[SIGNBIT]], [[ADD]] 171 172; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 173; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]] 174; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] 175; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 176; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 177; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 178define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 179 %tid = call i32 @llvm.amdgcn.workitem.id.x() 180 %tid.ext = sext i32 %tid to i64 181 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 182 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 183 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 184 %a = load volatile float, float addrspace(1)* %a.gep 185 %b = load volatile float, float addrspace(1)* %b.gep 186 %fneg.a = fneg float %a 187 %add = fadd float %fneg.a, %b 188 %fneg = fneg float %add 189 store volatile float %fneg, float addrspace(1)* %out 190 store volatile float %fneg.a, float addrspace(1)* %out 191 ret void 192} 193 194; GCN-LABEL: {{^}}v_fneg_add_multi_use_fneg_x_f32: 195; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 196; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 197 198; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 199; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] 200; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] 201 202; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]] 203; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 204; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] 205; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 206; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 207; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 208define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { 209 %tid = call i32 @llvm.amdgcn.workitem.id.x() 210 %tid.ext = sext i32 %tid to i64 211 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 212 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 213 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 214 %a = load volatile float, float addrspace(1)* %a.gep 215 %b = load volatile float, float addrspace(1)* %b.gep 216 %fneg.a = fneg float %a 217 %add = fadd float %fneg.a, %b 218 %fneg = fneg float %add 219 %use1 = fmul float %fneg.a, %c 220 store volatile float %fneg, float addrspace(1)* %out 221 store volatile float %use1, float addrspace(1)* %out 222 ret void 223} 224 225; This one asserted with -enable-no-signed-zeros-fp-math 226; GCN-LABEL: {{^}}fneg_fadd_0: 227; GCN-SAFE-DAG: v_mad_f32 [[A:v[0-9]+]], 228; GCN-SAFE-DAG: v_cmp_ngt_f32_e32 {{.*}}, [[A]] 229; GCN-SAFE-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -[[A]] 230define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 { 231.entry: 232 %tmp7 = fdiv float 1.000000e+00, %tmp6 233 %tmp8 = fmul float 0.000000e+00, %tmp7 234 %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8 235 %.i188 = fadd float %tmp9, 0.000000e+00 236 %tmp10 = fcmp uge float %.i188, %tmp2 237 %tmp11 = fneg float %.i188 238 %.i092 = select i1 %tmp10, float %tmp2, float %tmp11 239 %tmp12 = fcmp ule float %.i092, 0.000000e+00 240 %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000 241 ret float %.i198 242} 243 244; This is a workaround because -enable-no-signed-zeros-fp-math does not set up 245; function attribute unsafe-fp-math automatically. Combine with the previous test 246; when that is done. 247; GCN-LABEL: {{^}}fneg_fadd_0_nsz: 248; GCN-NSZ-DAG: v_rcp_f32_e32 [[A:v[0-9]+]], 249; GCN-NSZ-DAG: v_mov_b32_e32 [[B:v[0-9]+]], 250; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]], 251; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]], 252; GCN-NSZ-DAG: v_cmp_nlt_f32_e64 {{.*}}, -[[D]] 253define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #2 { 254.entry: 255 %tmp7 = fdiv afn float 1.000000e+00, %tmp6 256 %tmp8 = fmul float 0.000000e+00, %tmp7 257 %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8 258 %.i188 = fadd float %tmp9, 0.000000e+00 259 %tmp10 = fcmp uge float %.i188, %tmp2 260 %tmp11 = fneg float %.i188 261 %.i092 = select i1 %tmp10, float %tmp2, float %tmp11 262 %tmp12 = fcmp ule float %.i092, 0.000000e+00 263 %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000 264 ret float %.i198 265} 266 267; -------------------------------------------------------------------------------- 268; fmul tests 269; -------------------------------------------------------------------------------- 270 271; GCN-LABEL: {{^}}v_fneg_mul_f32: 272; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 273; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 274; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]] 275; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 276define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 277 %tid = call i32 @llvm.amdgcn.workitem.id.x() 278 %tid.ext = sext i32 %tid to i64 279 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 280 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 281 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 282 %a = load volatile float, float addrspace(1)* %a.gep 283 %b = load volatile float, float addrspace(1)* %b.gep 284 %mul = fmul float %a, %b 285 %fneg = fneg float %mul 286 store float %fneg, float addrspace(1)* %out.gep 287 ret void 288} 289 290; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32: 291; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 292; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 293; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 294; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]] 295; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]] 296; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 297define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 298 %tid = call i32 @llvm.amdgcn.workitem.id.x() 299 %tid.ext = sext i32 %tid to i64 300 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 301 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 302 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 303 %a = load volatile float, float addrspace(1)* %a.gep 304 %b = load volatile float, float addrspace(1)* %b.gep 305 %mul = fmul float %a, %b 306 %fneg = fneg float %mul 307 store volatile float %fneg, float addrspace(1)* %out 308 store volatile float %mul, float addrspace(1)* %out 309 ret void 310} 311 312; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32: 313; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 314; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 315; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], [[A]], -[[B]] 316; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]] 317 318; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] 319; GCN-NEXT: s_waitcnt vmcnt(0) 320; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 321; GCN-NEXT: s_waitcnt vmcnt(0) 322define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 323 %tid = call i32 @llvm.amdgcn.workitem.id.x() 324 %tid.ext = sext i32 %tid to i64 325 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 326 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 327 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 328 %a = load volatile float, float addrspace(1)* %a.gep 329 %b = load volatile float, float addrspace(1)* %b.gep 330 %mul = fmul float %a, %b 331 %fneg = fneg float %mul 332 %use1 = fmul float %mul, 4.0 333 store volatile float %fneg, float addrspace(1)* %out 334 store volatile float %use1, float addrspace(1)* %out 335 ret void 336} 337 338; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32: 339; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 340; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 341; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 342; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 343define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 344 %tid = call i32 @llvm.amdgcn.workitem.id.x() 345 %tid.ext = sext i32 %tid to i64 346 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 347 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 348 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 349 %a = load volatile float, float addrspace(1)* %a.gep 350 %b = load volatile float, float addrspace(1)* %b.gep 351 %fneg.a = fneg float %a 352 %mul = fmul float %fneg.a, %b 353 %fneg = fneg float %mul 354 store volatile float %fneg, float addrspace(1)* %out 355 ret void 356} 357 358; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32: 359; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 360; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 361; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 362; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 363define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 364 %tid = call i32 @llvm.amdgcn.workitem.id.x() 365 %tid.ext = sext i32 %tid to i64 366 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 367 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 368 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 369 %a = load volatile float, float addrspace(1)* %a.gep 370 %b = load volatile float, float addrspace(1)* %b.gep 371 %fneg.b = fneg float %b 372 %mul = fmul float %a, %fneg.b 373 %fneg = fneg float %mul 374 store volatile float %fneg, float addrspace(1)* %out 375 ret void 376} 377 378; GCN-LABEL: {{^}}v_fneg_mul_fneg_fneg_f32: 379; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 380; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 381; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] 382; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 383define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 384 %tid = call i32 @llvm.amdgcn.workitem.id.x() 385 %tid.ext = sext i32 %tid to i64 386 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 387 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 388 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 389 %a = load volatile float, float addrspace(1)* %a.gep 390 %b = load volatile float, float addrspace(1)* %b.gep 391 %fneg.a = fneg float %a 392 %fneg.b = fneg float %b 393 %mul = fmul float %fneg.a, %fneg.b 394 %fneg = fneg float %mul 395 store volatile float %fneg, float addrspace(1)* %out 396 ret void 397} 398 399; GCN-LABEL: {{^}}v_fneg_mul_store_use_fneg_x_f32: 400; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 401; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 402; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 403; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]] 404 405; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]] 406; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 407define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 408 %tid = call i32 @llvm.amdgcn.workitem.id.x() 409 %tid.ext = sext i32 %tid to i64 410 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 411 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 412 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 413 %a = load volatile float, float addrspace(1)* %a.gep 414 %b = load volatile float, float addrspace(1)* %b.gep 415 %fneg.a = fneg float %a 416 %mul = fmul float %fneg.a, %b 417 %fneg = fneg float %mul 418 store volatile float %fneg, float addrspace(1)* %out 419 store volatile float %fneg.a, float addrspace(1)* %out 420 ret void 421} 422 423; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32: 424; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 425; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 426; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]] 427; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 428; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]] 429; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 430define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { 431 %tid = call i32 @llvm.amdgcn.workitem.id.x() 432 %tid.ext = sext i32 %tid to i64 433 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 434 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 435 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 436 %a = load volatile float, float addrspace(1)* %a.gep 437 %b = load volatile float, float addrspace(1)* %b.gep 438 %fneg.a = fneg float %a 439 %mul = fmul float %fneg.a, %b 440 %fneg = fneg float %mul 441 %use1 = fmul float %fneg.a, %c 442 store volatile float %fneg, float addrspace(1)* %out 443 store volatile float %use1, float addrspace(1)* %out 444 ret void 445} 446 447; -------------------------------------------------------------------------------- 448; fminnum tests 449; -------------------------------------------------------------------------------- 450 451; GCN-LABEL: {{^}}v_fneg_minnum_f32_ieee: 452; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 453; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 454; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 455; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] 456; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] 457; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 458define amdgpu_kernel void @v_fneg_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 459 %tid = call i32 @llvm.amdgcn.workitem.id.x() 460 %tid.ext = sext i32 %tid to i64 461 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 462 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 463 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 464 %a = load volatile float, float addrspace(1)* %a.gep 465 %b = load volatile float, float addrspace(1)* %b.gep 466 %min = call float @llvm.minnum.f32(float %a, float %b) 467 %fneg = fneg float %min 468 store float %fneg, float addrspace(1)* %out.gep 469 ret void 470} 471 472; GCN-LABEL: {{^}}v_fneg_minnum_f32_no_ieee: 473; GCN-NOT: v0 474; GCN-NOT: v1 475; GCN: v_max_f32_e64 v0, -v0, -v1 476; GCN-NEXT: ; return 477define amdgpu_ps float @v_fneg_minnum_f32_no_ieee(float %a, float %b) #0 { 478 %min = call float @llvm.minnum.f32(float %a, float %b) 479 %fneg = fneg float %min 480 ret float %fneg 481} 482 483; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_ieee: 484; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 485; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 486; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]] 487; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 488define amdgpu_kernel void @v_fneg_self_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 489 %tid = call i32 @llvm.amdgcn.workitem.id.x() 490 %tid.ext = sext i32 %tid to i64 491 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 492 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 493 %a = load volatile float, float addrspace(1)* %a.gep 494 %min = call float @llvm.minnum.f32(float %a, float %a) 495 %min.fneg = fneg float %min 496 store float %min.fneg, float addrspace(1)* %out.gep 497 ret void 498} 499 500; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_no_ieee: 501; GCN-NOT: v0 502; GCN: v_max_f32_e64 v0, -v0, -v0 503; GCN-NEXT: ; return 504define amdgpu_ps float @v_fneg_self_minnum_f32_no_ieee(float %a) #0 { 505 %min = call float @llvm.minnum.f32(float %a, float %a) 506 %min.fneg = fneg float %min 507 ret float %min.fneg 508} 509 510; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_ieee: 511; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 512; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 513; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]] 514; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 515define amdgpu_kernel void @v_fneg_posk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 516 %tid = call i32 @llvm.amdgcn.workitem.id.x() 517 %tid.ext = sext i32 %tid to i64 518 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 519 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 520 %a = load volatile float, float addrspace(1)* %a.gep 521 %min = call float @llvm.minnum.f32(float 4.0, float %a) 522 %fneg = fneg float %min 523 store float %fneg, float addrspace(1)* %out.gep 524 ret void 525} 526 527; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_no_ieee: 528; GCN-NOT: v0 529; GCN: v_max_f32_e64 v0, -v0, -4.0 530; GCN-NEXT: ; return 531define amdgpu_ps float @v_fneg_posk_minnum_f32_no_ieee(float %a) #0 { 532 %min = call float @llvm.minnum.f32(float 4.0, float %a) 533 %fneg = fneg float %min 534 ret float %fneg 535} 536 537; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_ieee: 538; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 539; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 540; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]] 541; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 542define amdgpu_kernel void @v_fneg_negk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 543 %tid = call i32 @llvm.amdgcn.workitem.id.x() 544 %tid.ext = sext i32 %tid to i64 545 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 546 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 547 %a = load volatile float, float addrspace(1)* %a.gep 548 %min = call float @llvm.minnum.f32(float -4.0, float %a) 549 %fneg = fneg float %min 550 store float %fneg, float addrspace(1)* %out.gep 551 ret void 552} 553 554; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_no_ieee: 555; GCN-NOT: v0 556; GCN: v_max_f32_e64 v0, -v0, 4.0 557; GCN-NEXT: ; return 558define amdgpu_ps float @v_fneg_negk_minnum_f32_no_ieee(float %a) #0 { 559 %min = call float @llvm.minnum.f32(float -4.0, float %a) 560 %fneg = fneg float %min 561 ret float %fneg 562} 563 564; GCN-LABEL: {{^}}v_fneg_0_minnum_f32: 565; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 566; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]] 567; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 568define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 569 %tid = call i32 @llvm.amdgcn.workitem.id.x() 570 %tid.ext = sext i32 %tid to i64 571 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 572 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 573 %a = load volatile float, float addrspace(1)* %a.gep 574 %min = call float @llvm.minnum.f32(float 0.0, float %a) 575 %fneg = fneg float %min 576 store float %fneg, float addrspace(1)* %out.gep 577 ret void 578} 579 580; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_ieee: 581; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 582; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 583; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]] 584; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 585define amdgpu_kernel void @v_fneg_neg0_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 586 %tid = call i32 @llvm.amdgcn.workitem.id.x() 587 %tid.ext = sext i32 %tid to i64 588 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 589 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 590 %a = load volatile float, float addrspace(1)* %a.gep 591 %min = call float @llvm.minnum.f32(float -0.0, float %a) 592 %fneg = fneg float %min 593 store float %fneg, float addrspace(1)* %out.gep 594 ret void 595} 596 597; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f32: 598; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 599 600; SI-DAG: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]] 601; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]] 602 603; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]] 604; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]] 605; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]] 606 607; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 608define amdgpu_kernel void @v_fneg_inv2pi_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 609 %tid = call i32 @llvm.amdgcn.workitem.id.x() 610 %tid.ext = sext i32 %tid to i64 611 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 612 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 613 %a = load volatile float, float addrspace(1)* %a.gep 614 %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a) 615 %fneg = fneg float %min 616 store float %fneg, float addrspace(1)* %out.gep 617 ret void 618} 619 620; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f32: 621; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 622 623; SI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]] 624; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0x3e22f983, [[NEG_QUIET]] 625 626; VI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]] 627; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]] 628 629; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 630define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 631 %tid = call i32 @llvm.amdgcn.workitem.id.x() 632 %tid.ext = sext i32 %tid to i64 633 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 634 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 635 %a = load volatile float, float addrspace(1)* %a.gep 636 %min = call float @llvm.minnum.f32(float 0xBFC45F3060000000, float %a) 637 %fneg = fneg float %min 638 store float %fneg, float addrspace(1)* %out.gep 639 ret void 640} 641 642; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f16: 643; GCN-DAG: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 644 645; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] 646; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0xbe230000, [[CVT]] 647; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]] 648 649; VI: v_max_f16_e32 [[QUIET:v[0-9]+]], [[A]], [[A]] 650; VI: v_min_f16_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]] 651; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x8000, [[MAX]] 652 653; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 654define amdgpu_kernel void @v_fneg_inv2pi_minnum_f16(half addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { 655 %tid = call i32 @llvm.amdgcn.workitem.id.x() 656 %tid.ext = sext i32 %tid to i64 657 %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext 658 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 659 %a = load volatile half, half addrspace(1)* %a.gep 660 %min = call half @llvm.minnum.f16(half 0xH3118, half %a) 661 %fneg = fsub half -0.000000e+00, %min 662 store half %fneg, half addrspace(1)* %out.gep 663 ret void 664} 665 666; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f16: 667; GCN-DAG: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 668 669; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] 670; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0x3e230000, [[CVT]] 671; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]] 672 673; VI: v_max_f16_e64 [[NEG_QUIET:v[0-9]+]], -[[A]], -[[A]] 674; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]] 675 676; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 677define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f16(half addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { 678 %tid = call i32 @llvm.amdgcn.workitem.id.x() 679 %tid.ext = sext i32 %tid to i64 680 %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext 681 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 682 %a = load volatile half, half addrspace(1)* %a.gep 683 %min = call half @llvm.minnum.f16(half 0xHB118, half %a) 684 %fneg = fsub half -0.000000e+00, %min 685 store half %fneg, half addrspace(1)* %out.gep 686 ret void 687} 688 689; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f64: 690; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 691 692; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfc45f30 693; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882 694; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]] 695; SI: v_max_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, [[NEG_QUIET]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} 696 697; VI: v_min_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, [[A]], 0.15915494 698; VI: v_xor_b32_e32 v[[RESULT_HI]], 0x80000000, v[[RESULT_HI]] 699 700; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} 701define amdgpu_kernel void @v_fneg_inv2pi_minnum_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { 702 %tid = call i32 @llvm.amdgcn.workitem.id.x() 703 %tid.ext = sext i32 %tid to i64 704 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 705 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 706 %a = load volatile double, double addrspace(1)* %a.gep 707 %min = call double @llvm.minnum.f64(double 0x3fc45f306dc9c882, double %a) 708 %fneg = fsub double -0.000000e+00, %min 709 store double %fneg, double addrspace(1)* %out.gep 710 ret void 711} 712 713; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f64: 714; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 715 716; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fc45f30 717; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882 718; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]] 719; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} 720 721; VI: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]] 722; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], 0.15915494 723 724; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 725define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { 726 %tid = call i32 @llvm.amdgcn.workitem.id.x() 727 %tid.ext = sext i32 %tid to i64 728 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 729 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 730 %a = load volatile double, double addrspace(1)* %a.gep 731 %min = call double @llvm.minnum.f64(double 0xbfc45f306dc9c882, double %a) 732 %fneg = fsub double -0.000000e+00, %min 733 store double %fneg, double addrspace(1)* %out.gep 734 ret void 735} 736 737; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_no_ieee: 738; GCN-NOT: v0 739; GCN: v_max_f32_e64 v0, -v0, 0{{$}} 740; GCN-NEXT: ; return 741define amdgpu_ps float @v_fneg_neg0_minnum_f32_no_ieee(float %a) #0 { 742 %min = call float @llvm.minnum.f32(float -0.0, float %a) 743 %fneg = fneg float %min 744 ret float %fneg 745} 746 747; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_ieee: 748; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 749; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 750; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] 751; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[QUIET_A]] 752; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]] 753; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 754define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 755 %tid = call i32 @llvm.amdgcn.workitem.id.x() 756 %tid.ext = sext i32 %tid to i64 757 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 758 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 759 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 760 %a = load volatile float, float addrspace(1)* %a.gep 761 %b = load volatile float, float addrspace(1)* %b.gep 762 %min = call float @llvm.minnum.f32(float 0.0, float %a) 763 %fneg = fneg float %min 764 %mul = fmul float %fneg, %b 765 store float %mul, float addrspace(1)* %out.gep 766 ret void 767} 768 769; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_foldable_use_f32: 770; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 771; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 772 773; SI: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]] 774 775; SI: v_max_f32_e32 [[MIN:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]] 776; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[B]] 777 778; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]] 779; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0.15915494, [[QUIET]] 780; VI: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]] 781 782; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 783define amdgpu_kernel void @v_fneg_inv2pi_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 784 %tid = call i32 @llvm.amdgcn.workitem.id.x() 785 %tid.ext = sext i32 %tid to i64 786 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 787 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 788 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 789 %a = load volatile float, float addrspace(1)* %a.gep 790 %b = load volatile float, float addrspace(1)* %b.gep 791 %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a) 792 %fneg = fneg float %min 793 %mul = fmul float %fneg, %b 794 store float %mul, float addrspace(1)* %out.gep 795 ret void 796} 797 798; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_no_ieee: 799; GCN-NOT: v0 800; GCN-NOT: v1 801; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, v0 802; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], v1 803; GCN-NEXT: ; return 804define amdgpu_ps float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float %b) #0 { 805 %min = call float @llvm.minnum.f32(float 0.0, float %a) 806 %fneg = fneg float %min 807 %mul = fmul float %fneg, %b 808 ret float %mul 809} 810 811; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_ieee: 812; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 813; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 814; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 815; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] 816; GCN: v_max_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] 817; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]] 818; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]] 819; GCN-NEXT: s_waitcnt vmcnt(0) 820; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 821; GCN-NEXT: s_waitcnt vmcnt(0) 822define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 823 %tid = call i32 @llvm.amdgcn.workitem.id.x() 824 %tid.ext = sext i32 %tid to i64 825 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 826 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 827 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 828 %a = load volatile float, float addrspace(1)* %a.gep 829 %b = load volatile float, float addrspace(1)* %b.gep 830 %min = call float @llvm.minnum.f32(float %a, float %b) 831 %fneg = fneg float %min 832 %use1 = fmul float %min, 4.0 833 store volatile float %fneg, float addrspace(1)* %out 834 store volatile float %use1, float addrspace(1)* %out 835 ret void 836} 837 838; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_no_ieee: 839; GCN-NOT: v0 840; GCN-NOT: v1 841; GCN: v_max_f32_e64 v0, -v0, -v1 842; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0 843; GCN-NEXT: ; return 844define amdgpu_ps <2 x float> @v_fneg_minnum_multi_use_minnum_f32_no_ieee(float %a, float %b) #0 { 845 %min = call float @llvm.minnum.f32(float %a, float %b) 846 %fneg = fneg float %min 847 %use1 = fmul float %min, 4.0 848 %ins0 = insertelement <2 x float> undef, float %fneg, i32 0 849 %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1 850 ret <2 x float> %ins1 851} 852 853; -------------------------------------------------------------------------------- 854; fmaxnum tests 855; -------------------------------------------------------------------------------- 856 857 858; GCN-LABEL: {{^}}v_fneg_maxnum_f32_ieee: 859; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 860; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 861; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 862; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] 863; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] 864; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 865define amdgpu_kernel void @v_fneg_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 866 %tid = call i32 @llvm.amdgcn.workitem.id.x() 867 %tid.ext = sext i32 %tid to i64 868 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 869 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 870 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 871 %a = load volatile float, float addrspace(1)* %a.gep 872 %b = load volatile float, float addrspace(1)* %b.gep 873 %max = call float @llvm.maxnum.f32(float %a, float %b) 874 %fneg = fneg float %max 875 store float %fneg, float addrspace(1)* %out.gep 876 ret void 877} 878 879; GCN-LABEL: {{^}}v_fneg_maxnum_f32_no_ieee: 880; GCN-NOT: v0 881; GCN-NOT: v1 882; GCN: v_min_f32_e64 v0, -v0, -v1 883; GCN-NEXT: ; return 884define amdgpu_ps float @v_fneg_maxnum_f32_no_ieee(float %a, float %b) #0 { 885 %max = call float @llvm.maxnum.f32(float %a, float %b) 886 %fneg = fneg float %max 887 ret float %fneg 888} 889 890; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_ieee: 891; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 892; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 893; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]] 894; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 895define amdgpu_kernel void @v_fneg_self_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 896 %tid = call i32 @llvm.amdgcn.workitem.id.x() 897 %tid.ext = sext i32 %tid to i64 898 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 899 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 900 %a = load volatile float, float addrspace(1)* %a.gep 901 %max = call float @llvm.maxnum.f32(float %a, float %a) 902 %max.fneg = fneg float %max 903 store float %max.fneg, float addrspace(1)* %out.gep 904 ret void 905} 906 907; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_no_ieee: 908; GCN-NOT: v0 909; GCN: v_min_f32_e64 v0, -v0, -v0 910; GCN-NEXT: ; return 911define amdgpu_ps float @v_fneg_self_maxnum_f32_no_ieee(float %a) #0 { 912 %max = call float @llvm.maxnum.f32(float %a, float %a) 913 %max.fneg = fneg float %max 914 ret float %max.fneg 915} 916 917; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_ieee: 918; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 919; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 920; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]] 921; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 922define amdgpu_kernel void @v_fneg_posk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 923 %tid = call i32 @llvm.amdgcn.workitem.id.x() 924 %tid.ext = sext i32 %tid to i64 925 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 926 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 927 %a = load volatile float, float addrspace(1)* %a.gep 928 %max = call float @llvm.maxnum.f32(float 4.0, float %a) 929 %fneg = fneg float %max 930 store float %fneg, float addrspace(1)* %out.gep 931 ret void 932} 933 934; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_no_ieee: 935; GCN-NOT: v0 936; GCN: v_min_f32_e64 v0, -v0, -4.0 937; GCN-NEXT: ; return 938define amdgpu_ps float @v_fneg_posk_maxnum_f32_no_ieee(float %a) #0 { 939 %max = call float @llvm.maxnum.f32(float 4.0, float %a) 940 %fneg = fneg float %max 941 ret float %fneg 942} 943 944; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_ieee: 945; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 946; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 947; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]] 948; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 949define amdgpu_kernel void @v_fneg_negk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 950 %tid = call i32 @llvm.amdgcn.workitem.id.x() 951 %tid.ext = sext i32 %tid to i64 952 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 953 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 954 %a = load volatile float, float addrspace(1)* %a.gep 955 %max = call float @llvm.maxnum.f32(float -4.0, float %a) 956 %fneg = fneg float %max 957 store float %fneg, float addrspace(1)* %out.gep 958 ret void 959} 960 961; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_no_ieee: 962; GCN-NOT: v0 963; GCN: v_min_f32_e64 v0, -v0, 4.0 964; GCN-NEXT: ; return 965define amdgpu_ps float @v_fneg_negk_maxnum_f32_no_ieee(float %a) #0 { 966 %max = call float @llvm.maxnum.f32(float -4.0, float %a) 967 %fneg = fneg float %max 968 ret float %fneg 969} 970 971; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32: 972; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 973; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]] 974; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 975define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 976 %tid = call i32 @llvm.amdgcn.workitem.id.x() 977 %tid.ext = sext i32 %tid to i64 978 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 979 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 980 %a = load volatile float, float addrspace(1)* %a.gep 981 %max = call float @llvm.maxnum.f32(float 0.0, float %a) 982 %fneg = fneg float %max 983 store float %fneg, float addrspace(1)* %out.gep 984 ret void 985} 986 987; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_ieee: 988; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 989; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 990; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]] 991; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 992define amdgpu_kernel void @v_fneg_neg0_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 993 %tid = call i32 @llvm.amdgcn.workitem.id.x() 994 %tid.ext = sext i32 %tid to i64 995 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 996 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 997 %a = load volatile float, float addrspace(1)* %a.gep 998 %max = call float @llvm.maxnum.f32(float -0.0, float %a) 999 %fneg = fneg float %max 1000 store float %fneg, float addrspace(1)* %out.gep 1001 ret void 1002} 1003 1004; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_no_ieee: 1005; GCN-NOT: v0 1006; GCN: v_min_f32_e64 v0, -v0, 0{{$}} 1007; GCN-NEXT: ; return 1008define amdgpu_ps float @v_fneg_neg0_maxnum_f32_no_ieee(float %a) #0 { 1009 %max = call float @llvm.maxnum.f32(float -0.0, float %a) 1010 %fneg = fneg float %max 1011 ret float %fneg 1012} 1013 1014; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_ieee: 1015; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1016; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1017; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] 1018; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]] 1019; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]] 1020; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1021define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1022 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1023 %tid.ext = sext i32 %tid to i64 1024 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1025 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1026 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1027 %a = load volatile float, float addrspace(1)* %a.gep 1028 %b = load volatile float, float addrspace(1)* %b.gep 1029 %max = call float @llvm.maxnum.f32(float 0.0, float %a) 1030 %fneg = fneg float %max 1031 %mul = fmul float %fneg, %b 1032 store float %mul, float addrspace(1)* %out.gep 1033 ret void 1034} 1035 1036; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_no_ieee: 1037; GCN-NOT: v0 1038; GCN-NOT: v1 1039; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, v0 1040; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], v1 1041; GCN-NEXT: ; return 1042define amdgpu_ps float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float %b) #0 { 1043 %max = call float @llvm.maxnum.f32(float 0.0, float %a) 1044 %fneg = fneg float %max 1045 %mul = fmul float %fneg, %b 1046 ret float %mul 1047} 1048 1049; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_ieee: 1050; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1051; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1052; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 1053; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] 1054; GCN: v_min_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] 1055; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]] 1056; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]] 1057; GCN-NEXT: s_waitcnt vmcnt(0) 1058; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 1059; GCN-NEXT: s_waitcnt vmcnt(0) 1060define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1061 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1062 %tid.ext = sext i32 %tid to i64 1063 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1064 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1065 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1066 %a = load volatile float, float addrspace(1)* %a.gep 1067 %b = load volatile float, float addrspace(1)* %b.gep 1068 %max = call float @llvm.maxnum.f32(float %a, float %b) 1069 %fneg = fneg float %max 1070 %use1 = fmul float %max, 4.0 1071 store volatile float %fneg, float addrspace(1)* %out 1072 store volatile float %use1, float addrspace(1)* %out 1073 ret void 1074} 1075 1076; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_no_ieee: 1077; GCN-NOT: v0 1078; GCN-NOT: v1 1079; GCN: v_min_f32_e64 v0, -v0, -v1 1080; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0 1081; GCN-NEXT: ; return 1082define amdgpu_ps <2 x float> @v_fneg_maxnum_multi_use_maxnum_f32_no_ieee(float %a, float %b) #0 { 1083 %max = call float @llvm.maxnum.f32(float %a, float %b) 1084 %fneg = fneg float %max 1085 %use1 = fmul float %max, 4.0 1086 %ins0 = insertelement <2 x float> undef, float %fneg, i32 0 1087 %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1 1088 ret <2 x float> %ins1 1089} 1090 1091; -------------------------------------------------------------------------------- 1092; fma tests 1093; -------------------------------------------------------------------------------- 1094 1095; GCN-LABEL: {{^}}v_fneg_fma_f32: 1096; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1097; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1098; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1099 1100; GCN-SAFE: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] 1101; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[RESULT]] 1102 1103; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] 1104; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1105define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1106 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1107 %tid.ext = sext i32 %tid to i64 1108 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1109 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1110 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1111 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1112 %a = load volatile float, float addrspace(1)* %a.gep 1113 %b = load volatile float, float addrspace(1)* %b.gep 1114 %c = load volatile float, float addrspace(1)* %c.gep 1115 %fma = call float @llvm.fma.f32(float %a, float %b, float %c) 1116 %fneg = fneg float %fma 1117 store float %fneg, float addrspace(1)* %out.gep 1118 ret void 1119} 1120 1121; GCN-LABEL: {{^}}v_fneg_fma_store_use_fma_f32: 1122; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1123; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1124; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1125; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] 1126; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]] 1127; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]] 1128; GCN-NEXT: s_waitcnt vmcnt(0) 1129; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1130; GCN-NEXT: s_waitcnt vmcnt(0) 1131define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1132 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1133 %tid.ext = sext i32 %tid to i64 1134 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1135 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1136 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1137 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1138 %a = load volatile float, float addrspace(1)* %a.gep 1139 %b = load volatile float, float addrspace(1)* %b.gep 1140 %c = load volatile float, float addrspace(1)* %c.gep 1141 %fma = call float @llvm.fma.f32(float %a, float %b, float %c) 1142 %fneg = fneg float %fma 1143 store volatile float %fneg, float addrspace(1)* %out 1144 store volatile float %fma, float addrspace(1)* %out 1145 ret void 1146} 1147 1148; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fma_f32: 1149; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1150; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1151; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1152 1153; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] 1154; GCN-SAFE: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]] 1155; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]] 1156 1157; GCN-NSZ: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], -[[B]], -[[C]] 1158; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]] 1159 1160; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]] 1161; GCN-NEXT: s_waitcnt vmcnt(0) 1162; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1163; GCN-NEXT: s_waitcnt vmcnt(0) 1164define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1165 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1166 %tid.ext = sext i32 %tid to i64 1167 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1168 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1169 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1170 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1171 %a = load volatile float, float addrspace(1)* %a.gep 1172 %b = load volatile float, float addrspace(1)* %b.gep 1173 %c = load volatile float, float addrspace(1)* %c.gep 1174 %fma = call float @llvm.fma.f32(float %a, float %b, float %c) 1175 %fneg = fneg float %fma 1176 %use1 = fmul float %fma, 4.0 1177 store volatile float %fneg, float addrspace(1)* %out 1178 store volatile float %use1, float addrspace(1)* %out 1179 ret void 1180} 1181 1182; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_y_f32: 1183; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1184; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1185; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1186 1187; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], [[C]] 1188; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1189 1190; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1191; GCN-NSZ-NOT: [[FMA]] 1192; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1193define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1194 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1195 %tid.ext = sext i32 %tid to i64 1196 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1197 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1198 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1199 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1200 %a = load volatile float, float addrspace(1)* %a.gep 1201 %b = load volatile float, float addrspace(1)* %b.gep 1202 %c = load volatile float, float addrspace(1)* %c.gep 1203 %fneg.a = fneg float %a 1204 %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) 1205 %fneg = fneg float %fma 1206 store volatile float %fneg, float addrspace(1)* %out 1207 ret void 1208} 1209 1210; GCN-LABEL: {{^}}v_fneg_fma_x_fneg_y_f32: 1211; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1212; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1213; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1214 1215; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]] 1216; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1217 1218; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1219; GCN-NSZ-NOT: [[FMA]] 1220; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1221define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1222 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1223 %tid.ext = sext i32 %tid to i64 1224 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1225 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1226 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1227 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1228 %a = load volatile float, float addrspace(1)* %a.gep 1229 %b = load volatile float, float addrspace(1)* %b.gep 1230 %c = load volatile float, float addrspace(1)* %c.gep 1231 %fneg.b = fneg float %b 1232 %fma = call float @llvm.fma.f32(float %a, float %fneg.b, float %c) 1233 %fneg = fneg float %fma 1234 store volatile float %fneg, float addrspace(1)* %out 1235 ret void 1236} 1237 1238; GCN-LABEL: {{^}}v_fneg_fma_fneg_fneg_y_f32: 1239; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1240; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1241; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1242 1243; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] 1244; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1245 1246; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]] 1247; GCN-NSZ-NOT: [[FMA]] 1248; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1249define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1250 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1251 %tid.ext = sext i32 %tid to i64 1252 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1253 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1254 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1255 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1256 %a = load volatile float, float addrspace(1)* %a.gep 1257 %b = load volatile float, float addrspace(1)* %b.gep 1258 %c = load volatile float, float addrspace(1)* %c.gep 1259 %fneg.a = fneg float %a 1260 %fneg.b = fneg float %b 1261 %fma = call float @llvm.fma.f32(float %fneg.a, float %fneg.b, float %c) 1262 %fneg = fneg float %fma 1263 store volatile float %fneg, float addrspace(1)* %out 1264 ret void 1265} 1266 1267; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_fneg_f32: 1268; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1269; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1270; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1271 1272; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], -[[C]] 1273; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1274 1275; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] 1276; GCN-NSZ-NOT: [[FMA]] 1277; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1278define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1279 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1280 %tid.ext = sext i32 %tid to i64 1281 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1282 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1283 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1284 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1285 %a = load volatile float, float addrspace(1)* %a.gep 1286 %b = load volatile float, float addrspace(1)* %b.gep 1287 %c = load volatile float, float addrspace(1)* %c.gep 1288 %fneg.a = fneg float %a 1289 %fneg.c = fneg float %c 1290 %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %fneg.c) 1291 %fneg = fneg float %fma 1292 store volatile float %fneg, float addrspace(1)* %out 1293 ret void 1294} 1295 1296; GCN-LABEL: {{^}}v_fneg_fma_x_y_fneg_f32: 1297; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1298; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1299; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1300 1301; GCN-NSZ-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1302; GCN-NSZ-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1303 1304; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]] 1305; GCN-NSZ-NOT: [[FMA]] 1306; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1307define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1308 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1309 %tid.ext = sext i32 %tid to i64 1310 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1311 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1312 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1313 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1314 %a = load volatile float, float addrspace(1)* %a.gep 1315 %b = load volatile float, float addrspace(1)* %b.gep 1316 %c = load volatile float, float addrspace(1)* %c.gep 1317 %fneg.c = fneg float %c 1318 %fma = call float @llvm.fma.f32(float %a, float %b, float %fneg.c) 1319 %fneg = fneg float %fma 1320 store volatile float %fneg, float addrspace(1)* %out 1321 ret void 1322} 1323 1324; GCN-LABEL: {{^}}v_fneg_fma_store_use_fneg_x_y_f32: 1325; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1326; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1327; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1328 1329; GCN-SAFE: v_xor_b32 1330; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], 1331; GCN-SAFE: v_xor_b32 1332 1333; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 1334; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1335 1336; GCN-NSZ-NOT: [[FMA]] 1337; GCN-NSZ-NOT: [[NEG_A]] 1338; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1339; GCN-NSZ-NOT: [[NEG_A]] 1340; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 1341define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1342 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1343 %tid.ext = sext i32 %tid to i64 1344 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1345 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1346 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1347 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1348 %a = load volatile float, float addrspace(1)* %a.gep 1349 %b = load volatile float, float addrspace(1)* %b.gep 1350 %c = load volatile float, float addrspace(1)* %c.gep 1351 %fneg.a = fneg float %a 1352 %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) 1353 %fneg = fneg float %fma 1354 store volatile float %fneg, float addrspace(1)* %out 1355 store volatile float %fneg.a, float addrspace(1)* %out 1356 ret void 1357} 1358 1359; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fneg_x_y_f32: 1360; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1361; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1362; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1363 1364; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 1365; GCN-SAFE-DAG: v_fma_f32 [[FMA:v[0-9]+]] 1366; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1367 1368; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1369; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]] 1370; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 1371; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1372; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 1373define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 { 1374 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1375 %tid.ext = sext i32 %tid to i64 1376 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1377 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1378 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1379 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1380 %a = load volatile float, float addrspace(1)* %a.gep 1381 %b = load volatile float, float addrspace(1)* %b.gep 1382 %c = load volatile float, float addrspace(1)* %c.gep 1383 %fneg.a = fneg float %a 1384 %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) 1385 %fneg = fneg float %fma 1386 %use1 = fmul float %fneg.a, %d 1387 store volatile float %fneg, float addrspace(1)* %out 1388 store volatile float %use1, float addrspace(1)* %out 1389 ret void 1390} 1391 1392; -------------------------------------------------------------------------------- 1393; fmad tests 1394; -------------------------------------------------------------------------------- 1395 1396; GCN-LABEL: {{^}}v_fneg_fmad_f32: 1397; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1398; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1399; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1400 1401; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]] 1402; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[C]] 1403 1404; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] 1405; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1406define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1407 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1408 %tid.ext = sext i32 %tid to i64 1409 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1410 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1411 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1412 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1413 %a = load volatile float, float addrspace(1)* %a.gep 1414 %b = load volatile float, float addrspace(1)* %b.gep 1415 %c = load volatile float, float addrspace(1)* %c.gep 1416 %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c) 1417 %fneg = fneg float %fma 1418 store float %fneg, float addrspace(1)* %out.gep 1419 ret void 1420} 1421 1422; GCN-LABEL: {{^}}v_fneg_fmad_v4f32: 1423 1424; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 1425; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 1426; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 1427; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 1428define amdgpu_kernel void @v_fneg_fmad_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %a.ptr, <4 x float> addrspace(1)* %b.ptr, <4 x float> addrspace(1)* %c.ptr) #0 { 1429 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1430 %tid.ext = sext i32 %tid to i64 1431 %a.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a.ptr, i64 %tid.ext 1432 %b.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %b.ptr, i64 %tid.ext 1433 %c.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %c.ptr, i64 %tid.ext 1434 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext 1435 %a = load volatile <4 x float>, <4 x float> addrspace(1)* %a.gep 1436 %b = load volatile <4 x float>, <4 x float> addrspace(1)* %b.gep 1437 %c = load volatile <4 x float>, <4 x float> addrspace(1)* %c.gep 1438 %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) 1439 %fneg = fneg <4 x float> %fma 1440 store <4 x float> %fneg, <4 x float> addrspace(1)* %out.gep 1441 ret void 1442} 1443 1444; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32: 1445; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1446; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1447; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1448 1449; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]] 1450; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]] 1451; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]] 1452 1453; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], [[A]], -[[B]], -[[C]] 1454; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]] 1455 1456; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MAD]] 1457; GCN-NEXT: s_waitcnt vmcnt(0) 1458; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1459; GCN-NEXT: s_waitcnt vmcnt(0) 1460define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 1461 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1462 %tid.ext = sext i32 %tid to i64 1463 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1464 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1465 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 1466 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1467 %a = load volatile float, float addrspace(1)* %a.gep 1468 %b = load volatile float, float addrspace(1)* %b.gep 1469 %c = load volatile float, float addrspace(1)* %c.gep 1470 %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c) 1471 %fneg = fneg float %fma 1472 %use1 = fmul float %fma, 4.0 1473 store volatile float %fneg, float addrspace(1)* %out 1474 store volatile float %use1, float addrspace(1)* %out 1475 ret void 1476} 1477 1478; -------------------------------------------------------------------------------- 1479; fp_extend tests 1480; -------------------------------------------------------------------------------- 1481 1482; GCN-LABEL: {{^}}v_fneg_fp_extend_f32_to_f64: 1483; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1484; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]] 1485; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1486define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1487 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1488 %tid.ext = sext i32 %tid to i64 1489 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1490 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 1491 %a = load volatile float, float addrspace(1)* %a.gep 1492 %fpext = fpext float %a to double 1493 %fneg = fsub double -0.000000e+00, %fpext 1494 store double %fneg, double addrspace(1)* %out.gep 1495 ret void 1496} 1497 1498; GCN-LABEL: {{^}}v_fneg_fp_extend_fneg_f32_to_f64: 1499; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1500; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]] 1501; GCN: {{buffer|flat}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1502define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1503 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1504 %tid.ext = sext i32 %tid to i64 1505 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1506 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 1507 %a = load volatile float, float addrspace(1)* %a.gep 1508 %fneg.a = fneg float %a 1509 %fpext = fpext float %fneg.a to double 1510 %fneg = fsub double -0.000000e+00, %fpext 1511 store double %fneg, double addrspace(1)* %out.gep 1512 ret void 1513} 1514 1515; GCN-LABEL: {{^}}v_fneg_fp_extend_store_use_fneg_f32_to_f64: 1516; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1517; GCN-DAG: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]] 1518; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]] 1519; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1520; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FNEG_A]] 1521define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1522 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1523 %tid.ext = sext i32 %tid to i64 1524 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1525 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 1526 %a = load volatile float, float addrspace(1)* %a.gep 1527 %fneg.a = fneg float %a 1528 %fpext = fpext float %fneg.a to double 1529 %fneg = fsub double -0.000000e+00, %fpext 1530 store volatile double %fneg, double addrspace(1)* %out.gep 1531 store volatile float %fneg.a, float addrspace(1)* undef 1532 ret void 1533} 1534 1535; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f32_to_f64: 1536; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1537; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]{{\]}}, [[A]] 1538; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]] 1539; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]{{\]}} 1540; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}} 1541define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1542 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1543 %tid.ext = sext i32 %tid to i64 1544 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1545 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 1546 %a = load volatile float, float addrspace(1)* %a.gep 1547 %fpext = fpext float %a to double 1548 %fneg = fsub double -0.000000e+00, %fpext 1549 store volatile double %fneg, double addrspace(1)* %out.gep 1550 store volatile double %fpext, double addrspace(1)* undef 1551 ret void 1552} 1553 1554; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64: 1555; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1556; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]{{\]}}, [[A]] 1557; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]] 1558; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}, 4.0 1559; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]{{\]}} 1560; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1561define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1562 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1563 %tid.ext = sext i32 %tid to i64 1564 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1565 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 1566 %a = load volatile float, float addrspace(1)* %a.gep 1567 %fpext = fpext float %a to double 1568 %fneg = fsub double -0.000000e+00, %fpext 1569 %mul = fmul double %fpext, 4.0 1570 store volatile double %fneg, double addrspace(1)* %out.gep 1571 store volatile double %mul, double addrspace(1)* %out.gep 1572 ret void 1573} 1574 1575; FIXME: Source modifiers not folded for f16->f32 1576; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32: 1577define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { 1578 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1579 %tid.ext = sext i32 %tid to i64 1580 %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext 1581 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1582 %a = load volatile half, half addrspace(1)* %a.gep 1583 %fpext = fpext half %a to float 1584 %fneg = fneg float %fpext 1585 store volatile float %fneg, float addrspace(1)* %out.gep 1586 store volatile float %fpext, float addrspace(1)* %out.gep 1587 ret void 1588} 1589 1590; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32: 1591define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { 1592 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1593 %tid.ext = sext i32 %tid to i64 1594 %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext 1595 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1596 %a = load volatile half, half addrspace(1)* %a.gep 1597 %fpext = fpext half %a to float 1598 %fneg = fneg float %fpext 1599 %mul = fmul float %fpext, 4.0 1600 store volatile float %fneg, float addrspace(1)* %out.gep 1601 store volatile float %mul, float addrspace(1)* %out.gep 1602 ret void 1603} 1604 1605; -------------------------------------------------------------------------------- 1606; fp_round tests 1607; -------------------------------------------------------------------------------- 1608 1609; GCN-LABEL: {{^}}v_fneg_fp_round_f64_to_f32: 1610; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 1611; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]] 1612; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1613define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { 1614 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1615 %tid.ext = sext i32 %tid to i64 1616 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 1617 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1618 %a = load volatile double, double addrspace(1)* %a.gep 1619 %fpround = fptrunc double %a to float 1620 %fneg = fneg float %fpround 1621 store float %fneg, float addrspace(1)* %out.gep 1622 ret void 1623} 1624 1625; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f64_to_f32: 1626; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 1627; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]] 1628; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1629define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { 1630 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1631 %tid.ext = sext i32 %tid to i64 1632 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 1633 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1634 %a = load volatile double, double addrspace(1)* %a.gep 1635 %fneg.a = fsub double -0.000000e+00, %a 1636 %fpround = fptrunc double %fneg.a to float 1637 %fneg = fneg float %fpround 1638 store float %fneg, float addrspace(1)* %out.gep 1639 ret void 1640} 1641 1642; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f64_to_f32: 1643; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} 1644; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v{{\[}}[[A_LO]]:[[A_HI]]{{\]}} 1645; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]] 1646; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1647; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[A_LO]]:[[NEG_A_HI]]{{\]}} 1648define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { 1649 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1650 %tid.ext = sext i32 %tid to i64 1651 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 1652 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1653 %a = load volatile double, double addrspace(1)* %a.gep 1654 %fneg.a = fsub double -0.000000e+00, %a 1655 %fpround = fptrunc double %fneg.a to float 1656 %fneg = fneg float %fpround 1657 store volatile float %fneg, float addrspace(1)* %out.gep 1658 store volatile double %fneg.a, double addrspace(1)* undef 1659 ret void 1660} 1661 1662; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f64_to_f32: 1663; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 1664; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]] 1665; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}} 1666 1667; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1668; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[USE1]] 1669define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 { 1670 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1671 %tid.ext = sext i32 %tid to i64 1672 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 1673 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1674 %a = load volatile double, double addrspace(1)* %a.gep 1675 %fneg.a = fsub double -0.000000e+00, %a 1676 %fpround = fptrunc double %fneg.a to float 1677 %fneg = fneg float %fpround 1678 %use1 = fmul double %fneg.a, %c 1679 store volatile float %fneg, float addrspace(1)* %out.gep 1680 store volatile double %use1, double addrspace(1)* undef 1681 ret void 1682} 1683 1684; GCN-LABEL: {{^}}v_fneg_fp_round_f32_to_f16: 1685; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1686; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 1687; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1688define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1689 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1690 %tid.ext = sext i32 %tid to i64 1691 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1692 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 1693 %a = load volatile float, float addrspace(1)* %a.gep 1694 %fpround = fptrunc float %a to half 1695 %fneg = fsub half -0.000000e+00, %fpround 1696 store half %fneg, half addrspace(1)* %out.gep 1697 ret void 1698} 1699 1700; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f32_to_f16: 1701; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1702; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1703; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1704define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1705 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1706 %tid.ext = sext i32 %tid to i64 1707 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1708 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 1709 %a = load volatile float, float addrspace(1)* %a.gep 1710 %fneg.a = fneg float %a 1711 %fpround = fptrunc float %fneg.a to half 1712 %fneg = fsub half -0.000000e+00, %fpround 1713 store half %fneg, half addrspace(1)* %out.gep 1714 ret void 1715} 1716 1717; GCN-LABEL: {{^}}v_fneg_multi_use_fp_round_fneg_f64_to_f32: 1718; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 1719; GCN-DAG: v_cvt_f32_f64_e32 [[CVT:v[0-9]+]], [[A]] 1720; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]] 1721; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG]] 1722; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[CVT]] 1723define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { 1724 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1725 %tid.ext = sext i32 %tid to i64 1726 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 1727 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1728 %a = load volatile double, double addrspace(1)* %a.gep 1729 %fpround = fptrunc double %a to float 1730 %fneg = fneg float %fpround 1731 store volatile float %fneg, float addrspace(1)* %out.gep 1732 store volatile float %fpround, float addrspace(1)* %out.gep 1733 ret void 1734} 1735 1736; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f32_to_f16: 1737; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1738; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1739; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 1740; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1741; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 1742define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1743 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1744 %tid.ext = sext i32 %tid to i64 1745 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1746 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 1747 %a = load volatile float, float addrspace(1)* %a.gep 1748 %fneg.a = fneg float %a 1749 %fpround = fptrunc float %fneg.a to half 1750 %fneg = fsub half -0.000000e+00, %fpround 1751 store volatile half %fneg, half addrspace(1)* %out.gep 1752 store volatile float %fneg.a, float addrspace(1)* undef 1753 ret void 1754} 1755 1756; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f32_to_f16: 1757; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1758; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1759; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s 1760; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1761; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[USE1]] 1762define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 { 1763 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1764 %tid.ext = sext i32 %tid to i64 1765 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1766 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 1767 %a = load volatile float, float addrspace(1)* %a.gep 1768 %fneg.a = fneg float %a 1769 %fpround = fptrunc float %fneg.a to half 1770 %fneg = fsub half -0.000000e+00, %fpround 1771 %use1 = fmul float %fneg.a, %c 1772 store volatile half %fneg, half addrspace(1)* %out.gep 1773 store volatile float %use1, float addrspace(1)* undef 1774 ret void 1775} 1776 1777; -------------------------------------------------------------------------------- 1778; rcp tests 1779; -------------------------------------------------------------------------------- 1780 1781; GCN-LABEL: {{^}}v_fneg_rcp_f32: 1782; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1783; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 1784; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1785define amdgpu_kernel void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1786 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1787 %tid.ext = sext i32 %tid to i64 1788 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1789 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1790 %a = load volatile float, float addrspace(1)* %a.gep 1791 %rcp = call float @llvm.amdgcn.rcp.f32(float %a) 1792 %fneg = fneg float %rcp 1793 store float %fneg, float addrspace(1)* %out.gep 1794 ret void 1795} 1796 1797; GCN-LABEL: {{^}}v_fneg_rcp_fneg_f32: 1798; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1799; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1800; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1801define amdgpu_kernel void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1802 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1803 %tid.ext = sext i32 %tid to i64 1804 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1805 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1806 %a = load volatile float, float addrspace(1)* %a.gep 1807 %fneg.a = fneg float %a 1808 %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a) 1809 %fneg = fneg float %rcp 1810 store float %fneg, float addrspace(1)* %out.gep 1811 ret void 1812} 1813 1814; GCN-LABEL: {{^}}v_fneg_rcp_store_use_fneg_f32: 1815; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1816; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1817; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 1818; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1819; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 1820define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 1821 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1822 %tid.ext = sext i32 %tid to i64 1823 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1824 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1825 %a = load volatile float, float addrspace(1)* %a.gep 1826 %fneg.a = fneg float %a 1827 %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a) 1828 %fneg = fneg float %rcp 1829 store volatile float %fneg, float addrspace(1)* %out.gep 1830 store volatile float %fneg.a, float addrspace(1)* undef 1831 ret void 1832} 1833 1834; GCN-LABEL: {{^}}v_fneg_rcp_multi_use_fneg_f32: 1835; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1836; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1837; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 1838; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1839; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1840define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 { 1841 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1842 %tid.ext = sext i32 %tid to i64 1843 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1844 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1845 %a = load volatile float, float addrspace(1)* %a.gep 1846 %fneg.a = fneg float %a 1847 %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a) 1848 %fneg = fneg float %rcp 1849 %use1 = fmul float %fneg.a, %c 1850 store volatile float %fneg, float addrspace(1)* %out.gep 1851 store volatile float %use1, float addrspace(1)* undef 1852 ret void 1853} 1854 1855; -------------------------------------------------------------------------------- 1856; fmul_legacy tests 1857; -------------------------------------------------------------------------------- 1858 1859; GCN-LABEL: {{^}}v_fneg_mul_legacy_f32: 1860; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1861; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1862; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]] 1863; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1864define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1865 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1866 %tid.ext = sext i32 %tid to i64 1867 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1868 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1869 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1870 %a = load volatile float, float addrspace(1)* %a.gep 1871 %b = load volatile float, float addrspace(1)* %b.gep 1872 %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) 1873 %fneg = fneg float %mul 1874 store float %fneg, float addrspace(1)* %out.gep 1875 ret void 1876} 1877 1878; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32: 1879; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1880; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1881; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 1882; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]] 1883; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]] 1884; GCN-NEXT: s_waitcnt vmcnt(0) 1885; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1886; GCN-NEXT: s_waitcnt vmcnt(0) 1887define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1888 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1889 %tid.ext = sext i32 %tid to i64 1890 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1891 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1892 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1893 %a = load volatile float, float addrspace(1)* %a.gep 1894 %b = load volatile float, float addrspace(1)* %b.gep 1895 %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) 1896 %fneg = fneg float %mul 1897 store volatile float %fneg, float addrspace(1)* %out 1898 store volatile float %mul, float addrspace(1)* %out 1899 ret void 1900} 1901 1902; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32: 1903; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1904; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1905; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] 1906; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0 1907; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1908; GCN-NEXT: s_waitcnt vmcnt(0) 1909; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1910; GCN-NEXT: s_waitcnt vmcnt(0) 1911define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1912 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1913 %tid.ext = sext i32 %tid to i64 1914 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1915 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1916 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1917 %a = load volatile float, float addrspace(1)* %a.gep 1918 %b = load volatile float, float addrspace(1)* %b.gep 1919 %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) 1920 %fneg = fneg float %mul 1921 %use1 = call float @llvm.amdgcn.fmul.legacy(float %mul, float 4.0) 1922 store volatile float %fneg, float addrspace(1)* %out 1923 store volatile float %use1, float addrspace(1)* %out 1924 ret void 1925} 1926 1927; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32: 1928; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1929; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1930; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 1931; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1932define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1933 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1934 %tid.ext = sext i32 %tid to i64 1935 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1936 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1937 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1938 %a = load volatile float, float addrspace(1)* %a.gep 1939 %b = load volatile float, float addrspace(1)* %b.gep 1940 %fneg.a = fneg float %a 1941 %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b) 1942 %fneg = fneg float %mul 1943 store volatile float %fneg, float addrspace(1)* %out 1944 ret void 1945} 1946 1947; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32: 1948; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1949; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1950; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 1951; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1952define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1953 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1954 %tid.ext = sext i32 %tid to i64 1955 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1956 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1957 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1958 %a = load volatile float, float addrspace(1)* %a.gep 1959 %b = load volatile float, float addrspace(1)* %b.gep 1960 %fneg.b = fneg float %b 1961 %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %fneg.b) 1962 %fneg = fneg float %mul 1963 store volatile float %fneg, float addrspace(1)* %out 1964 ret void 1965} 1966 1967; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_fneg_f32: 1968; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1969; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1970; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] 1971; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1972define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1973 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1974 %tid.ext = sext i32 %tid to i64 1975 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1976 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 1977 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 1978 %a = load volatile float, float addrspace(1)* %a.gep 1979 %b = load volatile float, float addrspace(1)* %b.gep 1980 %fneg.a = fneg float %a 1981 %fneg.b = fneg float %b 1982 %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %fneg.b) 1983 %fneg = fneg float %mul 1984 store volatile float %fneg, float addrspace(1)* %out 1985 ret void 1986} 1987 1988; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_fneg_x_f32: 1989; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1990; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1991; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 1992; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]] 1993; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]] 1994; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 1995define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 1996 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1997 %tid.ext = sext i32 %tid to i64 1998 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 1999 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2000 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2001 %a = load volatile float, float addrspace(1)* %a.gep 2002 %b = load volatile float, float addrspace(1)* %b.gep 2003 %fneg.a = fneg float %a 2004 %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b) 2005 %fneg = fneg float %mul 2006 store volatile float %fneg, float addrspace(1)* %out 2007 store volatile float %fneg.a, float addrspace(1)* %out 2008 ret void 2009} 2010 2011; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32: 2012; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2013; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2014; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]] 2015; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 2016; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]] 2017; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 2018define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { 2019 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2020 %tid.ext = sext i32 %tid to i64 2021 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2022 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2023 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2024 %a = load volatile float, float addrspace(1)* %a.gep 2025 %b = load volatile float, float addrspace(1)* %b.gep 2026 %fneg.a = fneg float %a 2027 %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b) 2028 %fneg = fneg float %mul 2029 %use1 = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %c) 2030 store volatile float %fneg, float addrspace(1)* %out 2031 store volatile float %use1, float addrspace(1)* %out 2032 ret void 2033} 2034 2035; -------------------------------------------------------------------------------- 2036; sin tests 2037; -------------------------------------------------------------------------------- 2038 2039; GCN-LABEL: {{^}}v_fneg_sin_f32: 2040; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2041; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbe22f983, [[A]] 2042; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]] 2043; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]] 2044; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2045define amdgpu_kernel void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2046 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2047 %tid.ext = sext i32 %tid to i64 2048 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2049 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2050 %a = load volatile float, float addrspace(1)* %a.gep 2051 %sin = call float @llvm.sin.f32(float %a) 2052 %fneg = fneg float %sin 2053 store float %fneg, float addrspace(1)* %out.gep 2054 ret void 2055} 2056 2057; GCN-LABEL: {{^}}v_fneg_amdgcn_sin_f32: 2058; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2059; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 2060; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2061define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2062 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2063 %tid.ext = sext i32 %tid to i64 2064 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2065 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2066 %a = load volatile float, float addrspace(1)* %a.gep 2067 %sin = call float @llvm.amdgcn.sin.f32(float %a) 2068 %fneg = fneg float %sin 2069 store float %fneg, float addrspace(1)* %out.gep 2070 ret void 2071} 2072 2073; -------------------------------------------------------------------------------- 2074; ftrunc tests 2075; -------------------------------------------------------------------------------- 2076 2077; GCN-LABEL: {{^}}v_fneg_trunc_f32: 2078; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2079; GCN: v_trunc_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 2080; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2081define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2082 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2083 %tid.ext = sext i32 %tid to i64 2084 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2085 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2086 %a = load volatile float, float addrspace(1)* %a.gep 2087 %trunc = call float @llvm.trunc.f32(float %a) 2088 %fneg = fneg float %trunc 2089 store float %fneg, float addrspace(1)* %out.gep 2090 ret void 2091} 2092 2093; -------------------------------------------------------------------------------- 2094; fround tests 2095; -------------------------------------------------------------------------------- 2096 2097; GCN-LABEL: {{^}}v_fneg_round_f32: 2098; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2099; GCN: v_trunc_f32_e32 2100; GCN: v_sub_f32_e32 2101; GCN: v_cndmask_b32 2102 2103; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} 2104; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[ADD]] 2105 2106; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}} 2107; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2108define amdgpu_kernel void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2109 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2110 %tid.ext = sext i32 %tid to i64 2111 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2112 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2113 %a = load volatile float, float addrspace(1)* %a.gep 2114 %round = call float @llvm.round.f32(float %a) 2115 %fneg = fneg float %round 2116 store float %fneg, float addrspace(1)* %out.gep 2117 ret void 2118} 2119 2120; -------------------------------------------------------------------------------- 2121; rint tests 2122; -------------------------------------------------------------------------------- 2123 2124; GCN-LABEL: {{^}}v_fneg_rint_f32: 2125; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2126; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 2127; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2128define amdgpu_kernel void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2129 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2130 %tid.ext = sext i32 %tid to i64 2131 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2132 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2133 %a = load volatile float, float addrspace(1)* %a.gep 2134 %rint = call float @llvm.rint.f32(float %a) 2135 %fneg = fneg float %rint 2136 store float %fneg, float addrspace(1)* %out.gep 2137 ret void 2138} 2139 2140; -------------------------------------------------------------------------------- 2141; nearbyint tests 2142; -------------------------------------------------------------------------------- 2143 2144; GCN-LABEL: {{^}}v_fneg_nearbyint_f32: 2145; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2146; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 2147; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2148define amdgpu_kernel void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2149 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2150 %tid.ext = sext i32 %tid to i64 2151 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2152 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2153 %a = load volatile float, float addrspace(1)* %a.gep 2154 %nearbyint = call float @llvm.nearbyint.f32(float %a) 2155 %fneg = fneg float %nearbyint 2156 store float %fneg, float addrspace(1)* %out.gep 2157 ret void 2158} 2159 2160; -------------------------------------------------------------------------------- 2161; fcanonicalize tests 2162; -------------------------------------------------------------------------------- 2163 2164; GCN-LABEL: {{^}}v_fneg_canonicalize_f32: 2165; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2166; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], -1.0, [[A]] 2167; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2168define amdgpu_kernel void @v_fneg_canonicalize_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 2169 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2170 %tid.ext = sext i32 %tid to i64 2171 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2172 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2173 %a = load volatile float, float addrspace(1)* %a.gep 2174 %trunc = call float @llvm.canonicalize.f32(float %a) 2175 %fneg = fneg float %trunc 2176 store float %fneg, float addrspace(1)* %out.gep 2177 ret void 2178} 2179 2180; -------------------------------------------------------------------------------- 2181; vintrp tests 2182; -------------------------------------------------------------------------------- 2183 2184; GCN-LABEL: {{^}}v_fneg_interp_p1_f32: 2185; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2186; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2187; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] 2188; GCN: v_interp_p1_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]] 2189; GCN: v_interp_p1_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]] 2190define amdgpu_kernel void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 2191 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2192 %tid.ext = sext i32 %tid to i64 2193 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2194 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2195 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2196 %a = load volatile float, float addrspace(1)* %a.gep 2197 %b = load volatile float, float addrspace(1)* %b.gep 2198 %mul = fmul float %a, %b 2199 %fneg = fneg float %mul 2200 %intrp0 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 0, i32 0, i32 0) 2201 %intrp1 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 1, i32 0, i32 0) 2202 store volatile float %intrp0, float addrspace(1)* %out.gep 2203 store volatile float %intrp1, float addrspace(1)* %out.gep 2204 ret void 2205} 2206 2207; GCN-LABEL: {{^}}v_fneg_interp_p2_f32: 2208; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2209; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2210; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] 2211; GCN: v_interp_p2_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]] 2212; GCN: v_interp_p2_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]] 2213define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 2214 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2215 %tid.ext = sext i32 %tid to i64 2216 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2217 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2218 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2219 %a = load volatile float, float addrspace(1)* %a.gep 2220 %b = load volatile float, float addrspace(1)* %b.gep 2221 %mul = fmul float %a, %b 2222 %fneg = fneg float %mul 2223 %intrp0 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 0, i32 0, i32 0) 2224 %intrp1 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 1, i32 0, i32 0) 2225 store volatile float %intrp0, float addrspace(1)* %out.gep 2226 store volatile float %intrp1, float addrspace(1)* %out.gep 2227 ret void 2228} 2229 2230; -------------------------------------------------------------------------------- 2231; CopyToReg tests 2232; -------------------------------------------------------------------------------- 2233 2234; GCN-LABEL: {{^}}v_fneg_copytoreg_f32: 2235; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2236; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2237; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2238; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]] 2239; GCN: s_cbranch_scc0 2240 2241; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] 2242; GCN: s_endpgm 2243 2244; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]] 2245; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[XOR]], [[C]] 2246; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2247 2248define amdgpu_kernel void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { 2249 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2250 %tid.ext = sext i32 %tid to i64 2251 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2252 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2253 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2254 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2255 %a = load volatile float, float addrspace(1)* %a.gep 2256 %b = load volatile float, float addrspace(1)* %b.gep 2257 %c = load volatile float, float addrspace(1)* %c.gep 2258 %mul = fmul float %a, %b 2259 %fneg = fneg float %mul 2260 %cmp0 = icmp eq i32 %d, 0 2261 br i1 %cmp0, label %if, label %endif 2262 2263if: 2264 %mul1 = fmul float %fneg, %c 2265 store volatile float %mul1, float addrspace(1)* %out.gep 2266 br label %endif 2267 2268endif: 2269 store volatile float %mul, float addrspace(1)* %out.gep 2270 ret void 2271} 2272 2273; -------------------------------------------------------------------------------- 2274; inlineasm tests 2275; -------------------------------------------------------------------------------- 2276 2277; Can't fold into use, so should fold into source 2278; GCN-LABEL: {{^}}v_fneg_inlineasm_f32: 2279; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2280; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2281; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] 2282; GCN: ; use [[MUL]] 2283; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 2284define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { 2285 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2286 %tid.ext = sext i32 %tid to i64 2287 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2288 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2289 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2290 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2291 %a = load volatile float, float addrspace(1)* %a.gep 2292 %b = load volatile float, float addrspace(1)* %b.gep 2293 %c = load volatile float, float addrspace(1)* %c.gep 2294 %mul = fmul float %a, %b 2295 %fneg = fneg float %mul 2296 call void asm sideeffect "; use $0", "v"(float %fneg) #0 2297 store volatile float %fneg, float addrspace(1)* %out.gep 2298 ret void 2299} 2300 2301; -------------------------------------------------------------------------------- 2302; inlineasm tests 2303; -------------------------------------------------------------------------------- 2304 2305; Can't fold into use, so should fold into source 2306; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32: 2307; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2308; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2309; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]] 2310; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]] 2311; GCN: ; use [[NEG]] 2312; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 2313define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { 2314 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2315 %tid.ext = sext i32 %tid to i64 2316 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2317 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2318 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2319 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2320 %a = load volatile float, float addrspace(1)* %a.gep 2321 %b = load volatile float, float addrspace(1)* %b.gep 2322 %c = load volatile float, float addrspace(1)* %c.gep 2323 %mul = fmul float %a, %b 2324 %fneg = fneg float %mul 2325 call void asm sideeffect "; use $0", "v"(float %fneg) #0 2326 store volatile float %mul, float addrspace(1)* %out.gep 2327 ret void 2328} 2329 2330; -------------------------------------------------------------------------------- 2331; code size regression tests 2332; -------------------------------------------------------------------------------- 2333 2334; There are multiple users of the fneg that must use a VOP3 2335; instruction, so there is no penalty 2336; GCN-LABEL: {{^}}multiuse_fneg_2_vop3_users_f32: 2337; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2338; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2339; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2340 2341; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], [[C]] 2342; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0 2343 2344; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]] 2345; GCN-NEXT: s_waitcnt vmcnt(0) 2346; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA1]] 2347; GCN-NEXT: s_waitcnt vmcnt(0) 2348define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 2349 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2350 %tid.ext = sext i32 %tid to i64 2351 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2352 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2353 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2354 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2355 %a = load volatile float, float addrspace(1)* %a.gep 2356 %b = load volatile float, float addrspace(1)* %b.gep 2357 %c = load volatile float, float addrspace(1)* %c.gep 2358 2359 %fneg.a = fneg float %a 2360 %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) 2361 %fma1 = call float @llvm.fma.f32(float %fneg.a, float %c, float 2.0) 2362 2363 store volatile float %fma0, float addrspace(1)* %out 2364 store volatile float %fma1, float addrspace(1)* %out 2365 ret void 2366} 2367 2368; There are multiple users, but both require using a larger encoding 2369; for the modifier. 2370 2371; GCN-LABEL: {{^}}multiuse_fneg_2_vop2_users_f32: 2372; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2373; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2374; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2375 2376; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]] 2377; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]] 2378; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] 2379; GCN-NEXT: s_waitcnt vmcnt(0) 2380; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2381; GCN-NEXT: s_waitcnt vmcnt(0) 2382define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 2383 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2384 %tid.ext = sext i32 %tid to i64 2385 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2386 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2387 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2388 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2389 %a = load volatile float, float addrspace(1)* %a.gep 2390 %b = load volatile float, float addrspace(1)* %b.gep 2391 %c = load volatile float, float addrspace(1)* %c.gep 2392 2393 %fneg.a = fneg float %a 2394 %mul0 = fmul float %fneg.a, %b 2395 %mul1 = fmul float %fneg.a, %c 2396 2397 store volatile float %mul0, float addrspace(1)* %out 2398 store volatile float %mul1, float addrspace(1)* %out 2399 ret void 2400} 2401 2402; One user is VOP3 so has no cost to folding the modifier, the other does. 2403; GCN-LABEL: {{^}}multiuse_fneg_vop2_vop3_users_f32: 2404; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2405; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2406; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2407 2408; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], 2.0 2409; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]] 2410 2411; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]] 2412; GCN-NEXT: s_waitcnt vmcnt(0) 2413; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2414; GCN-NEXT: s_waitcnt vmcnt(0) 2415define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { 2416 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2417 %tid.ext = sext i32 %tid to i64 2418 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2419 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2420 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2421 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2422 %a = load volatile float, float addrspace(1)* %a.gep 2423 %b = load volatile float, float addrspace(1)* %b.gep 2424 %c = load volatile float, float addrspace(1)* %c.gep 2425 2426 %fneg.a = fneg float %a 2427 %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0) 2428 %mul1 = fmul float %fneg.a, %c 2429 2430 store volatile float %fma0, float addrspace(1)* %out 2431 store volatile float %mul1, float addrspace(1)* %out 2432 ret void 2433} 2434 2435; The use of the fneg requires a code size increase, but folding into 2436; the source does not 2437 2438; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f32: 2439; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2440; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2441; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2442; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]] 2443 2444; GCN-SAFE: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], [[B]], 2.0 2445; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[FMA0]], [[C]] 2446; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]] 2447 2448; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0 2449; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[FMA0]], [[C]] 2450; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]] 2451 2452; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2453; GCN-NEXT: s_waitcnt vmcnt(0) 2454; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL2]] 2455; GCN-NEXT: s_waitcnt vmcnt(0) 2456define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { 2457 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2458 %tid.ext = sext i32 %tid to i64 2459 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2460 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2461 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2462 %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext 2463 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2464 %a = load volatile float, float addrspace(1)* %a.gep 2465 %b = load volatile float, float addrspace(1)* %b.gep 2466 %c = load volatile float, float addrspace(1)* %c.gep 2467 %d = load volatile float, float addrspace(1)* %d.gep 2468 2469 %fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0) 2470 %fneg.fma0 = fneg float %fma0 2471 %mul1 = fmul float %fneg.fma0, %c 2472 %mul2 = fmul float %fneg.fma0, %d 2473 2474 store volatile float %mul1, float addrspace(1)* %out 2475 store volatile float %mul2, float addrspace(1)* %out 2476 ret void 2477} 2478 2479; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64: 2480; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 2481; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] 2482; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]] 2483; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]] 2484 2485; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0 2486; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]] 2487; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]] 2488 2489; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] 2490; GCN-NEXT: s_waitcnt vmcnt(0) 2491; GCN-NEXT: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2492; GCN-NEXT: s_waitcnt vmcnt(0) 2493define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 { 2494 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2495 %tid.ext = sext i32 %tid to i64 2496 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext 2497 %b.gep = getelementptr inbounds double, double addrspace(1)* %b.ptr, i64 %tid.ext 2498 %c.gep = getelementptr inbounds double, double addrspace(1)* %c.ptr, i64 %tid.ext 2499 %d.gep = getelementptr inbounds double, double addrspace(1)* %d.ptr, i64 %tid.ext 2500 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 2501 %a = load volatile double, double addrspace(1)* %a.gep 2502 %b = load volatile double, double addrspace(1)* %b.gep 2503 %c = load volatile double, double addrspace(1)* %c.gep 2504 %d = load volatile double, double addrspace(1)* %d.gep 2505 2506 %fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0) 2507 %fneg.fma0 = fsub double -0.0, %fma0 2508 %mul1 = fmul double %fneg.fma0, %c 2509 %mul2 = fmul double %fneg.fma0, %d 2510 2511 store volatile double %mul1, double addrspace(1)* %out 2512 store volatile double %mul2, double addrspace(1)* %out 2513 ret void 2514} 2515 2516; %trunc.a has one fneg use, but it requires a code size increase and 2517; %the fneg can instead be folded for free into the fma. 2518 2519; GCN-LABEL: {{^}}one_use_cost_to_fold_into_src_f32: 2520; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2521; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2522; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2523; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]] 2524; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]] 2525; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]] 2526define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { 2527 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2528 %tid.ext = sext i32 %tid to i64 2529 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2530 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2531 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2532 %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext 2533 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2534 %a = load volatile float, float addrspace(1)* %a.gep 2535 %b = load volatile float, float addrspace(1)* %b.gep 2536 %c = load volatile float, float addrspace(1)* %c.gep 2537 %d = load volatile float, float addrspace(1)* %d.gep 2538 2539 %trunc.a = call float @llvm.trunc.f32(float %a) 2540 %trunc.fneg.a = fneg float %trunc.a 2541 %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c) 2542 store volatile float %fma0, float addrspace(1)* %out 2543 ret void 2544} 2545 2546; GCN-LABEL: {{^}}multi_use_cost_to_fold_into_src: 2547; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2548; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2549; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2550; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]] 2551; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]] 2552; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]] 2553; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[TRUNC_A]], [[D]] 2554; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]] 2555; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2556define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { 2557 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2558 %tid.ext = sext i32 %tid to i64 2559 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 2560 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 2561 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext 2562 %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext 2563 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 2564 %a = load volatile float, float addrspace(1)* %a.gep 2565 %b = load volatile float, float addrspace(1)* %b.gep 2566 %c = load volatile float, float addrspace(1)* %c.gep 2567 %d = load volatile float, float addrspace(1)* %d.gep 2568 2569 %trunc.a = call float @llvm.trunc.f32(float %a) 2570 %trunc.fneg.a = fneg float %trunc.a 2571 %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c) 2572 %mul1 = fmul float %trunc.a, %d 2573 store volatile float %fma0, float addrspace(1)* %out 2574 store volatile float %mul1, float addrspace(1)* %out 2575 ret void 2576} 2577 2578; The AMDGPU combine to pull fneg into the FMA operands was being 2579; undone by the generic combine to pull the fneg out of the fma if 2580; !isFNegFree. We were reporting false for v2f32 even though it will 2581; be split into f32 where it will be free. 2582; GCN-LABEL: {{^}}fneg_fma_fneg_dagcombine_loop: 2583; GCN: s_brev_b32 [[NEGZERO:s[0-9]+]], 1{{$}} 2584; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], v2, -v4, [[NEGZERO]] 2585; GCN-DAG: v_fma_f32 [[FMA1:v[0-9]+]], v3, -v5, [[NEGZERO]] 2586; GCN-DAG: v_sub_f32_e32 [[SUB0:v[0-9]+]], [[FMA0]], v0 2587; GCN-DAG: v_sub_f32_e32 [[SUB1:v[0-9]+]], [[FMA1]], v1 2588; GCN-DAG: v_mul_f32_e32 v0, [[SUB0]], v4 2589; GCN-DAG: v_mul_f32_e32 v1, [[SUB1]], v5 2590; GCN: s_setpc_b64 2591define <2 x float> @fneg_fma_fneg_dagcombine_loop(<2 x float> %arg, <2 x float> %arg1, <2 x float> %arg2) #0 { 2592bb: 2593 %i3 = call fast <2 x float> @llvm.fma.v2f32(<2 x float> %arg1, <2 x float> %arg2, <2 x float> zeroinitializer) 2594 %i4 = fadd fast <2 x float> %i3, %arg 2595 %i5 = fneg <2 x float> %i4 2596 %i6 = fmul fast <2 x float> %i5, %arg2 2597 ret <2 x float> %i6 2598} 2599 2600; This expects denormal flushing, so can't turn this fmul into fneg 2601; TODO: Keeping this as fmul saves encoding size 2602; GCN-LABEL: {{^}}nnan_fmul_neg1_to_fneg: 2603; GCN: v_sub_f32_e32 [[TMP:v[0-9]+]], 0x80000000, v0 2604; GCN-NEXT: v_mul_f32_e32 v0, [[TMP]], v1 2605define float @nnan_fmul_neg1_to_fneg(float %x, float %y) #0 { 2606 %mul = fmul float %x, -1.0 2607 %add = fmul nnan float %mul, %y 2608 ret float %add 2609} 2610 2611; It's legal to turn this fmul into an fneg since denormals are 2612; preserved and we know an snan can't happen from the flag. 2613; GCN-LABEL: {{^}}denormal_fmul_neg1_to_fneg: 2614; GCN: v_mul_f32_e64 v0, -v0, v1 2615; GCN-NEXT: s_setpc_b64 2616define float @denormal_fmul_neg1_to_fneg(float %x, float %y) { 2617 %mul = fmul nnan float %x, -1.0 2618 %add = fmul float %mul, %y 2619 ret float %add 2620} 2621 2622; know the source can't be an snan 2623; GCN-LABEL: {{^}}denorm_snan_fmul_neg1_to_fneg: 2624; GCN: v_mul_f32_e64 [[TMP:v[0-9]+]], v0, -v0 2625; GCN: v_mul_f32_e32 v0, [[TMP]], v1 2626; GCN-NEXT: s_setpc_b64 2627define float @denorm_snan_fmul_neg1_to_fneg(float %x, float %y) { 2628 %canonical = fmul float %x, %x 2629 %mul = fmul float %canonical, -1.0 2630 %add = fmul float %mul, %y 2631 ret float %add 2632} 2633 2634; GCN-LABEL: {{^}}flush_snan_fmul_neg1_to_fneg: 2635; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], 1.0, v0 2636; GCN: v_sub_f32_e32 [[TMP1:v[0-9]+]], 0x80000000, [[TMP0]] 2637; GCN-NEXT: v_mul_f32_e32 v0, [[TMP1]], v1 2638define float @flush_snan_fmul_neg1_to_fneg(float %x, float %y) #0 { 2639 %quiet = call float @llvm.canonicalize.f32(float %x) 2640 %mul = fmul float %quiet, -1.0 2641 %add = fmul float %mul, %y 2642 ret float %add 2643} 2644 2645declare i32 @llvm.amdgcn.workitem.id.x() #1 2646declare float @llvm.fma.f32(float, float, float) #1 2647declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) 2648declare float @llvm.fmuladd.f32(float, float, float) #1 2649declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1 2650declare float @llvm.sin.f32(float) #1 2651declare float @llvm.trunc.f32(float) #1 2652declare float @llvm.round.f32(float) #1 2653declare float @llvm.rint.f32(float) #1 2654declare float @llvm.nearbyint.f32(float) #1 2655declare float @llvm.canonicalize.f32(float) #1 2656declare float @llvm.minnum.f32(float, float) #1 2657declare float @llvm.maxnum.f32(float, float) #1 2658declare half @llvm.minnum.f16(half, half) #1 2659declare double @llvm.minnum.f64(double, double) #1 2660declare double @llvm.fma.f64(double, double, double) #1 2661 2662declare float @llvm.amdgcn.sin.f32(float) #1 2663declare float @llvm.amdgcn.rcp.f32(float) #1 2664declare float @llvm.amdgcn.rcp.legacy(float) #1 2665declare float @llvm.amdgcn.fmul.legacy(float, float) #1 2666declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0 2667declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0 2668 2669attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 2670attributes #1 = { nounwind readnone } 2671attributes #2 = { nounwind "unsafe-fp-math"="true" } 2672attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" } 2673