1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GFX678 %s 2; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX678 %s 3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s 4 5; GCN-LABEL: {{^}}v_clamp_f32: 6; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 7; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 8define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 9 %tid = call i32 @llvm.amdgcn.workitem.id.x() 10 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 11 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 12 %a = load float, float addrspace(1)* %gep0 13 %max = call float @llvm.maxnum.f32(float %a, float 0.0) 14 %med = call float @llvm.minnum.f32(float %max, float 1.0) 15 16 store float %med, float addrspace(1)* %out.gep 17 ret void 18} 19 20; GCN-LABEL: {{^}}v_clamp_neg_f32: 21; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 22; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}} 23define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 24 %tid = call i32 @llvm.amdgcn.workitem.id.x() 25 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 26 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 27 %a = load float, float addrspace(1)* %gep0 28 %fneg.a = fneg float %a 29 %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0) 30 %med = call float @llvm.minnum.f32(float %max, float 1.0) 31 32 store float %med, float addrspace(1)* %out.gep 33 ret void 34} 35 36; GCN-LABEL: {{^}}v_clamp_negabs_f32: 37; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 38; GCN: v_max_f32_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}} 39define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 40 %tid = call i32 @llvm.amdgcn.workitem.id.x() 41 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 42 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 43 %a = load float, float addrspace(1)* %gep0 44 %fabs.a = call float @llvm.fabs.f32(float %a) 45 %fneg.fabs.a = fneg float %fabs.a 46 47 %max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0) 48 %med = call float @llvm.minnum.f32(float %max, float 1.0) 49 50 store float %med, float addrspace(1)* %out.gep 51 ret void 52} 53 54; GCN-LABEL: {{^}}v_clamp_negzero_f32: 55; GCN-DAG: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 56; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, [[A]] 57; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[ADD]] 58; GCN: v_min_f32_e32 v{{[0-9]+}}, 1.0, [[MAX]] 59define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 60 %tid = call i32 @llvm.amdgcn.workitem.id.x() 61 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 62 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 63 %a = load float, float addrspace(1)* %gep0 64 %add = fadd nnan float %a, 0.5 65 %max = call float @llvm.maxnum.f32(float %add, float -0.0) 66 %med = call float @llvm.minnum.f32(float %max, float 1.0) 67 68 store float %med, float addrspace(1)* %out.gep 69 ret void 70} 71 72; FIXME: Weird inconsistency in how -0.0 is treated. Accepted if clamp 73; matched through med3, not if directly. Is this correct? 74 75; GCN-LABEL: {{^}}v_clamp_negzero_maybe_snan_f32: 76; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 77; GFX678: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]] 78; GFX9: v_max_f32_e32 [[QUIET:v[0-9]+]], [[A]], [[A]] 79; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[QUIET]] 80; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]] 81define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 82 %tid = call i32 @llvm.amdgcn.workitem.id.x() 83 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 84 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 85 %a = load float, float addrspace(1)* %gep0 86 %max = call float @llvm.maxnum.f32(float %a, float -0.0) 87 %med = call float @llvm.minnum.f32(float %max, float 1.0) 88 89 store float %med, float addrspace(1)* %out.gep 90 ret void 91} 92 93; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32: 94; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 95; GFX678: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] 96; GFX9: v_max_f32_e32 [[QUIET_A:v[0-9]+]], [[A]], [[A]] 97; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]] 98; GCN: v_min_f32_e32 [[MED:v[0-9]+]], 1.0, [[QUIET_A]] 99; GCN-NOT: [[MAX]] 100; GCN-NOT: [[MED]] 101 102; SI: buffer_store_dword [[MED]] 103; SI: buffer_store_dword [[MAX]] 104 105; GFX89: {{flat|global}}_store_dword v{{.+}}, [[MED]] 106; GFX89: {{flat|global}}_store_dword v{{.+}}, [[MAX]] 107define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 108 %tid = call i32 @llvm.amdgcn.workitem.id.x() 109 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 110 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 111 %a = load float, float addrspace(1)* %gep0 112 %max = call float @llvm.maxnum.f32(float %a, float 0.0) 113 %med = call float @llvm.minnum.f32(float %max, float 1.0) 114 115 store float %med, float addrspace(1)* %out.gep 116 store volatile float %max, float addrspace(1)* undef 117 ret void 118} 119 120; GCN-LABEL: {{^}}v_clamp_f16: 121; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]] 122; GFX89: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 123 124; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], [[A]] clamp{{$}} 125; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]] 126define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { 127 %tid = call i32 @llvm.amdgcn.workitem.id.x() 128 %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid 129 %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid 130 %a = load half, half addrspace(1)* %gep0 131 %max = call half @llvm.maxnum.f16(half %a, half 0.0) 132 %med = call half @llvm.minnum.f16(half %max, half 1.0) 133 134 store half %med, half addrspace(1)* %out.gep 135 ret void 136} 137 138; GCN-LABEL: {{^}}v_clamp_neg_f16: 139; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]] 140; GFX89: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}} 141 142; FIXME: Better to fold neg into max 143; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] clamp{{$}} 144; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]] 145define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { 146 %tid = call i32 @llvm.amdgcn.workitem.id.x() 147 %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid 148 %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid 149 %a = load half, half addrspace(1)* %gep0 150 %fneg.a = fsub half -0.0, %a 151 %max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0) 152 %med = call half @llvm.minnum.f16(half %max, half 1.0) 153 154 store half %med, half addrspace(1)* %out.gep 155 ret void 156} 157 158; GCN-LABEL: {{^}}v_clamp_negabs_f16: 159; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]] 160; GFX89: v_max_f16_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}} 161 162; FIXME: Better to fold neg/abs into max 163 164; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]| clamp{{$}} 165; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]] 166define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { 167 %tid = call i32 @llvm.amdgcn.workitem.id.x() 168 %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid 169 %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid 170 %a = load half, half addrspace(1)* %gep0 171 %fabs.a = call half @llvm.fabs.f16(half %a) 172 %fneg.fabs.a = fsub half -0.0, %fabs.a 173 174 %max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0) 175 %med = call half @llvm.minnum.f16(half %max, half 1.0) 176 177 store half %med, half addrspace(1)* %out.gep 178 ret void 179} 180 181; FIXME: Do f64 instructions support clamp? 182; GCN-LABEL: {{^}}v_clamp_f64: 183; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 184; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], [[A]] clamp{{$}} 185define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { 186 %tid = call i32 @llvm.amdgcn.workitem.id.x() 187 %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid 188 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid 189 %a = load double, double addrspace(1)* %gep0 190 %max = call double @llvm.maxnum.f64(double %a, double 0.0) 191 %med = call double @llvm.minnum.f64(double %max, double 1.0) 192 193 store double %med, double addrspace(1)* %out.gep 194 ret void 195} 196 197; GCN-LABEL: {{^}}v_clamp_neg_f64: 198; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 199; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -[[A]], -[[A]] clamp{{$}} 200define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { 201 %tid = call i32 @llvm.amdgcn.workitem.id.x() 202 %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid 203 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid 204 %a = load double, double addrspace(1)* %gep0 205 %fneg.a = fsub double -0.0, %a 206 %max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0) 207 %med = call double @llvm.minnum.f64(double %max, double 1.0) 208 209 store double %med, double addrspace(1)* %out.gep 210 ret void 211} 212 213; GCN-LABEL: {{^}}v_clamp_negabs_f64: 214; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 215; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -|[[A]]|, -|[[A]]| clamp{{$}} 216define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { 217 %tid = call i32 @llvm.amdgcn.workitem.id.x() 218 %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid 219 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid 220 %a = load double, double addrspace(1)* %gep0 221 %fabs.a = call double @llvm.fabs.f64(double %a) 222 %fneg.fabs.a = fsub double -0.0, %fabs.a 223 224 %max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0) 225 %med = call double @llvm.minnum.f64(double %max, double 1.0) 226 227 store double %med, double addrspace(1)* %out.gep 228 ret void 229} 230 231; GCN-LABEL: {{^}}v_clamp_med3_aby_negzero_f32: 232; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 233; GCN: v_med3_f32 234define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 235 %tid = call i32 @llvm.amdgcn.workitem.id.x() 236 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 237 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 238 %a = load float, float addrspace(1)* %gep0 239 %med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a) 240 store float %med, float addrspace(1)* %out.gep 241 ret void 242} 243 244; GCN-LABEL: {{^}}v_clamp_med3_aby_f32: 245; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 246; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 247define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 248 %tid = call i32 @llvm.amdgcn.workitem.id.x() 249 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 250 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 251 %a = load float, float addrspace(1)* %gep0 252 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a) 253 store float %med, float addrspace(1)* %out.gep 254 ret void 255} 256 257; GCN-LABEL: {{^}}v_clamp_med3_bay_f32: 258; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 259; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 260define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 261 %tid = call i32 @llvm.amdgcn.workitem.id.x() 262 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 263 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 264 %a = load float, float addrspace(1)* %gep0 265 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a) 266 store float %med, float addrspace(1)* %out.gep 267 ret void 268} 269 270; GCN-LABEL: {{^}}v_clamp_med3_yab_f32: 271; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 272; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 273define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 274 %tid = call i32 @llvm.amdgcn.workitem.id.x() 275 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 276 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 277 %a = load float, float addrspace(1)* %gep0 278 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0) 279 store float %med, float addrspace(1)* %out.gep 280 ret void 281} 282 283; GCN-LABEL: {{^}}v_clamp_med3_yba_f32: 284; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 285; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 286define amdgpu_kernel void @v_clamp_med3_yba_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 287 %tid = call i32 @llvm.amdgcn.workitem.id.x() 288 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 289 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 290 %a = load float, float addrspace(1)* %gep0 291 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0) 292 store float %med, float addrspace(1)* %out.gep 293 ret void 294} 295 296; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32: 297; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 298; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 299define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 300 %tid = call i32 @llvm.amdgcn.workitem.id.x() 301 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 302 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 303 %a = load float, float addrspace(1)* %gep0 304 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0) 305 store float %med, float addrspace(1)* %out.gep 306 ret void 307} 308 309; GCN-LABEL: {{^}}v_clamp_med3_bya_f32: 310; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 311; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 312define amdgpu_kernel void @v_clamp_med3_bya_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 313 %tid = call i32 @llvm.amdgcn.workitem.id.x() 314 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 315 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 316 %a = load float, float addrspace(1)* %gep0 317 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0) 318 store float %med, float addrspace(1)* %out.gep 319 ret void 320} 321 322; GCN-LABEL: {{^}}v_clamp_constants_to_one_f32: 323; GCN: v_mov_b32_e32 v{{[0-9]+}}, 1.0 324define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 { 325 %tid = call i32 @llvm.amdgcn.workitem.id.x() 326 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 327 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0) 328 store float %med, float addrspace(1)* %out.gep 329 ret void 330} 331 332; GCN-LABEL: {{^}}v_clamp_constants_to_zero_f32: 333; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} 334define amdgpu_kernel void @v_clamp_constants_to_zero_f32(float addrspace(1)* %out) #0 { 335 %tid = call i32 @llvm.amdgcn.workitem.id.x() 336 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 337 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0) 338 store float %med, float addrspace(1)* %out.gep 339 ret void 340} 341 342; GCN-LABEL: {{^}}v_clamp_constant_preserve_f32: 343; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0.5 344define amdgpu_kernel void @v_clamp_constant_preserve_f32(float addrspace(1)* %out) #0 { 345 %tid = call i32 @llvm.amdgcn.workitem.id.x() 346 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 347 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5) 348 store float %med, float addrspace(1)* %out.gep 349 ret void 350} 351 352; GCN-LABEL: {{^}}v_clamp_constant_preserve_denorm_f32: 353; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fffff{{$}} 354define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(float addrspace(1)* %out) #0 { 355 %tid = call i32 @llvm.amdgcn.workitem.id.x() 356 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 357 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float)) 358 store float %med, float addrspace(1)* %out.gep 359 ret void 360} 361 362; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32: 363; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} 364define amdgpu_kernel void @v_clamp_constant_qnan_f32(float addrspace(1)* %out) #0 { 365 %tid = call i32 @llvm.amdgcn.workitem.id.x() 366 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 367 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000) 368 store float %med, float addrspace(1)* %out.gep 369 ret void 370} 371 372; GCN-LABEL: {{^}}v_clamp_constant_snan_f32: 373; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} 374define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #0 { 375 %tid = call i32 @llvm.amdgcn.workitem.id.x() 376 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 377 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float)) 378 store float %med, float addrspace(1)* %out.gep 379 ret void 380} 381 382; --------------------------------------------------------------------- 383; Test non-default behaviors enabling snans and disabling dx10_clamp 384; --------------------------------------------------------------------- 385 386; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp: 387; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 388; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, [[A]] 389; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0 390define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 391 %tid = call i32 @llvm.amdgcn.workitem.id.x() 392 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 393 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 394 %a = load float, float addrspace(1)* %gep0 395 %a.nnan = fadd nnan float %a, 0.5 396 %max = call float @llvm.maxnum.f32(float %a.nnan, float 0.0) 397 %med = call float @llvm.minnum.f32(float %max, float 1.0) 398 399 store float %med, float addrspace(1)* %out.gep 400 ret void 401} 402 403; GCN-LABEL: {{^}}v_clamp_f32_snan_dx10clamp: 404; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 405; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], [[A]], 0.5 clamp{{$}} 406define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #3 { 407 %tid = call i32 @llvm.amdgcn.workitem.id.x() 408 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 409 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 410 %a = load float, float addrspace(1)* %gep0 411 %add = fadd float %a, 0.5 412 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 413 %med = call float @llvm.minnum.f32(float %max, float 1.0) 414 415 store float %med, float addrspace(1)* %out.gep 416 ret void 417} 418 419; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp: 420; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 421; GFX678: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] 422; GFX9: v_max_f32_e32 [[QUIET_A:v[0-9]+]], [[A]], [[A]] 423; GCN: v_med3_f32 {{v[0-9]+}}, [[QUIET_A]], 0, 1.0 424define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { 425 %tid = call i32 @llvm.amdgcn.workitem.id.x() 426 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 427 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 428 %a = load float, float addrspace(1)* %gep0 429 %max = call float @llvm.maxnum.f32(float %a, float 0.0) 430 %med = call float @llvm.minnum.f32(float %max, float 1.0) 431 432 store float %med, float addrspace(1)* %out.gep 433 ret void 434} 435 436; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src: 437; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 438; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]] 439; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0 440define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { 441 %tid = call i32 @llvm.amdgcn.workitem.id.x() 442 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 443 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 444 %a = load float, float addrspace(1)* %gep0 445 %add = fadd nnan float %a, 1.0 446 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 447 %med = call float @llvm.minnum.f32(float %max, float 1.0) 448 449 store float %med, float addrspace(1)* %out.gep 450 ret void 451} 452 453; GCN-LABEL: {{^}}v_clamp_med3_aby_f32_no_dx10_clamp: 454; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 455; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 456define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 457 %tid = call i32 @llvm.amdgcn.workitem.id.x() 458 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 459 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 460 %a = load float, float addrspace(1)* %gep0 461 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a) 462 store float %med, float addrspace(1)* %out.gep 463 ret void 464} 465 466; GCN-LABEL: {{^}}v_clamp_med3_bay_f32_no_dx10_clamp: 467; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 468; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 469define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 470 %tid = call i32 @llvm.amdgcn.workitem.id.x() 471 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 472 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 473 %a = load float, float addrspace(1)* %gep0 474 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a) 475 store float %med, float addrspace(1)* %out.gep 476 ret void 477} 478 479; GCN-LABEL: {{^}}v_clamp_med3_yab_f32_no_dx10_clamp: 480; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 481; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0 482define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 483 %tid = call i32 @llvm.amdgcn.workitem.id.x() 484 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 485 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 486 %a = load float, float addrspace(1)* %gep0 487 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0) 488 store float %med, float addrspace(1)* %out.gep 489 ret void 490} 491 492; GCN-LABEL: {{^}}v_clamp_med3_yba_f32_no_dx10_clamp: 493; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 494; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 1.0, 0 495define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 496 %tid = call i32 @llvm.amdgcn.workitem.id.x() 497 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 498 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 499 %a = load float, float addrspace(1)* %gep0 500 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0) 501 store float %med, float addrspace(1)* %out.gep 502 ret void 503} 504 505; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_no_dx10_clamp: 506; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 507; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0 508define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 509 %tid = call i32 @llvm.amdgcn.workitem.id.x() 510 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 511 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 512 %a = load float, float addrspace(1)* %gep0 513 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0) 514 store float %med, float addrspace(1)* %out.gep 515 ret void 516} 517 518; GCN-LABEL: {{^}}v_clamp_med3_bya_f32_no_dx10_clamp: 519; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 520; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, [[A]], 0 521define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 522 %tid = call i32 @llvm.amdgcn.workitem.id.x() 523 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 524 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 525 %a = load float, float addrspace(1)* %gep0 526 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0) 527 store float %med, float addrspace(1)* %out.gep 528 ret void 529} 530 531; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32_no_dx10_clamp: 532; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fc00000 533define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 { 534 %tid = call i32 @llvm.amdgcn.workitem.id.x() 535 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 536 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000) 537 store float %med, float addrspace(1)* %out.gep 538 ret void 539} 540 541; GCN-LABEL: {{^}}v_clamp_constant_snan_f32_no_dx10_clamp: 542; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7f800001 543define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 { 544 %tid = call i32 @llvm.amdgcn.workitem.id.x() 545 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 546 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float)) 547 store float %med, float addrspace(1)* %out.gep 548 ret void 549} 550 551; GCN-LABEL: {{^}}v_clamp_v2f16: 552; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 553; GFX9-NOT: [[A]] 554; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}} 555define amdgpu_kernel void @v_clamp_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 556 %tid = call i32 @llvm.amdgcn.workitem.id.x() 557 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 558 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 559 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 560 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> zeroinitializer) 561 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 562 563 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 564 ret void 565} 566 567; GCN-LABEL: {{^}}v_clamp_v2f16_undef_elt: 568; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 569; GFX9-NOT: [[A]] 570; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}} 571define amdgpu_kernel void @v_clamp_v2f16_undef_elt(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 572 %tid = call i32 @llvm.amdgcn.workitem.id.x() 573 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 574 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 575 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 576 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>) 577 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>) 578 579 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 580 ret void 581} 582 583; GCN-LABEL: {{^}}v_clamp_v2f16_not_zero: 584; GFX9: v_pk_max_f16 585; GFX9: v_pk_min_f16 586define amdgpu_kernel void @v_clamp_v2f16_not_zero(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 587 %tid = call i32 @llvm.amdgcn.workitem.id.x() 588 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 589 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 590 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 591 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 0.0>) 592 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 593 594 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 595 ret void 596} 597 598; GCN-LABEL: {{^}}v_clamp_v2f16_not_one: 599; GFX9: v_pk_max_f16 600; GFX9: v_pk_min_f16 601define amdgpu_kernel void @v_clamp_v2f16_not_one(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 602 %tid = call i32 @llvm.amdgcn.workitem.id.x() 603 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 604 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 605 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 606 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half 0.0>) 607 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 0.0, half 1.0>) 608 609 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 610 ret void 611} 612 613; GCN-LABEL: {{^}}v_clamp_neg_v2f16: 614; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 615; GFX9-NOT: [[A]] 616; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}} 617define amdgpu_kernel void @v_clamp_neg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 618 %tid = call i32 @llvm.amdgcn.workitem.id.x() 619 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 620 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 621 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 622 %fneg.a = fsub <2 x half> <half -0.0, half -0.0>, %a 623 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.a, <2 x half> zeroinitializer) 624 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 625 626 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 627 ret void 628} 629 630; GCN-LABEL: {{^}}v_clamp_negabs_v2f16: 631; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 632; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, [[A]] 633; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[ABS]], [[ABS]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}} 634define amdgpu_kernel void @v_clamp_negabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 635 %tid = call i32 @llvm.amdgcn.workitem.id.x() 636 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 637 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 638 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 639 %fabs.a = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a) 640 %fneg.fabs.a = fsub <2 x half> <half -0.0, half -0.0>, %fabs.a 641 642 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.fabs.a, <2 x half> zeroinitializer) 643 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 644 645 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 646 ret void 647} 648 649; GCN-LABEL: {{^}}v_clamp_neglo_v2f16: 650; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 651; GFX9-NOT: [[A]] 652; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_lo:[1,1] clamp{{$}} 653define amdgpu_kernel void @v_clamp_neglo_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 654 %tid = call i32 @llvm.amdgcn.workitem.id.x() 655 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 656 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 657 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 658 %lo = extractelement <2 x half> %a, i32 0 659 %neg.lo = fsub half -0.0, %lo 660 %neg.lo.vec = insertelement <2 x half> %a, half %neg.lo, i32 0 661 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.lo.vec, <2 x half> zeroinitializer) 662 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 663 664 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 665 ret void 666} 667 668; GCN-LABEL: {{^}}v_clamp_neghi_v2f16: 669; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 670; GFX9-NOT: [[A]] 671; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_hi:[1,1] clamp{{$}} 672define amdgpu_kernel void @v_clamp_neghi_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 673 %tid = call i32 @llvm.amdgcn.workitem.id.x() 674 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 675 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 676 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 677 %hi = extractelement <2 x half> %a, i32 1 678 %neg.hi = fsub half -0.0, %hi 679 %neg.hi.vec = insertelement <2 x half> %a, half %neg.hi, i32 1 680 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.hi.vec, <2 x half> zeroinitializer) 681 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 682 683 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 684 ret void 685} 686 687; GCN-LABEL: {{^}}v_clamp_v2f16_shuffle: 688; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 689; GFX9-NOT: [[A]] 690; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] op_sel:[1,1] op_sel_hi:[0,0] clamp{{$}} 691define amdgpu_kernel void @v_clamp_v2f16_shuffle(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 692 %tid = call i32 @llvm.amdgcn.workitem.id.x() 693 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 694 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 695 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 696 %shuf = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0> 697 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %shuf, <2 x half> zeroinitializer) 698 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 699 700 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 701 ret void 702} 703 704; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts0: 705; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 706; GFX9-NOT: [[A]] 707; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}} 708define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 709 %tid = call i32 @llvm.amdgcn.workitem.id.x() 710 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 711 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 712 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 713 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half undef>) 714 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half undef, half 1.0>) 715 716 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 717 ret void 718} 719 720; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts1: 721; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 722; GFX9-NOT: [[A]] 723; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}} 724define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 725 %tid = call i32 @llvm.amdgcn.workitem.id.x() 726 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 727 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 728 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 729 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>) 730 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>) 731 732 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 733 ret void 734} 735 736; GCN-LABEL: {{^}}v_clamp_diff_source_f32: 737; GCN: v_add_f32_e32 [[A:v[0-9]+]] 738; GCN: v_add_f32_e32 [[B:v[0-9]+]] 739; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[B]] clamp{{$}} 740define amdgpu_kernel void @v_clamp_diff_source_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 741{ 742 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 0 743 %gep1 = getelementptr float, float addrspace(1)* %aptr, i32 1 744 %gep2 = getelementptr float, float addrspace(1)* %aptr, i32 2 745 %l0 = load float, float addrspace(1)* %gep0 746 %l1 = load float, float addrspace(1)* %gep1 747 %l2 = load float, float addrspace(1)* %gep2 748 %a = fadd nsz float %l0, %l1 749 %b = fadd nsz float %l0, %l2 750 %res = call nsz float @llvm.maxnum.f32(float %a, float %b) 751 %max = call nsz float @llvm.maxnum.f32(float %res, float 0.0) 752 %min = call nsz float @llvm.minnum.f32(float %max, float 1.0) 753 %out.gep = getelementptr float, float addrspace(1)* %out, i32 3 754 store float %min, float addrspace(1)* %out.gep 755 ret void 756} 757 758declare i32 @llvm.amdgcn.workitem.id.x() #1 759declare float @llvm.fabs.f32(float) #1 760declare float @llvm.minnum.f32(float, float) #1 761declare float @llvm.maxnum.f32(float, float) #1 762declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1 763declare double @llvm.fabs.f64(double) #1 764declare double @llvm.minnum.f64(double, double) #1 765declare double @llvm.maxnum.f64(double, double) #1 766declare half @llvm.fabs.f16(half) #1 767declare half @llvm.minnum.f16(half, half) #1 768declare half @llvm.maxnum.f16(half, half) #1 769declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1 770declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1 771declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1 772 773attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 774attributes #1 = { nounwind readnone } 775attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" } 776attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" } 777attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" } 778