1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI %s 2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI %s 3; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s 4; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s 5 6declare i32 @llvm.amdgcn.workitem.id.x() #1 7declare half @llvm.fabs.f16(half) 8declare float @llvm.fabs.f32(float) 9declare double @llvm.fabs.f64(double) 10 11; GCN-LABEL: {{^}}v_cnd_nan_nosgpr: 12; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0 13; GCN: s_cselect_b64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], -1, 0 14; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]}}, -1, v{{[0-9]+}}, [[COND]] 15; GCN-DAG: v{{[0-9]}} 16; All nan values are converted to 0xffffffff 17; GCN: s_endpgm 18define amdgpu_kernel void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 { 19 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 20 %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx 21 %f = load float, float addrspace(1)* %f.gep 22 %setcc = icmp ne i32 %c, 0 23 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f 24 store float %select, float addrspace(1)* %out 25 ret void 26} 27 28 29; This requires slightly trickier SGPR operand legalization since the 30; single constant bus SGPR usage is the last operand, and it should 31; never be moved. 32; However on GFX10 constant bus is limited to 2 scalar operands, not one. 33 34; GCN-LABEL: {{^}}v_cnd_nan: 35; SIVI: s_cmp_eq_u32 s{{[0-9]+}}, 0 36; SIVI: s_cselect_b64 vcc, -1, 0 37; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, -1, v{{[0-9]+}}, vcc 38; GFX10: s_cmp_eq_u32 s{{[0-9]+}}, 0 39; GFX10: s_cselect_b64 [[CC:s\[[0-9:]+\]]], 40; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, -1, s{{[0-9]+}}, [[CC]] 41; GCN-DAG: v{{[0-9]}} 42; All nan values are converted to 0xffffffff 43; GCN: s_endpgm 44define amdgpu_kernel void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 { 45 %setcc = icmp ne i32 %c, 0 46 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f 47 store float %select, float addrspace(1)* %out 48 ret void 49} 50 51; Test different compare and select operand types for optimal code 52; shrinking. 53; (select (cmp (sgprX, constant)), constant, sgprZ) 54 55; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprZ_f32: 56; GCN: s_load_{{dwordx2|b64}} s[[[X:[0-9]+]]:[[Z:[0-9]+]]], s[0:1], {{0x4c|0x13}} 57 58; SIVI-DAG: v_cmp_nlg_f32_e64 [[CC:vcc]], s[[X]], 0 59; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], s[[X]], 0 60; SIVI-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]] 61; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], [[CC]] 62; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, s[[Z]], [[CC]] 63define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 { 64 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 65 %tid.ext = sext i32 %tid to i64 66 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 67 %setcc = fcmp one float %x, 0.0 68 %select = select i1 %setcc, float 1.0, float %z 69 store float %select, float addrspace(1)* %out.gep 70 ret void 71} 72 73; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprX_f32: 74; GCN: s_load_{{dword|b32}} [[X:s[0-9]+]] 75; SIVI-DAG: v_cmp_nlg_f32_e64 [[CC:vcc]], [[X]], 0 76; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], [[X]], 0 77; SIVI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[X]] 78; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VX]], [[CC]] 79; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, [[X]], [[CC]] 80define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 { 81 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 82 %tid.ext = sext i32 %tid to i64 83 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 84 %setcc = fcmp one float %x, 0.0 85 %select = select i1 %setcc, float 1.0, float %x 86 store float %select, float addrspace(1)* %out.gep 87 ret void 88} 89 90; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprZ_f32: 91; GCN-DAG: s_load_{{dwordx2|b64}} s[[[X:[0-9]+]]:[[Z:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} 92; SIVI-DAG: v_cmp_nlg_f32_e64 [[CC:vcc]], s[[X]], 0 93; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], s[[X]], 0 94; SIVI-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]] 95; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], [[CC]] 96; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 0, s[[Z]], [[CC]] 97define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 { 98 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 99 %tid.ext = sext i32 %tid to i64 100 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 101 %setcc = fcmp one float %x, 0.0 102 %select = select i1 %setcc, float 0.0, float %z 103 store float %select, float addrspace(1)* %out.gep 104 ret void 105} 106 107; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprX_f32: 108; GCN: s_load_{{dword|b32}} [[X:s[0-9]+]] 109; SIVI-DAG: v_cmp_nlg_f32_e64 [[CC:vcc]], [[X]], 0 110; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], [[X]], 0 111; SIVI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[X]] 112; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VX]], [[CC]] 113; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 0, [[X]], [[CC]] 114define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 { 115 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 116 %tid.ext = sext i32 %tid to i64 117 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 118 %setcc = fcmp one float %x, 0.0 119 %select = select i1 %setcc, float 0.0, float %x 120 store float %select, float addrspace(1)* %out.gep 121 ret void 122} 123 124; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_vgprZ_f32: 125; GCN-DAG: s_load_{{dword|b32}} [[X:s[0-9]+]] 126; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}} [[Z:v[0-9]+]] 127; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0 128; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 0, [[Z]], [[COND]] 129define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 { 130 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 131 %tid.ext = sext i32 %tid to i64 132 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext 133 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 134 %z = load float, float addrspace(1)* %z.gep 135 %setcc = fcmp one float %x, 0.0 136 %select = select i1 %setcc, float 0.0, float %z 137 store float %select, float addrspace(1)* %out.gep 138 ret void 139} 140 141; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_vgprZ_f32: 142; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}} [[Z:v[0-9]+]] 143; GCN-DAG: s_load_{{dword|b32}} [[X:s[0-9]+]] 144; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0 145; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, [[Z]], [[COND]] 146define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 { 147 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 148 %tid.ext = sext i32 %tid to i64 149 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext 150 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 151 %z = load float, float addrspace(1)* %z.gep 152 %setcc = fcmp one float %x, 0.0 153 %select = select i1 %setcc, float 1.0, float %z 154 store float %select, float addrspace(1)* %out.gep 155 ret void 156} 157 158; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_sgprZ_f32: 159; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] 160; GCN-DAG: s_load_{{dword|b32}} [[Z:s[0-9]+]] 161; GCN-DAG: v_cmp_ngt_f32_e32 vcc, 0, [[X]] 162; SIVI-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] 163; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc 164; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, [[Z]], vcc 165define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 { 166 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 167 %tid.ext = sext i32 %tid to i64 168 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 169 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 170 %x = load float, float addrspace(1)* %x.gep 171 %setcc = fcmp olt float %x, 0.0 172 %select = select i1 %setcc, float 1.0, float %z 173 store float %select, float addrspace(1)* %out.gep 174 ret void 175} 176 177; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_vgprZ_f32: 178; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] 179; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[Z:v[0-9]+]] 180; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]] 181; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc 182define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { 183 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 184 %tid.ext = sext i32 %tid to i64 185 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 186 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext 187 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 188 %x = load volatile float, float addrspace(1)* %x.gep 189 %z = load volatile float, float addrspace(1)* %z.gep 190 %setcc = fcmp ult float %x, 0.0 191 %select = select i1 %setcc, float 1.0, float %z 192 store float %select, float addrspace(1)* %out.gep 193 ret void 194} 195 196; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i32: 197; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] 198; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[Z:v[0-9]+]] 199; GCN: v_cmp_lt_i32_e32 vcc, -1, [[X]] 200; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[Z]], vcc 201define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 { 202 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 203 %tid.ext = sext i32 %tid to i64 204 %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext 205 %z.gep = getelementptr inbounds i32, i32 addrspace(1)* %z.ptr, i64 %tid.ext 206 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext 207 %x = load volatile i32, i32 addrspace(1)* %x.gep 208 %z = load volatile i32, i32 addrspace(1)* %z.gep 209 %setcc = icmp slt i32 %x, 0 210 %select = select i1 %setcc, i32 2, i32 %z 211 store i32 %select, i32 addrspace(1)* %out.gep 212 ret void 213} 214 215; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i64: 216; GCN: {{buffer|flat|global}}_load_{{dwordx2|b64}} v[[[X_LO:[0-9]+]]:[[X_HI:[0-9]+]]] 217; GCN-DAG: {{buffer|flat|global}}_load_{{dwordx2|b64}} v[[[Z_LO:[0-9]+]]:[[Z_HI:[0-9]+]]] 218; GCN-DAG: v_cmp_lt_i64_e32 vcc, -1, v[[[X_LO]]:[[X_HI]]] 219; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc 220; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc 221define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 { 222 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 223 %tid.ext = sext i32 %tid to i64 224 %x.gep = getelementptr inbounds i64, i64 addrspace(1)* %x.ptr, i64 %tid.ext 225 %z.gep = getelementptr inbounds i64, i64 addrspace(1)* %z.ptr, i64 %tid.ext 226 %out.gep = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %tid.ext 227 %x = load volatile i64, i64 addrspace(1)* %x.gep 228 %z = load volatile i64, i64 addrspace(1)* %z.gep 229 %setcc = icmp slt i64 %x, 0 230 %select = select i1 %setcc, i64 2, i64 %z 231 store i64 %select, i64 addrspace(1)* %out.gep 232 ret void 233} 234 235; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_vgprZ_k1_v4f32: 236; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] 237; GCN: {{buffer|flat|global}}_load_{{dword|b128}} 238 239; GCN: v_cmp_nge_f32_e32 vcc, 4.0, [[X]] 240; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc 241; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc 242; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc 243; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc 244define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { 245 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 246 %tid.ext = sext i32 %tid to i64 247 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 248 %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext 249 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext 250 %x = load volatile float, float addrspace(1)* %x.gep 251 %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep 252 %setcc = fcmp ugt float %x, 4.0 253 %select = select i1 %setcc, <4 x float> %z, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0> 254 store <4 x float> %select, <4 x float> addrspace(1)* %out.gep 255 ret void 256} 257 258; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_vgprZ_v4f32: 259; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] 260; GCN: {{buffer|flat|global}}_load_{{dword|b128}} 261 262; GCN: v_cmp_ge_f32_e32 vcc, 4.0, [[X]] 263; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc 264; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc 265; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc 266; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc 267define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { 268 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 269 %tid.ext = sext i32 %tid to i64 270 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 271 %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext 272 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext 273 %x = load volatile float, float addrspace(1)* %x.gep 274 %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep 275 %setcc = fcmp ugt float %x, 4.0 276 %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z 277 store <4 x float> %select, <4 x float> addrspace(1)* %out.gep 278 ret void 279} 280 281; This must be swapped as a vector type before the condition has 282; multiple uses. 283 284; GCN-LABEL: {{^}}fcmp_k0_vgprX_select_k1_vgprZ_v4f32: 285; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] 286; GCN: {{buffer|flat|global}}_load_{{dword|b128}} 287 288; GCN: v_cmp_le_f32_e32 vcc, 4.0, [[X]] 289; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc 290; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc 291; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc 292; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc 293define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { 294 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 295 %tid.ext = sext i32 %tid to i64 296 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 297 %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext 298 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext 299 %x = load volatile float, float addrspace(1)* %x.gep 300 %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep 301 %setcc = fcmp ugt float 4.0, %x 302 %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z 303 store <4 x float> %select, <4 x float> addrspace(1)* %out.gep 304 ret void 305} 306 307; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i1: 308; GCN: load_{{dword|b32}} 309; GCN: load_{{ubyte|u8}} 310; GCN-DAG: v_cmp_gt_i32_e32 vcc, 0, v 311; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 1, 312; GCN-DAG: v_cmp_eq_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, v 313; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, s{{\[[0-9]+:[0-9]+\]}} 314; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s 315; GCN: store_{{byte|b8}} 316define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 { 317 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 318 %tid.ext = sext i32 %tid to i64 319 %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext 320 %z.gep = getelementptr inbounds i1, i1 addrspace(1)* %z.ptr, i64 %tid.ext 321 %out.gep = getelementptr inbounds i1, i1 addrspace(1)* %out, i64 %tid.ext 322 %x = load volatile i32, i32 addrspace(1)* %x.gep 323 %z = load volatile i1, i1 addrspace(1)* %z.gep 324 %setcc = icmp slt i32 %x, 0 325 %select = select i1 %setcc, i1 true, i1 %z 326 store i1 %select, i1 addrspace(1)* %out.gep 327 ret void 328} 329 330; Different types compared vs. selected 331; GCN-LABEL: {{^}}fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: 332; SIVI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3ff00000 333; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] 334; GCN-DAG: {{buffer|flat|global}}_load_{{dwordx2|b64}} 335 336; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]] 337; SIVI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc 338; GFX10-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3ff00000, v{{[0-9]+}}, vcc 339; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 340define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 { 341 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 342 %tid.ext = sext i32 %tid to i64 343 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 344 %z.gep = getelementptr inbounds double, double addrspace(1)* %z.ptr, i64 %tid.ext 345 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 346 %x = load volatile float, float addrspace(1)* %x.gep 347 %z = load volatile double, double addrspace(1)* %z.gep 348 %setcc = fcmp ult float %x, 0.0 349 %select = select i1 %setcc, double 1.0, double %z 350 store double %select, double addrspace(1)* %out.gep 351 ret void 352} 353 354; Different types compared vs. selected 355; GCN-LABEL: {{^}}fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: 356; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] 357; GCN: {{buffer|flat|global}}_load_{{dwordx2|b64}} 358 359; GCN: v_cmp_nlg_f32_e32 vcc, 0, [[X]] 360; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc 361; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 362define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 { 363 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 364 %tid.ext = sext i32 %tid to i64 365 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 366 %z.gep = getelementptr inbounds i64, i64 addrspace(1)* %z.ptr, i64 %tid.ext 367 %out.gep = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %tid.ext 368 %x = load volatile float, float addrspace(1)* %x.gep 369 %z = load volatile i64, i64 addrspace(1)* %z.gep 370 %setcc = fcmp one float %x, 0.0 371 %select = select i1 %setcc, i64 3, i64 %z 372 store i64 %select, i64 addrspace(1)* %out.gep 373 ret void 374} 375 376; Different types compared vs. selected 377; GCN-LABEL: {{^}}icmp_vgprX_k0_selectf32_k1_vgprZ_i32: 378; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] 379; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[Z:v[0-9]+]] 380 381; GCN: v_cmp_gt_u32_e32 vcc, 2, [[X]] 382; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, [[Z]], vcc 383define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { 384 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 385 %tid.ext = sext i32 %tid to i64 386 %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext 387 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext 388 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 389 %x = load volatile i32, i32 addrspace(1)* %x.gep 390 %z = load volatile float, float addrspace(1)* %z.gep 391 %setcc = icmp ugt i32 %x, 1 392 %select = select i1 %setcc, float 4.0, float %z 393 store float %select, float addrspace(1)* %out.gep 394 ret void 395} 396 397; FIXME: Should be able to handle multiple uses 398 399; GCN-LABEL: {{^}}fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: 400; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] 401 402; GCN: v_cmp_nle_f32_e32 vcc, 4.0, [[X]] 403; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -1.0, vcc 404; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -2.0, vcc 405define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { 406 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 407 %tid.ext = sext i32 %tid to i64 408 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 409 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext 410 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 411 %x = load volatile float, float addrspace(1)* %x.gep 412 %z = load volatile float, float addrspace(1)* %z.gep 413 %setcc = fcmp ugt float 4.0, %x 414 %select0 = select i1 %setcc, float -1.0, float %z 415 %select1 = select i1 %setcc, float -2.0, float %z 416 store volatile float %select0, float addrspace(1)* %out.gep 417 store volatile float %select1, float addrspace(1)* %out.gep 418 ret void 419} 420 421; Source modifiers abs/neg only work for f32 422 423; GCN-LABEL: {{^}}v_cndmask_abs_neg_f16: 424; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 425define amdgpu_kernel void @v_cndmask_abs_neg_f16(half addrspace(1)* %out, i32 %c, half addrspace(1)* %fptr) #0 { 426 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 427 %f.gep = getelementptr half, half addrspace(1)* %fptr, i32 %idx 428 %f = load half, half addrspace(1)* %f.gep 429 %f.abs = call half @llvm.fabs.f16(half %f) 430 %f.neg = fneg half %f 431 %setcc = icmp ne i32 %c, 0 432 %select = select i1 %setcc, half %f.abs, half %f.neg 433 store half %select, half addrspace(1)* %out 434 ret void 435} 436 437; GCN-LABEL: {{^}}v_cndmask_abs_neg_f32: 438; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, |v{{[0-9]+}}|, 439define amdgpu_kernel void @v_cndmask_abs_neg_f32(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 { 440 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 441 %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx 442 %f = load float, float addrspace(1)* %f.gep 443 %f.abs = call float @llvm.fabs.f32(float %f) 444 %f.neg = fneg float %f 445 %setcc = icmp ne i32 %c, 0 446 %select = select i1 %setcc, float %f.abs, float %f.neg 447 store float %select, float addrspace(1)* %out 448 ret void 449} 450 451; GCN-LABEL: {{^}}v_cndmask_abs_neg_f64: 452; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 453; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 454define amdgpu_kernel void @v_cndmask_abs_neg_f64(double addrspace(1)* %out, i32 %c, double addrspace(1)* %fptr) #0 { 455 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 456 %f.gep = getelementptr double, double addrspace(1)* %fptr, i32 %idx 457 %f = load double, double addrspace(1)* %f.gep 458 %f.abs = call double @llvm.fabs.f64(double %f) 459 %f.neg = fneg double %f 460 %setcc = icmp ne i32 %c, 0 461 %select = select i1 %setcc, double %f.abs, double %f.neg 462 store double %select, double addrspace(1)* %out 463 ret void 464} 465 466attributes #0 = { nounwind } 467attributes #1 = { nounwind readnone } 468