1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI %s 2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI %s 3; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s 4 5declare i32 @llvm.amdgcn.workitem.id.x() #1 6declare half @llvm.fabs.f16(half) 7declare float @llvm.fabs.f32(float) 8declare double @llvm.fabs.f64(double) 9 10; GCN-LABEL: {{^}}v_cnd_nan_nosgpr: 11; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0 12; GCN: s_cselect_b64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], -1, 0 13; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]}}, -1, v{{[0-9]+}}, [[COND]] 14; GCN-DAG: v{{[0-9]}} 15; All nan values are converted to 0xffffffff 16; GCN: s_endpgm 17define amdgpu_kernel void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 { 18 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 19 %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx 20 %f = load float, float addrspace(1)* %f.gep 21 %setcc = icmp ne i32 %c, 0 22 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f 23 store float %select, float addrspace(1)* %out 24 ret void 25} 26 27 28; This requires slightly trickier SGPR operand legalization since the 29; single constant bus SGPR usage is the last operand, and it should 30; never be moved. 31; However on GFX10 constant bus is limited to 2 scalar operands, not one. 32 33; GCN-LABEL: {{^}}v_cnd_nan: 34; SIVI: s_cmp_eq_u32 s{{[0-9]+}}, 0 35; SIVI: s_cselect_b64 vcc, -1, 0 36; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, -1, v{{[0-9]+}}, vcc 37; GFX10: s_cmp_eq_u32 s{{[0-9]+}}, 0 38; GFX10: s_cselect_b64 [[CC:s\[[0-9:]+\]]], 39; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, -1, s{{[0-9]+}}, [[CC]] 40; GCN-DAG: v{{[0-9]}} 41; All nan values are converted to 0xffffffff 42; GCN: s_endpgm 43define amdgpu_kernel void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 { 44 %setcc = icmp ne i32 %c, 0 45 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f 46 store float %select, float addrspace(1)* %out 47 ret void 48} 49 50; Test different compare and select operand types for optimal code 51; shrinking. 52; (select (cmp (sgprX, constant)), constant, sgprZ) 53 54; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprZ_f32: 55; GCN: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Z:[0-9]+]]{{\]}}, s[0:1], {{0x4c|0x13}} 56 57; SIVI-DAG: v_cmp_nlg_f32_e64 [[CC:vcc]], s[[X]], 0 58; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], s[[X]], 0 59; SIVI-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]] 60; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], [[CC]] 61; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, s[[Z]], [[CC]] 62define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 { 63 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 64 %tid.ext = sext i32 %tid to i64 65 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 66 %setcc = fcmp one float %x, 0.0 67 %select = select i1 %setcc, float 1.0, float %z 68 store float %select, float addrspace(1)* %out.gep 69 ret void 70} 71 72; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprX_f32: 73; GCN: s_load_dword [[X:s[0-9]+]] 74; SIVI-DAG: v_cmp_nlg_f32_e64 [[CC:vcc]], [[X]], 0 75; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], [[X]], 0 76; SIVI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[X]] 77; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VX]], [[CC]] 78; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, [[X]], [[CC]] 79define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 { 80 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 81 %tid.ext = sext i32 %tid to i64 82 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 83 %setcc = fcmp one float %x, 0.0 84 %select = select i1 %setcc, float 1.0, float %x 85 store float %select, float addrspace(1)* %out.gep 86 ret void 87} 88 89; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprZ_f32: 90; GCN-DAG: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Z:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} 91; SIVI-DAG: v_cmp_nlg_f32_e64 [[CC:vcc]], s[[X]], 0 92; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], s[[X]], 0 93; SIVI-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]] 94; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], [[CC]] 95; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 0, s[[Z]], [[CC]] 96define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 { 97 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 98 %tid.ext = sext i32 %tid to i64 99 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 100 %setcc = fcmp one float %x, 0.0 101 %select = select i1 %setcc, float 0.0, float %z 102 store float %select, float addrspace(1)* %out.gep 103 ret void 104} 105 106; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprX_f32: 107; GCN: s_load_dword [[X:s[0-9]+]] 108; SIVI-DAG: v_cmp_nlg_f32_e64 [[CC:vcc]], [[X]], 0 109; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], [[X]], 0 110; SIVI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[X]] 111; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VX]], [[CC]] 112; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 0, [[X]], [[CC]] 113define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 { 114 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 115 %tid.ext = sext i32 %tid to i64 116 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 117 %setcc = fcmp one float %x, 0.0 118 %select = select i1 %setcc, float 0.0, float %x 119 store float %select, float addrspace(1)* %out.gep 120 ret void 121} 122 123; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_vgprZ_f32: 124; GCN-DAG: s_load_dword [[X:s[0-9]+]] 125; GCN-DAG: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]] 126; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0 127; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 0, [[Z]], [[COND]] 128define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 { 129 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 130 %tid.ext = sext i32 %tid to i64 131 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext 132 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 133 %z = load float, float addrspace(1)* %z.gep 134 %setcc = fcmp one float %x, 0.0 135 %select = select i1 %setcc, float 0.0, float %z 136 store float %select, float addrspace(1)* %out.gep 137 ret void 138} 139 140; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_vgprZ_f32: 141; GCN-DAG: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]] 142; GCN-DAG: s_load_dword [[X:s[0-9]+]] 143; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0 144; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, [[Z]], [[COND]] 145define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 { 146 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 147 %tid.ext = sext i32 %tid to i64 148 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext 149 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 150 %z = load float, float addrspace(1)* %z.gep 151 %setcc = fcmp one float %x, 0.0 152 %select = select i1 %setcc, float 1.0, float %z 153 store float %select, float addrspace(1)* %out.gep 154 ret void 155} 156 157; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_sgprZ_f32: 158; GCN-DAG: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 159; GCN-DAG: s_load_dword [[Z:s[0-9]+]] 160; GCN-DAG: v_cmp_ngt_f32_e32 vcc, 0, [[X]] 161; SIVI-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] 162; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc 163; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, [[Z]], vcc 164define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 { 165 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 166 %tid.ext = sext i32 %tid to i64 167 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 168 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 169 %x = load float, float addrspace(1)* %x.gep 170 %setcc = fcmp olt float %x, 0.0 171 %select = select i1 %setcc, float 1.0, float %z 172 store float %select, float addrspace(1)* %out.gep 173 ret void 174} 175 176; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_vgprZ_f32: 177; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 178; GCN: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]] 179; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]] 180; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc 181define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { 182 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 183 %tid.ext = sext i32 %tid to i64 184 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 185 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext 186 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 187 %x = load volatile float, float addrspace(1)* %x.gep 188 %z = load volatile float, float addrspace(1)* %z.gep 189 %setcc = fcmp ult float %x, 0.0 190 %select = select i1 %setcc, float 1.0, float %z 191 store float %select, float addrspace(1)* %out.gep 192 ret void 193} 194 195; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i32: 196; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 197; GCN: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]] 198; GCN: v_cmp_lt_i32_e32 vcc, -1, [[X]] 199; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[Z]], vcc 200define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 { 201 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 202 %tid.ext = sext i32 %tid to i64 203 %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext 204 %z.gep = getelementptr inbounds i32, i32 addrspace(1)* %z.ptr, i64 %tid.ext 205 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext 206 %x = load volatile i32, i32 addrspace(1)* %x.gep 207 %z = load volatile i32, i32 addrspace(1)* %z.gep 208 %setcc = icmp slt i32 %x, 0 209 %select = select i1 %setcc, i32 2, i32 %z 210 store i32 %select, i32 addrspace(1)* %out.gep 211 ret void 212} 213 214; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i64: 215; GCN: {{buffer|flat|global}}_load_dwordx2 v{{\[}}[[X_LO:[0-9]+]]:[[X_HI:[0-9]+]]{{\]}} 216; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 v{{\[}}[[Z_LO:[0-9]+]]:[[Z_HI:[0-9]+]]{{\]}} 217; GCN-DAG: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}} 218; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc 219; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc 220define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 { 221 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 222 %tid.ext = sext i32 %tid to i64 223 %x.gep = getelementptr inbounds i64, i64 addrspace(1)* %x.ptr, i64 %tid.ext 224 %z.gep = getelementptr inbounds i64, i64 addrspace(1)* %z.ptr, i64 %tid.ext 225 %out.gep = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %tid.ext 226 %x = load volatile i64, i64 addrspace(1)* %x.gep 227 %z = load volatile i64, i64 addrspace(1)* %z.gep 228 %setcc = icmp slt i64 %x, 0 229 %select = select i1 %setcc, i64 2, i64 %z 230 store i64 %select, i64 addrspace(1)* %out.gep 231 ret void 232} 233 234; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_vgprZ_k1_v4f32: 235; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 236; GCN: {{buffer|flat|global}}_load_dwordx4 237 238; GCN: v_cmp_nge_f32_e32 vcc, 4.0, [[X]] 239; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc 240; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc 241; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc 242; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc 243define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { 244 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 245 %tid.ext = sext i32 %tid to i64 246 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 247 %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext 248 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext 249 %x = load volatile float, float addrspace(1)* %x.gep 250 %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep 251 %setcc = fcmp ugt float %x, 4.0 252 %select = select i1 %setcc, <4 x float> %z, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0> 253 store <4 x float> %select, <4 x float> addrspace(1)* %out.gep 254 ret void 255} 256 257; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_vgprZ_v4f32: 258; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 259; GCN: {{buffer|flat|global}}_load_dwordx4 260 261; GCN: v_cmp_ge_f32_e32 vcc, 4.0, [[X]] 262; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc 263; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc 264; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc 265; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc 266define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { 267 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 268 %tid.ext = sext i32 %tid to i64 269 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 270 %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext 271 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext 272 %x = load volatile float, float addrspace(1)* %x.gep 273 %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep 274 %setcc = fcmp ugt float %x, 4.0 275 %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z 276 store <4 x float> %select, <4 x float> addrspace(1)* %out.gep 277 ret void 278} 279 280; This must be swapped as a vector type before the condition has 281; multiple uses. 282 283; GCN-LABEL: {{^}}fcmp_k0_vgprX_select_k1_vgprZ_v4f32: 284; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 285; GCN: {{buffer|flat|global}}_load_dwordx4 286 287; GCN: v_cmp_le_f32_e32 vcc, 4.0, [[X]] 288; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc 289; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc 290; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc 291; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc 292define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { 293 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 294 %tid.ext = sext i32 %tid to i64 295 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 296 %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext 297 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext 298 %x = load volatile float, float addrspace(1)* %x.gep 299 %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep 300 %setcc = fcmp ugt float 4.0, %x 301 %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z 302 store <4 x float> %select, <4 x float> addrspace(1)* %out.gep 303 ret void 304} 305 306; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i1: 307; GCN: load_dword 308; GCN: load_ubyte 309; GCN-DAG: v_cmp_gt_i32_e32 vcc, 0, v 310; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 1, 311; GCN-DAG: v_cmp_eq_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, v 312; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, s{{\[[0-9]+:[0-9]+\]}} 313; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s 314; GCN: store_byte 315define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 { 316 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 317 %tid.ext = sext i32 %tid to i64 318 %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext 319 %z.gep = getelementptr inbounds i1, i1 addrspace(1)* %z.ptr, i64 %tid.ext 320 %out.gep = getelementptr inbounds i1, i1 addrspace(1)* %out, i64 %tid.ext 321 %x = load volatile i32, i32 addrspace(1)* %x.gep 322 %z = load volatile i1, i1 addrspace(1)* %z.gep 323 %setcc = icmp slt i32 %x, 0 324 %select = select i1 %setcc, i1 true, i1 %z 325 store i1 %select, i1 addrspace(1)* %out.gep 326 ret void 327} 328 329; Different types compared vs. selected 330; GCN-LABEL: {{^}}fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: 331; SIVI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3ff00000 332; GCN-DAG: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 333; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 334 335; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]] 336; SIVI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc 337; GFX10-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3ff00000, v{{[0-9]+}}, vcc 338; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 339define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 { 340 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 341 %tid.ext = sext i32 %tid to i64 342 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 343 %z.gep = getelementptr inbounds double, double addrspace(1)* %z.ptr, i64 %tid.ext 344 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 345 %x = load volatile float, float addrspace(1)* %x.gep 346 %z = load volatile double, double addrspace(1)* %z.gep 347 %setcc = fcmp ult float %x, 0.0 348 %select = select i1 %setcc, double 1.0, double %z 349 store double %select, double addrspace(1)* %out.gep 350 ret void 351} 352 353; Different types compared vs. selected 354; GCN-LABEL: {{^}}fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: 355; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 356; GCN: {{buffer|flat|global}}_load_dwordx2 357 358; GCN: v_cmp_nlg_f32_e32 vcc, 0, [[X]] 359; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc 360; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 361define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 { 362 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 363 %tid.ext = sext i32 %tid to i64 364 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 365 %z.gep = getelementptr inbounds i64, i64 addrspace(1)* %z.ptr, i64 %tid.ext 366 %out.gep = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %tid.ext 367 %x = load volatile float, float addrspace(1)* %x.gep 368 %z = load volatile i64, i64 addrspace(1)* %z.gep 369 %setcc = fcmp one float %x, 0.0 370 %select = select i1 %setcc, i64 3, i64 %z 371 store i64 %select, i64 addrspace(1)* %out.gep 372 ret void 373} 374 375; Different types compared vs. selected 376; GCN-LABEL: {{^}}icmp_vgprX_k0_selectf32_k1_vgprZ_i32: 377; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 378; GCN: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]] 379 380; GCN: v_cmp_gt_u32_e32 vcc, 2, [[X]] 381; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, [[Z]], vcc 382define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { 383 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 384 %tid.ext = sext i32 %tid to i64 385 %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext 386 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext 387 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 388 %x = load volatile i32, i32 addrspace(1)* %x.gep 389 %z = load volatile float, float addrspace(1)* %z.gep 390 %setcc = icmp ugt i32 %x, 1 391 %select = select i1 %setcc, float 4.0, float %z 392 store float %select, float addrspace(1)* %out.gep 393 ret void 394} 395 396; FIXME: Should be able to handle multiple uses 397 398; GCN-LABEL: {{^}}fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: 399; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 400 401; GCN: v_cmp_nle_f32_e32 vcc, 4.0, [[X]] 402; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -1.0, vcc 403; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -2.0, vcc 404define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { 405 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 406 %tid.ext = sext i32 %tid to i64 407 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 408 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext 409 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 410 %x = load volatile float, float addrspace(1)* %x.gep 411 %z = load volatile float, float addrspace(1)* %z.gep 412 %setcc = fcmp ugt float 4.0, %x 413 %select0 = select i1 %setcc, float -1.0, float %z 414 %select1 = select i1 %setcc, float -2.0, float %z 415 store volatile float %select0, float addrspace(1)* %out.gep 416 store volatile float %select1, float addrspace(1)* %out.gep 417 ret void 418} 419 420; Source modifiers abs/neg only work for f32 421 422; GCN-LABEL: {{^}}v_cndmask_abs_neg_f16: 423; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 424define amdgpu_kernel void @v_cndmask_abs_neg_f16(half addrspace(1)* %out, i32 %c, half addrspace(1)* %fptr) #0 { 425 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 426 %f.gep = getelementptr half, half addrspace(1)* %fptr, i32 %idx 427 %f = load half, half addrspace(1)* %f.gep 428 %f.abs = call half @llvm.fabs.f16(half %f) 429 %f.neg = fneg half %f 430 %setcc = icmp ne i32 %c, 0 431 %select = select i1 %setcc, half %f.abs, half %f.neg 432 store half %select, half addrspace(1)* %out 433 ret void 434} 435 436; GCN-LABEL: {{^}}v_cndmask_abs_neg_f32: 437; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, |v{{[0-9]+}}|, 438define amdgpu_kernel void @v_cndmask_abs_neg_f32(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 { 439 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 440 %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx 441 %f = load float, float addrspace(1)* %f.gep 442 %f.abs = call float @llvm.fabs.f32(float %f) 443 %f.neg = fneg float %f 444 %setcc = icmp ne i32 %c, 0 445 %select = select i1 %setcc, float %f.abs, float %f.neg 446 store float %select, float addrspace(1)* %out 447 ret void 448} 449 450; GCN-LABEL: {{^}}v_cndmask_abs_neg_f64: 451; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 452; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 453define amdgpu_kernel void @v_cndmask_abs_neg_f64(double addrspace(1)* %out, i32 %c, double addrspace(1)* %fptr) #0 { 454 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 455 %f.gep = getelementptr double, double addrspace(1)* %fptr, i32 %idx 456 %f = load double, double addrspace(1)* %f.gep 457 %f.abs = call double @llvm.fabs.f64(double %f) 458 %f.neg = fneg double %f 459 %setcc = icmp ne i32 %c, 0 460 %select = select i1 %setcc, double %f.abs, double %f.neg 461 store double %select, double addrspace(1)* %out 462 ret void 463} 464 465attributes #0 = { nounwind } 466attributes #1 = { nounwind readnone } 467