1; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s 2; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s 3; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s 4 5; Test expansion of scalar selects on vectors. 6; Evergreen not enabled since it seems to be having problems with doubles. 7 8; GCN-LABEL: {{^}}v_select_v2i8: 9; SI: v_cndmask_b32 10; SI-NOT: cndmask 11 12; GFX9: v_cndmask_b32 13; GFX9-NOT: cndmask 14 15; This is worse when i16 is legal and packed is not because 16; SelectionDAGBuilder for some reason changes the select type. 17; VI: v_cndmask_b32 18; VI: v_cndmask_b32 19define amdgpu_kernel void @v_select_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %a.ptr, <2 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 20 %a = load <2 x i8>, <2 x i8> addrspace(1)* %a.ptr, align 2 21 %b = load <2 x i8>, <2 x i8> addrspace(1)* %b.ptr, align 2 22 %cmp = icmp eq i32 %c, 0 23 %select = select i1 %cmp, <2 x i8> %a, <2 x i8> %b 24 store <2 x i8> %select, <2 x i8> addrspace(1)* %out, align 2 25 ret void 26} 27 28; GCN-LABEL: {{^}}v_select_v4i8: 29; GCN: v_cndmask_b32_e32 30; GCN-NOT: cndmask 31define amdgpu_kernel void @v_select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %a.ptr, <4 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 32 %a = load <4 x i8>, <4 x i8> addrspace(1)* %a.ptr 33 %b = load <4 x i8>, <4 x i8> addrspace(1)* %b.ptr 34 %cmp = icmp eq i32 %c, 0 35 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b 36 store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 37 ret void 38} 39 40; GCN-LABEL: {{^}}v_select_v8i8: 41; GCN: v_cndmask_b32_e32 42; GCN: v_cndmask_b32_e32 43; GCN-NOT: cndmask 44define amdgpu_kernel void @v_select_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %a.ptr, <8 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 45 %a = load <8 x i8>, <8 x i8> addrspace(1)* %a.ptr 46 %b = load <8 x i8>, <8 x i8> addrspace(1)* %b.ptr 47 %cmp = icmp eq i32 %c, 0 48 %select = select i1 %cmp, <8 x i8> %a, <8 x i8> %b 49 store <8 x i8> %select, <8 x i8> addrspace(1)* %out, align 4 50 ret void 51} 52 53; GCN-LABEL: {{^}}v_select_v16i8: 54; GCN: v_cndmask_b32_e32 55; GCN: v_cndmask_b32_e32 56; GCN: v_cndmask_b32_e32 57; GCN: v_cndmask_b32_e32 58; GCN-NOT: cndmask 59define amdgpu_kernel void @v_select_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %a.ptr, <16 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 60 %a = load <16 x i8>, <16 x i8> addrspace(1)* %a.ptr 61 %b = load <16 x i8>, <16 x i8> addrspace(1)* %b.ptr 62 %cmp = icmp eq i32 %c, 0 63 %select = select i1 %cmp, <16 x i8> %a, <16 x i8> %b 64 store <16 x i8> %select, <16 x i8> addrspace(1)* %out, align 4 65 ret void 66} 67 68; GCN-LABEL: {{^}}select_v4i8: 69; GCN: v_cndmask_b32_e32 70; GCN-NOT: cndmask 71define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 { 72 %cmp = icmp eq i8 %c, 0 73 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b 74 store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 75 ret void 76} 77 78; GCN-LABEL: {{^}}select_v2i16: 79; GCN: v_cndmask_b32_e32 80; GCN-NOT: v_cndmask_b32 81define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 { 82 %cmp = icmp eq i32 %c, 0 83 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b 84 store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4 85 ret void 86} 87 88; GCN-LABEL: {{^}}v_select_v2i16: 89; GCN: v_cndmask_b32_e32 90; GCN-NOT: cndmask 91define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 92 %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.ptr 93 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.ptr 94 %cmp = icmp eq i32 %c, 0 95 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b 96 store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4 97 ret void 98} 99 100; GCN-LABEL: {{^}}v_select_v3i16: 101; SI: v_cndmask_b32_e32 102; SI: cndmask 103; SI-NOT: cndmask 104 105; GFX9: v_cndmask_b32_e32 106; GFX9: cndmask 107; GFX9-NOT: cndmask 108 109; VI: v_cndmask_b32 110; VI: v_cndmask_b32 111; VI: v_cndmask_b32 112define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 113 %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr 114 %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.ptr 115 %cmp = icmp eq i32 %c, 0 116 %select = select i1 %cmp, <3 x i16> %a, <3 x i16> %b 117 store <3 x i16> %select, <3 x i16> addrspace(1)* %out, align 4 118 ret void 119} 120 121; GCN-LABEL: {{^}}v_select_v4i16: 122; GCN: v_cndmask_b32_e32 123; GCN: v_cndmask_b32_e32 124; GCN-NOT: cndmask 125define amdgpu_kernel void @v_select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %a.ptr, <4 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 126 %a = load <4 x i16>, <4 x i16> addrspace(1)* %a.ptr 127 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b.ptr 128 %cmp = icmp eq i32 %c, 0 129 %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b 130 store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4 131 ret void 132} 133 134; GCN-LABEL: {{^}}v_select_v8i16: 135; GCN: v_cndmask_b32_e32 136; GCN: v_cndmask_b32_e32 137; GCN: v_cndmask_b32_e32 138; GCN: v_cndmask_b32_e32 139; GCN-NOT: cndmask 140define amdgpu_kernel void @v_select_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %a.ptr, <8 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 141 %a = load <8 x i16>, <8 x i16> addrspace(1)* %a.ptr 142 %b = load <8 x i16>, <8 x i16> addrspace(1)* %b.ptr 143 %cmp = icmp eq i32 %c, 0 144 %select = select i1 %cmp, <8 x i16> %a, <8 x i16> %b 145 store <8 x i16> %select, <8 x i16> addrspace(1)* %out, align 4 146 ret void 147} 148 149; FIXME: Expansion with bitwise operations may be better if doing a 150; vector select with SGPR inputs. 151 152; GCN-LABEL: {{^}}s_select_v2i32: 153; GCN: v_cndmask_b32_e32 154; GCN: v_cndmask_b32_e32 155; GCN: buffer_store_dwordx2 156define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) #0 { 157 %cmp = icmp eq i32 %c, 0 158 %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b 159 store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8 160 ret void 161} 162 163; GCN-LABEL: {{^}}s_select_v4i32: 164; GCN: v_cndmask_b32_e32 165; GCN: v_cndmask_b32_e32 166; GCN: v_cndmask_b32_e32 167; GCN: v_cndmask_b32_e32 168; GCN: buffer_store_dwordx4 169define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) #0 { 170 %cmp = icmp eq i32 %c, 0 171 %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b 172 store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16 173 ret void 174} 175 176; GCN-LABEL: {{^}}v_select_v4i32: 177; GCN: buffer_load_dwordx4 178; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 179; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 180; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 181; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 182; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 183; GCN: buffer_store_dwordx4 184define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 { 185bb: 186 %tmp2 = icmp ult i32 %cond, 32 187 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in 188 %tmp3 = select i1 %tmp2, <4 x i32> %val, <4 x i32> zeroinitializer 189 store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 16 190 ret void 191} 192 193; GCN-LABEL: {{^}}select_v8i32: 194; GCN: v_cndmask_b32_e32 195; GCN: v_cndmask_b32_e32 196; GCN: v_cndmask_b32_e32 197; GCN: v_cndmask_b32_e32 198; GCN: v_cndmask_b32_e32 199; GCN: v_cndmask_b32_e32 200; GCN: v_cndmask_b32_e32 201; GCN: v_cndmask_b32_e32 202define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) #0 { 203 %cmp = icmp eq i32 %c, 0 204 %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b 205 store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16 206 ret void 207} 208 209; GCN-LABEL: {{^}}s_select_v2f32: 210; GCN-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} 211; GCN-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}} 212 213; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]] 214; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]] 215; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]] 216; GCN-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} 217 218; GCN: v_cndmask_b32_e32 219; GCN: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]] 220; GCN: v_cndmask_b32_e32 221; GCN: buffer_store_dwordx2 222define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 { 223 %cmp = icmp eq i32 %c, 0 224 %select = select i1 %cmp, <2 x float> %a, <2 x float> %b 225 store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16 226 ret void 227} 228 229; GCN-LABEL: {{^}}s_select_v4f32: 230; GCN: s_load_dwordx4 231; GCN: s_load_dwordx4 232; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} 233 234; GCN: v_cndmask_b32_e32 235; GCN: v_cndmask_b32_e32 236; GCN: v_cndmask_b32_e32 237; GCN: v_cndmask_b32_e32 238 239; GCN: buffer_store_dwordx4 240define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) #0 { 241 %cmp = icmp eq i32 %c, 0 242 %select = select i1 %cmp, <4 x float> %a, <4 x float> %b 243 store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16 244 ret void 245} 246 247; GCN-LABEL: {{^}}v_select_v4f32: 248; GCN: buffer_load_dwordx4 249; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 250; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 251; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 252; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 253; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 254; GCN: buffer_store_dwordx4 255define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 { 256bb: 257 %tmp2 = icmp ult i32 %cond, 32 258 %val = load <4 x float>, <4 x float> addrspace(1)* %in 259 %tmp3 = select i1 %tmp2, <4 x float> %val, <4 x float> zeroinitializer 260 store <4 x float> %tmp3, <4 x float> addrspace(1)* %out, align 16 261 ret void 262} 263 264; GCN-LABEL: {{^}}select_v8f32: 265; GCN: v_cndmask_b32_e32 266; GCN: v_cndmask_b32_e32 267; GCN: v_cndmask_b32_e32 268; GCN: v_cndmask_b32_e32 269; GCN: v_cndmask_b32_e32 270; GCN: v_cndmask_b32_e32 271; GCN: v_cndmask_b32_e32 272; GCN: v_cndmask_b32_e32 273define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) #0 { 274 %cmp = icmp eq i32 %c, 0 275 %select = select i1 %cmp, <8 x float> %a, <8 x float> %b 276 store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16 277 ret void 278} 279 280; GCN-LABEL: {{^}}select_v2f64: 281; GCN: v_cndmask_b32_e32 282; GCN: v_cndmask_b32_e32 283; GCN: v_cndmask_b32_e32 284; GCN: v_cndmask_b32_e32 285define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) #0 { 286 %cmp = icmp eq i32 %c, 0 287 %select = select i1 %cmp, <2 x double> %a, <2 x double> %b 288 store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16 289 ret void 290} 291 292; GCN-LABEL: {{^}}select_v4f64: 293; GCN: v_cndmask_b32_e32 294; GCN: v_cndmask_b32_e32 295; GCN: v_cndmask_b32_e32 296; GCN: v_cndmask_b32_e32 297; GCN: v_cndmask_b32_e32 298; GCN: v_cndmask_b32_e32 299; GCN: v_cndmask_b32_e32 300; GCN: v_cndmask_b32_e32 301define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) #0 { 302 %cmp = icmp eq i32 %c, 0 303 %select = select i1 %cmp, <4 x double> %a, <4 x double> %b 304 store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16 305 ret void 306} 307 308; GCN-LABEL: {{^}}select_v8f64: 309; GCN: v_cndmask_b32_e32 310; GCN: v_cndmask_b32_e32 311; GCN: v_cndmask_b32_e32 312; GCN: v_cndmask_b32_e32 313; GCN: v_cndmask_b32_e32 314; GCN: v_cndmask_b32_e32 315; GCN: v_cndmask_b32_e32 316; GCN: v_cndmask_b32_e32 317; GCN: v_cndmask_b32_e32 318; GCN: v_cndmask_b32_e32 319; GCN: v_cndmask_b32_e32 320; GCN: v_cndmask_b32_e32 321; GCN: v_cndmask_b32_e32 322; GCN: v_cndmask_b32_e32 323; GCN: v_cndmask_b32_e32 324; GCN: v_cndmask_b32_e32 325define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) #0 { 326 %cmp = icmp eq i32 %c, 0 327 %select = select i1 %cmp, <8 x double> %a, <8 x double> %b 328 store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16 329 ret void 330} 331 332; GCN-LABEL: {{^}}v_select_v2f16: 333; GCN: v_cndmask_b32_e32 334; GCN-NOT: cndmask 335define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %a.ptr, <2 x half> addrspace(1)* %b.ptr, i32 %c) #0 { 336 %a = load <2 x half>, <2 x half> addrspace(1)* %a.ptr 337 %b = load <2 x half>, <2 x half> addrspace(1)* %b.ptr 338 %cmp = icmp eq i32 %c, 0 339 %select = select i1 %cmp, <2 x half> %a, <2 x half> %b 340 store <2 x half> %select, <2 x half> addrspace(1)* %out, align 4 341 ret void 342} 343 344; GCN-LABEL: {{^}}v_select_v3f16: 345; GCN: v_cndmask_b32_e32 346; GCN: v_cndmask_b32_e32 347; GCN-NOT: cndmask 348define amdgpu_kernel void @v_select_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %a.ptr, <3 x half> addrspace(1)* %b.ptr, i32 %c) #0 { 349 %a = load <3 x half>, <3 x half> addrspace(1)* %a.ptr 350 %b = load <3 x half>, <3 x half> addrspace(1)* %b.ptr 351 %cmp = icmp eq i32 %c, 0 352 %select = select i1 %cmp, <3 x half> %a, <3 x half> %b 353 store <3 x half> %select, <3 x half> addrspace(1)* %out, align 4 354 ret void 355} 356 357; GCN-LABEL: {{^}}v_select_v4f16: 358; GCN: v_cndmask_b32_e32 359; GCN: v_cndmask_b32_e32 360; GCN-NOT: cndmask 361define amdgpu_kernel void @v_select_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %a.ptr, <4 x half> addrspace(1)* %b.ptr, i32 %c) #0 { 362 %a = load <4 x half>, <4 x half> addrspace(1)* %a.ptr 363 %b = load <4 x half>, <4 x half> addrspace(1)* %b.ptr 364 %cmp = icmp eq i32 %c, 0 365 %select = select i1 %cmp, <4 x half> %a, <4 x half> %b 366 store <4 x half> %select, <4 x half> addrspace(1)* %out, align 4 367 ret void 368} 369 370; Function Attrs: nounwind readnone 371declare i32 @llvm.amdgcn.workitem.id.x() #1 372 373attributes #0 = { nounwind } 374attributes #1 = { nounwind readnone } 375