1; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s 4 5; Test expansion of scalar selects on vectors. 6; Evergreen not enabled since it seems to be having problems with doubles. 7 8; GCN-LABEL: {{^}}v_select_v2i8: 9; SI: v_cndmask_b32 10; SI-NOT: cndmask 11 12; GFX9: v_cndmask_b32 13; GFX9-NOT: cndmask 14 15; This is worse when i16 is legal and packed is not because 16; SelectionDAGBuilder for some reason changes the select type. 17; VI: v_cndmask_b32 18; VI: v_cndmask_b32 19define amdgpu_kernel void @v_select_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %a.ptr, <2 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 20 %a = load <2 x i8>, <2 x i8> addrspace(1)* %a.ptr, align 2 21 %b = load <2 x i8>, <2 x i8> addrspace(1)* %b.ptr, align 2 22 %cmp = icmp eq i32 %c, 0 23 %select = select i1 %cmp, <2 x i8> %a, <2 x i8> %b 24 store <2 x i8> %select, <2 x i8> addrspace(1)* %out, align 2 25 ret void 26} 27 28; GCN-LABEL: {{^}}v_select_v4i8: 29; GCN: v_cndmask_b32_e32 30; GCN-NOT: cndmask 31define amdgpu_kernel void @v_select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %a.ptr, <4 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 32 %a = load <4 x i8>, <4 x i8> addrspace(1)* %a.ptr 33 %b = load <4 x i8>, <4 x i8> addrspace(1)* %b.ptr 34 %cmp = icmp eq i32 %c, 0 35 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b 36 store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 37 ret void 38} 39 40; GCN-LABEL: {{^}}v_select_v8i8: 41; GCN: v_cndmask_b32_e32 42; GCN: v_cndmask_b32_e32 43; GCN-NOT: cndmask 44define amdgpu_kernel void @v_select_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %a.ptr, <8 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 45 %a = load <8 x i8>, <8 x i8> addrspace(1)* %a.ptr 46 %b = load <8 x i8>, <8 x i8> addrspace(1)* %b.ptr 47 %cmp = icmp eq i32 %c, 0 48 %select = select i1 %cmp, <8 x i8> %a, <8 x i8> %b 49 store <8 x i8> %select, <8 x i8> addrspace(1)* %out, align 4 50 ret void 51} 52 53; GCN-LABEL: {{^}}v_select_v16i8: 54; GCN: v_cndmask_b32_e32 55; GCN: v_cndmask_b32_e32 56; GCN: v_cndmask_b32_e32 57; GCN: v_cndmask_b32_e32 58; GCN-NOT: cndmask 59define amdgpu_kernel void @v_select_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %a.ptr, <16 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 60 %a = load <16 x i8>, <16 x i8> addrspace(1)* %a.ptr 61 %b = load <16 x i8>, <16 x i8> addrspace(1)* %b.ptr 62 %cmp = icmp eq i32 %c, 0 63 %select = select i1 %cmp, <16 x i8> %a, <16 x i8> %b 64 store <16 x i8> %select, <16 x i8> addrspace(1)* %out, align 4 65 ret void 66} 67 68; GCN-LABEL: {{^}}select_v4i8: 69; GFX89: s_cselect_b32 70; GFX89-NOT: s_cselect_b32 71 72; SI: v_cndmask_b32 73; SI-NOT: cndmask 74define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 { 75 %cmp = icmp eq i8 %c, 0 76 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b 77 store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 78 ret void 79} 80 81; GCN-LABEL: {{^}}select_v2i16: 82; GFX89: s_load_dword 83; GFX89: s_load_dword 84; GFX89: s_load_dword 85; GFX89: s_cselect_b32 86; GFX89-NOT: s_cselect_b32 87 88; SI: v_cndmask_b32_e32 89; SI-NOT: v_cndmask_b32e 90define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 { 91 %cmp = icmp eq i32 %c, 0 92 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b 93 store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4 94 ret void 95} 96 97; GCN-LABEL: {{^}}v_select_v2i16: 98; GCN: buffer_load_dword v 99; GCN: buffer_load_dword v 100; GCN: v_cndmask_b32 101; GCN-NOT: cndmask 102define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 103 %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.ptr 104 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.ptr 105 %cmp = icmp eq i32 %c, 0 106 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b 107 store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4 108 ret void 109} 110 111; GCN-LABEL: {{^}}v_select_v3i16: 112; SI: v_cndmask_b32_e32 113; SI: cndmask 114; SI-NOT: cndmask 115 116; GFX89: v_cndmask_b32_e32 117; GFX89: cndmask 118; VI: cndmask 119; GFX89-NOT: cndmask 120define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 121 %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr 122 %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.ptr 123 %cmp = icmp eq i32 %c, 0 124 %select = select i1 %cmp, <3 x i16> %a, <3 x i16> %b 125 store <3 x i16> %select, <3 x i16> addrspace(1)* %out, align 4 126 ret void 127} 128 129; GCN-LABEL: {{^}}v_select_v4i16: 130; GCN: v_cndmask_b32_e32 131; GCN: v_cndmask_b32_e32 132; GCN-NOT: cndmask 133define amdgpu_kernel void @v_select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %a.ptr, <4 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 134 %a = load <4 x i16>, <4 x i16> addrspace(1)* %a.ptr 135 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b.ptr 136 %cmp = icmp eq i32 %c, 0 137 %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b 138 store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4 139 ret void 140} 141 142; GCN-LABEL: {{^}}v_select_v8i16: 143; GCN: v_cndmask_b32_e32 144; GCN: v_cndmask_b32_e32 145; GCN: v_cndmask_b32_e32 146; GCN: v_cndmask_b32_e32 147; GCN-NOT: cndmask 148define amdgpu_kernel void @v_select_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %a.ptr, <8 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 149 %a = load <8 x i16>, <8 x i16> addrspace(1)* %a.ptr 150 %b = load <8 x i16>, <8 x i16> addrspace(1)* %b.ptr 151 %cmp = icmp eq i32 %c, 0 152 %select = select i1 %cmp, <8 x i16> %a, <8 x i16> %b 153 store <8 x i16> %select, <8 x i16> addrspace(1)* %out, align 4 154 ret void 155} 156 157; FIXME: Expansion with bitwise operations may be better if doing a 158; vector select with SGPR inputs. 159 160; GCN-LABEL: {{^}}s_select_v2i32: 161; GCN: v_cndmask_b32_e32 162; GCN: v_cndmask_b32_e32 163; GCN: buffer_store_dwordx2 164define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) #0 { 165 %cmp = icmp eq i32 %c, 0 166 %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b 167 store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8 168 ret void 169} 170 171; GCN-LABEL: {{^}}s_select_v4i32: 172; GCN: v_cndmask_b32_e32 173; GCN: v_cndmask_b32_e32 174; GCN: v_cndmask_b32_e32 175; GCN: v_cndmask_b32_e32 176; GCN: buffer_store_dwordx4 177define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) #0 { 178 %cmp = icmp eq i32 %c, 0 179 %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b 180 store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16 181 ret void 182} 183 184; GCN-LABEL: {{^}}v_select_v4i32: 185; GCN: buffer_load_dwordx4 186; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 32 187; GCN: s_cselect_b64 vcc, -1, 0 188; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 189; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 190; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 191; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 192; GCN: buffer_store_dwordx4 193define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 { 194bb: 195 %tmp2 = icmp ult i32 %cond, 32 196 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in 197 %tmp3 = select i1 %tmp2, <4 x i32> %val, <4 x i32> zeroinitializer 198 store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 16 199 ret void 200} 201 202; GCN-LABEL: {{^}}select_v8i32: 203; GCN: v_cndmask_b32_e32 204; GCN: v_cndmask_b32_e32 205; GCN: v_cndmask_b32_e32 206; GCN: v_cndmask_b32_e32 207; GCN: v_cndmask_b32_e32 208; GCN: v_cndmask_b32_e32 209; GCN: v_cndmask_b32_e32 210; GCN: v_cndmask_b32_e32 211define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) #0 { 212 %cmp = icmp eq i32 %c, 0 213 %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b 214 store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16 215 ret void 216} 217 218; GCN-LABEL: {{^}}s_select_v2f32: 219; GCN-DAG: s_load_dwordx4 s[[[ALO:[0-9]+]]:[[BHI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} 220 221; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]] 222; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]] 223; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} 224 225; GCN-DAG: v_cndmask_b32_e32 226; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} 227; GCN-DAG: v_cndmask_b32_e32 228; GCN: buffer_store_dwordx2 229define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 { 230 %cmp = icmp eq i32 %c, 0 231 %select = select i1 %cmp, <2 x float> %a, <2 x float> %b 232 store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16 233 ret void 234} 235 236; GCN-LABEL: {{^}}s_select_v3f32: 237; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} 238 239; GCN: v_cndmask_b32_e32 240; GCN: v_cndmask_b32_e32 241; GCN: v_cndmask_b32_e32 242 243; GCN: buffer_store_dwordx 244define amdgpu_kernel void @s_select_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, <3 x float> %b, i32 %c) #0 { 245 %cmp = icmp eq i32 %c, 0 246 %select = select i1 %cmp, <3 x float> %a, <3 x float> %b 247 store <3 x float> %select, <3 x float> addrspace(1)* %out, align 16 248 ret void 249} 250 251; GCN-LABEL: {{^}}s_select_v4f32: 252; GCN: s_load_dwordx8 253; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} 254 255; GCN: v_cndmask_b32_e32 256; GCN: v_cndmask_b32_e32 257; GCN: v_cndmask_b32_e32 258; GCN: v_cndmask_b32_e32 259 260; GCN: buffer_store_dwordx4 261define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) #0 { 262 %cmp = icmp eq i32 %c, 0 263 %select = select i1 %cmp, <4 x float> %a, <4 x float> %b 264 store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16 265 ret void 266} 267 268; GCN-LABEL: {{^}}v_select_v4f32: 269; GCN: buffer_load_dwordx4 270; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 32 271; GCN: s_cselect_b64 vcc, -1, 0 272; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 273; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 274; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 275; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 276; GCN: buffer_store_dwordx4 277define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 { 278bb: 279 %tmp2 = icmp ult i32 %cond, 32 280 %val = load <4 x float>, <4 x float> addrspace(1)* %in 281 %tmp3 = select i1 %tmp2, <4 x float> %val, <4 x float> zeroinitializer 282 store <4 x float> %tmp3, <4 x float> addrspace(1)* %out, align 16 283 ret void 284} 285 286; GCN-LABEL: {{^}}s_select_v5f32: 287; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} 288 289; GCN: v_cndmask_b32_e32 290; GCN: v_cndmask_b32_e32 291; GCN: v_cndmask_b32_e32 292; GCN: v_cndmask_b32_e32 293; GCN: v_cndmask_b32_e32 294 295; GCN: buffer_store_dwordx 296define amdgpu_kernel void @s_select_v5f32(<5 x float> addrspace(1)* %out, <5 x float> %a, <5 x float> %b, i32 %c) #0 { 297 %cmp = icmp eq i32 %c, 0 298 %select = select i1 %cmp, <5 x float> %a, <5 x float> %b 299 store <5 x float> %select, <5 x float> addrspace(1)* %out, align 16 300 ret void 301} 302 303; GCN-LABEL: {{^}}select_v8f32: 304; GCN: v_cndmask_b32_e32 305; GCN: v_cndmask_b32_e32 306; GCN: v_cndmask_b32_e32 307; GCN: v_cndmask_b32_e32 308; GCN: v_cndmask_b32_e32 309; GCN: v_cndmask_b32_e32 310; GCN: v_cndmask_b32_e32 311; GCN: v_cndmask_b32_e32 312define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) #0 { 313 %cmp = icmp eq i32 %c, 0 314 %select = select i1 %cmp, <8 x float> %a, <8 x float> %b 315 store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16 316 ret void 317} 318 319; GCN-LABEL: {{^}}select_v2f64: 320; GCN: v_cndmask_b32_e32 321; GCN: v_cndmask_b32_e32 322; GCN: v_cndmask_b32_e32 323; GCN: v_cndmask_b32_e32 324define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) #0 { 325 %cmp = icmp eq i32 %c, 0 326 %select = select i1 %cmp, <2 x double> %a, <2 x double> %b 327 store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16 328 ret void 329} 330 331; GCN-LABEL: {{^}}select_v4f64: 332; GCN: v_cndmask_b32_e32 333; GCN: v_cndmask_b32_e32 334; GCN: v_cndmask_b32_e32 335; GCN: v_cndmask_b32_e32 336; GCN: v_cndmask_b32_e32 337; GCN: v_cndmask_b32_e32 338; GCN: v_cndmask_b32_e32 339; GCN: v_cndmask_b32_e32 340define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) #0 { 341 %cmp = icmp eq i32 %c, 0 342 %select = select i1 %cmp, <4 x double> %a, <4 x double> %b 343 store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16 344 ret void 345} 346 347; GCN-LABEL: {{^}}select_v8f64: 348; GCN: v_cndmask_b32_e32 349; GCN: v_cndmask_b32_e32 350; GCN: v_cndmask_b32_e32 351; GCN: v_cndmask_b32_e32 352; GCN: v_cndmask_b32_e32 353; GCN: v_cndmask_b32_e32 354; GCN: v_cndmask_b32_e32 355; GCN: v_cndmask_b32_e32 356; GCN: v_cndmask_b32_e32 357; GCN: v_cndmask_b32_e32 358; GCN: v_cndmask_b32_e32 359; GCN: v_cndmask_b32_e32 360; GCN: v_cndmask_b32_e32 361; GCN: v_cndmask_b32_e32 362; GCN: v_cndmask_b32_e32 363; GCN: v_cndmask_b32_e32 364define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) #0 { 365 %cmp = icmp eq i32 %c, 0 366 %select = select i1 %cmp, <8 x double> %a, <8 x double> %b 367 store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16 368 ret void 369} 370 371; GCN-LABEL: {{^}}v_select_v2f16: 372; GCN: v_cndmask_b32 373; GCN-NOT: cndmask 374define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %a.ptr, <2 x half> addrspace(1)* %b.ptr, i32 %c) #0 { 375 %a = load <2 x half>, <2 x half> addrspace(1)* %a.ptr 376 %b = load <2 x half>, <2 x half> addrspace(1)* %b.ptr 377 %cmp = icmp eq i32 %c, 0 378 %select = select i1 %cmp, <2 x half> %a, <2 x half> %b 379 store <2 x half> %select, <2 x half> addrspace(1)* %out, align 4 380 ret void 381} 382 383; GCN-LABEL: {{^}}v_select_v3f16: 384; GCN: v_cndmask_b32_e32 385; GCN: v_cndmask_b32_e32 386; GCN-NOT: cndmask 387define amdgpu_kernel void @v_select_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %a.ptr, <3 x half> addrspace(1)* %b.ptr, i32 %c) #0 { 388 %a = load <3 x half>, <3 x half> addrspace(1)* %a.ptr 389 %b = load <3 x half>, <3 x half> addrspace(1)* %b.ptr 390 %cmp = icmp eq i32 %c, 0 391 %select = select i1 %cmp, <3 x half> %a, <3 x half> %b 392 store <3 x half> %select, <3 x half> addrspace(1)* %out, align 4 393 ret void 394} 395 396; GCN-LABEL: {{^}}v_select_v4f16: 397; GCN: v_cndmask_b32_e32 398; GCN: v_cndmask_b32_e32 399; GCN-NOT: cndmask 400define amdgpu_kernel void @v_select_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %a.ptr, <4 x half> addrspace(1)* %b.ptr, i32 %c) #0 { 401 %a = load <4 x half>, <4 x half> addrspace(1)* %a.ptr 402 %b = load <4 x half>, <4 x half> addrspace(1)* %b.ptr 403 %cmp = icmp eq i32 %c, 0 404 %select = select i1 %cmp, <4 x half> %a, <4 x half> %b 405 store <4 x half> %select, <4 x half> addrspace(1)* %out, align 4 406 ret void 407} 408 409; Function Attrs: nounwind readnone 410declare i32 @llvm.amdgcn.workitem.id.x() #1 411 412attributes #0 = { nounwind } 413attributes #1 = { nounwind readnone } 414