1; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s 4 5; Test expansion of scalar selects on vectors. 6; Evergreen not enabled since it seems to be having problems with doubles. 7 8; GCN-LABEL: {{^}}v_select_v2i8: 9; SI: v_cndmask_b32 10; SI-NOT: cndmask 11 12; GFX9: v_cndmask_b32 13; GFX9-NOT: cndmask 14 15; This is worse when i16 is legal and packed is not because 16; SelectionDAGBuilder for some reason changes the select type. 17; VI: v_cndmask_b32 18; VI: v_cndmask_b32 19define amdgpu_kernel void @v_select_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %a.ptr, <2 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 20 %a = load <2 x i8>, <2 x i8> addrspace(1)* %a.ptr, align 2 21 %b = load <2 x i8>, <2 x i8> addrspace(1)* %b.ptr, align 2 22 %cmp = icmp eq i32 %c, 0 23 %select = select i1 %cmp, <2 x i8> %a, <2 x i8> %b 24 store <2 x i8> %select, <2 x i8> addrspace(1)* %out, align 2 25 ret void 26} 27 28; GCN-LABEL: {{^}}v_select_v4i8: 29; GCN: v_cndmask_b32_e32 30; GCN-NOT: cndmask 31define amdgpu_kernel void @v_select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %a.ptr, <4 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 32 %a = load <4 x i8>, <4 x i8> addrspace(1)* %a.ptr 33 %b = load <4 x i8>, <4 x i8> addrspace(1)* %b.ptr 34 %cmp = icmp eq i32 %c, 0 35 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b 36 store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 37 ret void 38} 39 40; GCN-LABEL: {{^}}v_select_v8i8: 41; GCN: v_cndmask_b32_e32 42; GCN: v_cndmask_b32_e32 43; GCN-NOT: cndmask 44define amdgpu_kernel void @v_select_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %a.ptr, <8 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 45 %a = load <8 x i8>, <8 x i8> addrspace(1)* %a.ptr 46 %b = load <8 x i8>, <8 x i8> addrspace(1)* %b.ptr 47 %cmp = icmp eq i32 %c, 0 48 %select = select i1 %cmp, <8 x i8> %a, <8 x i8> %b 49 store <8 x i8> %select, <8 x i8> addrspace(1)* %out, align 4 50 ret void 51} 52 53; GCN-LABEL: {{^}}v_select_v16i8: 54; GCN: v_cndmask_b32_e32 55; GCN: v_cndmask_b32_e32 56; GCN: v_cndmask_b32_e32 57; GCN: v_cndmask_b32_e32 58; GCN-NOT: cndmask 59define amdgpu_kernel void @v_select_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %a.ptr, <16 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 60 %a = load <16 x i8>, <16 x i8> addrspace(1)* %a.ptr 61 %b = load <16 x i8>, <16 x i8> addrspace(1)* %b.ptr 62 %cmp = icmp eq i32 %c, 0 63 %select = select i1 %cmp, <16 x i8> %a, <16 x i8> %b 64 store <16 x i8> %select, <16 x i8> addrspace(1)* %out, align 4 65 ret void 66} 67 68; GCN-LABEL: {{^}}select_v4i8: 69; GFX89: s_cselect_b32 70; GFX89-NOT: s_cselect_b32 71 72; SI: v_cndmask_b32 73; SI-NOT: cndmask 74define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 { 75 %cmp = icmp eq i8 %c, 0 76 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b 77 store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 78 ret void 79} 80 81; GCN-LABEL: {{^}}select_v2i16: 82; GFX89: s_load_dword 83; GFX89: s_load_dword 84; GFX89: s_load_dword 85; GFX89: s_cselect_b32 86; GFX89-NOT: s_cselect_b32 87 88; SI: v_cndmask_b32_e32 89; SI-NOT: v_cndmask_b32e 90define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 { 91 %cmp = icmp eq i32 %c, 0 92 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b 93 store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4 94 ret void 95} 96 97; GCN-LABEL: {{^}}v_select_v2i16: 98; GCN: buffer_load_dword v 99; GCN: buffer_load_dword v 100; GCN: v_cndmask_b32 101; GCN-NOT: cndmask 102define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 103 %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.ptr 104 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.ptr 105 %cmp = icmp eq i32 %c, 0 106 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b 107 store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4 108 ret void 109} 110 111; GCN-LABEL: {{^}}v_select_v3i16: 112; SI: v_cndmask_b32_e32 113; SI: cndmask 114; SI-NOT: cndmask 115 116; GFX89: v_cndmask_b32_e32 117; GFX89: cndmask 118; VI: cndmask 119; GFX89-NOT: cndmask 120define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 121 %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr 122 %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.ptr 123 %cmp = icmp eq i32 %c, 0 124 %select = select i1 %cmp, <3 x i16> %a, <3 x i16> %b 125 store <3 x i16> %select, <3 x i16> addrspace(1)* %out, align 4 126 ret void 127} 128 129; GCN-LABEL: {{^}}v_select_v4i16: 130; GCN: v_cndmask_b32_e32 131; GCN: v_cndmask_b32_e32 132; GCN-NOT: cndmask 133define amdgpu_kernel void @v_select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %a.ptr, <4 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 134 %a = load <4 x i16>, <4 x i16> addrspace(1)* %a.ptr 135 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b.ptr 136 %cmp = icmp eq i32 %c, 0 137 %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b 138 store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4 139 ret void 140} 141 142; GCN-LABEL: {{^}}v_select_v8i16: 143; GCN: v_cndmask_b32_e32 144; GCN: v_cndmask_b32_e32 145; GCN: v_cndmask_b32_e32 146; GCN: v_cndmask_b32_e32 147; GCN-NOT: cndmask 148define amdgpu_kernel void @v_select_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %a.ptr, <8 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 149 %a = load <8 x i16>, <8 x i16> addrspace(1)* %a.ptr 150 %b = load <8 x i16>, <8 x i16> addrspace(1)* %b.ptr 151 %cmp = icmp eq i32 %c, 0 152 %select = select i1 %cmp, <8 x i16> %a, <8 x i16> %b 153 store <8 x i16> %select, <8 x i16> addrspace(1)* %out, align 4 154 ret void 155} 156 157; FIXME: Expansion with bitwise operations may be better if doing a 158; vector select with SGPR inputs. 159 160; GCN-LABEL: {{^}}s_select_v2i32: 161; GCN: v_cndmask_b32_e32 162; GCN: v_cndmask_b32_e32 163; GCN: buffer_store_dwordx2 164define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) #0 { 165 %cmp = icmp eq i32 %c, 0 166 %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b 167 store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8 168 ret void 169} 170 171; GCN-LABEL: {{^}}s_select_v4i32: 172; GCN: v_cndmask_b32_e32 173; GCN: v_cndmask_b32_e32 174; GCN: v_cndmask_b32_e32 175; GCN: v_cndmask_b32_e32 176; GCN: buffer_store_dwordx4 177define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) #0 { 178 %cmp = icmp eq i32 %c, 0 179 %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b 180 store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16 181 ret void 182} 183 184; GCN-LABEL: {{^}}v_select_v4i32: 185; GCN: buffer_load_dwordx4 186; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 32 187; GCN: s_cselect_b64 vcc, -1, 0 188; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 189; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 190; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 191; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 192; GCN: buffer_store_dwordx4 193define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 { 194bb: 195 %tmp2 = icmp ult i32 %cond, 32 196 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in 197 %tmp3 = select i1 %tmp2, <4 x i32> %val, <4 x i32> zeroinitializer 198 store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 16 199 ret void 200} 201 202; GCN-LABEL: {{^}}select_v8i32: 203; GCN: v_cndmask_b32_e32 204; GCN: v_cndmask_b32_e32 205; GCN: v_cndmask_b32_e32 206; GCN: v_cndmask_b32_e32 207; GCN: v_cndmask_b32_e32 208; GCN: v_cndmask_b32_e32 209; GCN: v_cndmask_b32_e32 210; GCN: v_cndmask_b32_e32 211define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) #0 { 212 %cmp = icmp eq i32 %c, 0 213 %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b 214 store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16 215 ret void 216} 217 218; GCN-LABEL: {{^}}s_select_v2f32: 219; GCN-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} 220; GCN-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}} 221 222; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]] 223; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]] 224; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]] 225; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} 226 227; GCN-DAG: v_cndmask_b32_e32 228; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]] 229; GCN-DAG: v_cndmask_b32_e32 230; GCN: buffer_store_dwordx2 231define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 { 232 %cmp = icmp eq i32 %c, 0 233 %select = select i1 %cmp, <2 x float> %a, <2 x float> %b 234 store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16 235 ret void 236} 237 238; GCN-LABEL: {{^}}s_select_v3f32: 239; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} 240 241; GCN: v_cndmask_b32_e32 242; GCN: v_cndmask_b32_e32 243; GCN: v_cndmask_b32_e32 244 245; GCN: buffer_store_dwordx 246define amdgpu_kernel void @s_select_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, <3 x float> %b, i32 %c) #0 { 247 %cmp = icmp eq i32 %c, 0 248 %select = select i1 %cmp, <3 x float> %a, <3 x float> %b 249 store <3 x float> %select, <3 x float> addrspace(1)* %out, align 16 250 ret void 251} 252 253; GCN-LABEL: {{^}}s_select_v4f32: 254; GCN: s_load_dwordx4 255; GCN: s_load_dwordx4 256; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} 257 258; GCN: v_cndmask_b32_e32 259; GCN: v_cndmask_b32_e32 260; GCN: v_cndmask_b32_e32 261; GCN: v_cndmask_b32_e32 262 263; GCN: buffer_store_dwordx4 264define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) #0 { 265 %cmp = icmp eq i32 %c, 0 266 %select = select i1 %cmp, <4 x float> %a, <4 x float> %b 267 store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16 268 ret void 269} 270 271; GCN-LABEL: {{^}}v_select_v4f32: 272; GCN: buffer_load_dwordx4 273; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 32 274; GCN: s_cselect_b64 vcc, -1, 0 275; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 276; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 277; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 278; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 279; GCN: buffer_store_dwordx4 280define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 { 281bb: 282 %tmp2 = icmp ult i32 %cond, 32 283 %val = load <4 x float>, <4 x float> addrspace(1)* %in 284 %tmp3 = select i1 %tmp2, <4 x float> %val, <4 x float> zeroinitializer 285 store <4 x float> %tmp3, <4 x float> addrspace(1)* %out, align 16 286 ret void 287} 288 289; GCN-LABEL: {{^}}s_select_v5f32: 290; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} 291 292; GCN: v_cndmask_b32_e32 293; GCN: v_cndmask_b32_e32 294; GCN: v_cndmask_b32_e32 295; GCN: v_cndmask_b32_e32 296; GCN: v_cndmask_b32_e32 297 298; GCN: buffer_store_dwordx 299define amdgpu_kernel void @s_select_v5f32(<5 x float> addrspace(1)* %out, <5 x float> %a, <5 x float> %b, i32 %c) #0 { 300 %cmp = icmp eq i32 %c, 0 301 %select = select i1 %cmp, <5 x float> %a, <5 x float> %b 302 store <5 x float> %select, <5 x float> addrspace(1)* %out, align 16 303 ret void 304} 305 306; GCN-LABEL: {{^}}select_v8f32: 307; GCN: v_cndmask_b32_e32 308; GCN: v_cndmask_b32_e32 309; GCN: v_cndmask_b32_e32 310; GCN: v_cndmask_b32_e32 311; GCN: v_cndmask_b32_e32 312; GCN: v_cndmask_b32_e32 313; GCN: v_cndmask_b32_e32 314; GCN: v_cndmask_b32_e32 315define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) #0 { 316 %cmp = icmp eq i32 %c, 0 317 %select = select i1 %cmp, <8 x float> %a, <8 x float> %b 318 store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16 319 ret void 320} 321 322; GCN-LABEL: {{^}}select_v2f64: 323; GCN: v_cndmask_b32_e32 324; GCN: v_cndmask_b32_e32 325; GCN: v_cndmask_b32_e32 326; GCN: v_cndmask_b32_e32 327define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) #0 { 328 %cmp = icmp eq i32 %c, 0 329 %select = select i1 %cmp, <2 x double> %a, <2 x double> %b 330 store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16 331 ret void 332} 333 334; GCN-LABEL: {{^}}select_v4f64: 335; GCN: v_cndmask_b32_e32 336; GCN: v_cndmask_b32_e32 337; GCN: v_cndmask_b32_e32 338; GCN: v_cndmask_b32_e32 339; GCN: v_cndmask_b32_e32 340; GCN: v_cndmask_b32_e32 341; GCN: v_cndmask_b32_e32 342; GCN: v_cndmask_b32_e32 343define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) #0 { 344 %cmp = icmp eq i32 %c, 0 345 %select = select i1 %cmp, <4 x double> %a, <4 x double> %b 346 store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16 347 ret void 348} 349 350; GCN-LABEL: {{^}}select_v8f64: 351; GCN: v_cndmask_b32_e32 352; GCN: v_cndmask_b32_e32 353; GCN: v_cndmask_b32_e32 354; GCN: v_cndmask_b32_e32 355; GCN: v_cndmask_b32_e32 356; GCN: v_cndmask_b32_e32 357; GCN: v_cndmask_b32_e32 358; GCN: v_cndmask_b32_e32 359; GCN: v_cndmask_b32_e32 360; GCN: v_cndmask_b32_e32 361; GCN: v_cndmask_b32_e32 362; GCN: v_cndmask_b32_e32 363; GCN: v_cndmask_b32_e32 364; GCN: v_cndmask_b32_e32 365; GCN: v_cndmask_b32_e32 366; GCN: v_cndmask_b32_e32 367define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) #0 { 368 %cmp = icmp eq i32 %c, 0 369 %select = select i1 %cmp, <8 x double> %a, <8 x double> %b 370 store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16 371 ret void 372} 373 374; GCN-LABEL: {{^}}v_select_v2f16: 375; GCN: v_cndmask_b32 376; GCN-NOT: cndmask 377define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %a.ptr, <2 x half> addrspace(1)* %b.ptr, i32 %c) #0 { 378 %a = load <2 x half>, <2 x half> addrspace(1)* %a.ptr 379 %b = load <2 x half>, <2 x half> addrspace(1)* %b.ptr 380 %cmp = icmp eq i32 %c, 0 381 %select = select i1 %cmp, <2 x half> %a, <2 x half> %b 382 store <2 x half> %select, <2 x half> addrspace(1)* %out, align 4 383 ret void 384} 385 386; GCN-LABEL: {{^}}v_select_v3f16: 387; GCN: v_cndmask_b32_e32 388; GCN: v_cndmask_b32_e32 389; GCN-NOT: cndmask 390define amdgpu_kernel void @v_select_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %a.ptr, <3 x half> addrspace(1)* %b.ptr, i32 %c) #0 { 391 %a = load <3 x half>, <3 x half> addrspace(1)* %a.ptr 392 %b = load <3 x half>, <3 x half> addrspace(1)* %b.ptr 393 %cmp = icmp eq i32 %c, 0 394 %select = select i1 %cmp, <3 x half> %a, <3 x half> %b 395 store <3 x half> %select, <3 x half> addrspace(1)* %out, align 4 396 ret void 397} 398 399; GCN-LABEL: {{^}}v_select_v4f16: 400; GCN: v_cndmask_b32_e32 401; GCN: v_cndmask_b32_e32 402; GCN-NOT: cndmask 403define amdgpu_kernel void @v_select_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %a.ptr, <4 x half> addrspace(1)* %b.ptr, i32 %c) #0 { 404 %a = load <4 x half>, <4 x half> addrspace(1)* %a.ptr 405 %b = load <4 x half>, <4 x half> addrspace(1)* %b.ptr 406 %cmp = icmp eq i32 %c, 0 407 %select = select i1 %cmp, <4 x half> %a, <4 x half> %b 408 store <4 x half> %select, <4 x half> addrspace(1)* %out, align 4 409 ret void 410} 411 412; Function Attrs: nounwind readnone 413declare i32 @llvm.amdgcn.workitem.id.x() #1 414 415attributes #0 = { nounwind } 416attributes #1 = { nounwind readnone } 417