1; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s 2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s 3 4; FIXME: Broken on evergreen 5; FIXME: For some reason the 8 and 16 vectors are being stored as 6; individual elements instead of 128-bit stores. 7 8 9; FIXME: Why is the constant moved into the intermediate register and 10; not just directly into the vector component? 11 12; GCN-LABEL: {{^}}insertelement_v4f32_0: 13; GCN: s_load_dwordx4 14; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} 15; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} 16; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} 17; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} 18; GCN-DAG: s_mov_b32 [[CONSTREG:s[0-9]+]], 0x40a00000 19; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]] 20; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]: 21define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 22 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 23 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 24 ret void 25} 26 27; GCN-LABEL: {{^}}insertelement_v4f32_1: 28define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 29 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1 30 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 31 ret void 32} 33 34; GCN-LABEL: {{^}}insertelement_v4f32_2: 35define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 36 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2 37 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 38 ret void 39} 40 41; GCN-LABEL: {{^}}insertelement_v4f32_3: 42define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 43 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3 44 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 45 ret void 46} 47 48; GCN-LABEL: {{^}}insertelement_v4i32_0: 49define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind { 50 %vecins = insertelement <4 x i32> %a, i32 999, i32 0 51 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 52 ret void 53} 54 55; GCN-LABEL: {{^}}insertelement_v3f32_1: 56define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { 57 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1 58 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 59 ret void 60} 61 62; GCN-LABEL: {{^}}insertelement_v3f32_2: 63define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { 64 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2 65 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 66 ret void 67} 68 69; GCN-LABEL: {{^}}insertelement_v3f32_3: 70define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { 71 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3 72 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 73 ret void 74} 75 76; GCN-LABEL: {{^}}insertelement_to_sgpr: 77; GCN-NOT: v_readfirstlane 78define <4 x float> @insertelement_to_sgpr() nounwind { 79 %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef 80 %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0 81 %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0) 82 ret <4 x float> %tmp2 83} 84 85; GCN-LABEL: {{^}}dynamic_insertelement_v2f32: 86; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 87; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1 88; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]] 89; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 90; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]], v{{[0-9]+}}, [[CC1]] 91; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: 92define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { 93 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b 94 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8 95 ret void 96} 97 98; GCN-LABEL: {{^}}dynamic_insertelement_v3f32: 99; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 100; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2 101; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC3]] 102; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1 103; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]] 104; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 105; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC1]] 106; GCN-DAG: buffer_store_dwordx3 v 107define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind { 108 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b 109 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 110 ret void 111} 112 113; GCN-LABEL: {{^}}dynamic_insertelement_v4f32: 114; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 115; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3 116; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC4]] 117; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 2 118; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC3]] 119; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1 120; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]] 121; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 122; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]], v{{[0-9]+}}, [[CC1]] 123; GCN: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]: 124define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind { 125 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b 126 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 127 ret void 128} 129 130; GCN-LABEL: {{^}}dynamic_insertelement_v8f32: 131; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 132; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 7 133; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CCL]] 134; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 135; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC1]] 136; GCN: buffer_store_dwordx4 137; GCN: buffer_store_dwordx4 138define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { 139 %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b 140 store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32 141 ret void 142} 143 144; GCN-LABEL: {{^}}dynamic_insertelement_v16f32: 145; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} 146; GCN: buffer_store_dwordx4 147; GCN: buffer_store_dwordx4 148; GCN: buffer_store_dwordx4 149; GCN: buffer_store_dwordx4 150define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind { 151 %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b 152 store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64 153 ret void 154} 155 156; GCN-LABEL: {{^}}dynamic_insertelement_v2i32: 157; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1 158; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC2]] 159; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 160; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5, v{{[0-9]+}}, [[CC1]] 161; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: 162define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { 163 %vecins = insertelement <2 x i32> %a, i32 5, i32 %b 164 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8 165 ret void 166} 167 168; GCN-LABEL: {{^}}dynamic_insertelement_v3i32: 169; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2 170; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC3]] 171; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1 172; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC2]] 173; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 174; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]] 175; GCN-DAG: buffer_store_dwordx3 v 176define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind { 177 %vecins = insertelement <3 x i32> %a, i32 5, i32 %b 178 store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16 179 ret void 180} 181 182; GCN-LABEL: {{^}}dynamic_insertelement_v4i32: 183; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}} 184; GCN-DAG: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] 185; GCN-DAG: v_cmp_eq_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3 186; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC4]] 187; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX]], 2 188; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC3]] 189; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1 190; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC2]] 191; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 192; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC1]] 193; GCN: buffer_store_dwordx4 194define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { 195 %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b 196 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 197 ret void 198} 199 200; GCN-LABEL: {{^}}dynamic_insertelement_v8i32: 201; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 7 202; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CCL]] 203; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 204; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]] 205; GCN: buffer_store_dwordx4 206; GCN: buffer_store_dwordx4 207define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind { 208 %vecins = insertelement <8 x i32> %a, i32 5, i32 %b 209 store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32 210 ret void 211} 212 213; GCN-LABEL: {{^}}dynamic_insertelement_v16i32: 214; GCN: v_movreld_b32 215; GCN: buffer_store_dwordx4 216; GCN: buffer_store_dwordx4 217; GCN: buffer_store_dwordx4 218; GCN: buffer_store_dwordx4 219define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind { 220 %vecins = insertelement <16 x i32> %a, i32 5, i32 %b 221 store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64 222 ret void 223} 224 225; GCN-LABEL: {{^}}dynamic_insertelement_v2i16: 226define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind { 227 %vecins = insertelement <2 x i16> %a, i16 5, i32 %b 228 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8 229 ret void 230} 231 232; GCN-LABEL: {{^}}dynamic_insertelement_v3i16: 233define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind { 234 %vecins = insertelement <3 x i16> %a, i16 5, i32 %b 235 store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8 236 ret void 237} 238 239; GCN-LABEL: {{^}}dynamic_insertelement_v2i8: 240; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28 241; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c 242; VI-NOT: _load 243; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 244; VI: v_lshlrev_b16_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], -1 245; VI: v_and_b32_e32 [[INSERT:v[0-9]+]], 0x505, [[MASK]] 246; VI: v_xor_b32_e32 [[NOT_MASK:v[0-9]+]], -1, [[MASK]] 247; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[LOAD]], [[NOT_MASK]] 248; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[AND_NOT_MASK]] 249; VI: buffer_store_short [[OR]] 250define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind { 251 %vecins = insertelement <2 x i8> %a, i8 5, i32 %b 252 store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8 253 ret void 254} 255 256; FIXME: post legalize i16 and i32 shifts aren't merged because of 257; isTypeDesirableForOp in SimplifyDemandedBits 258 259; GCN-LABEL: {{^}}dynamic_insertelement_v3i8: 260; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28 261; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c 262; VI-NOT: _load 263 264; VI: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x5050505 265; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] 266; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 267; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] 268; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], [[VAL]], [[V_LOAD]] 269; VI: v_lshrrev_b32_e32 [[V_HI2:v[0-9]+]], 16, [[BFI]] 270 271; VI: buffer_store_short [[BFI]] 272; VI: buffer_store_byte [[V_HI2]] 273define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { 274 %vecins = insertelement <3 x i8> %a, i8 5, i32 %b 275 store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4 276 ret void 277} 278 279; GCN-LABEL: {{^}}dynamic_insertelement_v4i8: 280; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28 281; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c 282; VI-NOT: _load 283 284; VI: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x5050505 285; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] 286; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 287; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] 288; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], [[VAL]], [[V_LOAD]] 289; VI: buffer_store_dword [[BFI]] 290define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { 291 %vecins = insertelement <4 x i8> %a, i8 5, i32 %b 292 store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4 293 ret void 294} 295 296; GCN-LABEL: {{^}}s_dynamic_insertelement_v8i8: 297; VI-NOT: {{buffer|flat|global}}_load 298; VI-DAG: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 299; VI-DAG: s_load_dword [[IDX:s[0-9]]], s[4:5], 0x10 300; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0{{$}} 301; VI-DAG: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 302 303; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 304; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff 305; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]] 306; VI: s_mov_b32 [[VAL:s[0-9]+]], 0x5050505 307; VI: s_and_b32 s[[INS_HI:[0-9]+]], s[[MASK_SHIFT_HI]], [[VAL]] 308; VI: s_and_b32 s[[INS_LO:[0-9]+]], s[[MASK_SHIFT_LO]], [[VAL]] 309; VI: s_andn2_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[VEC]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}} 310; VI: s_or_b64 s{{\[}}[[RESULT0:[0-9]+]]:[[RESULT1:[0-9]+]]{{\]}}, s{{\[}}[[INS_LO]]:[[INS_HI]]{{\]}}, [[AND]] 311; VI: v_mov_b32_e32 v[[V_RESULT0:[0-9]+]], s[[RESULT0]] 312; VI: v_mov_b32_e32 v[[V_RESULT1:[0-9]+]], s[[RESULT1]] 313; VI: buffer_store_dwordx2 v{{\[}}[[V_RESULT0]]:[[V_RESULT1]]{{\]}} 314define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind { 315 %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4 316 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b 317 store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8 318 ret void 319} 320 321; GCN-LABEL: {{^}}dynamic_insertelement_v16i8: 322; GCN: s_load_dwordx2 323; GCN: s_load_dwordx4 324; GCN: s_load_dword s 325 326; GCN-NOT: buffer_store_byte 327 328; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 15 329; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CCL]] 330; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 331; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]] 332 333; GCN: buffer_store_dwordx4 334define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { 335 %vecins = insertelement <16 x i8> %a, i8 5, i32 %b 336 store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16 337 ret void 338} 339 340; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that 341; the compiler doesn't crash. 342; GCN-LABEL: {{^}}insert_split_bb: 343define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) { 344entry: 345 %0 = insertelement <2 x i32> undef, i32 %a, i32 0 346 %1 = icmp eq i32 %a, 0 347 br i1 %1, label %if, label %else 348 349if: 350 %2 = load i32, i32 addrspace(1)* %in 351 %3 = insertelement <2 x i32> %0, i32 %2, i32 1 352 br label %endif 353 354else: 355 %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1 356 %5 = load i32, i32 addrspace(1)* %4 357 %6 = insertelement <2 x i32> %0, i32 %5, i32 1 358 br label %endif 359 360endif: 361 %7 = phi <2 x i32> [%3, %if], [%6, %else] 362 store <2 x i32> %7, <2 x i32> addrspace(1)* %out 363 ret void 364} 365 366; GCN-LABEL: {{^}}dynamic_insertelement_v2f64: 367; GCN-DAG: s_load_dwordx4 s{{\[}}[[A_ELT0:[0-9]+]]:[[A_ELT3:[0-9]+]]{{\]}} 368; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x18|0x60}}{{$}} 369 370; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} 371; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} 372; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} 373; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} 374; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000 375 376; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1 377; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[ELT1]], [[CC2]] 378; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]] 379; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 380; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[ELT1]], [[CC1]] 381; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]] 382 383; GCN: buffer_store_dwordx4 384; GCN: s_endpgm 385define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind { 386 %vecins = insertelement <2 x double> %a, double 8.0, i32 %b 387 store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16 388 ret void 389} 390 391; GCN-LABEL: {{^}}dynamic_insertelement_v2i64: 392 393; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1 394; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC2]] 395; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]] 396; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 397; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC1]] 398; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]] 399 400; GCN: buffer_store_dwordx4 401; GCN: s_endpgm 402define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { 403 %vecins = insertelement <2 x i64> %a, i64 5, i32 %b 404 store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8 405 ret void 406} 407 408; GCN-LABEL: {{^}}dynamic_insertelement_v3i64: 409; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2 410; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC3]] 411; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC3]] 412; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1 413; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC2]] 414; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]] 415; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 416; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC1]] 417; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]] 418define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind { 419 %vecins = insertelement <3 x i64> %a, i64 5, i32 %b 420 store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32 421 ret void 422} 423 424; GCN-LABEL: {{^}}dynamic_insertelement_v4f64: 425 426; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40200000 427; GCN-DAG: v_cmp_eq_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3 428; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC4]] 429; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC4]] 430; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX]], 2 431; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC3]] 432; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC3]] 433; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1 434; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC2]] 435; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]] 436; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 437; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC1]] 438; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]] 439 440; GCN: buffer_store_dwordx4 441; GCN: buffer_store_dwordx4 442; GCN: s_endpgm 443; GCN: ScratchSize: 0 444 445define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { 446 %vecins = insertelement <4 x double> %a, double 8.0, i32 %b 447 store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16 448 ret void 449} 450 451; GCN-LABEL: {{^}}dynamic_insertelement_v8f64: 452; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:64{{$}} 453; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:80{{$}} 454; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:96{{$}} 455; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:112{{$}} 456 457; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}} 458 459; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:64{{$}} 460; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:80{{$}} 461; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:96{{$}} 462; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:112{{$}} 463 464; GCN: buffer_store_dwordx4 465; GCN: buffer_store_dwordx4 466; GCN: buffer_store_dwordx4 467; GCN: buffer_store_dwordx4 468; GCN: s_endpgm 469; GCN: ScratchSize: 128 470define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 { 471 %vecins = insertelement <8 x double> %a, double 8.0, i32 %b 472 store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 473 ret void 474} 475 476declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 477 478attributes #0 = { nounwind } 479attributes #1 = { nounwind readnone } 480