1; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s 2; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s 4 5; GCN-LABEL: {{^}}extract_vector_elt_v2i16: 6; GCN: s_load_dword [[VEC:s[0-9]+]] 7; SIVI: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 8; SIVI-DAG: v_mov_b32_e32 [[VELT0:v[0-9]+]], [[VEC]] 9; SIVI-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] 10; SIVI-DAG: buffer_store_short [[VELT0]] 11; SIVI-DAG: buffer_store_short [[VELT1]] 12; GFX9: v_mov_b32_e32 [[VVEC:v[0-9]+]], [[VEC]] 13; GFX9: global_store_short_d16_hi v{{[0-9]+}}, [[VVEC]], 14; GFX9: buffer_store_short [[VVEC]], 15define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 { 16 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 17 %p0 = extractelement <2 x i16> %vec, i32 0 18 %p1 = extractelement <2 x i16> %vec, i32 1 19 %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10 20 store i16 %p1, i16 addrspace(1)* %out, align 2 21 store i16 %p0, i16 addrspace(1)* %out1, align 2 22 ret void 23} 24 25; GCN-LABEL: {{^}}extract_vector_elt_v2i16_dynamic_sgpr: 26; GCN: s_load_dword [[IDX:s[0-9]+]] 27; GCN: s_load_dword [[VEC:s[0-9]+]] 28; GCN: s_lshl_b32 [[IDX_SCALED:s[0-9]+]], [[IDX]], 4 29; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], [[IDX_SCALED]] 30; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] 31; GCN: buffer_store_short [[VELT1]] 32; GCN: ScratchSize: 0 33define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %idx) #0 { 34 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 35 %elt = extractelement <2 x i16> %vec, i32 %idx 36 store i16 %elt, i16 addrspace(1)* %out, align 2 37 ret void 38} 39 40; GCN-LABEL: {{^}}extract_vector_elt_v2i16_dynamic_vgpr: 41; GCN-DAG: {{flat|buffer|global}}_load_dword [[IDX:v[0-9]+]] 42; GCN-DAG: v_lshlrev_b32_e32 [[IDX_SCALED:v[0-9]+]], 4, [[IDX]] 43; GCN-DAG: s_load_dword [[VEC:s[0-9]+]] 44 45; SI: v_lshr_b32_e32 [[ELT:v[0-9]+]], [[VEC]], [[IDX_SCALED]] 46; VI: v_lshrrev_b32_e64 [[ELT:v[0-9]+]], [[IDX_SCALED]], [[VEC]] 47 48; SI: buffer_store_short [[ELT]] 49; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]] 50; GCN: ScratchSize: 0{{$}} 51define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 { 52 %tid = call i32 @llvm.amdgcn.workitem.id.x() 53 %tid.ext = sext i32 %tid to i64 54 %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext 55 %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext 56 %idx = load volatile i32, i32 addrspace(1)* %gep 57 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 58 %elt = extractelement <2 x i16> %vec, i32 %idx 59 store i16 %elt, i16 addrspace(1)* %out.gep, align 2 60 ret void 61} 62 63; GCN-LABEL: {{^}}extract_vector_elt_v3i16: 64; GCN: s_load_dwordx2 65; GCN: s_load_dwordx2 66 67; GCN-NOT: {{buffer|flat|global}}_load 68 69; GCN: buffer_store_short 70; GCN: buffer_store_short 71define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 { 72 %p0 = extractelement <3 x i16> %foo, i32 0 73 %p1 = extractelement <3 x i16> %foo, i32 2 74 %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 75 store i16 %p1, i16 addrspace(1)* %out, align 2 76 store i16 %p0, i16 addrspace(1)* %out1, align 2 77 ret void 78} 79 80; GCN-LABEL: {{^}}extract_vector_elt_v4i16: 81; SI: s_load_dwordx2 82; SI: buffer_store_short 83; SI: buffer_store_short 84 85; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[0:1], 0x2c 86; GFX89-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], s[[LOAD0]] 87; GFX89-DAG: buffer_store_short [[VLOAD0]], off 88; GFX89-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], s[[LOAD1]] 89; GFX89-DAG: buffer_store_short [[VLOAD1]], off 90define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 { 91 %p0 = extractelement <4 x i16> %foo, i32 0 92 %p1 = extractelement <4 x i16> %foo, i32 2 93 %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10 94 store volatile i16 %p1, i16 addrspace(1)* %out, align 2 95 store volatile i16 %p0, i16 addrspace(1)* %out1, align 2 96 ret void 97} 98 99; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16: 100; SI: s_load_dword s 101; SI: s_load_dwordx2 s 102; SI: s_load_dwordx2 s 103 104; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[0:1], 0x24 105; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[0:1], 0x4c 106; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[0:1], 0x54 107 108; GCN-NOT: {{buffer|flat|global}} 109 110; SICI: buffer_store_short 111; SICI: buffer_store_short 112; SICI: buffer_store_short 113 114; GFX9-NOT: s_pack_ll_b32_b16 115; GFX9-NOT: s_pack_lh_b32_b16 116 117; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4 118; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s 119; GCN: {{buffer|global}}_store_short 120define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, [8 x i32], <3 x i16> %foo, i32 %idx) #0 { 121 %p0 = extractelement <3 x i16> %foo, i32 %idx 122 %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 123 store i16 %p0, i16 addrspace(1)* %out 124 ret void 125} 126 127; GCN-LABEL: {{^}}v_insertelement_v4i16_dynamic_sgpr: 128define amdgpu_kernel void @v_insertelement_v4i16_dynamic_sgpr(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %idx) #0 { 129 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 130 %tid.ext = sext i32 %tid to i64 131 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext 132 %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext 133 %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep 134 %vec.extract = extractelement <4 x i16> %vec, i32 %idx 135 store i16 %vec.extract, i16 addrspace(1)* %out.gep 136 ret void 137} 138 139; GCN-LABEL: {{^}}reduce_load_vector_v8i16_extract_01: 140; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]], 141; GCN-NOT: {{s|buffer|flat|global}}_load_ 142; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0 143; GCN-NOT: {{s|buffer|flat|global}}_load_ 144; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 145define amdgpu_kernel void @reduce_load_vector_v8i16_extract_01(<16 x i16> addrspace(4)* %ptr) #0 { 146 %load = load <16 x i16>, <16 x i16> addrspace(4)* %ptr 147 %elt0 = extractelement <16 x i16> %load, i32 0 148 %elt1 = extractelement <16 x i16> %load, i32 1 149 store volatile i16 %elt0, i16 addrspace(1)* undef, align 2 150 store volatile i16 %elt1, i16 addrspace(1)* undef, align 2 151 ret void 152} 153 154; GCN-LABEL: {{^}}reduce_load_vector_v8i16_extract_23: 155; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]], 156; GCN-NOT: {{s|buffer|flat|global}}_load_ 157; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], {{0x1|0x4}} 158; GCN-NOT: {{s|buffer|flat|global}}_load_ 159; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 160define amdgpu_kernel void @reduce_load_vector_v8i16_extract_23(<16 x i16> addrspace(4)* %ptr) #0 { 161 %load = load <16 x i16>, <16 x i16> addrspace(4)* %ptr 162 %elt2 = extractelement <16 x i16> %load, i32 2 163 %elt3 = extractelement <16 x i16> %load, i32 3 164 store volatile i16 %elt2, i16 addrspace(1)* undef, align 2 165 store volatile i16 %elt3, i16 addrspace(1)* undef, align 2 166 ret void 167} 168 169; GCN-LABEL: {{^}}v_extractelement_v8i16_2: 170; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:4 171; SI: buffer_store_short [[RES]] 172; VI: flat_load_dword [[RES:v[0-9]+]] 173; VI: flat_store_short v[{{[0-9:]+}}], [[RES]] 174; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:4 175; GFX9: global_store_short v{{[0-9]+}}, [[RES]] 176define amdgpu_kernel void @v_extractelement_v8i16_2(i16 addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { 177 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 178 %tid.ext = sext i32 %tid to i64 179 %in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext 180 %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext 181 %vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep 182 %vec.extract = extractelement <8 x i16> %vec, i32 2 183 store i16 %vec.extract, i16 addrspace(1)* %out.gep 184 ret void 185} 186 187; GCN-LABEL: {{^}}v_extractelement_v8i16_6: 188; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:12 189; SI: buffer_store_short [[RES]] 190; VI: flat_load_dword [[RES:v[0-9]+]] 191; VI: flat_store_short v[{{[0-9:]+}}], [[RES]] 192; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:12 193; GFX9: global_store_short v{{[0-9]+}}, [[RES]] 194define amdgpu_kernel void @v_extractelement_v8i16_6(i16 addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { 195 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 196 %tid.ext = sext i32 %tid to i64 197 %in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext 198 %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext 199 %vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep 200 %vec.extract = extractelement <8 x i16> %vec, i32 6 201 store i16 %vec.extract, i16 addrspace(1)* %out.gep 202 ret void 203} 204 205; GCN-LABEL: {{^}}v_extractelement_v8i16_dynamic_sgpr: 206; GCN-COUNT-7: v_cndmask_b32_e32 207define amdgpu_kernel void @v_extractelement_v8i16_dynamic_sgpr(i16 addrspace(1)* %out, <8 x i16> addrspace(1)* %in, i32 %n) #0 { 208 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 209 %tid.ext = sext i32 %tid to i64 210 %in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext 211 %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext 212 %vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep 213 %vec.extract = extractelement <8 x i16> %vec, i32 %n 214 store i16 %vec.extract, i16 addrspace(1)* %out.gep 215 ret void 216} 217 218; GCN-LABEL: {{^}}v_extractelement_v16i16_2: 219; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:4 220; SI: buffer_store_short [[RES]] 221; VI: flat_load_dword [[RES:v[0-9]+]] 222; VI: flat_store_short v[{{[0-9:]+}}], [[RES]] 223; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:4 224; GFX9: global_store_short v{{[0-9]+}}, [[RES]] 225define amdgpu_kernel void @v_extractelement_v16i16_2(i16 addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { 226 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 227 %tid.ext = sext i32 %tid to i64 228 %in.gep = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(1)* %in, i64 %tid.ext 229 %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext 230 %vec = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep 231 %vec.extract = extractelement <16 x i16> %vec, i32 2 232 store i16 %vec.extract, i16 addrspace(1)* %out.gep 233 ret void 234} 235 236; GCN-LABEL: {{^}}v_extractelement_v16i16_6: 237; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:12 238; SI: buffer_store_short [[RES]] 239; VI: flat_load_dword [[RES:v[0-9]+]] 240; VI: flat_store_short v[{{[0-9:]+}}], [[RES]] 241; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:12 242; GFX9: global_store_short v{{[0-9]+}}, [[RES]] 243define amdgpu_kernel void @v_extractelement_v16i16_6(i16 addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { 244 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 245 %tid.ext = sext i32 %tid to i64 246 %in.gep = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(1)* %in, i64 %tid.ext 247 %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext 248 %vec = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep 249 %vec.extract = extractelement <16 x i16> %vec, i32 6 250 store i16 %vec.extract, i16 addrspace(1)* %out.gep 251 ret void 252} 253 254; GCN-LABEL: {{^}}v_extractelement_v16i16_dynamic_sgpr: 255; GCN-COUNT-15: v_cndmask_b32_e32 256define amdgpu_kernel void @v_extractelement_v16i16_dynamic_sgpr(i16 addrspace(1)* %out, <16 x i16> addrspace(1)* %in, i32 %n) #0 { 257 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 258 %tid.ext = sext i32 %tid to i64 259 %in.gep = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(1)* %in, i64 %tid.ext 260 %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext 261 %vec = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep 262 %vec.extract = extractelement <16 x i16> %vec, i32 %n 263 store i16 %vec.extract, i16 addrspace(1)* %out.gep 264 ret void 265} 266 267declare i32 @llvm.amdgcn.workitem.id.x() #1 268 269attributes #0 = { nounwind } 270attributes #1 = { nounwind readnone } 271