1; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s 2; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s 3; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s 4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s 6 7; GCN-LABEL: test_local_misaligned_v2: 8; GCN-DAG: ds_read2_b32 9; GCN-DAG: ds_write2_b32 10define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) { 11bb: 12 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 13 %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid 14 %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)* 15 %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 4 16 %v1 = extractelement <2 x i32> %load, i32 0 17 %v2 = extractelement <2 x i32> %load, i32 1 18 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0 19 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1 20 store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 4 21 ret void 22} 23 24; GCN-LABEL: test_local_misaligned_v4: 25; ALIGNED-DAG: ds_read2_b32 26; ALIGNED-DAG: ds_read2_b32 27; ALIGNED-DAG: ds_write2_b32 28; ALIGNED-DAG: ds_write2_b32 29; UNALIGNED-DAG: ds_read2_b64 30; UNALIGNED-DAG: ds_write2_b64 31define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) { 32bb: 33 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 34 %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid 35 %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)* 36 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4 37 %v1 = extractelement <4 x i32> %load, i32 0 38 %v2 = extractelement <4 x i32> %load, i32 1 39 %v3 = extractelement <4 x i32> %load, i32 2 40 %v4 = extractelement <4 x i32> %load, i32 3 41 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0 42 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1 43 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2 44 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3 45 store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 4 46 ret void 47} 48 49; GCN-LABEL: test_local_misaligned_v3: 50; ALIGNED-DAG: ds_read2_b32 51; ALIGNED-DAG: ds_read_b32 52; ALIGNED-DAG: ds_write2_b32 53; ALIGNED-DAG: ds_write_b32 54; UNALIGNED-DAG: ds_read_b96 55; UNALIGNED-DAG: ds_write_b96 56define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) { 57bb: 58 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 59 %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid 60 %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)* 61 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4 62 %v1 = extractelement <3 x i32> %load, i32 0 63 %v2 = extractelement <3 x i32> %load, i32 1 64 %v3 = extractelement <3 x i32> %load, i32 2 65 %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0 66 %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1 67 %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2 68 store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 4 69 ret void 70} 71 72; GCN-LABEL: test_flat_misaligned_v2: 73; VECT-DAG: flat_load_dwordx2 v 74; VECT-DAG: flat_store_dwordx2 v 75; SPLIT-DAG: flat_load_dword v 76; SPLIT-DAG: flat_load_dword v 77; SPLIT-DAG: flat_store_dword v 78; SPLIT-DAG: flat_store_dword v 79define amdgpu_kernel void @test_flat_misaligned_v2(i32* %arg) { 80bb: 81 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 82 %gep = getelementptr inbounds i32, i32* %arg, i32 %lid 83 %ptr = bitcast i32* %gep to <2 x i32>* 84 %load = load <2 x i32>, <2 x i32>* %ptr, align 4 85 %v1 = extractelement <2 x i32> %load, i32 0 86 %v2 = extractelement <2 x i32> %load, i32 1 87 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0 88 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1 89 store <2 x i32> %v4, <2 x i32>* %ptr, align 4 90 ret void 91} 92 93; GCN-LABEL: test_flat_misaligned_v4: 94; VECT-DAG: flat_load_dwordx4 v 95; VECT-DAG: flat_store_dwordx4 v 96; SPLIT-DAG: flat_load_dword v 97; SPLIT-DAG: flat_load_dword v 98; SPLIT-DAG: flat_load_dword v 99; SPLIT-DAG: flat_load_dword v 100; SPLIT-DAG: flat_store_dword v 101; SPLIT-DAG: flat_store_dword v 102; SPLIT-DAG: flat_store_dword v 103; SPLIT-DAG: flat_store_dword v 104define amdgpu_kernel void @test_flat_misaligned_v4(i32* %arg) { 105bb: 106 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 107 %gep = getelementptr inbounds i32, i32* %arg, i32 %lid 108 %ptr = bitcast i32* %gep to <4 x i32>* 109 %load = load <4 x i32>, <4 x i32>* %ptr, align 4 110 %v1 = extractelement <4 x i32> %load, i32 0 111 %v2 = extractelement <4 x i32> %load, i32 1 112 %v3 = extractelement <4 x i32> %load, i32 2 113 %v4 = extractelement <4 x i32> %load, i32 3 114 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0 115 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1 116 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2 117 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3 118 store <4 x i32> %v8, <4 x i32>* %ptr, align 4 119 ret void 120} 121 122; TODO: Reinstate the test below once v3i32/v3f32 is reinstated. 123 124; GCN-LABEL: test_flat_misaligned_v3: 125; xVECT-DAG: flat_load_dwordx3 v 126; xVECT-DAG: flat_store_dwordx3 v 127; xSPLIT-DAG: flat_load_dword v 128; xSPLIT-DAG: flat_load_dword v 129; xSPLIT-DAG: flat_load_dword v 130; xSPLIT-DAG: flat_store_dword v 131; xSPLIT-DAG: flat_store_dword v 132; xSPLIT-DAG: flat_store_dword v 133define amdgpu_kernel void @test_flat_misaligned_v3(i32* %arg) { 134bb: 135 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 136 %gep = getelementptr inbounds i32, i32* %arg, i32 %lid 137 %ptr = bitcast i32* %gep to <3 x i32>* 138 %load = load <3 x i32>, <3 x i32>* %ptr, align 4 139 %v1 = extractelement <3 x i32> %load, i32 0 140 %v2 = extractelement <3 x i32> %load, i32 1 141 %v3 = extractelement <3 x i32> %load, i32 2 142 %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0 143 %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1 144 %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2 145 store <3 x i32> %v7, <3 x i32>* %ptr, align 4 146 ret void 147} 148 149; GCN-LABEL: test_local_aligned_v2: 150; GCN-DAG: ds_read_b64 151; GCN-DAG: ds_write_b64 152define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) { 153bb: 154 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 155 %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid 156 %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)* 157 %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 8 158 %v1 = extractelement <2 x i32> %load, i32 0 159 %v2 = extractelement <2 x i32> %load, i32 1 160 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0 161 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1 162 store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 8 163 ret void 164} 165 166; GCN-LABEL: test_local_aligned_v3: 167; GCN-DAG: ds_read_b96 168; GCN-DAG: ds_write_b96 169define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) { 170bb: 171 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 172 %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid 173 %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)* 174 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16 175 %v1 = extractelement <3 x i32> %load, i32 0 176 %v2 = extractelement <3 x i32> %load, i32 1 177 %v3 = extractelement <3 x i32> %load, i32 2 178 %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0 179 %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1 180 %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2 181 store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 16 182 ret void 183} 184 185; GCN-LABEL: test_flat_aligned_v2: 186; GCN-DAG: flat_load_dwordx2 v 187; GCN-DAG: flat_store_dwordx2 v 188define amdgpu_kernel void @test_flat_aligned_v2(i32* %arg) { 189bb: 190 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 191 %gep = getelementptr inbounds i32, i32* %arg, i32 %lid 192 %ptr = bitcast i32* %gep to <2 x i32>* 193 %load = load <2 x i32>, <2 x i32>* %ptr, align 8 194 %v1 = extractelement <2 x i32> %load, i32 0 195 %v2 = extractelement <2 x i32> %load, i32 1 196 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0 197 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1 198 store <2 x i32> %v4, <2 x i32>* %ptr, align 8 199 ret void 200} 201 202; GCN-LABEL: test_flat_aligned_v4: 203; GCN-DAG: flat_load_dwordx4 v 204; GCN-DAG: flat_store_dwordx4 v 205define amdgpu_kernel void @test_flat_aligned_v4(i32* %arg) { 206bb: 207 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 208 %gep = getelementptr inbounds i32, i32* %arg, i32 %lid 209 %ptr = bitcast i32* %gep to <4 x i32>* 210 %load = load <4 x i32>, <4 x i32>* %ptr, align 16 211 %v1 = extractelement <4 x i32> %load, i32 0 212 %v2 = extractelement <4 x i32> %load, i32 1 213 %v3 = extractelement <4 x i32> %load, i32 2 214 %v4 = extractelement <4 x i32> %load, i32 3 215 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0 216 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1 217 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2 218 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3 219 store <4 x i32> %v8, <4 x i32>* %ptr, align 16 220 ret void 221} 222 223; GCN-LABEL: test_local_v4_aligned8: 224; ALIGNED-DAG: ds_read2_b64 225; ALIGNED-DAG: ds_write2_b64 226; UNALIGNED-DAG: ds_read2_b64 227; UNALIGNED-DAG: ds_write2_b64 228define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) { 229bb: 230 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 231 %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid 232 %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)* 233 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8 234 %v1 = extractelement <4 x i32> %load, i32 0 235 %v2 = extractelement <4 x i32> %load, i32 1 236 %v3 = extractelement <4 x i32> %load, i32 2 237 %v4 = extractelement <4 x i32> %load, i32 3 238 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0 239 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1 240 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2 241 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3 242 store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 8 243 ret void 244} 245 246; GCN-LABEL: test_flat_v4_aligned8: 247; VECT-DAG: flat_load_dwordx4 v 248; VECT-DAG: flat_store_dwordx4 v 249; SPLIT-DAG: flat_load_dwordx2 v 250; SPLIT-DAG: flat_load_dwordx2 v 251; SPLIT-DAG: flat_store_dwordx2 v 252; SPLIT-DAG: flat_store_dwordx2 v 253define amdgpu_kernel void @test_flat_v4_aligned8(i32* %arg) { 254bb: 255 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 256 %gep = getelementptr inbounds i32, i32* %arg, i32 %lid 257 %ptr = bitcast i32* %gep to <4 x i32>* 258 %load = load <4 x i32>, <4 x i32>* %ptr, align 8 259 %v1 = extractelement <4 x i32> %load, i32 0 260 %v2 = extractelement <4 x i32> %load, i32 1 261 %v3 = extractelement <4 x i32> %load, i32 2 262 %v4 = extractelement <4 x i32> %load, i32 3 263 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0 264 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1 265 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2 266 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3 267 store <4 x i32> %v8, <4 x i32>* %ptr, align 8 268 ret void 269} 270 271declare i32 @llvm.amdgcn.workitem.id.x() 272