1; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s 2; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s 3; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s 4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s 6 7; GCN-LABEL: test_local_misaligned_v2: 8; GCN-DAG: ds_read2_b32 9; GCN-DAG: ds_write2_b32 10define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) { 11bb: 12 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 13 %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid 14 %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)* 15 %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 4 16 %v1 = extractelement <2 x i32> %load, i32 0 17 %v2 = extractelement <2 x i32> %load, i32 1 18 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0 19 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1 20 store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 4 21 ret void 22} 23 24; GCN-LABEL: test_local_misaligned_v4: 25; GCN-DAG: ds_read2_b32 26; GCN-DAG: ds_read2_b32 27; GCN-DAG: ds_write2_b32 28; GCN-DAG: ds_write2_b32 29define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) { 30bb: 31 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 32 %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid 33 %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)* 34 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4 35 %v1 = extractelement <4 x i32> %load, i32 0 36 %v2 = extractelement <4 x i32> %load, i32 1 37 %v3 = extractelement <4 x i32> %load, i32 2 38 %v4 = extractelement <4 x i32> %load, i32 3 39 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0 40 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1 41 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2 42 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3 43 store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 4 44 ret void 45} 46 47; GCN-LABEL: test_local_misaligned_v3: 48; GCN-DAG: ds_read2_b32 49; GCN-DAG: ds_read_b32 50; GCN-DAG: ds_write2_b32 51; GCN-DAG: ds_write_b32 52define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) { 53bb: 54 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 55 %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid 56 %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)* 57 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4 58 %v1 = extractelement <3 x i32> %load, i32 0 59 %v2 = extractelement <3 x i32> %load, i32 1 60 %v3 = extractelement <3 x i32> %load, i32 2 61 %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0 62 %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1 63 %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2 64 store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 4 65 ret void 66} 67 68; GCN-LABEL: test_flat_misaligned_v2: 69; VECT-DAG: flat_load_dwordx2 v 70; VECT-DAG: flat_store_dwordx2 v 71; SPLIT-DAG: flat_load_dword v 72; SPLIT-DAG: flat_load_dword v 73; SPLIT-DAG: flat_store_dword v 74; SPLIT-DAG: flat_store_dword v 75define amdgpu_kernel void @test_flat_misaligned_v2(i32* %arg) { 76bb: 77 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 78 %gep = getelementptr inbounds i32, i32* %arg, i32 %lid 79 %ptr = bitcast i32* %gep to <2 x i32>* 80 %load = load <2 x i32>, <2 x i32>* %ptr, align 4 81 %v1 = extractelement <2 x i32> %load, i32 0 82 %v2 = extractelement <2 x i32> %load, i32 1 83 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0 84 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1 85 store <2 x i32> %v4, <2 x i32>* %ptr, align 4 86 ret void 87} 88 89; GCN-LABEL: test_flat_misaligned_v4: 90; VECT-DAG: flat_load_dwordx4 v 91; VECT-DAG: flat_store_dwordx4 v 92; SPLIT-DAG: flat_load_dword v 93; SPLIT-DAG: flat_load_dword v 94; SPLIT-DAG: flat_load_dword v 95; SPLIT-DAG: flat_load_dword v 96; SPLIT-DAG: flat_store_dword v 97; SPLIT-DAG: flat_store_dword v 98; SPLIT-DAG: flat_store_dword v 99; SPLIT-DAG: flat_store_dword v 100define amdgpu_kernel void @test_flat_misaligned_v4(i32* %arg) { 101bb: 102 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 103 %gep = getelementptr inbounds i32, i32* %arg, i32 %lid 104 %ptr = bitcast i32* %gep to <4 x i32>* 105 %load = load <4 x i32>, <4 x i32>* %ptr, align 4 106 %v1 = extractelement <4 x i32> %load, i32 0 107 %v2 = extractelement <4 x i32> %load, i32 1 108 %v3 = extractelement <4 x i32> %load, i32 2 109 %v4 = extractelement <4 x i32> %load, i32 3 110 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0 111 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1 112 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2 113 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3 114 store <4 x i32> %v8, <4 x i32>* %ptr, align 4 115 ret void 116} 117 118; TODO: Reinstate the test below once v3i32/v3f32 is reinstated. 119 120; GCN-LABEL: test_flat_misaligned_v3: 121; xVECT-DAG: flat_load_dwordx3 v 122; xVECT-DAG: flat_store_dwordx3 v 123; xSPLIT-DAG: flat_load_dword v 124; xSPLIT-DAG: flat_load_dword v 125; xSPLIT-DAG: flat_load_dword v 126; xSPLIT-DAG: flat_store_dword v 127; xSPLIT-DAG: flat_store_dword v 128; xSPLIT-DAG: flat_store_dword v 129define amdgpu_kernel void @test_flat_misaligned_v3(i32* %arg) { 130bb: 131 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 132 %gep = getelementptr inbounds i32, i32* %arg, i32 %lid 133 %ptr = bitcast i32* %gep to <3 x i32>* 134 %load = load <3 x i32>, <3 x i32>* %ptr, align 4 135 %v1 = extractelement <3 x i32> %load, i32 0 136 %v2 = extractelement <3 x i32> %load, i32 1 137 %v3 = extractelement <3 x i32> %load, i32 2 138 %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0 139 %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1 140 %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2 141 store <3 x i32> %v7, <3 x i32>* %ptr, align 4 142 ret void 143} 144 145; GCN-LABEL: test_local_aligned_v2: 146; GCN-DAG: ds_read_b64 147; GCN-DAG: ds_write_b64 148define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) { 149bb: 150 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 151 %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid 152 %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)* 153 %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 8 154 %v1 = extractelement <2 x i32> %load, i32 0 155 %v2 = extractelement <2 x i32> %load, i32 1 156 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0 157 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1 158 store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 8 159 ret void 160} 161 162; GCN-LABEL: test_local_aligned_v3: 163; GCN-DAG: ds_read_b96 164; GCN-DAG: ds_write_b96 165define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) { 166bb: 167 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 168 %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid 169 %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)* 170 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16 171 %v1 = extractelement <3 x i32> %load, i32 0 172 %v2 = extractelement <3 x i32> %load, i32 1 173 %v3 = extractelement <3 x i32> %load, i32 2 174 %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0 175 %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1 176 %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2 177 store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 16 178 ret void 179} 180 181; GCN-LABEL: test_flat_aligned_v2: 182; GCN-DAG: flat_load_dwordx2 v 183; GCN-DAG: flat_store_dwordx2 v 184define amdgpu_kernel void @test_flat_aligned_v2(i32* %arg) { 185bb: 186 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 187 %gep = getelementptr inbounds i32, i32* %arg, i32 %lid 188 %ptr = bitcast i32* %gep to <2 x i32>* 189 %load = load <2 x i32>, <2 x i32>* %ptr, align 8 190 %v1 = extractelement <2 x i32> %load, i32 0 191 %v2 = extractelement <2 x i32> %load, i32 1 192 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0 193 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1 194 store <2 x i32> %v4, <2 x i32>* %ptr, align 8 195 ret void 196} 197 198; GCN-LABEL: test_flat_aligned_v4: 199; GCN-DAG: flat_load_dwordx4 v 200; GCN-DAG: flat_store_dwordx4 v 201define amdgpu_kernel void @test_flat_aligned_v4(i32* %arg) { 202bb: 203 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 204 %gep = getelementptr inbounds i32, i32* %arg, i32 %lid 205 %ptr = bitcast i32* %gep to <4 x i32>* 206 %load = load <4 x i32>, <4 x i32>* %ptr, align 16 207 %v1 = extractelement <4 x i32> %load, i32 0 208 %v2 = extractelement <4 x i32> %load, i32 1 209 %v3 = extractelement <4 x i32> %load, i32 2 210 %v4 = extractelement <4 x i32> %load, i32 3 211 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0 212 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1 213 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2 214 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3 215 store <4 x i32> %v8, <4 x i32>* %ptr, align 16 216 ret void 217} 218 219; GCN-LABEL: test_local_v4_aligned8: 220; ALIGNED-DAG: ds_read2_b64 221; ALIGNED-DAG: ds_write2_b64 222; UNALIGNED-DAG: ds_read2_b64 223; UNALIGNED-DAG: ds_write2_b64 224define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) { 225bb: 226 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 227 %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid 228 %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)* 229 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8 230 %v1 = extractelement <4 x i32> %load, i32 0 231 %v2 = extractelement <4 x i32> %load, i32 1 232 %v3 = extractelement <4 x i32> %load, i32 2 233 %v4 = extractelement <4 x i32> %load, i32 3 234 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0 235 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1 236 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2 237 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3 238 store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 8 239 ret void 240} 241 242; GCN-LABEL: test_flat_v4_aligned8: 243; VECT-DAG: flat_load_dwordx4 v 244; VECT-DAG: flat_store_dwordx4 v 245; SPLIT-DAG: flat_load_dwordx2 v 246; SPLIT-DAG: flat_load_dwordx2 v 247; SPLIT-DAG: flat_store_dwordx2 v 248; SPLIT-DAG: flat_store_dwordx2 v 249define amdgpu_kernel void @test_flat_v4_aligned8(i32* %arg) { 250bb: 251 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 252 %gep = getelementptr inbounds i32, i32* %arg, i32 %lid 253 %ptr = bitcast i32* %gep to <4 x i32>* 254 %load = load <4 x i32>, <4 x i32>* %ptr, align 8 255 %v1 = extractelement <4 x i32> %load, i32 0 256 %v2 = extractelement <4 x i32> %load, i32 1 257 %v3 = extractelement <4 x i32> %load, i32 2 258 %v4 = extractelement <4 x i32> %load, i32 3 259 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0 260 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1 261 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2 262 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3 263 store <4 x i32> %v8, <4 x i32>* %ptr, align 8 264 ret void 265} 266 267declare i32 @llvm.amdgcn.workitem.id.x() 268