1; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
2; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
3; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s
6
7; GCN-LABEL: test_local_misaligned_v2:
8; GCN-DAG: ds_read2_b32
9; GCN-DAG: ds_write2_b32
10define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) {
11bb:
12  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
13  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
14  %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
15  %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 4
16  %v1 = extractelement <2 x i32> %load, i32 0
17  %v2 = extractelement <2 x i32> %load, i32 1
18  %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
19  %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
20  store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 4
21  ret void
22}
23
24; GCN-LABEL: test_local_misaligned_v4:
25; ALIGNED-DAG: ds_read2_b32
26; ALIGNED-DAG: ds_read2_b32
27; ALIGNED-DAG: ds_write2_b32
28; ALIGNED-DAG: ds_write2_b32
29; UNALIGNED-DAG: ds_read2_b64
30; UNALIGNED-DAG: ds_write2_b64
31define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
32bb:
33  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
34  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
35  %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
36  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4
37  %v1 = extractelement <4 x i32> %load, i32 0
38  %v2 = extractelement <4 x i32> %load, i32 1
39  %v3 = extractelement <4 x i32> %load, i32 2
40  %v4 = extractelement <4 x i32> %load, i32 3
41  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
42  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
43  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
44  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
45  store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 4
46  ret void
47}
48
49; GCN-LABEL: test_local_misaligned_v3:
50; ALIGNED-DAG: ds_read2_b32
51; ALIGNED-DAG: ds_read_b32
52; ALIGNED-DAG: ds_write2_b32
53; ALIGNED-DAG: ds_write_b32
54; UNALIGNED-DAG: ds_read_b96
55; UNALIGNED-DAG: ds_write_b96
56define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
57bb:
58  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
59  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
60  %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
61  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4
62  %v1 = extractelement <3 x i32> %load, i32 0
63  %v2 = extractelement <3 x i32> %load, i32 1
64  %v3 = extractelement <3 x i32> %load, i32 2
65  %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
66  %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
67  %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
68  store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 4
69  ret void
70}
71
72; GCN-LABEL: test_flat_misaligned_v2:
73; VECT-DAG:  flat_load_dwordx2 v
74; VECT-DAG:  flat_store_dwordx2 v
75; SPLIT-DAG: flat_load_dword v
76; SPLIT-DAG: flat_load_dword v
77; SPLIT-DAG: flat_store_dword v
78; SPLIT-DAG: flat_store_dword v
79define amdgpu_kernel void @test_flat_misaligned_v2(i32* %arg) {
80bb:
81  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
82  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
83  %ptr = bitcast i32* %gep to <2 x i32>*
84  %load = load <2 x i32>, <2 x i32>* %ptr, align 4
85  %v1 = extractelement <2 x i32> %load, i32 0
86  %v2 = extractelement <2 x i32> %load, i32 1
87  %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
88  %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
89  store <2 x i32> %v4, <2 x i32>* %ptr, align 4
90  ret void
91}
92
93; GCN-LABEL: test_flat_misaligned_v4:
94; VECT-DAG:  flat_load_dwordx4 v
95; VECT-DAG:  flat_store_dwordx4 v
96; SPLIT-DAG: flat_load_dword v
97; SPLIT-DAG: flat_load_dword v
98; SPLIT-DAG: flat_load_dword v
99; SPLIT-DAG: flat_load_dword v
100; SPLIT-DAG: flat_store_dword v
101; SPLIT-DAG: flat_store_dword v
102; SPLIT-DAG: flat_store_dword v
103; SPLIT-DAG: flat_store_dword v
104define amdgpu_kernel void @test_flat_misaligned_v4(i32* %arg) {
105bb:
106  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
107  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
108  %ptr = bitcast i32* %gep to <4 x i32>*
109  %load = load <4 x i32>, <4 x i32>* %ptr, align 4
110  %v1 = extractelement <4 x i32> %load, i32 0
111  %v2 = extractelement <4 x i32> %load, i32 1
112  %v3 = extractelement <4 x i32> %load, i32 2
113  %v4 = extractelement <4 x i32> %load, i32 3
114  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
115  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
116  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
117  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
118  store <4 x i32> %v8, <4 x i32>* %ptr, align 4
119  ret void
120}
121
122; TODO: Reinstate the test below once v3i32/v3f32 is reinstated.
123
124; GCN-LABEL: test_flat_misaligned_v3:
125; xVECT-DAG:  flat_load_dwordx3 v
126; xVECT-DAG:  flat_store_dwordx3 v
127; xSPLIT-DAG: flat_load_dword v
128; xSPLIT-DAG: flat_load_dword v
129; xSPLIT-DAG: flat_load_dword v
130; xSPLIT-DAG: flat_store_dword v
131; xSPLIT-DAG: flat_store_dword v
132; xSPLIT-DAG: flat_store_dword v
133define amdgpu_kernel void @test_flat_misaligned_v3(i32* %arg) {
134bb:
135  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
136  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
137  %ptr = bitcast i32* %gep to <3 x i32>*
138  %load = load <3 x i32>, <3 x i32>* %ptr, align 4
139  %v1 = extractelement <3 x i32> %load, i32 0
140  %v2 = extractelement <3 x i32> %load, i32 1
141  %v3 = extractelement <3 x i32> %load, i32 2
142  %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
143  %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
144  %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
145  store <3 x i32> %v7, <3 x i32>* %ptr, align 4
146  ret void
147}
148
149; GCN-LABEL: test_local_aligned_v2:
150; GCN-DAG: ds_read_b64
151; GCN-DAG: ds_write_b64
152define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) {
153bb:
154  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
155  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
156  %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
157  %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 8
158  %v1 = extractelement <2 x i32> %load, i32 0
159  %v2 = extractelement <2 x i32> %load, i32 1
160  %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
161  %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
162  store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 8
163  ret void
164}
165
166; GCN-LABEL: test_local_aligned_v3:
167; GCN-DAG: ds_read_b96
168; GCN-DAG: ds_write_b96
169define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) {
170bb:
171  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
172  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
173  %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
174  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16
175  %v1 = extractelement <3 x i32> %load, i32 0
176  %v2 = extractelement <3 x i32> %load, i32 1
177  %v3 = extractelement <3 x i32> %load, i32 2
178  %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
179  %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
180  %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
181  store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 16
182  ret void
183}
184
185; GCN-LABEL: test_flat_aligned_v2:
186; GCN-DAG: flat_load_dwordx2 v
187; GCN-DAG: flat_store_dwordx2 v
188define amdgpu_kernel void @test_flat_aligned_v2(i32* %arg) {
189bb:
190  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
191  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
192  %ptr = bitcast i32* %gep to <2 x i32>*
193  %load = load <2 x i32>, <2 x i32>* %ptr, align 8
194  %v1 = extractelement <2 x i32> %load, i32 0
195  %v2 = extractelement <2 x i32> %load, i32 1
196  %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
197  %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
198  store <2 x i32> %v4, <2 x i32>* %ptr, align 8
199  ret void
200}
201
202; GCN-LABEL: test_flat_aligned_v4:
203; GCN-DAG: flat_load_dwordx4 v
204; GCN-DAG: flat_store_dwordx4 v
205define amdgpu_kernel void @test_flat_aligned_v4(i32* %arg) {
206bb:
207  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
208  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
209  %ptr = bitcast i32* %gep to <4 x i32>*
210  %load = load <4 x i32>, <4 x i32>* %ptr, align 16
211  %v1 = extractelement <4 x i32> %load, i32 0
212  %v2 = extractelement <4 x i32> %load, i32 1
213  %v3 = extractelement <4 x i32> %load, i32 2
214  %v4 = extractelement <4 x i32> %load, i32 3
215  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
216  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
217  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
218  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
219  store <4 x i32> %v8, <4 x i32>* %ptr, align 16
220  ret void
221}
222
223; GCN-LABEL: test_local_v4_aligned8:
224; ALIGNED-DAG: ds_read2_b64
225; ALIGNED-DAG: ds_write2_b64
226; UNALIGNED-DAG: ds_read2_b64
227; UNALIGNED-DAG: ds_write2_b64
228define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {
229bb:
230  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
231  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
232  %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
233  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8
234  %v1 = extractelement <4 x i32> %load, i32 0
235  %v2 = extractelement <4 x i32> %load, i32 1
236  %v3 = extractelement <4 x i32> %load, i32 2
237  %v4 = extractelement <4 x i32> %load, i32 3
238  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
239  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
240  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
241  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
242  store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 8
243  ret void
244}
245
246; GCN-LABEL: test_flat_v4_aligned8:
247; VECT-DAG:  flat_load_dwordx4 v
248; VECT-DAG:  flat_store_dwordx4 v
249; SPLIT-DAG: flat_load_dwordx2 v
250; SPLIT-DAG: flat_load_dwordx2 v
251; SPLIT-DAG: flat_store_dwordx2 v
252; SPLIT-DAG: flat_store_dwordx2 v
253define amdgpu_kernel void @test_flat_v4_aligned8(i32* %arg) {
254bb:
255  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
256  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
257  %ptr = bitcast i32* %gep to <4 x i32>*
258  %load = load <4 x i32>, <4 x i32>* %ptr, align 8
259  %v1 = extractelement <4 x i32> %load, i32 0
260  %v2 = extractelement <4 x i32> %load, i32 1
261  %v3 = extractelement <4 x i32> %load, i32 2
262  %v4 = extractelement <4 x i32> %load, i32 3
263  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
264  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
265  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
266  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
267  store <4 x i32> %v8, <4 x i32>* %ptr, align 8
268  ret void
269}
270
271declare i32 @llvm.amdgcn.workitem.id.x()
272