1; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
2; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
3; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s
6
7; GCN-LABEL: test_local_misaligned_v2:
8; GCN-DAG: ds_read2_b32
9; GCN-DAG: ds_write2_b32
10define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) {
11bb:
12  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
13  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
14  %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
15  %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 4
16  %v1 = extractelement <2 x i32> %load, i32 0
17  %v2 = extractelement <2 x i32> %load, i32 1
18  %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
19  %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
20  store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 4
21  ret void
22}
23
24; GCN-LABEL: test_local_misaligned_v4:
25; GCN-DAG: ds_read2_b32
26; GCN-DAG: ds_read2_b32
27; GCN-DAG: ds_write2_b32
28; GCN-DAG: ds_write2_b32
29define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
30bb:
31  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
32  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
33  %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
34  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4
35  %v1 = extractelement <4 x i32> %load, i32 0
36  %v2 = extractelement <4 x i32> %load, i32 1
37  %v3 = extractelement <4 x i32> %load, i32 2
38  %v4 = extractelement <4 x i32> %load, i32 3
39  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
40  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
41  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
42  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
43  store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 4
44  ret void
45}
46
47; GCN-LABEL: test_local_misaligned_v3:
48; GCN-DAG: ds_read2_b32
49; GCN-DAG: ds_read_b32
50; GCN-DAG: ds_write2_b32
51; GCN-DAG: ds_write_b32
52define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
53bb:
54  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
55  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
56  %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
57  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4
58  %v1 = extractelement <3 x i32> %load, i32 0
59  %v2 = extractelement <3 x i32> %load, i32 1
60  %v3 = extractelement <3 x i32> %load, i32 2
61  %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
62  %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
63  %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
64  store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 4
65  ret void
66}
67
68; GCN-LABEL: test_flat_misaligned_v2:
69; VECT-DAG:  flat_load_dwordx2 v
70; VECT-DAG:  flat_store_dwordx2 v
71; SPLIT-DAG: flat_load_dword v
72; SPLIT-DAG: flat_load_dword v
73; SPLIT-DAG: flat_store_dword v
74; SPLIT-DAG: flat_store_dword v
75define amdgpu_kernel void @test_flat_misaligned_v2(i32* %arg) {
76bb:
77  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
78  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
79  %ptr = bitcast i32* %gep to <2 x i32>*
80  %load = load <2 x i32>, <2 x i32>* %ptr, align 4
81  %v1 = extractelement <2 x i32> %load, i32 0
82  %v2 = extractelement <2 x i32> %load, i32 1
83  %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
84  %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
85  store <2 x i32> %v4, <2 x i32>* %ptr, align 4
86  ret void
87}
88
89; GCN-LABEL: test_flat_misaligned_v4:
90; VECT-DAG:  flat_load_dwordx4 v
91; VECT-DAG:  flat_store_dwordx4 v
92; SPLIT-DAG: flat_load_dword v
93; SPLIT-DAG: flat_load_dword v
94; SPLIT-DAG: flat_load_dword v
95; SPLIT-DAG: flat_load_dword v
96; SPLIT-DAG: flat_store_dword v
97; SPLIT-DAG: flat_store_dword v
98; SPLIT-DAG: flat_store_dword v
99; SPLIT-DAG: flat_store_dword v
100define amdgpu_kernel void @test_flat_misaligned_v4(i32* %arg) {
101bb:
102  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
103  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
104  %ptr = bitcast i32* %gep to <4 x i32>*
105  %load = load <4 x i32>, <4 x i32>* %ptr, align 4
106  %v1 = extractelement <4 x i32> %load, i32 0
107  %v2 = extractelement <4 x i32> %load, i32 1
108  %v3 = extractelement <4 x i32> %load, i32 2
109  %v4 = extractelement <4 x i32> %load, i32 3
110  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
111  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
112  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
113  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
114  store <4 x i32> %v8, <4 x i32>* %ptr, align 4
115  ret void
116}
117
118; TODO: Reinstate the test below once v3i32/v3f32 is reinstated.
119
120; GCN-LABEL: test_flat_misaligned_v3:
121; xVECT-DAG:  flat_load_dwordx3 v
122; xVECT-DAG:  flat_store_dwordx3 v
123; xSPLIT-DAG: flat_load_dword v
124; xSPLIT-DAG: flat_load_dword v
125; xSPLIT-DAG: flat_load_dword v
126; xSPLIT-DAG: flat_store_dword v
127; xSPLIT-DAG: flat_store_dword v
128; xSPLIT-DAG: flat_store_dword v
129define amdgpu_kernel void @test_flat_misaligned_v3(i32* %arg) {
130bb:
131  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
132  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
133  %ptr = bitcast i32* %gep to <3 x i32>*
134  %load = load <3 x i32>, <3 x i32>* %ptr, align 4
135  %v1 = extractelement <3 x i32> %load, i32 0
136  %v2 = extractelement <3 x i32> %load, i32 1
137  %v3 = extractelement <3 x i32> %load, i32 2
138  %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
139  %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
140  %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
141  store <3 x i32> %v7, <3 x i32>* %ptr, align 4
142  ret void
143}
144
145; GCN-LABEL: test_local_aligned_v2:
146; GCN-DAG: ds_read_b64
147; GCN-DAG: ds_write_b64
148define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) {
149bb:
150  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
151  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
152  %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
153  %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 8
154  %v1 = extractelement <2 x i32> %load, i32 0
155  %v2 = extractelement <2 x i32> %load, i32 1
156  %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
157  %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
158  store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 8
159  ret void
160}
161
162; GCN-LABEL: test_local_aligned_v3:
163; GCN-DAG: ds_read_b96
164; GCN-DAG: ds_write_b96
165define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) {
166bb:
167  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
168  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
169  %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
170  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16
171  %v1 = extractelement <3 x i32> %load, i32 0
172  %v2 = extractelement <3 x i32> %load, i32 1
173  %v3 = extractelement <3 x i32> %load, i32 2
174  %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
175  %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
176  %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
177  store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 16
178  ret void
179}
180
181; GCN-LABEL: test_flat_aligned_v2:
182; GCN-DAG: flat_load_dwordx2 v
183; GCN-DAG: flat_store_dwordx2 v
184define amdgpu_kernel void @test_flat_aligned_v2(i32* %arg) {
185bb:
186  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
187  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
188  %ptr = bitcast i32* %gep to <2 x i32>*
189  %load = load <2 x i32>, <2 x i32>* %ptr, align 8
190  %v1 = extractelement <2 x i32> %load, i32 0
191  %v2 = extractelement <2 x i32> %load, i32 1
192  %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
193  %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
194  store <2 x i32> %v4, <2 x i32>* %ptr, align 8
195  ret void
196}
197
198; GCN-LABEL: test_flat_aligned_v4:
199; GCN-DAG: flat_load_dwordx4 v
200; GCN-DAG: flat_store_dwordx4 v
201define amdgpu_kernel void @test_flat_aligned_v4(i32* %arg) {
202bb:
203  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
204  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
205  %ptr = bitcast i32* %gep to <4 x i32>*
206  %load = load <4 x i32>, <4 x i32>* %ptr, align 16
207  %v1 = extractelement <4 x i32> %load, i32 0
208  %v2 = extractelement <4 x i32> %load, i32 1
209  %v3 = extractelement <4 x i32> %load, i32 2
210  %v4 = extractelement <4 x i32> %load, i32 3
211  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
212  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
213  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
214  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
215  store <4 x i32> %v8, <4 x i32>* %ptr, align 16
216  ret void
217}
218
219; GCN-LABEL: test_local_v4_aligned8:
220; ALIGNED-DAG: ds_read2_b64
221; ALIGNED-DAG: ds_write2_b64
222; UNALIGNED-DAG: ds_read2_b64
223; UNALIGNED-DAG: ds_write2_b64
224define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {
225bb:
226  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
227  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
228  %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
229  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8
230  %v1 = extractelement <4 x i32> %load, i32 0
231  %v2 = extractelement <4 x i32> %load, i32 1
232  %v3 = extractelement <4 x i32> %load, i32 2
233  %v4 = extractelement <4 x i32> %load, i32 3
234  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
235  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
236  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
237  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
238  store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 8
239  ret void
240}
241
242; GCN-LABEL: test_flat_v4_aligned8:
243; VECT-DAG:  flat_load_dwordx4 v
244; VECT-DAG:  flat_store_dwordx4 v
245; SPLIT-DAG: flat_load_dwordx2 v
246; SPLIT-DAG: flat_load_dwordx2 v
247; SPLIT-DAG: flat_store_dwordx2 v
248; SPLIT-DAG: flat_store_dwordx2 v
249define amdgpu_kernel void @test_flat_v4_aligned8(i32* %arg) {
250bb:
251  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
252  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
253  %ptr = bitcast i32* %gep to <4 x i32>*
254  %load = load <4 x i32>, <4 x i32>* %ptr, align 8
255  %v1 = extractelement <4 x i32> %load, i32 0
256  %v2 = extractelement <4 x i32> %load, i32 1
257  %v3 = extractelement <4 x i32> %load, i32 2
258  %v4 = extractelement <4 x i32> %load, i32 3
259  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
260  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
261  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
262  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
263  store <4 x i32> %v8, <4 x i32>* %ptr, align 8
264  ret void
265}
266
267declare i32 @llvm.amdgcn.workitem.id.x()
268