1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s
4
5; FIXME: Should still like to vectorize the memory operations for VI
6
7; Simple 3-pair chain with loads and stores
8define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) {
9; GCN-LABEL: @test1_as_3_3_3_v2f16(
10; GCN-NEXT:    [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
11; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
12; GCN-NEXT:    [[TMP3:%.*]] = bitcast half addrspace(3)* [[B:%.*]] to <2 x half> addrspace(3)*
13; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP3]], align 2
14; GCN-NEXT:    [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]
15; GCN-NEXT:    [[TMP6:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
16; GCN-NEXT:    store <2 x half> [[TMP5]], <2 x half> addrspace(3)* [[TMP6]], align 2
17; GCN-NEXT:    ret void
18;
19  %i0 = load half, half addrspace(3)* %a, align 2
20  %i1 = load half, half addrspace(3)* %b, align 2
21  %mul = fmul half %i0, %i1
22  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
23  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
24  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
25  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
26  %mul5 = fmul half %i3, %i4
27  store half %mul, half addrspace(3)* %c, align 2
28  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
29  store half %mul5, half addrspace(3)* %arrayidx5, align 2
30  ret void
31}
32
33define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) {
34; GCN-LABEL: @test1_as_3_0_0(
35; GCN-NEXT:    [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
36; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
37; GCN-NEXT:    [[TMP3:%.*]] = bitcast half* [[B:%.*]] to <2 x half>*
38; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, <2 x half>* [[TMP3]], align 2
39; GCN-NEXT:    [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]
40; GCN-NEXT:    [[TMP6:%.*]] = bitcast half* [[C:%.*]] to <2 x half>*
41; GCN-NEXT:    store <2 x half> [[TMP5]], <2 x half>* [[TMP6]], align 2
42; GCN-NEXT:    ret void
43;
44  %i0 = load half, half addrspace(3)* %a, align 2
45  %i1 = load half, half* %b, align 2
46  %mul = fmul half %i0, %i1
47  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
48  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
49  %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
50  %i4 = load half, half* %arrayidx4, align 2
51  %mul5 = fmul half %i3, %i4
52  store half %mul, half* %c, align 2
53  %arrayidx5 = getelementptr inbounds half, half* %c, i64 1
54  store half %mul5, half* %arrayidx5, align 2
55  ret void
56}
57
58define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) {
59; GCN-LABEL: @test1_as_0_0_3_v2f16(
60; GCN-NEXT:    [[TMP1:%.*]] = bitcast half* [[A:%.*]] to <2 x half>*
61; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half>* [[TMP1]], align 2
62; GCN-NEXT:    [[TMP3:%.*]] = bitcast half* [[B:%.*]] to <2 x half>*
63; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, <2 x half>* [[TMP3]], align 2
64; GCN-NEXT:    [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]
65; GCN-NEXT:    [[TMP6:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
66; GCN-NEXT:    store <2 x half> [[TMP5]], <2 x half> addrspace(3)* [[TMP6]], align 2
67; GCN-NEXT:    ret void
68;
69  %i0 = load half, half* %a, align 2
70  %i1 = load half, half* %b, align 2
71  %mul = fmul half %i0, %i1
72  %arrayidx3 = getelementptr inbounds half, half* %a, i64 1
73  %i3 = load half, half* %arrayidx3, align 2
74  %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
75  %i4 = load half, half* %arrayidx4, align 2
76  %mul5 = fmul half %i3, %i4
77  store half %mul, half addrspace(3)* %c, align 2
78  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
79  store half %mul5, half addrspace(3)* %arrayidx5, align 2
80  ret void
81}
82
83define amdgpu_kernel void @test1_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
84; GCN-LABEL: @test1_fma_v2f16(
85; GCN-NEXT:    [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
86; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
87; GCN-NEXT:    [[TMP3:%.*]] = bitcast half addrspace(3)* [[B:%.*]] to <2 x half> addrspace(3)*
88; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP3]], align 2
89; GCN-NEXT:    [[TMP5:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
90; GCN-NEXT:    [[TMP6:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP5]], align 2
91; GCN-NEXT:    [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP4]], <2 x half> [[TMP6]])
92; GCN-NEXT:    [[TMP8:%.*]] = bitcast half addrspace(3)* [[D:%.*]] to <2 x half> addrspace(3)*
93; GCN-NEXT:    store <2 x half> [[TMP7]], <2 x half> addrspace(3)* [[TMP8]], align 2
94; GCN-NEXT:    ret void
95;
96  %i0 = load half, half addrspace(3)* %a, align 2
97  %i1 = load half, half addrspace(3)* %b, align 2
98  %i2 = load half, half addrspace(3)* %c, align 2
99  %fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2)
100  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
101  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
102  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
103  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
104  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
105  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
106  %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
107  store half %fma0, half addrspace(3)* %d, align 2
108  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
109  store half %fma1, half addrspace(3)* %arrayidx6, align 2
110  ret void
111}
112
113define amdgpu_kernel void @mul_scalar_v2f16(half addrspace(3)* %a, half %scalar, half addrspace(3)* %c) {
114; GCN-LABEL: @mul_scalar_v2f16(
115; GCN-NEXT:    [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
116; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
117; GCN-NEXT:    [[TMP3:%.*]] = insertelement <2 x half> poison, half [[SCALAR:%.*]], i32 0
118; GCN-NEXT:    [[TMP4:%.*]] = insertelement <2 x half> [[TMP3]], half [[SCALAR]], i32 1
119; GCN-NEXT:    [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]
120; GCN-NEXT:    [[TMP6:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
121; GCN-NEXT:    store <2 x half> [[TMP5]], <2 x half> addrspace(3)* [[TMP6]], align 2
122; GCN-NEXT:    ret void
123;
124  %i0 = load half, half addrspace(3)* %a, align 2
125  %mul = fmul half %i0, %scalar
126  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
127  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
128  %mul5 = fmul half %i3, %scalar
129  store half %mul, half addrspace(3)* %c, align 2
130  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
131  store half %mul5, half addrspace(3)* %arrayidx5, align 2
132  ret void
133}
134
135define amdgpu_kernel void @fabs_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
136; GCN-LABEL: @fabs_v2f16(
137; GCN-NEXT:    [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
138; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
139; GCN-NEXT:    [[TMP3:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP2]])
140; GCN-NEXT:    [[TMP4:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
141; GCN-NEXT:    store <2 x half> [[TMP3]], <2 x half> addrspace(3)* [[TMP4]], align 2
142; GCN-NEXT:    ret void
143;
144  %i0 = load half, half addrspace(3)* %a, align 2
145  %fabs0 = call half @llvm.fabs.f16(half %i0)
146  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
147  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
148  %fabs1 = call half @llvm.fabs.f16(half %i3)
149  store half %fabs0, half addrspace(3)* %c, align 2
150  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
151  store half %fabs1, half addrspace(3)* %arrayidx5, align 2
152  ret void
153}
154
155define amdgpu_kernel void @test1_fabs_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
156; GCN-LABEL: @test1_fabs_fma_v2f16(
157; GCN-NEXT:    [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
158; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
159; GCN-NEXT:    [[TMP3:%.*]] = bitcast half addrspace(3)* [[B:%.*]] to <2 x half> addrspace(3)*
160; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP3]], align 2
161; GCN-NEXT:    [[TMP5:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
162; GCN-NEXT:    [[TMP6:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP5]], align 2
163; GCN-NEXT:    [[TMP7:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP2]])
164; GCN-NEXT:    [[TMP8:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP7]], <2 x half> [[TMP4]], <2 x half> [[TMP6]])
165; GCN-NEXT:    [[TMP9:%.*]] = bitcast half addrspace(3)* [[D:%.*]] to <2 x half> addrspace(3)*
166; GCN-NEXT:    store <2 x half> [[TMP8]], <2 x half> addrspace(3)* [[TMP9]], align 2
167; GCN-NEXT:    ret void
168;
169  %i0 = load half, half addrspace(3)* %a, align 2
170  %i1 = load half, half addrspace(3)* %b, align 2
171  %i2 = load half, half addrspace(3)* %c, align 2
172  %i0.fabs = call half @llvm.fabs.f16(half %i0)
173
174  %fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2)
175  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
176  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
177  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
178  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
179  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
180  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
181  %i3.fabs = call half @llvm.fabs.f16(half %i3)
182
183  %fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5)
184  store half %fma0, half addrspace(3)* %d, align 2
185  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
186  store half %fma1, half addrspace(3)* %arrayidx6, align 2
187  ret void
188}
189
190define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
191; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(
192; GCN-NEXT:    [[I1:%.*]] = load half, half addrspace(3)* [[B:%.*]], align 2
193; GCN-NEXT:    [[I1_FABS:%.*]] = call half @llvm.fabs.f16(half [[I1]])
194; GCN-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds half, half addrspace(3)* [[B]], i64 1
195; GCN-NEXT:    [[I4:%.*]] = load half, half addrspace(3)* [[ARRAYIDX4]], align 2
196; GCN-NEXT:    [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
197; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
198; GCN-NEXT:    [[TMP3:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
199; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP3]], align 2
200; GCN-NEXT:    [[TMP5:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0
201; GCN-NEXT:    [[TMP6:%.*]] = insertelement <2 x half> [[TMP5]], half [[I4]], i32 1
202; GCN-NEXT:    [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP6]], <2 x half> [[TMP4]])
203; GCN-NEXT:    [[TMP8:%.*]] = bitcast half addrspace(3)* [[D:%.*]] to <2 x half> addrspace(3)*
204; GCN-NEXT:    store <2 x half> [[TMP7]], <2 x half> addrspace(3)* [[TMP8]], align 2
205; GCN-NEXT:    ret void
206;
207  %i0 = load half, half addrspace(3)* %a, align 2
208  %i1 = load half, half addrspace(3)* %b, align 2
209  %i2 = load half, half addrspace(3)* %c, align 2
210  %i1.fabs = call half @llvm.fabs.f16(half %i1)
211
212  %fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2)
213  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
214  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
215  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
216  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
217  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
218  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
219  %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
220  store half %fma0, half addrspace(3)* %d, align 2
221  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
222  store half %fma1, half addrspace(3)* %arrayidx6, align 2
223  ret void
224}
225
226define amdgpu_kernel void @canonicalize_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
227; GFX9-LABEL: @canonicalize_v2f16(
228; GFX9-NEXT:    [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
229; GFX9-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
230; GFX9-NEXT:    [[TMP3:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP2]])
231; GFX9-NEXT:    [[TMP4:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
232; GFX9-NEXT:    store <2 x half> [[TMP3]], <2 x half> addrspace(3)* [[TMP4]], align 2
233; GFX9-NEXT:    ret void
234;
235; VI-LABEL: @canonicalize_v2f16(
236; VI-NEXT:    [[I0:%.*]] = load half, half addrspace(3)* [[A:%.*]], align 2
237; VI-NEXT:    [[CANONICALIZE0:%.*]] = call half @llvm.canonicalize.f16(half [[I0]])
238; VI-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds half, half addrspace(3)* [[A]], i64 1
239; VI-NEXT:    [[I3:%.*]] = load half, half addrspace(3)* [[ARRAYIDX3]], align 2
240; VI-NEXT:    [[CANONICALIZE1:%.*]] = call half @llvm.canonicalize.f16(half [[I3]])
241; VI-NEXT:    store half [[CANONICALIZE0]], half addrspace(3)* [[C:%.*]], align 2
242; VI-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds half, half addrspace(3)* [[C]], i64 1
243; VI-NEXT:    store half [[CANONICALIZE1]], half addrspace(3)* [[ARRAYIDX5]], align 2
244; VI-NEXT:    ret void
245;
246  %i0 = load half, half addrspace(3)* %a, align 2
247  %canonicalize0 = call half @llvm.canonicalize.f16(half %i0)
248  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
249  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
250  %canonicalize1 = call half @llvm.canonicalize.f16(half %i3)
251  store half %canonicalize0, half addrspace(3)* %c, align 2
252  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
253  store half %canonicalize1, half addrspace(3)* %arrayidx5, align 2
254  ret void
255}
256
257declare half @llvm.fabs.f16(half) #1
258declare half @llvm.fma.f16(half, half, half) #1
259declare half @llvm.canonicalize.f16(half) #1
260
261attributes #0 = { nounwind }
262attributes #1 = { nounwind readnone }
263