1; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9,GFX89 %s
2; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI,GFX89 %s
3
4; FIXME: Should still like to vectorize the memory operations for VI
5
6; Simple 3-pair chain with loads and stores
7; GCN-LABEL: @test1_as_3_3_3_v2f16(
8; GFX89: load <2 x half>, <2 x half> addrspace(3)*
9; GFX89: load <2 x half>, <2 x half> addrspace(3)*
10; GFX89: fmul <2 x half>
11; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
12; GFX89: ret
13define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) {
14  %i0 = load half, half addrspace(3)* %a, align 2
15  %i1 = load half, half addrspace(3)* %b, align 2
16  %mul = fmul half %i0, %i1
17  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
18  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
19  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
20  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
21  %mul5 = fmul half %i3, %i4
22  store half %mul, half addrspace(3)* %c, align 2
23  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
24  store half %mul5, half addrspace(3)* %arrayidx5, align 2
25  ret void
26}
27
28; GCN-LABEL: @test1_as_3_0_0(
29; GFX89: load <2 x half>, <2 x half> addrspace(3)*
30; GFX89: load <2 x half>, <2 x half>*
31; GFX89: fmul <2 x half>
32; GFX89: store <2 x half> %{{.*}}, <2 x half>* %
33; GFX89: ret
34define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) {
35  %i0 = load half, half addrspace(3)* %a, align 2
36  %i1 = load half, half* %b, align 2
37  %mul = fmul half %i0, %i1
38  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
39  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
40  %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
41  %i4 = load half, half* %arrayidx4, align 2
42  %mul5 = fmul half %i3, %i4
43  store half %mul, half* %c, align 2
44  %arrayidx5 = getelementptr inbounds half, half* %c, i64 1
45  store half %mul5, half* %arrayidx5, align 2
46  ret void
47}
48
49; GCN-LABEL: @test1_as_0_0_3_v2f16(
50; GFX89: load <2 x half>, <2 x half>*
51; GFX89: load <2 x half>, <2 x half>*
52; GFX89: fmul <2 x half>
53; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
54; GFX89: ret
55define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) {
56  %i0 = load half, half* %a, align 2
57  %i1 = load half, half* %b, align 2
58  %mul = fmul half %i0, %i1
59  %arrayidx3 = getelementptr inbounds half, half* %a, i64 1
60  %i3 = load half, half* %arrayidx3, align 2
61  %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
62  %i4 = load half, half* %arrayidx4, align 2
63  %mul5 = fmul half %i3, %i4
64  store half %mul, half addrspace(3)* %c, align 2
65  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
66  store half %mul5, half addrspace(3)* %arrayidx5, align 2
67  ret void
68}
69
70; GCN-LABEL: @test1_fma_v2f16(
71; GFX9: load <2 x half>
72; GFX9: load <2 x half>
73; GFX9: load <2 x half>
74; GFX9: call <2 x half> @llvm.fma.v2f16(
75; GFX9: store <2 x half>
76define amdgpu_kernel void @test1_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
77  %i0 = load half, half addrspace(3)* %a, align 2
78  %i1 = load half, half addrspace(3)* %b, align 2
79  %i2 = load half, half addrspace(3)* %c, align 2
80  %fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2)
81  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
82  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
83  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
84  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
85  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
86  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
87  %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
88  store half %fma0, half addrspace(3)* %d, align 2
89  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
90  store half %fma1, half addrspace(3)* %arrayidx6, align 2
91  ret void
92}
93
94; GCN-LABEL: @mul_scalar_v2f16(
95; GFX9: load <2 x half>
96; GFX9: fmul <2 x half>
97; GFX9: store <2 x half>
98define amdgpu_kernel void @mul_scalar_v2f16(half addrspace(3)* %a, half %scalar, half addrspace(3)* %c) {
99  %i0 = load half, half addrspace(3)* %a, align 2
100  %mul = fmul half %i0, %scalar
101  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
102  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
103  %mul5 = fmul half %i3, %scalar
104  store half %mul, half addrspace(3)* %c, align 2
105  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
106  store half %mul5, half addrspace(3)* %arrayidx5, align 2
107  ret void
108}
109
110; GCN-LABEL: @fabs_v2f16
111; GFX9: load <2 x half>
112; GFX9: call <2 x half> @llvm.fabs.v2f16(
113; GFX9: store <2 x half>
114define amdgpu_kernel void @fabs_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
115  %i0 = load half, half addrspace(3)* %a, align 2
116  %fabs0 = call half @llvm.fabs.f16(half %i0)
117  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
118  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
119  %fabs1 = call half @llvm.fabs.f16(half %i3)
120  store half %fabs0, half addrspace(3)* %c, align 2
121  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
122  store half %fabs1, half addrspace(3)* %arrayidx5, align 2
123  ret void
124}
125
126; GCN-LABEL: @test1_fabs_fma_v2f16(
127; GFX9: load <2 x half>
128; GFX9: call <2 x half> @llvm.fabs.v2f16(
129; GFX9: call <2 x half> @llvm.fma.v2f16(
130; GFX9: store <2 x half>
131define amdgpu_kernel void @test1_fabs_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
132  %i0 = load half, half addrspace(3)* %a, align 2
133  %i1 = load half, half addrspace(3)* %b, align 2
134  %i2 = load half, half addrspace(3)* %c, align 2
135  %i0.fabs = call half @llvm.fabs.f16(half %i0)
136
137  %fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2)
138  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
139  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
140  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
141  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
142  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
143  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
144  %i3.fabs = call half @llvm.fabs.f16(half %i3)
145
146  %fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5)
147  store half %fma0, half addrspace(3)* %d, align 2
148  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
149  store half %fma1, half addrspace(3)* %arrayidx6, align 2
150  ret void
151}
152
153; FIXME: Should do vector load and extract component for fabs
154; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(
155; GFX9: load half
156; GFX9: call half @llvm.fabs.f16(
157; GFX9: load <2 x half>
158; GFX9: load half
159; GFX9: load <2 x half>
160; GFX9: call <2 x half> @llvm.fma.v2f16(
161; GFX9: store <2 x half>
162define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
163  %i0 = load half, half addrspace(3)* %a, align 2
164  %i1 = load half, half addrspace(3)* %b, align 2
165  %i2 = load half, half addrspace(3)* %c, align 2
166  %i1.fabs = call half @llvm.fabs.f16(half %i1)
167
168  %fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2)
169  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
170  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
171  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
172  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
173  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
174  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
175  %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
176  store half %fma0, half addrspace(3)* %d, align 2
177  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
178  store half %fma1, half addrspace(3)* %arrayidx6, align 2
179  ret void
180}
181
182; GCN-LABEL: @canonicalize_v2f16
183; GFX9: load <2 x half>
184; GFX9: call <2 x half> @llvm.canonicalize.v2f16(
185; GFX9: store <2 x half>
186define amdgpu_kernel void @canonicalize_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
187  %i0 = load half, half addrspace(3)* %a, align 2
188  %canonicalize0 = call half @llvm.canonicalize.f16(half %i0)
189  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
190  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
191  %canonicalize1 = call half @llvm.canonicalize.f16(half %i3)
192  store half %canonicalize0, half addrspace(3)* %c, align 2
193  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
194  store half %canonicalize1, half addrspace(3)* %arrayidx5, align 2
195  ret void
196}
197
198declare half @llvm.fabs.f16(half) #1
199declare half @llvm.fma.f16(half, half, half) #1
200declare half @llvm.canonicalize.f16(half) #1
201
202attributes #0 = { nounwind }
203attributes #1 = { nounwind readnone }
204