1; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s
2; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s
3
4; FIXME: Should still like to vectorize the memory operations for VI
5
6; Simple 3-pair chain with loads and stores
7; GCN-LABEL: @test1_as_3_3_3_v2f16(
8; GFX9: load <2 x half>, <2 x half> addrspace(3)*
9; GFX9: load <2 x half>, <2 x half> addrspace(3)*
10; GFX9: fmul <2 x half>
11; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
12; GFX9: ret
13
14; VI: load half
15; VI: load half
16define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) {
17  %i0 = load half, half addrspace(3)* %a, align 2
18  %i1 = load half, half addrspace(3)* %b, align 2
19  %mul = fmul half %i0, %i1
20  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
21  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
22  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
23  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
24  %mul5 = fmul half %i3, %i4
25  store half %mul, half addrspace(3)* %c, align 2
26  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
27  store half %mul5, half addrspace(3)* %arrayidx5, align 2
28  ret void
29}
30
31; GCN-LABEL: @test1_as_3_0_0(
32; GFX9: load <2 x half>, <2 x half> addrspace(3)*
33; GFX9: load <2 x half>, <2 x half>*
34; GFX9: fmul <2 x half>
35; GFX9: store <2 x half> %{{.*}}, <2 x half>* %
36; GFX9: ret
37
38; VI: load half
39; VI: load half
40define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) {
41  %i0 = load half, half addrspace(3)* %a, align 2
42  %i1 = load half, half* %b, align 2
43  %mul = fmul half %i0, %i1
44  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
45  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
46  %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
47  %i4 = load half, half* %arrayidx4, align 2
48  %mul5 = fmul half %i3, %i4
49  store half %mul, half* %c, align 2
50  %arrayidx5 = getelementptr inbounds half, half* %c, i64 1
51  store half %mul5, half* %arrayidx5, align 2
52  ret void
53}
54
55; GCN-LABEL: @test1_as_0_0_3_v2f16(
56; GFX9: load <2 x half>, <2 x half>*
57; GFX9: load <2 x half>, <2 x half>*
58; GFX9: fmul <2 x half>
59; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
60; GFX9: ret
61
62; VI: load half
63; VI: load half
64define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) {
65  %i0 = load half, half* %a, align 2
66  %i1 = load half, half* %b, align 2
67  %mul = fmul half %i0, %i1
68  %arrayidx3 = getelementptr inbounds half, half* %a, i64 1
69  %i3 = load half, half* %arrayidx3, align 2
70  %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
71  %i4 = load half, half* %arrayidx4, align 2
72  %mul5 = fmul half %i3, %i4
73  store half %mul, half addrspace(3)* %c, align 2
74  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
75  store half %mul5, half addrspace(3)* %arrayidx5, align 2
76  ret void
77}
78
79; GCN-LABEL: @test1_fma_v2f16(
80; GFX9: load <2 x half>
81; GFX9: load <2 x half>
82; GFX9: load <2 x half>
83; GFX9: call <2 x half> @llvm.fma.v2f16(
84; GFX9: store <2 x half>
85define amdgpu_kernel void @test1_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
86  %i0 = load half, half addrspace(3)* %a, align 2
87  %i1 = load half, half addrspace(3)* %b, align 2
88  %i2 = load half, half addrspace(3)* %c, align 2
89  %fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2)
90  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
91  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
92  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
93  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
94  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
95  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
96  %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
97  store half %fma0, half addrspace(3)* %d, align 2
98  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
99  store half %fma1, half addrspace(3)* %arrayidx6, align 2
100  ret void
101}
102
103; GCN-LABEL: @mul_scalar_v2f16(
104; GFX9: load <2 x half>
105; GFX9: fmul <2 x half>
106; GFX9: store <2 x half>
107define amdgpu_kernel void @mul_scalar_v2f16(half addrspace(3)* %a, half %scalar, half addrspace(3)* %c) {
108  %i0 = load half, half addrspace(3)* %a, align 2
109  %mul = fmul half %i0, %scalar
110  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
111  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
112  %mul5 = fmul half %i3, %scalar
113  store half %mul, half addrspace(3)* %c, align 2
114  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
115  store half %mul5, half addrspace(3)* %arrayidx5, align 2
116  ret void
117}
118
119; GCN-LABEL: @fabs_v2f16
120; GFX9: load <2 x half>
121; GFX9: call <2 x half> @llvm.fabs.v2f16(
122; GFX9: store <2 x half>
123define amdgpu_kernel void @fabs_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
124  %i0 = load half, half addrspace(3)* %a, align 2
125  %fabs0 = call half @llvm.fabs.f16(half %i0)
126  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
127  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
128  %fabs1 = call half @llvm.fabs.f16(half %i3)
129  store half %fabs0, half addrspace(3)* %c, align 2
130  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
131  store half %fabs1, half addrspace(3)* %arrayidx5, align 2
132  ret void
133}
134
135; GCN-LABEL: @test1_fabs_fma_v2f16(
136; GFX9: load <2 x half>
137; GFX9: call <2 x half> @llvm.fabs.v2f16(
138; GFX9: call <2 x half> @llvm.fma.v2f16(
139; GFX9: store <2 x half>
140define amdgpu_kernel void @test1_fabs_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
141  %i0 = load half, half addrspace(3)* %a, align 2
142  %i1 = load half, half addrspace(3)* %b, align 2
143  %i2 = load half, half addrspace(3)* %c, align 2
144  %i0.fabs = call half @llvm.fabs.f16(half %i0)
145
146  %fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2)
147  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
148  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
149  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
150  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
151  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
152  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
153  %i3.fabs = call half @llvm.fabs.f16(half %i3)
154
155  %fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5)
156  store half %fma0, half addrspace(3)* %d, align 2
157  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
158  store half %fma1, half addrspace(3)* %arrayidx6, align 2
159  ret void
160}
161
162; FIXME: Should do vector load and extract component for fabs
163; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(
164; GFX9: load half
165; GFX9: call half @llvm.fabs.f16(
166; GFX9: load <2 x half>
167; GFX9: load half
168; GFX9: load <2 x half>
169; GFX9: call <2 x half> @llvm.fma.v2f16(
170; GFX9: store <2 x half>
171define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
172  %i0 = load half, half addrspace(3)* %a, align 2
173  %i1 = load half, half addrspace(3)* %b, align 2
174  %i2 = load half, half addrspace(3)* %c, align 2
175  %i1.fabs = call half @llvm.fabs.f16(half %i1)
176
177  %fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2)
178  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
179  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
180  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
181  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
182  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
183  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
184  %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
185  store half %fma0, half addrspace(3)* %d, align 2
186  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
187  store half %fma1, half addrspace(3)* %arrayidx6, align 2
188  ret void
189}
190
191declare half @llvm.fabs.f16(half) #1
192declare half @llvm.fma.f16(half, half, half) #1
193
194attributes #0 = { nounwind }
195attributes #1 = { nounwind readnone }
196