1; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s 2; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s 3 4; FIXME: Should still like to vectorize the memory operations for VI 5 6; Simple 3-pair chain with loads and stores 7; GCN-LABEL: @test1_as_3_3_3_v2f16( 8; GFX9: load <2 x half>, <2 x half> addrspace(3)* 9; GFX9: load <2 x half>, <2 x half> addrspace(3)* 10; GFX9: fmul <2 x half> 11; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* % 12; GFX9: ret 13 14; VI: load half 15; VI: load half 16define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) { 17 %i0 = load half, half addrspace(3)* %a, align 2 18 %i1 = load half, half addrspace(3)* %b, align 2 19 %mul = fmul half %i0, %i1 20 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 21 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 22 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1 23 %i4 = load half, half addrspace(3)* %arrayidx4, align 2 24 %mul5 = fmul half %i3, %i4 25 store half %mul, half addrspace(3)* %c, align 2 26 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 27 store half %mul5, half addrspace(3)* %arrayidx5, align 2 28 ret void 29} 30 31; GCN-LABEL: @test1_as_3_0_0( 32; GFX9: load <2 x half>, <2 x half> addrspace(3)* 33; GFX9: load <2 x half>, <2 x half>* 34; GFX9: fmul <2 x half> 35; GFX9: store <2 x half> %{{.*}}, <2 x half>* % 36; GFX9: ret 37 38; VI: load half 39; VI: load half 40define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) { 41 %i0 = load half, half addrspace(3)* %a, align 2 42 %i1 = load half, half* %b, align 2 43 %mul = fmul half %i0, %i1 44 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 45 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 46 %arrayidx4 = getelementptr inbounds half, half* %b, i64 1 47 %i4 = load half, half* %arrayidx4, align 2 48 %mul5 = fmul half %i3, %i4 49 store half %mul, half* %c, align 2 50 %arrayidx5 = getelementptr inbounds half, half* %c, i64 1 51 store half %mul5, half* %arrayidx5, align 2 52 ret void 53} 54 55; GCN-LABEL: @test1_as_0_0_3_v2f16( 56; GFX9: load <2 x half>, <2 x half>* 57; GFX9: load <2 x half>, <2 x half>* 58; GFX9: fmul <2 x half> 59; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* % 60; GFX9: ret 61 62; VI: load half 63; VI: load half 64define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) { 65 %i0 = load half, half* %a, align 2 66 %i1 = load half, half* %b, align 2 67 %mul = fmul half %i0, %i1 68 %arrayidx3 = getelementptr inbounds half, half* %a, i64 1 69 %i3 = load half, half* %arrayidx3, align 2 70 %arrayidx4 = getelementptr inbounds half, half* %b, i64 1 71 %i4 = load half, half* %arrayidx4, align 2 72 %mul5 = fmul half %i3, %i4 73 store half %mul, half addrspace(3)* %c, align 2 74 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 75 store half %mul5, half addrspace(3)* %arrayidx5, align 2 76 ret void 77} 78 79; GCN-LABEL: @test1_fma_v2f16( 80; GFX9: load <2 x half> 81; GFX9: load <2 x half> 82; GFX9: load <2 x half> 83; GFX9: call <2 x half> @llvm.fma.v2f16( 84; GFX9: store <2 x half> 85define amdgpu_kernel void @test1_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) { 86 %i0 = load half, half addrspace(3)* %a, align 2 87 %i1 = load half, half addrspace(3)* %b, align 2 88 %i2 = load half, half addrspace(3)* %c, align 2 89 %fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2) 90 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 91 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 92 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1 93 %i4 = load half, half addrspace(3)* %arrayidx4, align 2 94 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 95 %i5 = load half, half addrspace(3)* %arrayidx5, align 2 96 %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5) 97 store half %fma0, half addrspace(3)* %d, align 2 98 %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1 99 store half %fma1, half addrspace(3)* %arrayidx6, align 2 100 ret void 101} 102 103; GCN-LABEL: @mul_scalar_v2f16( 104; GFX9: load <2 x half> 105; GFX9: fmul <2 x half> 106; GFX9: store <2 x half> 107define amdgpu_kernel void @mul_scalar_v2f16(half addrspace(3)* %a, half %scalar, half addrspace(3)* %c) { 108 %i0 = load half, half addrspace(3)* %a, align 2 109 %mul = fmul half %i0, %scalar 110 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 111 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 112 %mul5 = fmul half %i3, %scalar 113 store half %mul, half addrspace(3)* %c, align 2 114 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 115 store half %mul5, half addrspace(3)* %arrayidx5, align 2 116 ret void 117} 118 119; GCN-LABEL: @fabs_v2f16 120; GFX9: load <2 x half> 121; GFX9: call <2 x half> @llvm.fabs.v2f16( 122; GFX9: store <2 x half> 123define amdgpu_kernel void @fabs_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) { 124 %i0 = load half, half addrspace(3)* %a, align 2 125 %fabs0 = call half @llvm.fabs.f16(half %i0) 126 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 127 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 128 %fabs1 = call half @llvm.fabs.f16(half %i3) 129 store half %fabs0, half addrspace(3)* %c, align 2 130 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 131 store half %fabs1, half addrspace(3)* %arrayidx5, align 2 132 ret void 133} 134 135; GCN-LABEL: @test1_fabs_fma_v2f16( 136; GFX9: load <2 x half> 137; GFX9: call <2 x half> @llvm.fabs.v2f16( 138; GFX9: call <2 x half> @llvm.fma.v2f16( 139; GFX9: store <2 x half> 140define amdgpu_kernel void @test1_fabs_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) { 141 %i0 = load half, half addrspace(3)* %a, align 2 142 %i1 = load half, half addrspace(3)* %b, align 2 143 %i2 = load half, half addrspace(3)* %c, align 2 144 %i0.fabs = call half @llvm.fabs.f16(half %i0) 145 146 %fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2) 147 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 148 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 149 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1 150 %i4 = load half, half addrspace(3)* %arrayidx4, align 2 151 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 152 %i5 = load half, half addrspace(3)* %arrayidx5, align 2 153 %i3.fabs = call half @llvm.fabs.f16(half %i3) 154 155 %fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5) 156 store half %fma0, half addrspace(3)* %d, align 2 157 %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1 158 store half %fma1, half addrspace(3)* %arrayidx6, align 2 159 ret void 160} 161 162; FIXME: Should do vector load and extract component for fabs 163; GCN-LABEL: @test1_fabs_scalar_fma_v2f16( 164; GFX9: load half 165; GFX9: call half @llvm.fabs.f16( 166; GFX9: load <2 x half> 167; GFX9: load half 168; GFX9: load <2 x half> 169; GFX9: call <2 x half> @llvm.fma.v2f16( 170; GFX9: store <2 x half> 171define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) { 172 %i0 = load half, half addrspace(3)* %a, align 2 173 %i1 = load half, half addrspace(3)* %b, align 2 174 %i2 = load half, half addrspace(3)* %c, align 2 175 %i1.fabs = call half @llvm.fabs.f16(half %i1) 176 177 %fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2) 178 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 179 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 180 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1 181 %i4 = load half, half addrspace(3)* %arrayidx4, align 2 182 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 183 %i5 = load half, half addrspace(3)* %arrayidx5, align 2 184 %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5) 185 store half %fma0, half addrspace(3)* %d, align 2 186 %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1 187 store half %fma1, half addrspace(3)* %arrayidx6, align 2 188 ret void 189} 190 191declare half @llvm.fabs.f16(half) #1 192declare half @llvm.fma.f16(half, half, half) #1 193 194attributes #0 = { nounwind } 195attributes #1 = { nounwind readnone } 196