1; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9,GFX89 %s 2; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI,GFX89 %s 3 4; FIXME: Should still like to vectorize the memory operations for VI 5 6; Simple 3-pair chain with loads and stores 7; GCN-LABEL: @test1_as_3_3_3_v2f16( 8; GFX89: load <2 x half>, <2 x half> addrspace(3)* 9; GFX89: load <2 x half>, <2 x half> addrspace(3)* 10; GFX89: fmul <2 x half> 11; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* % 12; GFX89: ret 13define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) { 14 %i0 = load half, half addrspace(3)* %a, align 2 15 %i1 = load half, half addrspace(3)* %b, align 2 16 %mul = fmul half %i0, %i1 17 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 18 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 19 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1 20 %i4 = load half, half addrspace(3)* %arrayidx4, align 2 21 %mul5 = fmul half %i3, %i4 22 store half %mul, half addrspace(3)* %c, align 2 23 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 24 store half %mul5, half addrspace(3)* %arrayidx5, align 2 25 ret void 26} 27 28; GCN-LABEL: @test1_as_3_0_0( 29; GFX89: load <2 x half>, <2 x half> addrspace(3)* 30; GFX89: load <2 x half>, <2 x half>* 31; GFX89: fmul <2 x half> 32; GFX89: store <2 x half> %{{.*}}, <2 x half>* % 33; GFX89: ret 34define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) { 35 %i0 = load half, half addrspace(3)* %a, align 2 36 %i1 = load half, half* %b, align 2 37 %mul = fmul half %i0, %i1 38 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 39 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 40 %arrayidx4 = getelementptr inbounds half, half* %b, i64 1 41 %i4 = load half, half* %arrayidx4, align 2 42 %mul5 = fmul half %i3, %i4 43 store half %mul, half* %c, align 2 44 %arrayidx5 = getelementptr inbounds half, half* %c, i64 1 45 store half %mul5, half* %arrayidx5, align 2 46 ret void 47} 48 49; GCN-LABEL: @test1_as_0_0_3_v2f16( 50; GFX89: load <2 x half>, <2 x half>* 51; GFX89: load <2 x half>, <2 x half>* 52; GFX89: fmul <2 x half> 53; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* % 54; GFX89: ret 55define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) { 56 %i0 = load half, half* %a, align 2 57 %i1 = load half, half* %b, align 2 58 %mul = fmul half %i0, %i1 59 %arrayidx3 = getelementptr inbounds half, half* %a, i64 1 60 %i3 = load half, half* %arrayidx3, align 2 61 %arrayidx4 = getelementptr inbounds half, half* %b, i64 1 62 %i4 = load half, half* %arrayidx4, align 2 63 %mul5 = fmul half %i3, %i4 64 store half %mul, half addrspace(3)* %c, align 2 65 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 66 store half %mul5, half addrspace(3)* %arrayidx5, align 2 67 ret void 68} 69 70; GCN-LABEL: @test1_fma_v2f16( 71; GFX9: load <2 x half> 72; GFX9: load <2 x half> 73; GFX9: load <2 x half> 74; GFX9: call <2 x half> @llvm.fma.v2f16( 75; GFX9: store <2 x half> 76define amdgpu_kernel void @test1_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) { 77 %i0 = load half, half addrspace(3)* %a, align 2 78 %i1 = load half, half addrspace(3)* %b, align 2 79 %i2 = load half, half addrspace(3)* %c, align 2 80 %fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2) 81 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 82 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 83 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1 84 %i4 = load half, half addrspace(3)* %arrayidx4, align 2 85 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 86 %i5 = load half, half addrspace(3)* %arrayidx5, align 2 87 %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5) 88 store half %fma0, half addrspace(3)* %d, align 2 89 %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1 90 store half %fma1, half addrspace(3)* %arrayidx6, align 2 91 ret void 92} 93 94; GCN-LABEL: @mul_scalar_v2f16( 95; GFX9: load <2 x half> 96; GFX9: fmul <2 x half> 97; GFX9: store <2 x half> 98define amdgpu_kernel void @mul_scalar_v2f16(half addrspace(3)* %a, half %scalar, half addrspace(3)* %c) { 99 %i0 = load half, half addrspace(3)* %a, align 2 100 %mul = fmul half %i0, %scalar 101 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 102 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 103 %mul5 = fmul half %i3, %scalar 104 store half %mul, half addrspace(3)* %c, align 2 105 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 106 store half %mul5, half addrspace(3)* %arrayidx5, align 2 107 ret void 108} 109 110; GCN-LABEL: @fabs_v2f16 111; GFX9: load <2 x half> 112; GFX9: call <2 x half> @llvm.fabs.v2f16( 113; GFX9: store <2 x half> 114define amdgpu_kernel void @fabs_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) { 115 %i0 = load half, half addrspace(3)* %a, align 2 116 %fabs0 = call half @llvm.fabs.f16(half %i0) 117 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 118 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 119 %fabs1 = call half @llvm.fabs.f16(half %i3) 120 store half %fabs0, half addrspace(3)* %c, align 2 121 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 122 store half %fabs1, half addrspace(3)* %arrayidx5, align 2 123 ret void 124} 125 126; GCN-LABEL: @test1_fabs_fma_v2f16( 127; GFX9: load <2 x half> 128; GFX9: call <2 x half> @llvm.fabs.v2f16( 129; GFX9: call <2 x half> @llvm.fma.v2f16( 130; GFX9: store <2 x half> 131define amdgpu_kernel void @test1_fabs_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) { 132 %i0 = load half, half addrspace(3)* %a, align 2 133 %i1 = load half, half addrspace(3)* %b, align 2 134 %i2 = load half, half addrspace(3)* %c, align 2 135 %i0.fabs = call half @llvm.fabs.f16(half %i0) 136 137 %fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2) 138 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 139 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 140 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1 141 %i4 = load half, half addrspace(3)* %arrayidx4, align 2 142 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 143 %i5 = load half, half addrspace(3)* %arrayidx5, align 2 144 %i3.fabs = call half @llvm.fabs.f16(half %i3) 145 146 %fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5) 147 store half %fma0, half addrspace(3)* %d, align 2 148 %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1 149 store half %fma1, half addrspace(3)* %arrayidx6, align 2 150 ret void 151} 152 153; FIXME: Should do vector load and extract component for fabs 154; GCN-LABEL: @test1_fabs_scalar_fma_v2f16( 155; GFX9: load half 156; GFX9: call half @llvm.fabs.f16( 157; GFX9: load <2 x half> 158; GFX9: load half 159; GFX9: load <2 x half> 160; GFX9: call <2 x half> @llvm.fma.v2f16( 161; GFX9: store <2 x half> 162define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) { 163 %i0 = load half, half addrspace(3)* %a, align 2 164 %i1 = load half, half addrspace(3)* %b, align 2 165 %i2 = load half, half addrspace(3)* %c, align 2 166 %i1.fabs = call half @llvm.fabs.f16(half %i1) 167 168 %fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2) 169 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 170 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 171 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1 172 %i4 = load half, half addrspace(3)* %arrayidx4, align 2 173 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 174 %i5 = load half, half addrspace(3)* %arrayidx5, align 2 175 %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5) 176 store half %fma0, half addrspace(3)* %d, align 2 177 %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1 178 store half %fma1, half addrspace(3)* %arrayidx6, align 2 179 ret void 180} 181 182; GCN-LABEL: @canonicalize_v2f16 183; GFX9: load <2 x half> 184; GFX9: call <2 x half> @llvm.canonicalize.v2f16( 185; GFX9: store <2 x half> 186define amdgpu_kernel void @canonicalize_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) { 187 %i0 = load half, half addrspace(3)* %a, align 2 188 %canonicalize0 = call half @llvm.canonicalize.f16(half %i0) 189 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 190 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 191 %canonicalize1 = call half @llvm.canonicalize.f16(half %i3) 192 store half %canonicalize0, half addrspace(3)* %c, align 2 193 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 194 store half %canonicalize1, half addrspace(3)* %arrayidx5, align 2 195 ret void 196} 197 198declare half @llvm.fabs.f16(half) #1 199declare half @llvm.fma.f16(half, half, half) #1 200declare half @llvm.canonicalize.f16(half) #1 201 202attributes #0 = { nounwind } 203attributes #1 = { nounwind readnone } 204