1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s 4 5; FIXME: Should still like to vectorize the memory operations for VI 6 7; Simple 3-pair chain with loads and stores 8define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) { 9; GCN-LABEL: @test1_as_3_3_3_v2f16( 10; GCN-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)* 11; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2 12; GCN-NEXT: [[TMP3:%.*]] = bitcast half addrspace(3)* [[B:%.*]] to <2 x half> addrspace(3)* 13; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP3]], align 2 14; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]] 15; GCN-NEXT: [[TMP6:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)* 16; GCN-NEXT: store <2 x half> [[TMP5]], <2 x half> addrspace(3)* [[TMP6]], align 2 17; GCN-NEXT: ret void 18; 19 %i0 = load half, half addrspace(3)* %a, align 2 20 %i1 = load half, half addrspace(3)* %b, align 2 21 %mul = fmul half %i0, %i1 22 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 23 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 24 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1 25 %i4 = load half, half addrspace(3)* %arrayidx4, align 2 26 %mul5 = fmul half %i3, %i4 27 store half %mul, half addrspace(3)* %c, align 2 28 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 29 store half %mul5, half addrspace(3)* %arrayidx5, align 2 30 ret void 31} 32 33define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) { 34; GCN-LABEL: @test1_as_3_0_0( 35; GCN-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)* 36; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2 37; GCN-NEXT: [[TMP3:%.*]] = bitcast half* [[B:%.*]] to <2 x half>* 38; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, <2 x half>* [[TMP3]], align 2 39; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]] 40; GCN-NEXT: [[TMP6:%.*]] = bitcast half* [[C:%.*]] to <2 x half>* 41; GCN-NEXT: store <2 x half> [[TMP5]], <2 x half>* [[TMP6]], align 2 42; GCN-NEXT: ret void 43; 44 %i0 = load half, half addrspace(3)* %a, align 2 45 %i1 = load half, half* %b, align 2 46 %mul = fmul half %i0, %i1 47 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 48 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 49 %arrayidx4 = getelementptr inbounds half, half* %b, i64 1 50 %i4 = load half, half* %arrayidx4, align 2 51 %mul5 = fmul half %i3, %i4 52 store half %mul, half* %c, align 2 53 %arrayidx5 = getelementptr inbounds half, half* %c, i64 1 54 store half %mul5, half* %arrayidx5, align 2 55 ret void 56} 57 58define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) { 59; GCN-LABEL: @test1_as_0_0_3_v2f16( 60; GCN-NEXT: [[TMP1:%.*]] = bitcast half* [[A:%.*]] to <2 x half>* 61; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half>* [[TMP1]], align 2 62; GCN-NEXT: [[TMP3:%.*]] = bitcast half* [[B:%.*]] to <2 x half>* 63; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, <2 x half>* [[TMP3]], align 2 64; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]] 65; GCN-NEXT: [[TMP6:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)* 66; GCN-NEXT: store <2 x half> [[TMP5]], <2 x half> addrspace(3)* [[TMP6]], align 2 67; GCN-NEXT: ret void 68; 69 %i0 = load half, half* %a, align 2 70 %i1 = load half, half* %b, align 2 71 %mul = fmul half %i0, %i1 72 %arrayidx3 = getelementptr inbounds half, half* %a, i64 1 73 %i3 = load half, half* %arrayidx3, align 2 74 %arrayidx4 = getelementptr inbounds half, half* %b, i64 1 75 %i4 = load half, half* %arrayidx4, align 2 76 %mul5 = fmul half %i3, %i4 77 store half %mul, half addrspace(3)* %c, align 2 78 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 79 store half %mul5, half addrspace(3)* %arrayidx5, align 2 80 ret void 81} 82 83define amdgpu_kernel void @test1_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) { 84; GCN-LABEL: @test1_fma_v2f16( 85; GCN-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)* 86; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2 87; GCN-NEXT: [[TMP3:%.*]] = bitcast half addrspace(3)* [[B:%.*]] to <2 x half> addrspace(3)* 88; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP3]], align 2 89; GCN-NEXT: [[TMP5:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)* 90; GCN-NEXT: [[TMP6:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP5]], align 2 91; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP4]], <2 x half> [[TMP6]]) 92; GCN-NEXT: [[TMP8:%.*]] = bitcast half addrspace(3)* [[D:%.*]] to <2 x half> addrspace(3)* 93; GCN-NEXT: store <2 x half> [[TMP7]], <2 x half> addrspace(3)* [[TMP8]], align 2 94; GCN-NEXT: ret void 95; 96 %i0 = load half, half addrspace(3)* %a, align 2 97 %i1 = load half, half addrspace(3)* %b, align 2 98 %i2 = load half, half addrspace(3)* %c, align 2 99 %fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2) 100 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 101 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 102 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1 103 %i4 = load half, half addrspace(3)* %arrayidx4, align 2 104 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 105 %i5 = load half, half addrspace(3)* %arrayidx5, align 2 106 %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5) 107 store half %fma0, half addrspace(3)* %d, align 2 108 %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1 109 store half %fma1, half addrspace(3)* %arrayidx6, align 2 110 ret void 111} 112 113define amdgpu_kernel void @mul_scalar_v2f16(half addrspace(3)* %a, half %scalar, half addrspace(3)* %c) { 114; GCN-LABEL: @mul_scalar_v2f16( 115; GCN-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)* 116; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2 117; GCN-NEXT: [[TMP3:%.*]] = insertelement <2 x half> poison, half [[SCALAR:%.*]], i32 0 118; GCN-NEXT: [[TMP4:%.*]] = insertelement <2 x half> [[TMP3]], half [[SCALAR]], i32 1 119; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]] 120; GCN-NEXT: [[TMP6:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)* 121; GCN-NEXT: store <2 x half> [[TMP5]], <2 x half> addrspace(3)* [[TMP6]], align 2 122; GCN-NEXT: ret void 123; 124 %i0 = load half, half addrspace(3)* %a, align 2 125 %mul = fmul half %i0, %scalar 126 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 127 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 128 %mul5 = fmul half %i3, %scalar 129 store half %mul, half addrspace(3)* %c, align 2 130 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 131 store half %mul5, half addrspace(3)* %arrayidx5, align 2 132 ret void 133} 134 135define amdgpu_kernel void @fabs_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) { 136; GCN-LABEL: @fabs_v2f16( 137; GCN-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)* 138; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2 139; GCN-NEXT: [[TMP3:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP2]]) 140; GCN-NEXT: [[TMP4:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)* 141; GCN-NEXT: store <2 x half> [[TMP3]], <2 x half> addrspace(3)* [[TMP4]], align 2 142; GCN-NEXT: ret void 143; 144 %i0 = load half, half addrspace(3)* %a, align 2 145 %fabs0 = call half @llvm.fabs.f16(half %i0) 146 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 147 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 148 %fabs1 = call half @llvm.fabs.f16(half %i3) 149 store half %fabs0, half addrspace(3)* %c, align 2 150 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 151 store half %fabs1, half addrspace(3)* %arrayidx5, align 2 152 ret void 153} 154 155define amdgpu_kernel void @test1_fabs_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) { 156; GCN-LABEL: @test1_fabs_fma_v2f16( 157; GCN-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)* 158; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2 159; GCN-NEXT: [[TMP3:%.*]] = bitcast half addrspace(3)* [[B:%.*]] to <2 x half> addrspace(3)* 160; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP3]], align 2 161; GCN-NEXT: [[TMP5:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)* 162; GCN-NEXT: [[TMP6:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP5]], align 2 163; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP2]]) 164; GCN-NEXT: [[TMP8:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP7]], <2 x half> [[TMP4]], <2 x half> [[TMP6]]) 165; GCN-NEXT: [[TMP9:%.*]] = bitcast half addrspace(3)* [[D:%.*]] to <2 x half> addrspace(3)* 166; GCN-NEXT: store <2 x half> [[TMP8]], <2 x half> addrspace(3)* [[TMP9]], align 2 167; GCN-NEXT: ret void 168; 169 %i0 = load half, half addrspace(3)* %a, align 2 170 %i1 = load half, half addrspace(3)* %b, align 2 171 %i2 = load half, half addrspace(3)* %c, align 2 172 %i0.fabs = call half @llvm.fabs.f16(half %i0) 173 174 %fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2) 175 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 176 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 177 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1 178 %i4 = load half, half addrspace(3)* %arrayidx4, align 2 179 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 180 %i5 = load half, half addrspace(3)* %arrayidx5, align 2 181 %i3.fabs = call half @llvm.fabs.f16(half %i3) 182 183 %fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5) 184 store half %fma0, half addrspace(3)* %d, align 2 185 %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1 186 store half %fma1, half addrspace(3)* %arrayidx6, align 2 187 ret void 188} 189 190define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) { 191; GCN-LABEL: @test1_fabs_scalar_fma_v2f16( 192; GCN-NEXT: [[I1:%.*]] = load half, half addrspace(3)* [[B:%.*]], align 2 193; GCN-NEXT: [[I1_FABS:%.*]] = call half @llvm.fabs.f16(half [[I1]]) 194; GCN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, half addrspace(3)* [[B]], i64 1 195; GCN-NEXT: [[I4:%.*]] = load half, half addrspace(3)* [[ARRAYIDX4]], align 2 196; GCN-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)* 197; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2 198; GCN-NEXT: [[TMP3:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)* 199; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP3]], align 2 200; GCN-NEXT: [[TMP5:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0 201; GCN-NEXT: [[TMP6:%.*]] = insertelement <2 x half> [[TMP5]], half [[I4]], i32 1 202; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP6]], <2 x half> [[TMP4]]) 203; GCN-NEXT: [[TMP8:%.*]] = bitcast half addrspace(3)* [[D:%.*]] to <2 x half> addrspace(3)* 204; GCN-NEXT: store <2 x half> [[TMP7]], <2 x half> addrspace(3)* [[TMP8]], align 2 205; GCN-NEXT: ret void 206; 207 %i0 = load half, half addrspace(3)* %a, align 2 208 %i1 = load half, half addrspace(3)* %b, align 2 209 %i2 = load half, half addrspace(3)* %c, align 2 210 %i1.fabs = call half @llvm.fabs.f16(half %i1) 211 212 %fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2) 213 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 214 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 215 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1 216 %i4 = load half, half addrspace(3)* %arrayidx4, align 2 217 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 218 %i5 = load half, half addrspace(3)* %arrayidx5, align 2 219 %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5) 220 store half %fma0, half addrspace(3)* %d, align 2 221 %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1 222 store half %fma1, half addrspace(3)* %arrayidx6, align 2 223 ret void 224} 225 226define amdgpu_kernel void @canonicalize_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) { 227; GFX9-LABEL: @canonicalize_v2f16( 228; GFX9-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)* 229; GFX9-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2 230; GFX9-NEXT: [[TMP3:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP2]]) 231; GFX9-NEXT: [[TMP4:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)* 232; GFX9-NEXT: store <2 x half> [[TMP3]], <2 x half> addrspace(3)* [[TMP4]], align 2 233; GFX9-NEXT: ret void 234; 235; VI-LABEL: @canonicalize_v2f16( 236; VI-NEXT: [[I0:%.*]] = load half, half addrspace(3)* [[A:%.*]], align 2 237; VI-NEXT: [[CANONICALIZE0:%.*]] = call half @llvm.canonicalize.f16(half [[I0]]) 238; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, half addrspace(3)* [[A]], i64 1 239; VI-NEXT: [[I3:%.*]] = load half, half addrspace(3)* [[ARRAYIDX3]], align 2 240; VI-NEXT: [[CANONICALIZE1:%.*]] = call half @llvm.canonicalize.f16(half [[I3]]) 241; VI-NEXT: store half [[CANONICALIZE0]], half addrspace(3)* [[C:%.*]], align 2 242; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, half addrspace(3)* [[C]], i64 1 243; VI-NEXT: store half [[CANONICALIZE1]], half addrspace(3)* [[ARRAYIDX5]], align 2 244; VI-NEXT: ret void 245; 246 %i0 = load half, half addrspace(3)* %a, align 2 247 %canonicalize0 = call half @llvm.canonicalize.f16(half %i0) 248 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 249 %i3 = load half, half addrspace(3)* %arrayidx3, align 2 250 %canonicalize1 = call half @llvm.canonicalize.f16(half %i3) 251 store half %canonicalize0, half addrspace(3)* %c, align 2 252 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 253 store half %canonicalize1, half addrspace(3)* %arrayidx5, align 2 254 ret void 255} 256 257declare half @llvm.fabs.f16(half) #1 258declare half @llvm.fma.f16(half, half, half) #1 259declare half @llvm.canonicalize.f16(half) #1 260 261attributes #0 = { nounwind } 262attributes #1 = { nounwind readnone } 263