1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -slp-vectorizer -instcombine -S | FileCheck %s 3 4target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" 5target triple = "aarch64--linux-gnu" 6 7define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { 8; CHECK-LABEL: @build_vec_v2i64( 9; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]] 10; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]] 11; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 1, i32 2> 12; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 3> 13; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]] 14; CHECK-NEXT: ret <2 x i64> [[TMP5]] 15; 16 %v0.0 = extractelement <2 x i64> %v0, i32 0 17 %v0.1 = extractelement <2 x i64> %v0, i32 1 18 %v1.0 = extractelement <2 x i64> %v1, i32 0 19 %v1.1 = extractelement <2 x i64> %v1, i32 1 20 %tmp0.0 = add i64 %v0.0, %v1.0 21 %tmp0.1 = add i64 %v0.1, %v1.1 22 %tmp1.0 = sub i64 %v0.0, %v1.0 23 %tmp1.1 = sub i64 %v0.1, %v1.1 24 %tmp2.0 = add i64 %tmp0.0, %tmp0.1 25 %tmp2.1 = add i64 %tmp1.0, %tmp1.1 26 %tmp3.0 = insertelement <2 x i64> undef, i64 %tmp2.0, i32 0 27 %tmp3.1 = insertelement <2 x i64> %tmp3.0, i64 %tmp2.1, i32 1 28 ret <2 x i64> %tmp3.1 29} 30 31define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) { 32; CHECK-LABEL: @store_chain_v2i64( 33; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* 34; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 35; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>* 36; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 37; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] 38; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] 39; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 1, i32 2> 40; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 0, i32 3> 41; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]] 42; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* 43; CHECK-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 8 44; CHECK-NEXT: ret void 45; 46 %a.0 = getelementptr i64, i64* %a, i64 0 47 %a.1 = getelementptr i64, i64* %a, i64 1 48 %b.0 = getelementptr i64, i64* %b, i64 0 49 %b.1 = getelementptr i64, i64* %b, i64 1 50 %c.0 = getelementptr i64, i64* %c, i64 0 51 %c.1 = getelementptr i64, i64* %c, i64 1 52 %v0.0 = load i64, i64* %a.0, align 8 53 %v0.1 = load i64, i64* %a.1, align 8 54 %v1.0 = load i64, i64* %b.0, align 8 55 %v1.1 = load i64, i64* %b.1, align 8 56 %tmp0.0 = add i64 %v0.0, %v1.0 57 %tmp0.1 = add i64 %v0.1, %v1.1 58 %tmp1.0 = sub i64 %v0.0, %v1.0 59 %tmp1.1 = sub i64 %v0.1, %v1.1 60 %tmp2.0 = add i64 %tmp0.0, %tmp0.1 61 %tmp2.1 = add i64 %tmp1.0, %tmp1.1 62 store i64 %tmp2.0, i64* %c.0, align 8 63 store i64 %tmp2.1, i64* %c.1, align 8 64 ret void 65} 66 67define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { 68; CHECK-LABEL: @build_vec_v4i32( 69; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]] 70; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]] 71; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 3, i32 6> 72; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 2, i32 7> 73; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]] 74; CHECK-NEXT: ret <4 x i32> [[TMP5]] 75; 76 %v0.0 = extractelement <4 x i32> %v0, i32 0 77 %v0.1 = extractelement <4 x i32> %v0, i32 1 78 %v0.2 = extractelement <4 x i32> %v0, i32 2 79 %v0.3 = extractelement <4 x i32> %v0, i32 3 80 %v1.0 = extractelement <4 x i32> %v1, i32 0 81 %v1.1 = extractelement <4 x i32> %v1, i32 1 82 %v1.2 = extractelement <4 x i32> %v1, i32 2 83 %v1.3 = extractelement <4 x i32> %v1, i32 3 84 %tmp0.0 = add i32 %v0.0, %v1.0 85 %tmp0.1 = add i32 %v0.1, %v1.1 86 %tmp0.2 = add i32 %v0.2, %v1.2 87 %tmp0.3 = add i32 %v0.3, %v1.3 88 %tmp1.0 = sub i32 %v0.0, %v1.0 89 %tmp1.1 = sub i32 %v0.1, %v1.1 90 %tmp1.2 = sub i32 %v0.2, %v1.2 91 %tmp1.3 = sub i32 %v0.3, %v1.3 92 %tmp2.0 = add i32 %tmp0.0, %tmp0.1 93 %tmp2.1 = add i32 %tmp1.0, %tmp1.1 94 %tmp2.2 = add i32 %tmp0.2, %tmp0.3 95 %tmp2.3 = add i32 %tmp1.2, %tmp1.3 96 %tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0 97 %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1 98 %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.2, i32 2 99 %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3 100 ret <4 x i32> %tmp3.3 101} 102 103define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { 104; CHECK-LABEL: @build_vec_v4i32_reuse_0( 105; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] 106; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]] 107; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2> 108; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3> 109; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] 110; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 111; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]] 112; 113 %v0.0 = extractelement <2 x i32> %v0, i32 0 114 %v0.1 = extractelement <2 x i32> %v0, i32 1 115 %v1.0 = extractelement <2 x i32> %v1, i32 0 116 %v1.1 = extractelement <2 x i32> %v1, i32 1 117 %tmp0.0 = add i32 %v0.0, %v1.0 118 %tmp0.1 = add i32 %v0.1, %v1.1 119 %tmp1.0 = sub i32 %v0.0, %v1.0 120 %tmp1.1 = sub i32 %v0.1, %v1.1 121 %tmp2.0 = add i32 %tmp0.0, %tmp0.1 122 %tmp2.1 = add i32 %tmp1.0, %tmp1.1 123 %tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0 124 %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1 125 %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.0, i32 2 126 %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.1, i32 3 127 ret <4 x i32> %tmp3.3 128} 129 130define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { 131; CHECK-LABEL: @build_vec_v4i32_reuse_1( 132; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i64 1 133; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i64 0 134; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1 135; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0 136; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] 137; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] 138; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0 139; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 140; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] 141; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] 142; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> <i32 1, i32 0> 143; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]] 144; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef> 145; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 146; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 4, i32 5> 147; CHECK-NEXT: ret <4 x i32> [[TMP2_31]] 148; 149 %v0.0 = extractelement <2 x i32> %v0, i32 0 150 %v0.1 = extractelement <2 x i32> %v0, i32 1 151 %v1.0 = extractelement <2 x i32> %v1, i32 0 152 %v1.1 = extractelement <2 x i32> %v1, i32 1 153 %tmp0.0 = add i32 %v0.0, %v1.0 154 %tmp0.1 = add i32 %v0.1, %v1.1 155 %tmp0.2 = xor i32 %v0.0, %v1.0 156 %tmp0.3 = xor i32 %v0.1, %v1.1 157 %tmp1.0 = sub i32 %tmp0.0, %tmp0.1 158 %tmp1.1 = sub i32 %tmp0.0, %tmp0.1 159 %tmp1.2 = sub i32 %tmp0.2, %tmp0.3 160 %tmp1.3 = sub i32 %tmp0.3, %tmp0.2 161 %tmp2.0 = insertelement <4 x i32> undef, i32 %tmp1.0, i32 0 162 %tmp2.1 = insertelement <4 x i32> %tmp2.0, i32 %tmp1.1, i32 1 163 %tmp2.2 = insertelement <4 x i32> %tmp2.1, i32 %tmp1.2, i32 2 164 %tmp2.3 = insertelement <4 x i32> %tmp2.2, i32 %tmp1.3, i32 3 165 ret <4 x i32> %tmp2.3 166} 167 168define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) { 169; CHECK-LABEL: @build_vec_v4i32_3_binops( 170; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] 171; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i32> [[V0]], [[V1]] 172; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2> 173; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3> 174; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] 175; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i32> [[V0]], [[V1]] 176; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 1, i32 0> 177; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i32> [[V0]], [[V1]] 178; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP7]] 179; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> 180; CHECK-NEXT: ret <4 x i32> [[TMP3_31]] 181; 182 %v0.0 = extractelement <2 x i32> %v0, i32 0 183 %v0.1 = extractelement <2 x i32> %v0, i32 1 184 %v1.0 = extractelement <2 x i32> %v1, i32 0 185 %v1.1 = extractelement <2 x i32> %v1, i32 1 186 %tmp0.0 = add i32 %v0.0, %v1.0 187 %tmp0.1 = add i32 %v0.1, %v1.1 188 %tmp0.2 = xor i32 %v0.0, %v1.0 189 %tmp0.3 = xor i32 %v0.1, %v1.1 190 %tmp1.0 = mul i32 %v0.0, %v1.0 191 %tmp1.1 = mul i32 %v0.1, %v1.1 192 %tmp1.2 = xor i32 %v0.0, %v1.0 193 %tmp1.3 = xor i32 %v0.1, %v1.1 194 %tmp2.0 = add i32 %tmp0.0, %tmp0.1 195 %tmp2.1 = add i32 %tmp1.0, %tmp1.1 196 %tmp2.2 = add i32 %tmp0.2, %tmp0.3 197 %tmp2.3 = add i32 %tmp1.2, %tmp1.3 198 %tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0 199 %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1 200 %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.2, i32 2 201 %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3 202 ret <4 x i32> %tmp3.3 203} 204 205define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) { 206; CHECK-LABEL: @reduction_v4i32( 207; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]] 208; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]] 209; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 3, i32 6> 210; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 2, i32 7> 211; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]] 212; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15> 213; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537> 214; CHECK-NEXT: [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], <i32 65535, i32 65535, i32 65535, i32 65535> 215; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] 216; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]] 217; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) 218; CHECK-NEXT: ret i32 [[TMP11]] 219; 220 %v0.0 = extractelement <4 x i32> %v0, i32 0 221 %v0.1 = extractelement <4 x i32> %v0, i32 1 222 %v0.2 = extractelement <4 x i32> %v0, i32 2 223 %v0.3 = extractelement <4 x i32> %v0, i32 3 224 %v1.0 = extractelement <4 x i32> %v1, i32 0 225 %v1.1 = extractelement <4 x i32> %v1, i32 1 226 %v1.2 = extractelement <4 x i32> %v1, i32 2 227 %v1.3 = extractelement <4 x i32> %v1, i32 3 228 %tmp0.0 = add i32 %v0.0, %v1.0 229 %tmp0.1 = add i32 %v0.1, %v1.1 230 %tmp0.2 = add i32 %v0.2, %v1.2 231 %tmp0.3 = add i32 %v0.3, %v1.3 232 %tmp1.0 = sub i32 %v0.0, %v1.0 233 %tmp1.1 = sub i32 %v0.1, %v1.1 234 %tmp1.2 = sub i32 %v0.2, %v1.2 235 %tmp1.3 = sub i32 %v0.3, %v1.3 236 %tmp2.0 = add i32 %tmp0.0, %tmp0.1 237 %tmp2.1 = add i32 %tmp1.0, %tmp1.1 238 %tmp2.2 = add i32 %tmp0.2, %tmp0.3 239 %tmp2.3 = add i32 %tmp1.2, %tmp1.3 240 %tmp3.0 = lshr i32 %tmp2.0, 15 241 %tmp3.1 = lshr i32 %tmp2.1, 15 242 %tmp3.2 = lshr i32 %tmp2.2, 15 243 %tmp3.3 = lshr i32 %tmp2.3, 15 244 %tmp4.0 = and i32 %tmp3.0, 65537 245 %tmp4.1 = and i32 %tmp3.1, 65537 246 %tmp4.2 = and i32 %tmp3.2, 65537 247 %tmp4.3 = and i32 %tmp3.3, 65537 248 %tmp5.0 = mul nuw i32 %tmp4.0, 65535 249 %tmp5.1 = mul nuw i32 %tmp4.1, 65535 250 %tmp5.2 = mul nuw i32 %tmp4.2, 65535 251 %tmp5.3 = mul nuw i32 %tmp4.3, 65535 252 %tmp6.0 = add i32 %tmp5.0, %tmp2.0 253 %tmp6.1 = add i32 %tmp5.1, %tmp2.1 254 %tmp6.2 = add i32 %tmp5.2, %tmp2.2 255 %tmp6.3 = add i32 %tmp5.3, %tmp2.3 256 %tmp7.0 = xor i32 %tmp6.0, %tmp5.0 257 %tmp7.1 = xor i32 %tmp6.1, %tmp5.1 258 %tmp7.2 = xor i32 %tmp6.2, %tmp5.2 259 %tmp7.3 = xor i32 %tmp6.3, %tmp5.3 260 %reduce.0 = add i32 %tmp7.1, %tmp7.0 261 %reduce.1 = add i32 %reduce.0, %tmp7.2 262 %reduce.2 = add i32 %reduce.1, %tmp7.3 263 ret i32 %reduce.2 264} 265