1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -passes='default<O3>' -enable-matrix -S %s | FileCheck %s 3 4target triple = "arm64-apple-ios" 5 6define void @matrix_extract_insert_scalar(i32 %i, i32 %k, i32 %j, [225 x double]* nonnull align 8 dereferenceable(1800) %A, [225 x double]* nonnull align 8 dereferenceable(1800) %B) #0 { 7; CHECK-LABEL: @matrix_extract_insert_scalar( 8; CHECK-NEXT: entry: 9; CHECK-NEXT: [[CONV:%.*]] = zext i32 [[K:%.*]] to i64 10; CHECK-NEXT: [[CONV1:%.*]] = zext i32 [[J:%.*]] to i64 11; CHECK-NEXT: [[TMP0:%.*]] = mul nuw nsw i64 [[CONV1]], 15 12; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], [[CONV]] 13; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 225 14; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP2]]) 15; CHECK-NEXT: [[TMP3:%.*]] = bitcast [225 x double]* [[A:%.*]] to <225 x double>* 16; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP3]], i64 0, i64 [[TMP1]] 17; CHECK-NEXT: [[MATRIXEXT:%.*]] = load double, double* [[TMP4]], align 8 18; CHECK-NEXT: [[CONV2:%.*]] = zext i32 [[I:%.*]] to i64 19; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP0]], [[CONV2]] 20; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 225 21; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP6]]) 22; CHECK-NEXT: [[TMP7:%.*]] = bitcast [225 x double]* [[B:%.*]] to <225 x double>* 23; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP5]] 24; CHECK-NEXT: [[MATRIXEXT4:%.*]] = load double, double* [[TMP8]], align 8 25; CHECK-NEXT: [[MUL:%.*]] = fmul double [[MATRIXEXT]], [[MATRIXEXT4]] 26; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP1]] 27; CHECK-NEXT: [[MATRIXEXT7:%.*]] = load double, double* [[TMP9]], align 8 28; CHECK-NEXT: [[SUB:%.*]] = fsub double [[MATRIXEXT7]], [[MUL]] 29; CHECK-NEXT: store double [[SUB]], double* [[TMP9]], align 8 30; CHECK-NEXT: ret void 31; 32entry: 33 %i.addr = alloca i32, align 4 34 %k.addr = alloca i32, align 4 35 %j.addr = alloca i32, align 4 36 %A.addr = alloca [225 x double]*, align 8 37 %B.addr = alloca [225 x double]*, align 8 38 store i32 %i, i32* %i.addr, align 4 39 store i32 %k, i32* %k.addr, align 4 40 store i32 %j, i32* %j.addr, align 4 41 store [225 x double]* %A, [225 x double]** %A.addr, align 8 42 store [225 x double]* %B, [225 x double]** %B.addr, align 8 43 %0 = load i32, i32* %k.addr, align 4 44 %conv = zext i32 %0 to i64 45 %1 = load i32, i32* %j.addr, align 4 46 %conv1 = zext i32 %1 to i64 47 %2 = mul i64 %conv1, 15 48 %3 = add i64 %2, %conv 49 %4 = icmp ult i64 %3, 225 50 call void @llvm.assume(i1 %4) 51 %5 = load [225 x double]*, [225 x double]** %A.addr, align 8 52 %6 = bitcast [225 x double]* %5 to <225 x double>* 53 %7 = load <225 x double>, <225 x double>* %6, align 8 54 %matrixext = extractelement <225 x double> %7, i64 %3 55 %8 = load i32, i32* %i.addr, align 4 56 %conv2 = zext i32 %8 to i64 57 %9 = load i32, i32* %j.addr, align 4 58 %conv3 = zext i32 %9 to i64 59 %10 = mul i64 %conv3, 15 60 %11 = add i64 %10, %conv2 61 %12 = icmp ult i64 %11, 225 62 call void @llvm.assume(i1 %12) 63 %13 = load [225 x double]*, [225 x double]** %B.addr, align 8 64 %14 = bitcast [225 x double]* %13 to <225 x double>* 65 %15 = load <225 x double>, <225 x double>* %14, align 8 66 %matrixext4 = extractelement <225 x double> %15, i64 %11 67 %mul = fmul double %matrixext, %matrixext4 68 %16 = load [225 x double]*, [225 x double]** %B.addr, align 8 69 %17 = load i32, i32* %k.addr, align 4 70 %conv5 = zext i32 %17 to i64 71 %18 = load i32, i32* %j.addr, align 4 72 %conv6 = zext i32 %18 to i64 73 %19 = mul i64 %conv6, 15 74 %20 = add i64 %19, %conv5 75 %21 = bitcast [225 x double]* %16 to <225 x double>* 76 %22 = icmp ult i64 %20, 225 77 call void @llvm.assume(i1 %22) 78 %23 = load <225 x double>, <225 x double>* %21, align 8 79 %matrixext7 = extractelement <225 x double> %23, i64 %20 80 %sub = fsub double %matrixext7, %mul 81 %24 = icmp ult i64 %20, 225 82 call void @llvm.assume(i1 %24) 83 %25 = load <225 x double>, <225 x double>* %21, align 8 84 %matins = insertelement <225 x double> %25, double %sub, i64 %20 85 store <225 x double> %matins, <225 x double>* %21, align 8 86 ret void 87} 88define void @matrix_extract_insert_loop(i32 %i, [225 x double]* nonnull align 8 dereferenceable(1800) %A, [225 x double]* nonnull align 8 dereferenceable(1800) %B) { 89; CHECK-LABEL: @matrix_extract_insert_loop( 90; CHECK-NEXT: entry: 91; CHECK-NEXT: [[CMP212_NOT:%.*]] = icmp eq i32 [[I:%.*]], 0 92; CHECK-NEXT: [[TMP0:%.*]] = bitcast [225 x double]* [[A:%.*]] to <225 x double>* 93; CHECK-NEXT: [[CONV6:%.*]] = zext i32 [[I]] to i64 94; CHECK-NEXT: [[TMP1:%.*]] = bitcast [225 x double]* [[B:%.*]] to <225 x double>* 95; CHECK-NEXT: br i1 [[CMP212_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US:%.*]] 96; CHECK: for.cond1.preheader.us: 97; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[I]], 225 98; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP2]]) 99; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[CONV6]] 100; CHECK-NEXT: br label [[FOR_BODY4_US:%.*]] 101; CHECK: for.body4.us: 102; CHECK-NEXT: [[K_013_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY4_US]] ] 103; CHECK-NEXT: [[CONV_US:%.*]] = zext i32 [[K_013_US]] to i64 104; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[K_013_US]], 225 105; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP4]]) 106; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[CONV_US]] 107; CHECK-NEXT: [[MATRIXEXT_US:%.*]] = load double, double* [[TMP5]], align 8 108; CHECK-NEXT: [[MATRIXEXT8_US:%.*]] = load double, double* [[TMP3]], align 8 109; CHECK-NEXT: [[MUL_US:%.*]] = fmul double [[MATRIXEXT_US]], [[MATRIXEXT8_US]] 110; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[CONV_US]] 111; CHECK-NEXT: [[MATRIXEXT11_US:%.*]] = load double, double* [[TMP6]], align 8 112; CHECK-NEXT: [[SUB_US:%.*]] = fsub double [[MATRIXEXT11_US]], [[MUL_US]] 113; CHECK-NEXT: store double [[SUB_US]], double* [[TMP6]], align 8 114; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[K_013_US]], 1 115; CHECK-NEXT: [[CMP2_US:%.*]] = icmp ult i32 [[INC_US]], [[I]] 116; CHECK-NEXT: br i1 [[CMP2_US]], label [[FOR_BODY4_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] 117; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us: 118; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[CONV6]], 15 119; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[I]], 210 120; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP8]]) 121; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP7]] 122; CHECK-NEXT: br label [[FOR_BODY4_US_1:%.*]] 123; CHECK: for.body4.us.1: 124; CHECK-NEXT: [[K_013_US_1:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[INC_US_1:%.*]], [[FOR_BODY4_US_1]] ] 125; CHECK-NEXT: [[NARROW:%.*]] = add nuw nsw i32 [[K_013_US_1]], 15 126; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[NARROW]] to i64 127; CHECK-NEXT: [[TMP11:%.*]] = icmp ult i32 [[K_013_US_1]], 210 128; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) 129; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP10]] 130; CHECK-NEXT: [[MATRIXEXT_US_1:%.*]] = load double, double* [[TMP12]], align 8 131; CHECK-NEXT: [[MATRIXEXT8_US_1:%.*]] = load double, double* [[TMP9]], align 8 132; CHECK-NEXT: [[MUL_US_1:%.*]] = fmul double [[MATRIXEXT_US_1]], [[MATRIXEXT8_US_1]] 133; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP10]] 134; CHECK-NEXT: [[MATRIXEXT11_US_1:%.*]] = load double, double* [[TMP13]], align 8 135; CHECK-NEXT: [[SUB_US_1:%.*]] = fsub double [[MATRIXEXT11_US_1]], [[MUL_US_1]] 136; CHECK-NEXT: store double [[SUB_US_1]], double* [[TMP13]], align 8 137; CHECK-NEXT: [[INC_US_1]] = add nuw nsw i32 [[K_013_US_1]], 1 138; CHECK-NEXT: [[CMP2_US_1:%.*]] = icmp ult i32 [[INC_US_1]], [[I]] 139; CHECK-NEXT: br i1 [[CMP2_US_1]], label [[FOR_BODY4_US_1]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1:%.*]] 140; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.1: 141; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[CONV6]], 30 142; CHECK-NEXT: [[TMP15:%.*]] = icmp ult i32 [[I]], 195 143; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP15]]) 144; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP14]] 145; CHECK-NEXT: br label [[FOR_BODY4_US_2:%.*]] 146; CHECK: for.body4.us.2: 147; CHECK-NEXT: [[K_013_US_2:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ [[INC_US_2:%.*]], [[FOR_BODY4_US_2]] ] 148; CHECK-NEXT: [[NARROW17:%.*]] = add nuw nsw i32 [[K_013_US_2]], 30 149; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[NARROW17]] to i64 150; CHECK-NEXT: [[TMP18:%.*]] = icmp ult i32 [[K_013_US_2]], 195 151; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP18]]) 152; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP17]] 153; CHECK-NEXT: [[MATRIXEXT_US_2:%.*]] = load double, double* [[TMP19]], align 8 154; CHECK-NEXT: [[MATRIXEXT8_US_2:%.*]] = load double, double* [[TMP16]], align 8 155; CHECK-NEXT: [[MUL_US_2:%.*]] = fmul double [[MATRIXEXT_US_2]], [[MATRIXEXT8_US_2]] 156; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP17]] 157; CHECK-NEXT: [[MATRIXEXT11_US_2:%.*]] = load double, double* [[TMP20]], align 8 158; CHECK-NEXT: [[SUB_US_2:%.*]] = fsub double [[MATRIXEXT11_US_2]], [[MUL_US_2]] 159; CHECK-NEXT: store double [[SUB_US_2]], double* [[TMP20]], align 8 160; CHECK-NEXT: [[INC_US_2]] = add nuw nsw i32 [[K_013_US_2]], 1 161; CHECK-NEXT: [[CMP2_US_2:%.*]] = icmp ult i32 [[INC_US_2]], [[I]] 162; CHECK-NEXT: br i1 [[CMP2_US_2]], label [[FOR_BODY4_US_2]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2:%.*]] 163; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.2: 164; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw i64 [[CONV6]], 45 165; CHECK-NEXT: [[TMP22:%.*]] = icmp ult i32 [[I]], 180 166; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP22]]) 167; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP21]] 168; CHECK-NEXT: br label [[FOR_BODY4_US_3:%.*]] 169; CHECK: for.body4.us.3: 170; CHECK-NEXT: [[K_013_US_3:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ [[INC_US_3:%.*]], [[FOR_BODY4_US_3]] ] 171; CHECK-NEXT: [[NARROW18:%.*]] = add nuw nsw i32 [[K_013_US_3]], 45 172; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[NARROW18]] to i64 173; CHECK-NEXT: [[TMP25:%.*]] = icmp ult i32 [[K_013_US_3]], 180 174; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP25]]) 175; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP24]] 176; CHECK-NEXT: [[MATRIXEXT_US_3:%.*]] = load double, double* [[TMP26]], align 8 177; CHECK-NEXT: [[MATRIXEXT8_US_3:%.*]] = load double, double* [[TMP23]], align 8 178; CHECK-NEXT: [[MUL_US_3:%.*]] = fmul double [[MATRIXEXT_US_3]], [[MATRIXEXT8_US_3]] 179; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP24]] 180; CHECK-NEXT: [[MATRIXEXT11_US_3:%.*]] = load double, double* [[TMP27]], align 8 181; CHECK-NEXT: [[SUB_US_3:%.*]] = fsub double [[MATRIXEXT11_US_3]], [[MUL_US_3]] 182; CHECK-NEXT: store double [[SUB_US_3]], double* [[TMP27]], align 8 183; CHECK-NEXT: [[INC_US_3]] = add nuw nsw i32 [[K_013_US_3]], 1 184; CHECK-NEXT: [[CMP2_US_3:%.*]] = icmp ult i32 [[INC_US_3]], [[I]] 185; CHECK-NEXT: br i1 [[CMP2_US_3]], label [[FOR_BODY4_US_3]], label [[FOR_COND_CLEANUP]] 186; CHECK: for.cond.cleanup: 187; CHECK-NEXT: ret void 188; 189entry: 190 %i.addr = alloca i32, align 4 191 %A.addr = alloca [225 x double]*, align 8 192 %B.addr = alloca [225 x double]*, align 8 193 %j = alloca i32, align 4 194 %cleanup.dest.slot = alloca i32, align 4 195 %k = alloca i32, align 4 196 store i32 %i, i32* %i.addr, align 4 197 store [225 x double]* %A, [225 x double]** %A.addr, align 8 198 store [225 x double]* %B, [225 x double]** %B.addr, align 8 199 %0 = bitcast i32* %j to i8* 200 call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #3 201 store i32 0, i32* %j, align 4 202 br label %for.cond 203 204for.cond: ; preds = %for.inc12, %entry 205 %1 = load i32, i32* %j, align 4 206 %cmp = icmp ult i32 %1, 4 207 br i1 %cmp, label %for.body, label %for.cond.cleanup 208 209for.cond.cleanup: ; preds = %for.cond 210 store i32 2, i32* %cleanup.dest.slot, align 4 211 %2 = bitcast i32* %j to i8* 212 call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #3 213 br label %for.end14 214 215for.body: ; preds = %for.cond 216 %3 = bitcast i32* %k to i8* 217 call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3 218 store i32 0, i32* %k, align 4 219 br label %for.cond1 220 221for.cond1: ; preds = %for.inc, %for.body 222 %4 = load i32, i32* %k, align 4 223 %5 = load i32, i32* %i.addr, align 4 224 %cmp2 = icmp ult i32 %4, %5 225 br i1 %cmp2, label %for.body4, label %for.cond.cleanup3 226 227for.cond.cleanup3: ; preds = %for.cond1 228 store i32 5, i32* %cleanup.dest.slot, align 4 229 %6 = bitcast i32* %k to i8* 230 call void @llvm.lifetime.end.p0i8(i64 4, i8* %6) #3 231 br label %for.end 232 233for.body4: ; preds = %for.cond1 234 %7 = load i32, i32* %k, align 4 235 %conv = zext i32 %7 to i64 236 %8 = load i32, i32* %j, align 4 237 %conv5 = zext i32 %8 to i64 238 %9 = mul i64 %conv5, 15 239 %10 = add i64 %9, %conv 240 %11 = icmp ult i64 %10, 225 241 call void @llvm.assume(i1 %11) 242 %12 = load [225 x double]*, [225 x double]** %A.addr, align 8 243 %13 = bitcast [225 x double]* %12 to <225 x double>* 244 %14 = load <225 x double>, <225 x double>* %13, align 8 245 %matrixext = extractelement <225 x double> %14, i64 %10 246 %15 = load i32, i32* %i.addr, align 4 247 %conv6 = zext i32 %15 to i64 248 %16 = load i32, i32* %j, align 4 249 %conv7 = zext i32 %16 to i64 250 %17 = mul i64 %conv7, 15 251 %18 = add i64 %17, %conv6 252 %19 = icmp ult i64 %18, 225 253 call void @llvm.assume(i1 %19) 254 %20 = load [225 x double]*, [225 x double]** %B.addr, align 8 255 %21 = bitcast [225 x double]* %20 to <225 x double>* 256 %22 = load <225 x double>, <225 x double>* %21, align 8 257 %matrixext8 = extractelement <225 x double> %22, i64 %18 258 %mul = fmul double %matrixext, %matrixext8 259 %23 = load [225 x double]*, [225 x double]** %B.addr, align 8 260 %24 = load i32, i32* %k, align 4 261 %conv9 = zext i32 %24 to i64 262 %25 = load i32, i32* %j, align 4 263 %conv10 = zext i32 %25 to i64 264 %26 = mul i64 %conv10, 15 265 %27 = add i64 %26, %conv9 266 %28 = bitcast [225 x double]* %23 to <225 x double>* 267 %29 = icmp ult i64 %27, 225 268 call void @llvm.assume(i1 %29) 269 %30 = load <225 x double>, <225 x double>* %28, align 8 270 %matrixext11 = extractelement <225 x double> %30, i64 %27 271 %sub = fsub double %matrixext11, %mul 272 %31 = icmp ult i64 %27, 225 273 call void @llvm.assume(i1 %31) 274 %32 = load <225 x double>, <225 x double>* %28, align 8 275 %matins = insertelement <225 x double> %32, double %sub, i64 %27 276 store <225 x double> %matins, <225 x double>* %28, align 8 277 br label %for.inc 278 279for.inc: ; preds = %for.body4 280 %33 = load i32, i32* %k, align 4 281 %inc = add i32 %33, 1 282 store i32 %inc, i32* %k, align 4 283 br label %for.cond1 284 285for.end: ; preds = %for.cond.cleanup3 286 br label %for.inc12 287 288for.inc12: ; preds = %for.end 289 %34 = load i32, i32* %j, align 4 290 %inc13 = add i32 %34, 1 291 store i32 %inc13, i32* %j, align 4 292 br label %for.cond 293 294for.end14: ; preds = %for.cond.cleanup 295 ret void 296} 297 298; Function Attrs: argmemonly nofree nosync nounwind willreturn 299declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1 300 301; Function Attrs: inaccessiblememonly nofree nosync nounwind willreturn 302declare void @llvm.assume(i1 noundef) #2 303 304; Function Attrs: argmemonly nofree nosync nounwind willreturn 305declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1 306 307; Function Attrs: nounwind ssp uwtable mustprogress 308 309define <4 x float> @reverse_hadd_v4f32(<4 x float> %a, <4 x float> %b) { 310; CHECK-LABEL: @reverse_hadd_v4f32( 311; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 6, i32 4> 312; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> [[A]], <4 x i32> <i32 3, i32 1, i32 7, i32 5> 313; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] 314; CHECK-NEXT: ret <4 x float> [[TMP3]] 315; 316 %vecext = extractelement <4 x float> %a, i32 0 317 %vecext1 = extractelement <4 x float> %a, i32 1 318 %add = fadd float %vecext, %vecext1 319 %vecinit = insertelement <4 x float> undef, float %add, i32 0 320 %vecext2 = extractelement <4 x float> %a, i32 2 321 %vecext3 = extractelement <4 x float> %a, i32 3 322 %add4 = fadd float %vecext2, %vecext3 323 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 324 %vecext6 = extractelement <4 x float> %b, i32 0 325 %vecext7 = extractelement <4 x float> %b, i32 1 326 %add8 = fadd float %vecext6, %vecext7 327 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2 328 %vecext10 = extractelement <4 x float> %b, i32 2 329 %vecext11 = extractelement <4 x float> %b, i32 3 330 %add12 = fadd float %vecext10, %vecext11 331 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3 332 %shuffle = shufflevector <4 x float> %vecinit13, <4 x float> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 333 ret <4 x float> %shuffle 334} 335