1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -passes='default<O3>' -enable-matrix -S %s | FileCheck %s
3
4target triple = "arm64-apple-ios"
5
6define void @matrix_extract_insert_scalar(i32 %i, i32 %k, i32 %j, [225 x double]* nonnull align 8 dereferenceable(1800) %A, [225 x double]* nonnull align 8 dereferenceable(1800) %B) #0 {
7; CHECK-LABEL: @matrix_extract_insert_scalar(
8; CHECK-NEXT:  entry:
9; CHECK-NEXT:    [[CONV:%.*]] = zext i32 [[K:%.*]] to i64
10; CHECK-NEXT:    [[CONV1:%.*]] = zext i32 [[J:%.*]] to i64
11; CHECK-NEXT:    [[TMP0:%.*]] = mul nuw nsw i64 [[CONV1]], 15
12; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], [[CONV]]
13; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 225
14; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP2]])
15; CHECK-NEXT:    [[TMP3:%.*]] = bitcast [225 x double]* [[A:%.*]] to <225 x double>*
16; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP3]], i64 0, i64 [[TMP1]]
17; CHECK-NEXT:    [[MATRIXEXT:%.*]] = load double, double* [[TMP4]], align 8
18; CHECK-NEXT:    [[CONV2:%.*]] = zext i32 [[I:%.*]] to i64
19; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP0]], [[CONV2]]
20; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 225
21; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP6]])
22; CHECK-NEXT:    [[TMP7:%.*]] = bitcast [225 x double]* [[B:%.*]] to <225 x double>*
23; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP5]]
24; CHECK-NEXT:    [[MATRIXEXT4:%.*]] = load double, double* [[TMP8]], align 8
25; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[MATRIXEXT]], [[MATRIXEXT4]]
26; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP1]]
27; CHECK-NEXT:    [[MATRIXEXT7:%.*]] = load double, double* [[TMP9]], align 8
28; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[MATRIXEXT7]], [[MUL]]
29; CHECK-NEXT:    store double [[SUB]], double* [[TMP9]], align 8
30; CHECK-NEXT:    ret void
31;
32entry:
33  %i.addr = alloca i32, align 4
34  %k.addr = alloca i32, align 4
35  %j.addr = alloca i32, align 4
36  %A.addr = alloca [225 x double]*, align 8
37  %B.addr = alloca [225 x double]*, align 8
38  store i32 %i, i32* %i.addr, align 4
39  store i32 %k, i32* %k.addr, align 4
40  store i32 %j, i32* %j.addr, align 4
41  store [225 x double]* %A, [225 x double]** %A.addr, align 8
42  store [225 x double]* %B, [225 x double]** %B.addr, align 8
43  %0 = load i32, i32* %k.addr, align 4
44  %conv = zext i32 %0 to i64
45  %1 = load i32, i32* %j.addr, align 4
46  %conv1 = zext i32 %1 to i64
47  %2 = mul i64 %conv1, 15
48  %3 = add i64 %2, %conv
49  %4 = icmp ult i64 %3, 225
50  call void @llvm.assume(i1 %4)
51  %5 = load [225 x double]*, [225 x double]** %A.addr, align 8
52  %6 = bitcast [225 x double]* %5 to <225 x double>*
53  %7 = load <225 x double>, <225 x double>* %6, align 8
54  %matrixext = extractelement <225 x double> %7, i64 %3
55  %8 = load i32, i32* %i.addr, align 4
56  %conv2 = zext i32 %8 to i64
57  %9 = load i32, i32* %j.addr, align 4
58  %conv3 = zext i32 %9 to i64
59  %10 = mul i64 %conv3, 15
60  %11 = add i64 %10, %conv2
61  %12 = icmp ult i64 %11, 225
62  call void @llvm.assume(i1 %12)
63  %13 = load [225 x double]*, [225 x double]** %B.addr, align 8
64  %14 = bitcast [225 x double]* %13 to <225 x double>*
65  %15 = load <225 x double>, <225 x double>* %14, align 8
66  %matrixext4 = extractelement <225 x double> %15, i64 %11
67  %mul = fmul double %matrixext, %matrixext4
68  %16 = load [225 x double]*, [225 x double]** %B.addr, align 8
69  %17 = load i32, i32* %k.addr, align 4
70  %conv5 = zext i32 %17 to i64
71  %18 = load i32, i32* %j.addr, align 4
72  %conv6 = zext i32 %18 to i64
73  %19 = mul i64 %conv6, 15
74  %20 = add i64 %19, %conv5
75  %21 = bitcast [225 x double]* %16 to <225 x double>*
76  %22 = icmp ult i64 %20, 225
77  call void @llvm.assume(i1 %22)
78  %23 = load <225 x double>, <225 x double>* %21, align 8
79  %matrixext7 = extractelement <225 x double> %23, i64 %20
80  %sub = fsub double %matrixext7, %mul
81  %24 = icmp ult i64 %20, 225
82  call void @llvm.assume(i1 %24)
83  %25 = load <225 x double>, <225 x double>* %21, align 8
84  %matins = insertelement <225 x double> %25, double %sub, i64 %20
85  store <225 x double> %matins, <225 x double>* %21, align 8
86  ret void
87}
88define void @matrix_extract_insert_loop(i32 %i, [225 x double]* nonnull align 8 dereferenceable(1800) %A, [225 x double]* nonnull align 8 dereferenceable(1800) %B) {
89; CHECK-LABEL: @matrix_extract_insert_loop(
90; CHECK-NEXT:  entry:
91; CHECK-NEXT:    [[CMP212_NOT:%.*]] = icmp eq i32 [[I:%.*]], 0
92; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [225 x double]* [[A:%.*]] to <225 x double>*
93; CHECK-NEXT:    [[CONV6:%.*]] = zext i32 [[I]] to i64
94; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [225 x double]* [[B:%.*]] to <225 x double>*
95; CHECK-NEXT:    br i1 [[CMP212_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US:%.*]]
96; CHECK:       for.cond1.preheader.us:
97; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[I]], 225
98; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP2]])
99; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[CONV6]]
100; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
101; CHECK:       for.body4.us:
102; CHECK-NEXT:    [[K_013_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY4_US]] ]
103; CHECK-NEXT:    [[CONV_US:%.*]] = zext i32 [[K_013_US]] to i64
104; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[K_013_US]], 225
105; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP4]])
106; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[CONV_US]]
107; CHECK-NEXT:    [[MATRIXEXT_US:%.*]] = load double, double* [[TMP5]], align 8
108; CHECK-NEXT:    [[MATRIXEXT8_US:%.*]] = load double, double* [[TMP3]], align 8
109; CHECK-NEXT:    [[MUL_US:%.*]] = fmul double [[MATRIXEXT_US]], [[MATRIXEXT8_US]]
110; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[CONV_US]]
111; CHECK-NEXT:    [[MATRIXEXT11_US:%.*]] = load double, double* [[TMP6]], align 8
112; CHECK-NEXT:    [[SUB_US:%.*]] = fsub double [[MATRIXEXT11_US]], [[MUL_US]]
113; CHECK-NEXT:    store double [[SUB_US]], double* [[TMP6]], align 8
114; CHECK-NEXT:    [[INC_US]] = add nuw nsw i32 [[K_013_US]], 1
115; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ult i32 [[INC_US]], [[I]]
116; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY4_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]]
117; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
118; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[CONV6]], 15
119; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[I]], 210
120; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP8]])
121; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP7]]
122; CHECK-NEXT:    br label [[FOR_BODY4_US_1:%.*]]
123; CHECK:       for.body4.us.1:
124; CHECK-NEXT:    [[K_013_US_1:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[INC_US_1:%.*]], [[FOR_BODY4_US_1]] ]
125; CHECK-NEXT:    [[NARROW:%.*]] = add nuw nsw i32 [[K_013_US_1]], 15
126; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[NARROW]] to i64
127; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult i32 [[K_013_US_1]], 210
128; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
129; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP10]]
130; CHECK-NEXT:    [[MATRIXEXT_US_1:%.*]] = load double, double* [[TMP12]], align 8
131; CHECK-NEXT:    [[MATRIXEXT8_US_1:%.*]] = load double, double* [[TMP9]], align 8
132; CHECK-NEXT:    [[MUL_US_1:%.*]] = fmul double [[MATRIXEXT_US_1]], [[MATRIXEXT8_US_1]]
133; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP10]]
134; CHECK-NEXT:    [[MATRIXEXT11_US_1:%.*]] = load double, double* [[TMP13]], align 8
135; CHECK-NEXT:    [[SUB_US_1:%.*]] = fsub double [[MATRIXEXT11_US_1]], [[MUL_US_1]]
136; CHECK-NEXT:    store double [[SUB_US_1]], double* [[TMP13]], align 8
137; CHECK-NEXT:    [[INC_US_1]] = add nuw nsw i32 [[K_013_US_1]], 1
138; CHECK-NEXT:    [[CMP2_US_1:%.*]] = icmp ult i32 [[INC_US_1]], [[I]]
139; CHECK-NEXT:    br i1 [[CMP2_US_1]], label [[FOR_BODY4_US_1]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1:%.*]]
140; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us.1:
141; CHECK-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[CONV6]], 30
142; CHECK-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[I]], 195
143; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP15]])
144; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP14]]
145; CHECK-NEXT:    br label [[FOR_BODY4_US_2:%.*]]
146; CHECK:       for.body4.us.2:
147; CHECK-NEXT:    [[K_013_US_2:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ [[INC_US_2:%.*]], [[FOR_BODY4_US_2]] ]
148; CHECK-NEXT:    [[NARROW17:%.*]] = add nuw nsw i32 [[K_013_US_2]], 30
149; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[NARROW17]] to i64
150; CHECK-NEXT:    [[TMP18:%.*]] = icmp ult i32 [[K_013_US_2]], 195
151; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP18]])
152; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP17]]
153; CHECK-NEXT:    [[MATRIXEXT_US_2:%.*]] = load double, double* [[TMP19]], align 8
154; CHECK-NEXT:    [[MATRIXEXT8_US_2:%.*]] = load double, double* [[TMP16]], align 8
155; CHECK-NEXT:    [[MUL_US_2:%.*]] = fmul double [[MATRIXEXT_US_2]], [[MATRIXEXT8_US_2]]
156; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP17]]
157; CHECK-NEXT:    [[MATRIXEXT11_US_2:%.*]] = load double, double* [[TMP20]], align 8
158; CHECK-NEXT:    [[SUB_US_2:%.*]] = fsub double [[MATRIXEXT11_US_2]], [[MUL_US_2]]
159; CHECK-NEXT:    store double [[SUB_US_2]], double* [[TMP20]], align 8
160; CHECK-NEXT:    [[INC_US_2]] = add nuw nsw i32 [[K_013_US_2]], 1
161; CHECK-NEXT:    [[CMP2_US_2:%.*]] = icmp ult i32 [[INC_US_2]], [[I]]
162; CHECK-NEXT:    br i1 [[CMP2_US_2]], label [[FOR_BODY4_US_2]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2:%.*]]
163; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us.2:
164; CHECK-NEXT:    [[TMP21:%.*]] = add nuw nsw i64 [[CONV6]], 45
165; CHECK-NEXT:    [[TMP22:%.*]] = icmp ult i32 [[I]], 180
166; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP22]])
167; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP21]]
168; CHECK-NEXT:    br label [[FOR_BODY4_US_3:%.*]]
169; CHECK:       for.body4.us.3:
170; CHECK-NEXT:    [[K_013_US_3:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ [[INC_US_3:%.*]], [[FOR_BODY4_US_3]] ]
171; CHECK-NEXT:    [[NARROW18:%.*]] = add nuw nsw i32 [[K_013_US_3]], 45
172; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[NARROW18]] to i64
173; CHECK-NEXT:    [[TMP25:%.*]] = icmp ult i32 [[K_013_US_3]], 180
174; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP25]])
175; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP24]]
176; CHECK-NEXT:    [[MATRIXEXT_US_3:%.*]] = load double, double* [[TMP26]], align 8
177; CHECK-NEXT:    [[MATRIXEXT8_US_3:%.*]] = load double, double* [[TMP23]], align 8
178; CHECK-NEXT:    [[MUL_US_3:%.*]] = fmul double [[MATRIXEXT_US_3]], [[MATRIXEXT8_US_3]]
179; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP24]]
180; CHECK-NEXT:    [[MATRIXEXT11_US_3:%.*]] = load double, double* [[TMP27]], align 8
181; CHECK-NEXT:    [[SUB_US_3:%.*]] = fsub double [[MATRIXEXT11_US_3]], [[MUL_US_3]]
182; CHECK-NEXT:    store double [[SUB_US_3]], double* [[TMP27]], align 8
183; CHECK-NEXT:    [[INC_US_3]] = add nuw nsw i32 [[K_013_US_3]], 1
184; CHECK-NEXT:    [[CMP2_US_3:%.*]] = icmp ult i32 [[INC_US_3]], [[I]]
185; CHECK-NEXT:    br i1 [[CMP2_US_3]], label [[FOR_BODY4_US_3]], label [[FOR_COND_CLEANUP]]
186; CHECK:       for.cond.cleanup:
187; CHECK-NEXT:    ret void
188;
189entry:
190  %i.addr = alloca i32, align 4
191  %A.addr = alloca [225 x double]*, align 8
192  %B.addr = alloca [225 x double]*, align 8
193  %j = alloca i32, align 4
194  %cleanup.dest.slot = alloca i32, align 4
195  %k = alloca i32, align 4
196  store i32 %i, i32* %i.addr, align 4
197  store [225 x double]* %A, [225 x double]** %A.addr, align 8
198  store [225 x double]* %B, [225 x double]** %B.addr, align 8
199  %0 = bitcast i32* %j to i8*
200  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #3
201  store i32 0, i32* %j, align 4
202  br label %for.cond
203
204for.cond:                                         ; preds = %for.inc12, %entry
205  %1 = load i32, i32* %j, align 4
206  %cmp = icmp ult i32 %1, 4
207  br i1 %cmp, label %for.body, label %for.cond.cleanup
208
209for.cond.cleanup:                                 ; preds = %for.cond
210  store i32 2, i32* %cleanup.dest.slot, align 4
211  %2 = bitcast i32* %j to i8*
212  call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #3
213  br label %for.end14
214
215for.body:                                         ; preds = %for.cond
216  %3 = bitcast i32* %k to i8*
217  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3
218  store i32 0, i32* %k, align 4
219  br label %for.cond1
220
221for.cond1:                                        ; preds = %for.inc, %for.body
222  %4 = load i32, i32* %k, align 4
223  %5 = load i32, i32* %i.addr, align 4
224  %cmp2 = icmp ult i32 %4, %5
225  br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
226
227for.cond.cleanup3:                                ; preds = %for.cond1
228  store i32 5, i32* %cleanup.dest.slot, align 4
229  %6 = bitcast i32* %k to i8*
230  call void @llvm.lifetime.end.p0i8(i64 4, i8* %6) #3
231  br label %for.end
232
233for.body4:                                        ; preds = %for.cond1
234  %7 = load i32, i32* %k, align 4
235  %conv = zext i32 %7 to i64
236  %8 = load i32, i32* %j, align 4
237  %conv5 = zext i32 %8 to i64
238  %9 = mul i64 %conv5, 15
239  %10 = add i64 %9, %conv
240  %11 = icmp ult i64 %10, 225
241  call void @llvm.assume(i1 %11)
242  %12 = load [225 x double]*, [225 x double]** %A.addr, align 8
243  %13 = bitcast [225 x double]* %12 to <225 x double>*
244  %14 = load <225 x double>, <225 x double>* %13, align 8
245  %matrixext = extractelement <225 x double> %14, i64 %10
246  %15 = load i32, i32* %i.addr, align 4
247  %conv6 = zext i32 %15 to i64
248  %16 = load i32, i32* %j, align 4
249  %conv7 = zext i32 %16 to i64
250  %17 = mul i64 %conv7, 15
251  %18 = add i64 %17, %conv6
252  %19 = icmp ult i64 %18, 225
253  call void @llvm.assume(i1 %19)
254  %20 = load [225 x double]*, [225 x double]** %B.addr, align 8
255  %21 = bitcast [225 x double]* %20 to <225 x double>*
256  %22 = load <225 x double>, <225 x double>* %21, align 8
257  %matrixext8 = extractelement <225 x double> %22, i64 %18
258  %mul = fmul double %matrixext, %matrixext8
259  %23 = load [225 x double]*, [225 x double]** %B.addr, align 8
260  %24 = load i32, i32* %k, align 4
261  %conv9 = zext i32 %24 to i64
262  %25 = load i32, i32* %j, align 4
263  %conv10 = zext i32 %25 to i64
264  %26 = mul i64 %conv10, 15
265  %27 = add i64 %26, %conv9
266  %28 = bitcast [225 x double]* %23 to <225 x double>*
267  %29 = icmp ult i64 %27, 225
268  call void @llvm.assume(i1 %29)
269  %30 = load <225 x double>, <225 x double>* %28, align 8
270  %matrixext11 = extractelement <225 x double> %30, i64 %27
271  %sub = fsub double %matrixext11, %mul
272  %31 = icmp ult i64 %27, 225
273  call void @llvm.assume(i1 %31)
274  %32 = load <225 x double>, <225 x double>* %28, align 8
275  %matins = insertelement <225 x double> %32, double %sub, i64 %27
276  store <225 x double> %matins, <225 x double>* %28, align 8
277  br label %for.inc
278
279for.inc:                                          ; preds = %for.body4
280  %33 = load i32, i32* %k, align 4
281  %inc = add i32 %33, 1
282  store i32 %inc, i32* %k, align 4
283  br label %for.cond1
284
285for.end:                                          ; preds = %for.cond.cleanup3
286  br label %for.inc12
287
288for.inc12:                                        ; preds = %for.end
289  %34 = load i32, i32* %j, align 4
290  %inc13 = add i32 %34, 1
291  store i32 %inc13, i32* %j, align 4
292  br label %for.cond
293
294for.end14:                                        ; preds = %for.cond.cleanup
295  ret void
296}
297
298; Function Attrs: argmemonly nofree nosync nounwind willreturn
299declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1
300
301; Function Attrs: inaccessiblememonly nofree nosync nounwind willreturn
302declare void @llvm.assume(i1 noundef) #2
303
304; Function Attrs: argmemonly nofree nosync nounwind willreturn
305declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
306
307; Function Attrs: nounwind ssp uwtable mustprogress
308
309define <4 x float> @reverse_hadd_v4f32(<4 x float> %a, <4 x float> %b) {
310; CHECK-LABEL: @reverse_hadd_v4f32(
311; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 6, i32 4>
312; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> [[A]], <4 x i32> <i32 3, i32 1, i32 7, i32 5>
313; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
314; CHECK-NEXT:    ret <4 x float> [[TMP3]]
315;
316  %vecext = extractelement <4 x float> %a, i32 0
317  %vecext1 = extractelement <4 x float> %a, i32 1
318  %add = fadd float %vecext, %vecext1
319  %vecinit = insertelement <4 x float> undef, float %add, i32 0
320  %vecext2 = extractelement <4 x float> %a, i32 2
321  %vecext3 = extractelement <4 x float> %a, i32 3
322  %add4 = fadd float %vecext2, %vecext3
323  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
324  %vecext6 = extractelement <4 x float> %b, i32 0
325  %vecext7 = extractelement <4 x float> %b, i32 1
326  %add8 = fadd float %vecext6, %vecext7
327  %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
328  %vecext10 = extractelement <4 x float> %b, i32 2
329  %vecext11 = extractelement <4 x float> %b, i32 3
330  %add12 = fadd float %vecext10, %vecext11
331  %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
332  %shuffle = shufflevector <4 x float> %vecinit13, <4 x float> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
333  ret <4 x float> %shuffle
334}
335