1; RUN: opt < %s -loop-vectorize -S | FileCheck %s
2
3target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
4target triple = "x86_64-unknown-linux-gnu"
5
6define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
7; CHECK-LABEL: tail_folding_enabled(
8; CHECK:  vector.body:
9; CHECK:  %wide.masked.load = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(
10; CHECK:  %wide.masked.load1 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(
11; CHECK:  %8 = add nsw <8 x i32> %wide.masked.load1, %wide.masked.load
12; CHECK:  call void @llvm.masked.store.v8i32.p0v8i32(
13; CHECK:  %index.next = add i64 %index, 8
14; CHECK:  %12 = icmp eq i64 %index.next, 432
15; CHECK:  br i1 %12, label %middle.block, label %vector.body, !llvm.loop !0
16
17entry:
18  br label %for.body
19
20for.cond.cleanup:
21  ret void
22
23for.body:
24  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
25  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
26  %0 = load i32, i32* %arrayidx, align 4
27  %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
28  %1 = load i32, i32* %arrayidx2, align 4
29  %add = add nsw i32 %1, %0
30  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
31  store i32 %add, i32* %arrayidx4, align 4
32  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
33  %exitcond = icmp eq i64 %indvars.iv.next, 430
34  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !6
35}
36
37define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
38; CHECK-LABEL: tail_folding_disabled(
39; CHECK:      vector.body:
40; CHECK-NOT:  @llvm.masked.load.v8i32.p0v8i32(
41; CHECK-NOT:  @llvm.masked.store.v8i32.p0v8i32(
42; CHECK:      br i1 %44, label {{.*}}, label %vector.body
43entry:
44  br label %for.body
45
46for.cond.cleanup:
47  ret void
48
49for.body:
50  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
51  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
52  %0 = load i32, i32* %arrayidx, align 4
53  %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
54  %1 = load i32, i32* %arrayidx2, align 4
55  %add = add nsw i32 %1, %0
56  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
57  store i32 %add, i32* %arrayidx4, align 4
58  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
59  %exitcond = icmp eq i64 %indvars.iv.next, 430
60  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
61}
62
63; CHECK:      !0 = distinct !{!0, !1}
64; CHECK-NEXT: !1 = !{!"llvm.loop.isvectorized", i32 1}
65; CHECK-NEXT: !2 = distinct !{!2, !3, !1}
66; CHECK-NEXT: !3 = !{!"llvm.loop.unroll.runtime.disable"}
67; CHECK-NEXT: !4 = distinct !{!4, !1}
68; CHECK-NEXT: !5 = distinct !{!5, !3, !1}
69
70attributes #0 = { nounwind optsize uwtable "target-cpu"="core-avx2" "target-features"="+avx,+avx2" }
71
72!6 = distinct !{!6, !7, !8}
73!7 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
74!8 = !{!"llvm.loop.vectorize.enable", i1 true}
75
76!10 = distinct !{!10, !11, !12}
77!11 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}
78!12 = !{!"llvm.loop.vectorize.enable", i1 true}
79