1; RUN: opt -aa-pipeline=basic-aa -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S \ 2; RUN: < %s | FileCheck %s 3 4; RUN: opt -aa-pipeline=basic-aa -passes='loop-distribute,loop(print-access-info)' -enable-loop-distribute \ 5; RUN: -verify-loop-info -verify-dom-info -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ANALYSIS 6 7; RUN: opt -aa-pipeline=basic-aa -passes=loop-distribute,loop-vectorize -enable-loop-distribute -force-vector-width=4 -S \ 8; RUN: < %s | FileCheck %s --check-prefix=VECTORIZE 9 10; We should distribute this loop into a safe (2nd statement) and unsafe loop 11; (1st statement): 12; for (i = 0; i < n; i++) { 13; A[i + 1] = A[i] * B[i]; 14; ======================= 15; C[i] = D[i] * E[i]; 16; } 17 18target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" 19target triple = "x86_64-apple-macosx10.10.0" 20 21; CHECK-LABEL: @f( 22define void @f(i32* noalias %a, 23 i32* noalias %b, 24 i32* noalias %c, 25 i32* noalias %d, 26 i32* noalias %e) { 27entry: 28 br label %for.body 29 30; Verify the two distributed loops. 31 32; CHECK: entry.split.ldist1: 33; CHECK: br label %for.body.ldist1 34; CHECK: for.body.ldist1: 35; CHECK: %mulA.ldist1 = mul i32 %loadB.ldist1, %loadA.ldist1 36; CHECK: br i1 %exitcond.ldist1, label %entry.split, label %for.body.ldist1 37 38; CHECK: entry.split: 39; CHECK: br label %for.body 40; CHECK: for.body: 41; CHECK: %mulC = mul i32 %loadD, %loadE 42; CHECK: for.end: 43 44 45; ANALYSIS: for.body.ldist1: 46; ANALYSIS-NEXT: Report: unsafe dependent memory operations in loop 47; ANALYSIS: for.body: 48; ANALYSIS-NEXT: Memory dependences are safe{{$}} 49 50 51; VECTORIZE: mul <4 x i32> 52 53for.body: ; preds = %for.body, %entry 54 %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] 55 56 %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind 57 %loadA = load i32, i32* %arrayidxA, align 4 58 59 %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind 60 %loadB = load i32, i32* %arrayidxB, align 4 61 62 %mulA = mul i32 %loadB, %loadA 63 64 %add = add nuw nsw i64 %ind, 1 65 %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add 66 store i32 %mulA, i32* %arrayidxA_plus_4, align 4 67 68 %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind 69 %loadD = load i32, i32* %arrayidxD, align 4 70 71 %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind 72 %loadE = load i32, i32* %arrayidxE, align 4 73 74 %mulC = mul i32 %loadD, %loadE 75 76 %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind 77 store i32 %mulC, i32* %arrayidxC, align 4 78 79 %exitcond = icmp eq i64 %add, 20 80 br i1 %exitcond, label %for.end, label %for.body 81 82for.end: ; preds = %for.body 83 ret void 84} 85 86declare i32 @llvm.convergent(i32) #0 87 88; It is OK to distribute with a convergent operation, since in each 89; new loop the convergent operation has the ssame control dependency. 90; CHECK-LABEL: @f_with_convergent( 91define void @f_with_convergent(i32* noalias %a, 92 i32* noalias %b, 93 i32* noalias %c, 94 i32* noalias %d, 95 i32* noalias %e) { 96entry: 97 br label %for.body 98 99; Verify the two distributed loops. 100 101; CHECK: entry.split.ldist1: 102; CHECK: br label %for.body.ldist1 103; CHECK: for.body.ldist1: 104; CHECK: %mulA.ldist1 = mul i32 %loadB.ldist1, %loadA.ldist1 105; CHECK: br i1 %exitcond.ldist1, label %entry.split, label %for.body.ldist1 106 107; CHECK: entry.split: 108; CHECK: br label %for.body 109; CHECK: for.body: 110; CHECK: %convergentD = call i32 @llvm.convergent(i32 %loadD) 111; CHECK: %mulC = mul i32 %convergentD, %loadE 112; CHECK: for.end: 113 114 115; ANALYSIS: for.body.ldist1: 116; ANALYSIS-NEXT: Report: unsafe dependent memory operations in loop 117; ANALYSIS: for.body: 118; ANALYSIS-NEXT: Has convergent operation in loop 119; ANALYSIS-NEXT: Report: cannot add control dependency to convergent operation 120 121; convergent instruction happens to block vectorization 122; VECTORIZE: call i32 @llvm.convergent 123; VECTORIZE: mul i32 124 125for.body: ; preds = %for.body, %entry 126 %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] 127 128 %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind 129 %loadA = load i32, i32* %arrayidxA, align 4 130 131 %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind 132 %loadB = load i32, i32* %arrayidxB, align 4 133 134 %mulA = mul i32 %loadB, %loadA 135 136 %add = add nuw nsw i64 %ind, 1 137 %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add 138 store i32 %mulA, i32* %arrayidxA_plus_4, align 4 139 140 %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind 141 %loadD = load i32, i32* %arrayidxD, align 4 142 143 %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind 144 %loadE = load i32, i32* %arrayidxE, align 4 145 146 %convergentD = call i32 @llvm.convergent(i32 %loadD) 147 %mulC = mul i32 %convergentD, %loadE 148 149 %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind 150 store i32 %mulC, i32* %arrayidxC, align 4 151 152 %exitcond = icmp eq i64 %add, 20 153 br i1 %exitcond, label %for.end, label %for.body 154 155for.end: ; preds = %for.body 156 ret void 157} 158 159attributes #0 = { nounwind readnone convergent } 160