1; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -instcombine -mattr=+sve -mtriple aarch64-unknown-linux-gnu \ 2; RUN: -prefer-predicate-over-epilogue=scalar-epilogue -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s 3; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARKS 4; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -force-target-instruction-cost=1 -instcombine -mattr=+sve -mtriple aarch64-unknown-linux-gnu \ 5; RUN: -prefer-predicate-over-epilogue=scalar-epilogue -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s 6; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARKS 7 8define void @vec_load(i64 %N, double* nocapture %a, double* nocapture readonly %b) { 9; CHECK-LABEL: @vec_load 10; CHECK: vector.body: 11; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* 12; CHECK: call <vscale x 2 x double> @foo_vec(<vscale x 2 x double> %[[LOAD]]) 13entry: 14 %cmp7 = icmp sgt i64 %N, 0 15 br i1 %cmp7, label %for.body, label %for.end 16 17for.body: ; preds = %for.body.preheader, %for.body 18 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 19 %arrayidx = getelementptr inbounds double, double* %b, i64 %iv 20 %0 = load double, double* %arrayidx, align 8 21 %1 = call double @foo(double %0) #0 22 %add = fadd double %1, 1.000000e+00 23 %arrayidx2 = getelementptr inbounds double, double* %a, i64 %iv 24 store double %add, double* %arrayidx2, align 8 25 %iv.next = add nuw nsw i64 %iv, 1 26 %exitcond.not = icmp eq i64 %iv.next, %N 27 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 28 29for.end: ; preds = %for.body, %entry 30 ret void 31} 32 33define void @vec_scalar(i64 %N, double* nocapture %a) { 34; CHECK-LABEL: @vec_scalar 35; CHECK: vector.body: 36; CHECK: call <vscale x 2 x double> @foo_vec(<vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+01, i32 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)) 37entry: 38 %cmp7 = icmp sgt i64 %N, 0 39 br i1 %cmp7, label %for.body, label %for.end 40 41for.body: ; preds = %for.body.preheader, %for.body 42 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 43 %0 = call double @foo(double 10.0) #0 44 %sub = fsub double %0, 1.000000e+00 45 %arrayidx = getelementptr inbounds double, double* %a, i64 %iv 46 store double %sub, double* %arrayidx, align 8 47 %iv.next = add nuw nsw i64 %iv, 1 48 %exitcond.not = icmp eq i64 %iv.next, %N 49 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 50 51for.end: ; preds = %for.body, %entry 52 ret void 53} 54 55define void @vec_ptr(i64 %N, i64* noalias %a, i64** readnone %b) { 56; CHECK-LABEL: @vec_ptr 57; CHECK: vector.body: 58; CHECK: %[[LOAD:.*]] = load <vscale x 2 x i64*>, <vscale x 2 x i64*>* 59; CHECK: call <vscale x 2 x i64> @bar_vec(<vscale x 2 x i64*> %[[LOAD]]) 60entry: 61 %cmp7 = icmp sgt i64 %N, 0 62 br i1 %cmp7, label %for.body, label %for.end 63 64for.body: 65 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 66 %gep = getelementptr i64*, i64** %b, i64 %iv 67 %load = load i64*, i64** %gep 68 %call = call i64 @bar(i64* %load) #1 69 %arrayidx = getelementptr inbounds i64, i64* %a, i64 %iv 70 store i64 %call, i64* %arrayidx 71 %iv.next = add nuw nsw i64 %iv, 1 72 %exitcond = icmp eq i64 %iv.next, 1024 73 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1 74 75for.end: 76 ret void 77} 78 79define void @vec_intrinsic(i64 %N, double* nocapture readonly %a) { 80; CHECK-LABEL: @vec_intrinsic 81; CHECK: vector.body: 82; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* 83; CHECK: call fast <vscale x 2 x double> @sin_vec_nxv2f64(<vscale x 2 x double> %[[LOAD]]) 84entry: 85 %cmp7 = icmp sgt i64 %N, 0 86 br i1 %cmp7, label %for.body, label %for.end 87 88for.body: 89 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 90 %arrayidx = getelementptr inbounds double, double* %a, i64 %iv 91 %0 = load double, double* %arrayidx, align 8 92 %1 = call fast double @llvm.sin.f64(double %0) #2 93 %add = fadd fast double %1, 1.000000e+00 94 store double %add, double* %arrayidx, align 8 95 %iv.next = add nuw nsw i64 %iv, 1 96 %exitcond = icmp eq i64 %iv.next, %N 97 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1 98 99for.end: 100 ret void 101} 102 103; CHECK-REMARKS: UserVF ignored because of invalid costs. 104; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load 105; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32 106; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store 107define void @vec_sin_no_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) { 108; CHECK: @vec_sin_no_mapping 109; CHECK: call fast <2 x float> @llvm.sin.v2f32 110; CHECK-NOT: <vscale x 111entry: 112 br label %for.body 113 114for.body: ; preds = %entry, %for.body 115 %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 116 %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07 117 %0 = load float, float* %arrayidx, align 4, !dbg !11 118 %1 = tail call fast float @llvm.sin.f32(float %0), !dbg !12 119 %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07 120 store float %1, float* %arrayidx1, align 4, !dbg !13 121 %inc = add nuw nsw i64 %i.07, 1 122 %exitcond.not = icmp eq i64 %inc, %n 123 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 124 125for.cond.cleanup: ; preds = %for.body 126 ret void 127} 128 129; CHECK-REMARKS: UserVF ignored because of invalid costs. 130; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load 131; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32 132; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32 133; CHECK-REMARKS-NEXT: t.c:3:40: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store 134define void @vec_sin_no_mapping_ite(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) { 135; CHECK: @vec_sin_no_mapping_ite 136; CHECK-NOT: <vscale x 137; CHECK: ret 138entry: 139 br label %for.body 140 141for.body: ; preds = %entry, %if.end 142 %i.07 = phi i64 [ %inc, %if.end ], [ 0, %entry ] 143 %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07 144 %0 = load float, float* %arrayidx, align 4, !dbg !11 145 %cmp = fcmp ugt float %0, 0.0000 146 br i1 %cmp, label %if.then, label %if.else 147if.then: 148 %1 = tail call fast float @llvm.sin.f32(float %0), !dbg !12 149 br label %if.end 150if.else: 151 %2 = tail call fast float @llvm.sin.f32(float 0.0), !dbg !13 152 br label %if.end 153if.end: 154 %3 = phi float [%1, %if.then], [%2, %if.else] 155 %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07 156 store float %3, float* %arrayidx1, align 4, !dbg !14 157 %inc = add nuw nsw i64 %i.07, 1 158 %exitcond.not = icmp eq i64 %inc, %n 159 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 160 161for.cond.cleanup: ; preds = %for.body 162 ret void 163} 164 165; CHECK-REMARKS: UserVF ignored because of invalid costs. 166; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load 167; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32 168; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store 169define void @vec_sin_fixed_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) { 170; CHECK: @vec_sin_fixed_mapping 171; CHECK: call fast <2 x float> @llvm.sin.v2f32 172; CHECK-NOT: <vscale x 173entry: 174 br label %for.body 175 176for.body: ; preds = %entry, %for.body 177 %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 178 %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07 179 %0 = load float, float* %arrayidx, align 4, !dbg !11 180 %1 = tail call fast float @llvm.sin.f32(float %0) #3, !dbg !12 181 %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07 182 store float %1, float* %arrayidx1, align 4, !dbg !13 183 %inc = add nuw nsw i64 %i.07, 1 184 %exitcond.not = icmp eq i64 %inc, %n 185 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 186 187for.cond.cleanup: ; preds = %for.body 188 ret void 189} 190 191; Even though there are no function mappings attached to the call 192; in the loop below we can still vectorize the loop because SVE has 193; hardware support in the form of the 'fqsrt' instruction. 194define void @vec_sqrt_no_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) #0 { 195; CHECK: @vec_sqrt_no_mapping 196; CHECK: call fast <vscale x 2 x float> @llvm.sqrt.nxv2f32 197entry: 198 br label %for.body 199 200for.body: ; preds = %entry, %for.body 201 %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 202 %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07 203 %0 = load float, float* %arrayidx, align 4 204 %1 = tail call fast float @llvm.sqrt.f32(float %0) 205 %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07 206 store float %1, float* %arrayidx1, align 4 207 %inc = add nuw nsw i64 %i.07, 1 208 %exitcond.not = icmp eq i64 %inc, %n 209 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 210 211for.cond.cleanup: ; preds = %for.body 212 ret void 213} 214 215 216declare double @foo(double) 217declare i64 @bar(i64*) 218declare double @llvm.sin.f64(double) 219declare float @llvm.sin.f32(float) 220declare float @llvm.sqrt.f32(float) 221 222declare <vscale x 2 x double> @foo_vec(<vscale x 2 x double>) 223declare <vscale x 2 x i64> @bar_vec(<vscale x 2 x i64*>) 224declare <vscale x 2 x double> @sin_vec_nxv2f64(<vscale x 2 x double>) 225declare <2 x double> @sin_vec_v2f64(<2 x double>) 226 227attributes #0 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_foo(foo_vec)" } 228attributes #1 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_bar(bar_vec)" } 229attributes #2 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_llvm.sin.f64(sin_vec_nxv2f64)" } 230attributes #3 = { "vector-function-abi-variant"="_ZGV_LLVM_N2v_llvm.sin.f64(sin_vec_v2f64)" } 231 232!1 = distinct !{!1, !2, !3} 233!2 = !{!"llvm.loop.vectorize.width", i32 2} 234!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} 235 236!llvm.dbg.cu = !{!4} 237!llvm.module.flags = !{!7} 238!llvm.ident = !{!8} 239 240!4 = distinct !DICompileUnit(language: DW_LANG_C99, file: !5, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !6, splitDebugInlining: false, nameTableKind: None) 241!5 = !DIFile(filename: "t.c", directory: "somedir") 242!6 = !{} 243!7 = !{i32 2, !"Debug Info Version", i32 3} 244!8 = !{!"clang"} 245!9 = distinct !DISubprogram(name: "foo", scope: !5, file: !5, line: 2, type: !10, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6) 246!10 = !DISubroutineType(types: !6) 247!11 = !DILocation(line: 3, column: 10, scope: !9) 248!12 = !DILocation(line: 3, column: 20, scope: !9) 249!13 = !DILocation(line: 3, column: 30, scope: !9) 250!14 = !DILocation(line: 3, column: 40, scope: !9) 251