1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -O3 -S | FileCheck %s 3; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s 4 5; Test that IR is optimal after vectorization/unrolling/CSE/canonicalization. 6; In particular, there should be no fdivs inside loops because that is expensive. 7 8; TODO: There is a CSE opportunity to reduce the hoisted fdivs after vectorization/unrolling. 9; PR46115 - https://llvm.org/PR46115 10 11target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" 12target triple = "x86_64-apple-macosx10.15.0" 13 14define void @vdiv(double* %x, double* %y, double %a, i32 %N) #0 { 15; CHECK-LABEL: @vdiv( 16; CHECK-NEXT: entry: 17; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0 18; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] 19; CHECK: for.body.preheader: 20; CHECK-NEXT: [[X4:%.*]] = ptrtoint double* [[X:%.*]] to i64 21; CHECK-NEXT: [[Y5:%.*]] = ptrtoint double* [[Y:%.*]] to i64 22; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 23; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 24; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[X4]], [[Y5]] 25; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128 26; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[MIN_ITERS_CHECK]], i1 true, i1 [[DIFF_CHECK]] 27; CHECK-NEXT: br i1 [[OR_COND]], label [[FOR_BODY_PREHEADER15:%.*]], label [[VECTOR_PH:%.*]] 28; CHECK: vector.ph: 29; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280 30; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i64 0 31; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer 32; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0 33; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT12]], <4 x double> poison, <4 x i32> zeroinitializer 34; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0 35; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT14]], <4 x double> poison, <4 x i32> zeroinitializer 36; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0 37; CHECK-NEXT: [[BROADCAST_SPLAT17:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT16]], <4 x double> poison, <4 x i32> zeroinitializer 38; CHECK-NEXT: [[TMP0:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]] 39; CHECK-NEXT: [[TMP1:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT13]] 40; CHECK-NEXT: [[TMP2:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT15]] 41; CHECK-NEXT: [[TMP3:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT17]] 42; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 43; CHECK: vector.body: 44; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 45; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDEX]] 46; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>* 47; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP5]], align 8, !tbaa [[TBAA3:![0-9]+]] 48; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 4 49; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <4 x double>* 50; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x double>, <4 x double>* [[TMP7]], align 8, !tbaa [[TBAA3]] 51; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 8 52; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[TMP8]] to <4 x double>* 53; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x double>, <4 x double>* [[TMP9]], align 8, !tbaa [[TBAA3]] 54; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 12 55; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[TMP10]] to <4 x double>* 56; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x double>, <4 x double>* [[TMP11]], align 8, !tbaa [[TBAA3]] 57; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <4 x double> [[WIDE_LOAD]], [[TMP0]] 58; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <4 x double> [[WIDE_LOAD9]], [[TMP1]] 59; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <4 x double> [[WIDE_LOAD10]], [[TMP2]] 60; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <4 x double> [[WIDE_LOAD11]], [[TMP3]] 61; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDEX]] 62; CHECK-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>* 63; CHECK-NEXT: store <4 x double> [[TMP12]], <4 x double>* [[TMP17]], align 8, !tbaa [[TBAA3]] 64; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP16]], i64 4 65; CHECK-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>* 66; CHECK-NEXT: store <4 x double> [[TMP13]], <4 x double>* [[TMP19]], align 8, !tbaa [[TBAA3]] 67; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP16]], i64 8 68; CHECK-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>* 69; CHECK-NEXT: store <4 x double> [[TMP14]], <4 x double>* [[TMP21]], align 8, !tbaa [[TBAA3]] 70; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP16]], i64 12 71; CHECK-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>* 72; CHECK-NEXT: store <4 x double> [[TMP15]], <4 x double>* [[TMP23]], align 8, !tbaa [[TBAA3]] 73; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 74; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 75; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] 76; CHECK: middle.block: 77; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] 78; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER15]] 79; CHECK: for.body.preheader15: 80; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] 81; CHECK-NEXT: [[TMP25:%.*]] = xor i64 [[INDVARS_IV_PH]], -1 82; CHECK-NEXT: [[TMP26:%.*]] = add nsw i64 [[TMP25]], [[WIDE_TRIP_COUNT]] 83; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 3 84; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 85; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], label [[FOR_BODY_PROL_PREHEADER:%.*]] 86; CHECK: for.body.prol.preheader: 87; CHECK-NEXT: [[TMP27:%.*]] = fdiv fast double 1.000000e+00, [[A]] 88; CHECK-NEXT: br label [[FOR_BODY_PROL:%.*]] 89; CHECK: for.body.prol: 90; CHECK-NEXT: [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_BODY_PROL]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PROL_PREHEADER]] ] 91; CHECK-NEXT: [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_NEXT:%.*]], [[FOR_BODY_PROL]] ], [ 0, [[FOR_BODY_PROL_PREHEADER]] ] 92; CHECK-NEXT: [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDVARS_IV_PROL]] 93; CHECK-NEXT: [[T0_PROL:%.*]] = load double, double* [[ARRAYIDX_PROL]], align 8, !tbaa [[TBAA3]] 94; CHECK-NEXT: [[TMP28:%.*]] = fmul fast double [[T0_PROL]], [[TMP27]] 95; CHECK-NEXT: [[ARRAYIDX2_PROL:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDVARS_IV_PROL]] 96; CHECK-NEXT: store double [[TMP28]], double* [[ARRAYIDX2_PROL]], align 8, !tbaa [[TBAA3]] 97; CHECK-NEXT: [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 1 98; CHECK-NEXT: [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1 99; CHECK-NEXT: [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER]] 100; CHECK-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label [[FOR_BODY_PROL_LOOPEXIT]], label [[FOR_BODY_PROL]], !llvm.loop [[LOOP14:![0-9]+]] 101; CHECK: for.body.prol.loopexit: 102; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER15]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ] 103; CHECK-NEXT: [[TMP29:%.*]] = icmp ult i64 [[TMP26]], 3 104; CHECK-NEXT: br i1 [[TMP29]], label [[FOR_END]], label [[FOR_BODY_PREHEADER18_NEW:%.*]] 105; CHECK: for.body.preheader15.new: 106; CHECK-NEXT: [[TMP30:%.*]] = fdiv fast double 1.000000e+00, [[A]] 107; CHECK-NEXT: [[TMP31:%.*]] = fdiv fast double 1.000000e+00, [[A]] 108; CHECK-NEXT: [[TMP32:%.*]] = fdiv fast double 1.000000e+00, [[A]] 109; CHECK-NEXT: [[TMP33:%.*]] = fdiv fast double 1.000000e+00, [[A]] 110; CHECK-NEXT: br label [[FOR_BODY:%.*]] 111; CHECK: for.body: 112; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], [[FOR_BODY_PREHEADER18_NEW]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY]] ] 113; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDVARS_IV]] 114; CHECK-NEXT: [[T0:%.*]] = load double, double* [[ARRAYIDX]], align 8, !tbaa [[TBAA3]] 115; CHECK-NEXT: [[TMP34:%.*]] = fmul fast double [[T0]], [[TMP30]] 116; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDVARS_IV]] 117; CHECK-NEXT: store double [[TMP34]], double* [[ARRAYIDX2]], align 8, !tbaa [[TBAA3]] 118; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 119; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDVARS_IV_NEXT]] 120; CHECK-NEXT: [[T0_1:%.*]] = load double, double* [[ARRAYIDX_1]], align 8, !tbaa [[TBAA3]] 121; CHECK-NEXT: [[TMP35:%.*]] = fmul fast double [[T0_1]], [[TMP31]] 122; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDVARS_IV_NEXT]] 123; CHECK-NEXT: store double [[TMP35]], double* [[ARRAYIDX2_1]], align 8, !tbaa [[TBAA3]] 124; CHECK-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2 125; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDVARS_IV_NEXT_1]] 126; CHECK-NEXT: [[T0_2:%.*]] = load double, double* [[ARRAYIDX_2]], align 8, !tbaa [[TBAA3]] 127; CHECK-NEXT: [[TMP36:%.*]] = fmul fast double [[T0_2]], [[TMP32]] 128; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDVARS_IV_NEXT_1]] 129; CHECK-NEXT: store double [[TMP36]], double* [[ARRAYIDX2_2]], align 8, !tbaa [[TBAA3]] 130; CHECK-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3 131; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDVARS_IV_NEXT_2]] 132; CHECK-NEXT: [[T0_3:%.*]] = load double, double* [[ARRAYIDX_3]], align 8, !tbaa [[TBAA3]] 133; CHECK-NEXT: [[TMP37:%.*]] = fmul fast double [[T0_3]], [[TMP33]] 134; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDVARS_IV_NEXT_2]] 135; CHECK-NEXT: store double [[TMP37]], double* [[ARRAYIDX2_3]], align 8, !tbaa [[TBAA3]] 136; CHECK-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4 137; CHECK-NEXT: [[EXITCOND_NOT_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], [[WIDE_TRIP_COUNT]] 138; CHECK-NEXT: br i1 [[EXITCOND_NOT_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] 139; CHECK: for.end: 140; CHECK-NEXT: ret void 141; 142entry: 143 %div = fdiv fast double 1.0, %a 144 br label %for.cond 145 146for.cond: 147 %n.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 148 %cmp = icmp slt i32 %n.0, %N 149 br i1 %cmp, label %for.body, label %for.cond.cleanup 150 151for.cond.cleanup: 152 br label %for.end 153 154for.body: 155 %idxprom = sext i32 %n.0 to i64 156 %arrayidx = getelementptr inbounds double, double* %y, i64 %idxprom 157 %t0 = load double, double* %arrayidx, align 8, !tbaa !3 158 %mul = fmul fast double %t0, %div 159 %idxprom1 = sext i32 %n.0 to i64 160 %arrayidx2 = getelementptr inbounds double, double* %x, i64 %idxprom1 161 store double %mul, double* %arrayidx2, align 8, !tbaa !3 162 br label %for.inc 163 164for.inc: 165 %inc = add nsw i32 %n.0, 1 166 br label %for.cond 167 168for.end: 169 ret void 170} 171 172attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="true" "use-soft-float"="false" } 173 174!llvm.module.flags = !{!0, !1} 175!llvm.ident = !{!2} 176 177!0 = !{i32 1, !"wchar_size", i32 4} 178!1 = !{i32 7, !"PIC Level", i32 2} 179!2 = !{!"clang version 11.0.0 (https://github.com/llvm/llvm-project.git 45ebe38ffc40bb7221fc587bfb4481cf7f53ebbc)"} 180!3 = !{!4, !4, i64 0} 181!4 = !{!"double", !5, i64 0} 182!5 = !{!"omnipotent char", !6, i64 0} 183!6 = !{!"Simple C/C++ TBAA"} 184 185