1; RUN: opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s 2 3 4@p = external local_unnamed_addr global [257 x i32], align 16 5@q = external local_unnamed_addr global [257 x i32], align 16 6 7; Test case for PR43398. 8 9define void @can_sink_after_store(i32 %x, i32* %ptr, i64 %tc) local_unnamed_addr #0 { 10; CHECK-LABEL: vector.ph: 11; CHECK: %broadcast.splatinsert1 = insertelement <4 x i32> undef, i32 %x, i32 0 12; CHECK-NEXT: %broadcast.splat2 = shufflevector <4 x i32> %broadcast.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer 13; CHECK-NEXT: %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3 14; CHECK-NEXT: br label %vector.body 15 16; CHECK-LABEL: vector.body: 17; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 18; CHECK-NEXT: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ] 19; CHECK-NEXT: %offset.idx = add i64 1, %index 20; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %offset.idx, i32 0 21; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer 22; CHECK-NEXT: %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3> 23; CHECK-NEXT: %0 = add i64 %offset.idx, 0 24; CHECK-NEXT: %1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %0 25; CHECK-NEXT: %2 = getelementptr inbounds i32, i32* %1, i32 0 26; CHECK-NEXT: %3 = bitcast i32* %2 to <4 x i32>* 27; CHECK-NEXT: %wide.load = load <4 x i32>, <4 x i32>* %3, align 4 28; CHECK-NEXT: %4 = shufflevector <4 x i32> %vector.recur, <4 x i32> %wide.load, <4 x i32> <i32 3, i32 4, i32 5, i32 6> 29; CHECK-NEXT: %5 = add <4 x i32> %4, %broadcast.splat2 30; CHECK-NEXT: %6 = add <4 x i32> %5, %wide.load 31; CHECK-NEXT: %7 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %0 32; CHECK-NEXT: %8 = getelementptr inbounds i32, i32* %7, i32 0 33; CHECK-NEXT: %9 = bitcast i32* %8 to <4 x i32>* 34; CHECK-NEXT: store <4 x i32> %6, <4 x i32>* %9, align 4 35; CHECK-NEXT: %index.next = add i64 %index, 4 36; CHECK-NEXT: %10 = icmp eq i64 %index.next, 1996 37; CHECK-NEXT: br i1 %10, label %middle.block, label %vector.body 38; 39entry: 40 br label %preheader 41 42preheader: 43 %idx.phi.trans = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1 44 %.pre = load i32, i32* %idx.phi.trans, align 4 45 br label %for 46 47for: 48 %pre.phi = phi i32 [ %.pre, %preheader ], [ %pre.next, %for ] 49 %iv = phi i64 [ 1, %preheader ], [ %iv.next, %for ] 50 %add.1 = add i32 %pre.phi, %x 51 %idx.1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %iv 52 %pre.next = load i32, i32* %idx.1, align 4 53 %add.2 = add i32 %add.1, %pre.next 54 %idx.2 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %iv 55 store i32 %add.2, i32* %idx.2, align 4 56 %iv.next = add nuw nsw i64 %iv, 1 57 %exitcond = icmp eq i64 %iv.next, 2000 58 br i1 %exitcond, label %exit, label %for 59 60exit: 61 ret void 62} 63 64; We can sink potential trapping instructions, as this will only delay the trap 65; and not introduce traps on additional paths. 66define void @sink_sdiv(i32 %x, i32* %ptr, i64 %tc) local_unnamed_addr #0 { 67; CHECK-LABEL: vector.ph: 68; CHECK: %broadcast.splatinsert1 = insertelement <4 x i32> undef, i32 %x, i32 0 69; CHECK-NEXT: %broadcast.splat2 = shufflevector <4 x i32> %broadcast.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer 70; CHECK-NEXT: %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3 71; CHECK-NEXT: br label %vector.body 72 73; CHECK-LABEL: vector.body: 74; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 75; CHECK-NEXT: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ] 76; CHECK-NEXT: %offset.idx = add i64 1, %index 77; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %offset.idx, i32 0 78; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer 79; CHECK-NEXT: %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3> 80; CHECK-NEXT: %0 = add i64 %offset.idx, 0 81; CHECK-NEXT: %1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %0 82; CHECK-NEXT: %2 = getelementptr inbounds i32, i32* %1, i32 0 83; CHECK-NEXT: %3 = bitcast i32* %2 to <4 x i32>* 84; CHECK-NEXT: %wide.load = load <4 x i32>, <4 x i32>* %3, align 4 85; CHECK-NEXT: %4 = shufflevector <4 x i32> %vector.recur, <4 x i32> %wide.load, <4 x i32> <i32 3, i32 4, i32 5, i32 6> 86; CHECK-NEXT: %5 = sdiv <4 x i32> %4, %broadcast.splat2 87; CHECK-NEXT: %6 = add <4 x i32> %5, %wide.load 88; CHECK-NEXT: %7 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %0 89; CHECK-NEXT: %8 = getelementptr inbounds i32, i32* %7, i32 0 90; CHECK-NEXT: %9 = bitcast i32* %8 to <4 x i32>* 91; CHECK-NEXT: store <4 x i32> %6, <4 x i32>* %9, align 4 92; CHECK-NEXT: %index.next = add i64 %index, 4 93; CHECK-NEXT: %10 = icmp eq i64 %index.next, 1996 94; CHECK-NEXT: br i1 %10, label %middle.block, label %vector.body 95; 96entry: 97 br label %preheader 98 99preheader: 100 %idx.phi.trans = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1 101 %.pre = load i32, i32* %idx.phi.trans, align 4 102 br label %for 103 104for: 105 %pre.phi = phi i32 [ %.pre, %preheader ], [ %pre.next, %for ] 106 %iv = phi i64 [ 1, %preheader ], [ %iv.next, %for ] 107 %div.1 = sdiv i32 %pre.phi, %x 108 %idx.1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %iv 109 %pre.next = load i32, i32* %idx.1, align 4 110 %add.2 = add i32 %div.1, %pre.next 111 %idx.2 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %iv 112 store i32 %add.2, i32* %idx.2, align 4 113 %iv.next = add nuw nsw i64 %iv, 1 114 %exitcond = icmp eq i64 %iv.next, 2000 115 br i1 %exitcond, label %exit, label %for 116 117exit: 118 ret void 119} 120 121; FIXME: Currently we can only sink a single instruction. For the example below, 122; we also have to sink users. 123define void @cannot_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) { 124; CHECK-LABEL: define void @cannot_sink_with_additional_user( 125; CHECK-NEXT: entry: 126; CHECK-NEXT: br label %preheader 127 128; CHECK-LABEL: preheader: ; preds = %entry 129; CHECK: br label %for 130 131; CHECK-LABEL: for: ; preds = %for, %preheader 132; CHECK br i1 %exitcond, label %exit, label %for 133 134; CHECK-LABEL: exit: 135; CHECK-NEXT: ret void 136 137entry: 138 br label %preheader 139 140preheader: 141 %idx.phi.trans = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1 142 %.pre = load i32, i32* %idx.phi.trans, align 4 143 br label %for 144 145for: 146 %pre.phi = phi i32 [ %.pre, %preheader ], [ %pre.next, %for ] 147 %iv = phi i64 [ 1, %preheader ], [ %iv.next, %for ] 148 %add.1 = add i32 %pre.phi, %x 149 %add.2 = add i32 %add.1, %x 150 %idx.1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %iv 151 %pre.next = load i32, i32* %idx.1, align 4 152 %add.3 = add i32 %add.1, %pre.next 153 %add.4 = add i32 %add.2, %add.3 154 %idx.2 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %iv 155 store i32 %add.4, i32* %idx.2, align 4 156 %iv.next = add nuw nsw i64 %iv, 1 157 %exitcond = icmp eq i64 %iv.next, 2000 158 br i1 %exitcond, label %exit, label %for 159 160exit: 161 ret void 162} 163 164; FIXME: We can sink a store, if we can guarantee that it does not alias any 165; loads/stores in between. 166define void @cannot_sink_store(i32 %x, i32* %ptr, i64 %tc) { 167; CHECK-LABEL: define void @cannot_sink_store( 168; CHECK-NEXT: entry: 169; CHECK-NEXT: br label %preheader 170 171; CHECK-LABEL: preheader: ; preds = %entry 172; CHECK: br label %for 173 174; CHECK-LABEL: for: ; preds = %for, %preheader 175; CHECK br i1 %exitcond, label %exit, label %for 176 177; CHECK-LABEL: exit: 178; CHECK-NEXT: ret void 179; 180entry: 181 br label %preheader 182 183preheader: 184 %idx.phi.trans = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1 185 %.pre = load i32, i32* %idx.phi.trans, align 4 186 br label %for 187 188for: 189 %pre.phi = phi i32 [ %.pre, %preheader ], [ %pre.next, %for ] 190 %iv = phi i64 [ 1, %preheader ], [ %iv.next, %for ] 191 %add.1 = add i32 %pre.phi, %x 192 store i32 %add.1, i32* %ptr 193 %idx.1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %iv 194 %pre.next = load i32, i32* %idx.1, align 4 195 %add.2 = add i32 %add.1, %pre.next 196 %idx.2 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %iv 197 store i32 %add.2, i32* %idx.2, align 4 198 %iv.next = add nuw nsw i64 %iv, 1 199 %exitcond = icmp eq i64 %iv.next, 2000 200 br i1 %exitcond, label %exit, label %for 201 202exit: 203 ret void 204} 205 206; Some kinds of reductions are not detected by IVDescriptors. If we have a 207; cycle, we cannot sink it. 208define void @cannot_sink_reduction(i32 %x, i32* %ptr, i64 %tc) { 209; CHECK-LABEL: define void @cannot_sink_reduction( 210; CHECK-NEXT: entry: 211; CHECK-NEXT: br label %preheader 212 213; CHECK-LABEL: preheader: ; preds = %entry 214; CHECK: br label %for 215 216; CHECK-LABEL: for: ; preds = %for, %preheader 217; CHECK br i1 %exitcond, label %exit, label %for 218 219; CHECK-LABEL: exit: ; preds = %for 220; CHECK-NET: ret void 221; 222entry: 223 br label %preheader 224 225preheader: 226 %idx.phi.trans = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1 227 %.pre = load i32, i32* %idx.phi.trans, align 4 228 br label %for 229 230for: 231 %pre.phi = phi i32 [ %.pre, %preheader ], [ %d, %for ] 232 %iv = phi i64 [ 1, %preheader ], [ %iv.next, %for ] 233 %d = sdiv i32 %pre.phi, %x 234 %idx.1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %iv 235 %pre.next = load i32, i32* %idx.1, align 4 236 %add.2 = add i32 %x, %pre.next 237 %idx.2 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %iv 238 store i32 %add.2, i32* %idx.2, align 4 239 %iv.next = add nuw nsw i64 %iv, 1 240 %exitcond = icmp eq i64 %iv.next, 2000 241 br i1 %exitcond, label %exit, label %for 242 243exit: 244 ret void 245} 246 247; TODO: We should be able to sink %tmp38 after %tmp60. 248define void @instruction_with_2_FOR_operands() { 249; CHECK-LABEL: define void @instruction_with_2_FOR_operands( 250; CHECK-NEXT: bb: 251; CHECK-NEXT: br label %bb13 252 253; CHECK-LABEL: bb13: 254; CHECK: br i1 %tmp12, label %bb13, label %bb74 255 256; CHECK-LABEL: bb74: 257; CHECK-NEXT: ret void 258; 259bb: 260 br label %bb13 261 262bb13: ; preds = %bb13, %bb 263 %tmp37 = phi float [ %tmp60, %bb13 ], [ undef, %bb ] 264 %tmp27 = phi float [ %tmp49, %bb13 ], [ undef, %bb ] 265 %indvars.iv = phi i64 [ %indvars.iv.next, %bb13 ], [ 0, %bb ] 266 %tmp38 = fmul fast float %tmp37, %tmp27 267 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 268 %tmp49 = load float, float* undef, align 4 269 %tmp60 = load float, float* undef, align 4 270 %tmp12 = icmp slt i64 %indvars.iv, undef 271 br i1 %tmp12, label %bb13, label %bb74 272 273bb74: ; preds = %bb13 274 ret void 275} 276