1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefixes=ALL,CHECK 3; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefixes=ALL,STORE 4 5; #include <stdint.h> 6; 7; int foo(float *A, int n) { 8; float sum = 0; 9; for (intptr_t i=0; i < n; ++i) { 10; sum += 7*A[i*4 ] + 11; 7*A[i*4+1] + 12; 7*A[i*4+2] + 13; 7*A[i*4+3]; 14; } 15; return sum; 16; } 17 18define i32 @add_red(float* %A, i32 %n) { 19; ALL-LABEL: @add_red( 20; ALL-NEXT: entry: 21; ALL-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0 22; ALL-NEXT: br i1 [[CMP31]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] 23; ALL: for.body.lr.ph: 24; ALL-NEXT: [[TMP0:%.*]] = sext i32 [[N]] to i64 25; ALL-NEXT: br label [[FOR_BODY:%.*]] 26; ALL: for.body: 27; ALL-NEXT: [[I_033:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] 28; ALL-NEXT: [[SUM_032:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD17:%.*]], [[FOR_BODY]] ] 29; ALL-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_033]], 2 30; ALL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]] 31; ALL-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* 32; ALL-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 33; ALL-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00> 34; ALL-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]]) 35; ALL-NEXT: [[ADD17]] = fadd fast float [[SUM_032]], [[TMP4]] 36; ALL-NEXT: [[INC]] = add nsw i64 [[I_033]], 1 37; ALL-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]] 38; ALL-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]] 39; ALL: for.cond.for.end_crit_edge: 40; ALL-NEXT: [[PHITMP:%.*]] = fptosi float [[ADD17]] to i32 41; ALL-NEXT: br label [[FOR_END]] 42; ALL: for.end: 43; ALL-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] 44; ALL-NEXT: ret i32 [[SUM_0_LCSSA]] 45; 46entry: 47 %cmp31 = icmp sgt i32 %n, 0 48 br i1 %cmp31, label %for.body.lr.ph, label %for.end 49 50for.body.lr.ph: 51 %0 = sext i32 %n to i64 52 br label %for.body 53 54for.body: 55 %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 56 %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ] 57 %mul = shl nsw i64 %i.033, 2 58 %arrayidx = getelementptr inbounds float, float* %A, i64 %mul 59 %1 = load float, float* %arrayidx, align 4 60 %mul2 = fmul float %1, 7.000000e+00 61 %add28 = or i64 %mul, 1 62 %arrayidx4 = getelementptr inbounds float, float* %A, i64 %add28 63 %2 = load float, float* %arrayidx4, align 4 64 %mul5 = fmul float %2, 7.000000e+00 65 %add6 = fadd fast float %mul2, %mul5 66 %add829 = or i64 %mul, 2 67 %arrayidx9 = getelementptr inbounds float, float* %A, i64 %add829 68 %3 = load float, float* %arrayidx9, align 4 69 %mul10 = fmul float %3, 7.000000e+00 70 %add11 = fadd fast float %add6, %mul10 71 %add1330 = or i64 %mul, 3 72 %arrayidx14 = getelementptr inbounds float, float* %A, i64 %add1330 73 %4 = load float, float* %arrayidx14, align 4 74 %mul15 = fmul float %4, 7.000000e+00 75 %add16 = fadd fast float %add11, %mul15 76 %add17 = fadd fast float %sum.032, %add16 77 %inc = add nsw i64 %i.033, 1 78 %exitcond = icmp eq i64 %inc, %0 79 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body 80 81for.cond.for.end_crit_edge: 82 %phitmp = fptosi float %add17 to i32 83 br label %for.end 84 85for.end: 86 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] 87 ret i32 %sum.0.lcssa 88} 89 90; int foo(float * restrict A, float * restrict B, int n) { 91; float sum = 0; 92; for (intptr_t i=0; i < n; ++i) { 93; sum *= B[0]*A[i*4 ] + 94; B[1]*A[i*4+1] + 95; B[2]*A[i*4+2] + 96; B[3]*A[i*4+3]; 97; } 98; return sum; 99; } 100 101define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) { 102; ALL-LABEL: @mul_red( 103; ALL-NEXT: entry: 104; ALL-NEXT: [[CMP38:%.*]] = icmp sgt i32 [[N:%.*]], 0 105; ALL-NEXT: br i1 [[CMP38]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] 106; ALL: for.body.lr.ph: 107; ALL-NEXT: [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <4 x float>* 108; ALL-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 109; ALL-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64 110; ALL-NEXT: br label [[FOR_BODY:%.*]] 111; ALL: for.body: 112; ALL-NEXT: [[I_040:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] 113; ALL-NEXT: [[SUM_039:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[MUL21:%.*]], [[FOR_BODY]] ] 114; ALL-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_040]], 2 115; ALL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]] 116; ALL-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>* 117; ALL-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 118; ALL-NEXT: [[TMP5:%.*]] = fmul <4 x float> [[TMP1]], [[TMP4]] 119; ALL-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) 120; ALL-NEXT: [[MUL21]] = fmul float [[SUM_039]], [[TMP6]] 121; ALL-NEXT: [[INC]] = add nsw i64 [[I_040]], 1 122; ALL-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]] 123; ALL-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]] 124; ALL: for.cond.for.end_crit_edge: 125; ALL-NEXT: [[PHITMP:%.*]] = fptosi float [[MUL21]] to i32 126; ALL-NEXT: br label [[FOR_END]] 127; ALL: for.end: 128; ALL-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] 129; ALL-NEXT: ret i32 [[SUM_0_LCSSA]] 130; 131entry: 132 %cmp38 = icmp sgt i32 %n, 0 133 br i1 %cmp38, label %for.body.lr.ph, label %for.end 134 135for.body.lr.ph: 136 %0 = load float, float* %B, align 4 137 %arrayidx4 = getelementptr inbounds float, float* %B, i64 1 138 %1 = load float, float* %arrayidx4, align 4 139 %arrayidx9 = getelementptr inbounds float, float* %B, i64 2 140 %2 = load float, float* %arrayidx9, align 4 141 %arrayidx15 = getelementptr inbounds float, float* %B, i64 3 142 %3 = load float, float* %arrayidx15, align 4 143 %4 = sext i32 %n to i64 144 br label %for.body 145 146for.body: 147 %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 148 %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ] 149 %mul = shl nsw i64 %i.040, 2 150 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul 151 %5 = load float, float* %arrayidx2, align 4 152 %mul3 = fmul float %0, %5 153 %add35 = or i64 %mul, 1 154 %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add35 155 %6 = load float, float* %arrayidx6, align 4 156 %mul7 = fmul float %1, %6 157 %add8 = fadd fast float %mul3, %mul7 158 %add1136 = or i64 %mul, 2 159 %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1136 160 %7 = load float, float* %arrayidx12, align 4 161 %mul13 = fmul float %2, %7 162 %add14 = fadd fast float %add8, %mul13 163 %add1737 = or i64 %mul, 3 164 %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1737 165 %8 = load float, float* %arrayidx18, align 4 166 %mul19 = fmul float %3, %8 167 %add20 = fadd fast float %add14, %mul19 168 %mul21 = fmul float %sum.039, %add20 169 %inc = add nsw i64 %i.040, 1 170 %exitcond = icmp eq i64 %inc, %4 171 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body 172 173for.cond.for.end_crit_edge: 174 %phitmp = fptosi float %mul21 to i32 175 br label %for.end 176 177for.end: 178 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] 179 ret i32 %sum.0.lcssa 180} 181 182; int foo(float * restrict A, float * restrict B, int n) { 183; float sum = 0; 184; for (intptr_t i=0; i < n; ++i) { 185; sum += B[0]*A[i*6 ] + 186; B[1]*A[i*6+1] + 187; B[2]*A[i*6+2] + 188; B[3]*A[i*6+3] + 189; B[4]*A[i*6+4] + 190; B[5]*A[i*6+5] + 191; B[6]*A[i*6+6] + 192; B[7]*A[i*6+7] + 193; B[8]*A[i*6+8]; 194; } 195; return sum; 196; } 197 198define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) { 199; ALL-LABEL: @long_red( 200; ALL-NEXT: entry: 201; ALL-NEXT: [[CMP81:%.*]] = icmp sgt i32 [[N:%.*]], 0 202; ALL-NEXT: br i1 [[CMP81]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] 203; ALL: for.body.lr.ph: 204; ALL-NEXT: [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <8 x float>* 205; ALL-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 206; ALL-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds float, float* [[B]], i64 8 207; ALL-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX45]], align 4 208; ALL-NEXT: [[TMP3:%.*]] = sext i32 [[N]] to i64 209; ALL-NEXT: br label [[FOR_BODY:%.*]] 210; ALL: for.body: 211; ALL-NEXT: [[I_083:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] 212; ALL-NEXT: [[SUM_082:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD51:%.*]], [[FOR_BODY]] ] 213; ALL-NEXT: [[MUL:%.*]] = mul nsw i64 [[I_083]], 6 214; ALL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]] 215; ALL-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX2]] to <8 x float>* 216; ALL-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4 217; ALL-NEXT: [[TMP6:%.*]] = fmul fast <8 x float> [[TMP1]], [[TMP5]] 218; ALL-NEXT: [[ADD47:%.*]] = add nsw i64 [[MUL]], 8 219; ALL-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD47]] 220; ALL-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX48]], align 4 221; ALL-NEXT: [[MUL49:%.*]] = fmul fast float [[TMP2]], [[TMP7]] 222; ALL-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP6]]) 223; ALL-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[MUL49]] 224; ALL-NEXT: [[ADD51]] = fadd fast float [[SUM_082]], [[OP_RDX]] 225; ALL-NEXT: [[INC]] = add nsw i64 [[I_083]], 1 226; ALL-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP3]] 227; ALL-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]] 228; ALL: for.cond.for.end_crit_edge: 229; ALL-NEXT: [[PHITMP:%.*]] = fptosi float [[ADD51]] to i32 230; ALL-NEXT: br label [[FOR_END]] 231; ALL: for.end: 232; ALL-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] 233; ALL-NEXT: ret i32 [[SUM_0_LCSSA]] 234; 235entry: 236 %cmp81 = icmp sgt i32 %n, 0 237 br i1 %cmp81, label %for.body.lr.ph, label %for.end 238 239for.body.lr.ph: 240 %0 = load float, float* %B, align 4 241 %arrayidx4 = getelementptr inbounds float, float* %B, i64 1 242 %1 = load float, float* %arrayidx4, align 4 243 %arrayidx9 = getelementptr inbounds float, float* %B, i64 2 244 %2 = load float, float* %arrayidx9, align 4 245 %arrayidx15 = getelementptr inbounds float, float* %B, i64 3 246 %3 = load float, float* %arrayidx15, align 4 247 %arrayidx21 = getelementptr inbounds float, float* %B, i64 4 248 %4 = load float, float* %arrayidx21, align 4 249 %arrayidx27 = getelementptr inbounds float, float* %B, i64 5 250 %5 = load float, float* %arrayidx27, align 4 251 %arrayidx33 = getelementptr inbounds float, float* %B, i64 6 252 %6 = load float, float* %arrayidx33, align 4 253 %arrayidx39 = getelementptr inbounds float, float* %B, i64 7 254 %7 = load float, float* %arrayidx39, align 4 255 %arrayidx45 = getelementptr inbounds float, float* %B, i64 8 256 %8 = load float, float* %arrayidx45, align 4 257 %9 = sext i32 %n to i64 258 br label %for.body 259 260for.body: 261 %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 262 %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ] 263 %mul = mul nsw i64 %i.083, 6 264 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul 265 %10 = load float, float* %arrayidx2, align 4 266 %mul3 = fmul fast float %0, %10 267 %add80 = or i64 %mul, 1 268 %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add80 269 %11 = load float, float* %arrayidx6, align 4 270 %mul7 = fmul fast float %1, %11 271 %add8 = fadd fast float %mul3, %mul7 272 %add11 = add nsw i64 %mul, 2 273 %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add11 274 %12 = load float, float* %arrayidx12, align 4 275 %mul13 = fmul fast float %2, %12 276 %add14 = fadd fast float %add8, %mul13 277 %add17 = add nsw i64 %mul, 3 278 %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add17 279 %13 = load float, float* %arrayidx18, align 4 280 %mul19 = fmul fast float %3, %13 281 %add20 = fadd fast float %add14, %mul19 282 %add23 = add nsw i64 %mul, 4 283 %arrayidx24 = getelementptr inbounds float, float* %A, i64 %add23 284 %14 = load float, float* %arrayidx24, align 4 285 %mul25 = fmul fast float %4, %14 286 %add26 = fadd fast float %add20, %mul25 287 %add29 = add nsw i64 %mul, 5 288 %arrayidx30 = getelementptr inbounds float, float* %A, i64 %add29 289 %15 = load float, float* %arrayidx30, align 4 290 %mul31 = fmul fast float %5, %15 291 %add32 = fadd fast float %add26, %mul31 292 %add35 = add nsw i64 %mul, 6 293 %arrayidx36 = getelementptr inbounds float, float* %A, i64 %add35 294 %16 = load float, float* %arrayidx36, align 4 295 %mul37 = fmul fast float %6, %16 296 %add38 = fadd fast float %add32, %mul37 297 %add41 = add nsw i64 %mul, 7 298 %arrayidx42 = getelementptr inbounds float, float* %A, i64 %add41 299 %17 = load float, float* %arrayidx42, align 4 300 %mul43 = fmul fast float %7, %17 301 %add44 = fadd fast float %add38, %mul43 302 %add47 = add nsw i64 %mul, 8 303 %arrayidx48 = getelementptr inbounds float, float* %A, i64 %add47 304 %18 = load float, float* %arrayidx48, align 4 305 %mul49 = fmul fast float %8, %18 306 %add50 = fadd fast float %add44, %mul49 307 %add51 = fadd fast float %sum.082, %add50 308 %inc = add nsw i64 %i.083, 1 309 %exitcond = icmp eq i64 %inc, %9 310 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body 311 312for.cond.for.end_crit_edge: 313 %phitmp = fptosi float %add51 to i32 314 br label %for.end 315 316for.end: 317 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] 318 ret i32 %sum.0.lcssa 319} 320 321; int foo(float * restrict A, float * restrict B, int n) { 322; float sum = 0; 323; for (intptr_t i=0; i < n; ++i) { 324; sum += B[0]*A[i*4 ]; 325; sum += B[1]*A[i*4+1]; 326; sum += B[2]*A[i*4+2]; 327; sum += B[3]*A[i*4+3]; 328; } 329; return sum; 330; } 331 332define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) { 333; ALL-LABEL: @chain_red( 334; ALL-NEXT: entry: 335; ALL-NEXT: [[CMP41:%.*]] = icmp sgt i32 [[N:%.*]], 0 336; ALL-NEXT: br i1 [[CMP41]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] 337; ALL: for.body.lr.ph: 338; ALL-NEXT: [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <4 x float>* 339; ALL-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 340; ALL-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64 341; ALL-NEXT: br label [[FOR_BODY:%.*]] 342; ALL: for.body: 343; ALL-NEXT: [[I_043:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] 344; ALL-NEXT: [[SUM_042:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ] 345; ALL-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_043]], 2 346; ALL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]] 347; ALL-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>* 348; ALL-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 349; ALL-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP4]] 350; ALL-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) 351; ALL-NEXT: [[OP_RDX]] = fadd fast float [[TMP6]], [[SUM_042]] 352; ALL-NEXT: [[INC]] = add nsw i64 [[I_043]], 1 353; ALL-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]] 354; ALL-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]] 355; ALL: for.cond.for.end_crit_edge: 356; ALL-NEXT: [[PHITMP:%.*]] = fptosi float [[OP_RDX]] to i32 357; ALL-NEXT: br label [[FOR_END]] 358; ALL: for.end: 359; ALL-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] 360; ALL-NEXT: ret i32 [[SUM_0_LCSSA]] 361; 362entry: 363 %cmp41 = icmp sgt i32 %n, 0 364 br i1 %cmp41, label %for.body.lr.ph, label %for.end 365 366for.body.lr.ph: 367 %0 = load float, float* %B, align 4 368 %arrayidx4 = getelementptr inbounds float, float* %B, i64 1 369 %1 = load float, float* %arrayidx4, align 4 370 %arrayidx10 = getelementptr inbounds float, float* %B, i64 2 371 %2 = load float, float* %arrayidx10, align 4 372 %arrayidx16 = getelementptr inbounds float, float* %B, i64 3 373 %3 = load float, float* %arrayidx16, align 4 374 %4 = sext i32 %n to i64 375 br label %for.body 376 377for.body: 378 %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 379 %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ] 380 %mul = shl nsw i64 %i.043, 2 381 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul 382 %5 = load float, float* %arrayidx2, align 4 383 %mul3 = fmul fast float %0, %5 384 %add = fadd fast float %sum.042, %mul3 385 %add638 = or i64 %mul, 1 386 %arrayidx7 = getelementptr inbounds float, float* %A, i64 %add638 387 %6 = load float, float* %arrayidx7, align 4 388 %mul8 = fmul fast float %1, %6 389 %add9 = fadd fast float %add, %mul8 390 %add1239 = or i64 %mul, 2 391 %arrayidx13 = getelementptr inbounds float, float* %A, i64 %add1239 392 %7 = load float, float* %arrayidx13, align 4 393 %mul14 = fmul fast float %2, %7 394 %add15 = fadd fast float %add9, %mul14 395 %add1840 = or i64 %mul, 3 396 %arrayidx19 = getelementptr inbounds float, float* %A, i64 %add1840 397 %8 = load float, float* %arrayidx19, align 4 398 %mul20 = fmul fast float %3, %8 399 %add21 = fadd fast float %add15, %mul20 400 %inc = add nsw i64 %i.043, 1 401 %exitcond = icmp eq i64 %inc, %4 402 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body 403 404for.cond.for.end_crit_edge: 405 %phitmp = fptosi float %add21 to i32 406 br label %for.end 407 408for.end: 409 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] 410 ret i32 %sum.0.lcssa 411} 412 413; void foo(const float *arg_A, unsigned arg_B, float *array) { 414; for (uint32_t i = 0; i < 6; ++i) { 415; const float *ptr = arg_A + i; 416; float w0 = array[i * 4 + 0]; 417; float w1 = array[i * 4 + 1]; 418; float w2 = array[i * 4 + 2]; 419; float w3 = array[i * 4 + 3]; 420; 421; for (unsigned j = 0; j < arg_B; ++j) { 422; const float x1 = *ptr - (-1.1f * w0) - (1.2f * w1); 423; const float x2 = (2.1f * x1) + (-2.2f * w0) + (2.3f * w1); 424; const float x3 = x2 - (-3.1f * w2) - (3.2f * w3); 425; const float x4 = x3 + (-4.0f * w2) + w3; 426; w1 = w0; 427; w0 = x1; 428; w3 = w2; 429; w2 = x3; 430; } 431; 432; array[i * 4 + 0] = w0; 433; array[i * 4 + 1] = w1; 434; array[i * 4 + 2] = w2; 435; array[i * 4 + 3] = w3; 436; } 437; } 438 439define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture %array) { 440; ALL-LABEL: @foo( 441; ALL-NEXT: entry: 442; ALL-NEXT: [[CMP1495:%.*]] = icmp eq i32 [[ARG_B:%.*]], 0 443; ALL-NEXT: br label [[FOR_BODY:%.*]] 444; ALL: for.cond.cleanup: 445; ALL-NEXT: ret void 446; ALL: for.body: 447; ALL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ] 448; ALL-NEXT: [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2 449; ALL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]] 450; ALL-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 451; ALL-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 1 452; ALL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]] 453; ALL-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4 454; ALL-NEXT: [[TMP4:%.*]] = or i64 [[TMP0]], 2 455; ALL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]] 456; ALL-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4 457; ALL-NEXT: [[TMP6:%.*]] = or i64 [[TMP0]], 3 458; ALL-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]] 459; ALL-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4 460; ALL-NEXT: br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]] 461; ALL: for.body16.lr.ph: 462; ALL-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]] 463; ALL-NEXT: [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4 464; ALL-NEXT: br label [[FOR_BODY16:%.*]] 465; ALL: for.cond.cleanup15: 466; ALL-NEXT: [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ] 467; ALL-NEXT: [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ] 468; ALL-NEXT: [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ] 469; ALL-NEXT: [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ] 470; ALL-NEXT: store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4 471; ALL-NEXT: store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4 472; ALL-NEXT: store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4 473; ALL-NEXT: store float [[W3_0_LCSSA]], float* [[ARRAYIDX12]], align 4 474; ALL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 475; ALL-NEXT: [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6 476; ALL-NEXT: br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] 477; ALL: for.body16: 478; ALL-NEXT: [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ] 479; ALL-NEXT: [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ] 480; ALL-NEXT: [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ] 481; ALL-NEXT: [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ] 482; ALL-NEXT: [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ] 483; ALL-NEXT: [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000 484; ALL-NEXT: [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000 485; ALL-NEXT: [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]] 486; ALL-NEXT: [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]] 487; ALL-NEXT: [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000 488; ALL-NEXT: [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000 489; ALL-NEXT: [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000 490; ALL-NEXT: [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000 491; ALL-NEXT: [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000 492; ALL-NEXT: [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]] 493; ALL-NEXT: [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]] 494; ALL-NEXT: [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]] 495; ALL-NEXT: [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]] 496; ALL-NEXT: [[INC]] = add nuw i32 [[J_098]], 1 497; ALL-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]] 498; ALL-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]] 499; 500entry: 501 %cmp1495 = icmp eq i32 %arg_B, 0 502 br label %for.body 503 504for.cond.cleanup: ; preds = %for.cond.cleanup15 505 ret void 506 507for.body: ; preds = %for.cond.cleanup15, %entry 508 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond.cleanup15 ] 509 %0 = shl i64 %indvars.iv, 2 510 %arrayidx = getelementptr inbounds float, float* %array, i64 %0 511 %1 = load float, float* %arrayidx, align 4 512 %2 = or i64 %0, 1 513 %arrayidx4 = getelementptr inbounds float, float* %array, i64 %2 514 %3 = load float, float* %arrayidx4, align 4 515 %4 = or i64 %0, 2 516 %arrayidx8 = getelementptr inbounds float, float* %array, i64 %4 517 %5 = load float, float* %arrayidx8, align 4 518 %6 = or i64 %0, 3 519 %arrayidx12 = getelementptr inbounds float, float* %array, i64 %6 520 %7 = load float, float* %arrayidx12, align 4 521 br i1 %cmp1495, label %for.cond.cleanup15, label %for.body16.lr.ph 522 523for.body16.lr.ph: ; preds = %for.body 524 %add.ptr = getelementptr inbounds float, float* %arg_A, i64 %indvars.iv 525 %8 = load float, float* %add.ptr, align 4 526 br label %for.body16 527 528for.cond.cleanup15: ; preds = %for.body16, %for.body 529 %w2.0.lcssa = phi float [ %5, %for.body ], [ %sub28, %for.body16 ] 530 %w3.0.lcssa = phi float [ %7, %for.body ], [ %w2.096, %for.body16 ] 531 %w1.0.lcssa = phi float [ %3, %for.body ], [ %w0.0100, %for.body16 ] 532 %w0.0.lcssa = phi float [ %1, %for.body ], [ %sub19, %for.body16 ] 533 store float %w0.0.lcssa, float* %arrayidx, align 4 534 store float %w1.0.lcssa, float* %arrayidx4, align 4 535 store float %w2.0.lcssa, float* %arrayidx8, align 4 536 store float %w3.0.lcssa, float* %arrayidx12, align 4 537 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 538 %exitcond109 = icmp eq i64 %indvars.iv.next, 6 539 br i1 %exitcond109, label %for.cond.cleanup, label %for.body 540 541for.body16: ; preds = %for.body16, %for.body16.lr.ph 542 %w0.0100 = phi float [ %1, %for.body16.lr.ph ], [ %sub19, %for.body16 ] 543 %w1.099 = phi float [ %3, %for.body16.lr.ph ], [ %w0.0100, %for.body16 ] 544 %j.098 = phi i32 [ 0, %for.body16.lr.ph ], [ %inc, %for.body16 ] 545 %w3.097 = phi float [ %7, %for.body16.lr.ph ], [ %w2.096, %for.body16 ] 546 %w2.096 = phi float [ %5, %for.body16.lr.ph ], [ %sub28, %for.body16 ] 547 %mul17 = fmul fast float %w0.0100, 0x3FF19999A0000000 548 %mul18.neg = fmul fast float %w1.099, 0xBFF3333340000000 549 %sub92 = fadd fast float %mul17, %mul18.neg 550 %sub19 = fadd fast float %sub92, %8 551 %mul20 = fmul fast float %sub19, 0x4000CCCCC0000000 552 %mul21.neg = fmul fast float %w0.0100, 0xC0019999A0000000 553 %mul23 = fmul fast float %w1.099, 0x4002666660000000 554 %mul25 = fmul fast float %w2.096, 0x4008CCCCC0000000 555 %mul27.neg = fmul fast float %w3.097, 0xC0099999A0000000 556 %add2293 = fadd fast float %mul27.neg, %mul25 557 %add24 = fadd fast float %add2293, %mul23 558 %sub2694 = fadd fast float %add24, %mul21.neg 559 %sub28 = fadd fast float %sub2694, %mul20 560 %inc = add nuw i32 %j.098, 1 561 %exitcond = icmp eq i32 %inc, %arg_B 562 br i1 %exitcond, label %for.cond.cleanup15, label %for.body16 563} 564 565 566; void foo(double * restrict A, double * restrict B, double * restrict C, 567; int n) { 568; for (intptr_t i=0; i < n; ++i) { 569; C[i] = B[0] *A[i*4 ] + B[1] *A[i*4+1]; 570; } 571; } 572 573define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) { 574; CHECK-LABEL: @store_red_double( 575; CHECK-NEXT: entry: 576; CHECK-NEXT: [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0 577; CHECK-NEXT: br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] 578; CHECK: for.body.lr.ph: 579; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[B:%.*]], align 8 580; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 581; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX4]], align 8 582; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64 583; CHECK-NEXT: br label [[FOR_BODY:%.*]] 584; CHECK: for.body: 585; CHECK-NEXT: [[I_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] 586; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_018]], 2 587; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[MUL]] 588; CHECK-NEXT: [[TMP3:%.*]] = load double, double* [[ARRAYIDX2]], align 8 589; CHECK-NEXT: [[MUL3:%.*]] = fmul fast double [[TMP0]], [[TMP3]] 590; CHECK-NEXT: [[ADD16:%.*]] = or i64 [[MUL]], 1 591; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[ADD16]] 592; CHECK-NEXT: [[TMP4:%.*]] = load double, double* [[ARRAYIDX6]], align 8 593; CHECK-NEXT: [[MUL7:%.*]] = fmul fast double [[TMP1]], [[TMP4]] 594; CHECK-NEXT: [[ADD8:%.*]] = fadd fast double [[MUL3]], [[MUL7]] 595; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 [[I_018]] 596; CHECK-NEXT: store double [[ADD8]], double* [[ARRAYIDX9]], align 8 597; CHECK-NEXT: [[INC]] = add nsw i64 [[I_018]], 1 598; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]] 599; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]] 600; CHECK: for.end: 601; CHECK-NEXT: ret void 602; 603; STORE-LABEL: @store_red_double( 604; STORE-NEXT: entry: 605; STORE-NEXT: [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0 606; STORE-NEXT: br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] 607; STORE: for.body.lr.ph: 608; STORE-NEXT: [[TMP0:%.*]] = bitcast double* [[B:%.*]] to <2 x double>* 609; STORE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 610; STORE-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64 611; STORE-NEXT: br label [[FOR_BODY:%.*]] 612; STORE: for.body: 613; STORE-NEXT: [[I_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] 614; STORE-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_018]], 2 615; STORE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[MUL]] 616; STORE-NEXT: [[TMP3:%.*]] = bitcast double* [[ARRAYIDX2]] to <2 x double>* 617; STORE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 618; STORE-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP4]] 619; STORE-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 620; STORE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 621; STORE-NEXT: [[ADD8:%.*]] = fadd fast double [[TMP6]], [[TMP7]] 622; STORE-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 [[I_018]] 623; STORE-NEXT: store double [[ADD8]], double* [[ARRAYIDX9]], align 8 624; STORE-NEXT: [[INC]] = add nsw i64 [[I_018]], 1 625; STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]] 626; STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]] 627; STORE: for.end: 628; STORE-NEXT: ret void 629; 630entry: 631 %cmp17 = icmp sgt i32 %n, 0 632 br i1 %cmp17, label %for.body.lr.ph, label %for.end 633 634for.body.lr.ph: 635 %0 = load double, double* %B, align 8 636 %arrayidx4 = getelementptr inbounds double, double* %B, i64 1 637 %1 = load double, double* %arrayidx4, align 8 638 %2 = sext i32 %n to i64 639 br label %for.body 640 641for.body: 642 %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 643 %mul = shl nsw i64 %i.018, 2 644 %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul 645 %3 = load double, double* %arrayidx2, align 8 646 %mul3 = fmul fast double %0, %3 647 %add16 = or i64 %mul, 1 648 %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16 649 %4 = load double, double* %arrayidx6, align 8 650 %mul7 = fmul fast double %1, %4 651 %add8 = fadd fast double %mul3, %mul7 652 %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018 653 store double %add8, double* %arrayidx9, align 8 654 %inc = add nsw i64 %i.018, 1 655 %exitcond = icmp eq i64 %inc, %2 656 br i1 %exitcond, label %for.end, label %for.body 657 658for.end: 659 ret void 660} 661 662; int foo(float * restrict A, float * restrict B, float * restrict C, int n) { 663; float sum = 0; 664; for (intptr_t i=0; i < n; ++i) { 665; C[i] = B[0] *A[i*4 ] + 666; B[1] *A[i*4+1] + 667; B[2] *A[i*4+2] + 668; B[3] *A[i*4+3]; 669; } 670; return sum; 671; } 672 673define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) { 674; CHECK-LABEL: @store_red( 675; CHECK-NEXT: entry: 676; CHECK-NEXT: [[CMP37:%.*]] = icmp sgt i32 [[N:%.*]], 0 677; CHECK-NEXT: br i1 [[CMP37]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] 678; CHECK: for.body.lr.ph: 679; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 680; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2 681; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 682; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[N]] to i64 683; CHECK-NEXT: br label [[FOR_BODY:%.*]] 684; CHECK: for.body: 685; CHECK-NEXT: [[I_039:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] 686; CHECK-NEXT: [[C_ADDR_038:%.*]] = phi float* [ [[C:%.*]], [[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] 687; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[B]], align 4 688; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_039]], 2 689; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]] 690; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4 691; CHECK-NEXT: [[MUL3:%.*]] = fmul fast float [[TMP1]], [[TMP2]] 692; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4 693; CHECK-NEXT: [[ADD34:%.*]] = or i64 [[MUL]], 1 694; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD34]] 695; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX6]], align 4 696; CHECK-NEXT: [[MUL7:%.*]] = fmul fast float [[TMP3]], [[TMP4]] 697; CHECK-NEXT: [[ADD8:%.*]] = fadd fast float [[MUL3]], [[MUL7]] 698; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX9]], align 4 699; CHECK-NEXT: [[ADD1135:%.*]] = or i64 [[MUL]], 2 700; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1135]] 701; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX12]], align 4 702; CHECK-NEXT: [[MUL13:%.*]] = fmul fast float [[TMP5]], [[TMP6]] 703; CHECK-NEXT: [[ADD14:%.*]] = fadd fast float [[ADD8]], [[MUL13]] 704; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX15]], align 4 705; CHECK-NEXT: [[ADD1736:%.*]] = or i64 [[MUL]], 3 706; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1736]] 707; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX18]], align 4 708; CHECK-NEXT: [[MUL19:%.*]] = fmul fast float [[TMP7]], [[TMP8]] 709; CHECK-NEXT: [[ADD20:%.*]] = fadd fast float [[ADD14]], [[MUL19]] 710; CHECK-NEXT: store float [[ADD20]], float* [[C_ADDR_038]], align 4 711; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[C_ADDR_038]], i64 1 712; CHECK-NEXT: [[INC]] = add nsw i64 [[I_039]], 1 713; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]] 714; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]] 715; CHECK: for.end: 716; CHECK-NEXT: ret i32 0 717; 718; STORE-LABEL: @store_red( 719; STORE-NEXT: entry: 720; STORE-NEXT: [[CMP37:%.*]] = icmp sgt i32 [[N:%.*]], 0 721; STORE-NEXT: br i1 [[CMP37]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] 722; STORE: for.body.lr.ph: 723; STORE-NEXT: [[TMP0:%.*]] = sext i32 [[N]] to i64 724; STORE-NEXT: br label [[FOR_BODY:%.*]] 725; STORE: for.body: 726; STORE-NEXT: [[I_039:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] 727; STORE-NEXT: [[C_ADDR_038:%.*]] = phi float* [ [[C:%.*]], [[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] 728; STORE-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_039]], 2 729; STORE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]] 730; STORE-NEXT: [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>* 731; STORE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 732; STORE-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>* 733; STORE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 734; STORE-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP4]] 735; STORE-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) 736; STORE-NEXT: store float [[TMP6]], float* [[C_ADDR_038]], align 4 737; STORE-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[C_ADDR_038]], i64 1 738; STORE-NEXT: [[INC]] = add nsw i64 [[I_039]], 1 739; STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]] 740; STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]] 741; STORE: for.end: 742; STORE-NEXT: ret i32 0 743; 744entry: 745 %cmp37 = icmp sgt i32 %n, 0 746 br i1 %cmp37, label %for.body.lr.ph, label %for.end 747 748for.body.lr.ph: 749 %arrayidx4 = getelementptr inbounds float, float* %B, i64 1 750 %arrayidx9 = getelementptr inbounds float, float* %B, i64 2 751 %arrayidx15 = getelementptr inbounds float, float* %B, i64 3 752 %0 = sext i32 %n to i64 753 br label %for.body 754 755for.body: 756 %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 757 %C.addr.038 = phi float* [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ] 758 %1 = load float, float* %B, align 4 759 %mul = shl nsw i64 %i.039, 2 760 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul 761 %2 = load float, float* %arrayidx2, align 4 762 %mul3 = fmul fast float %1, %2 763 %3 = load float, float* %arrayidx4, align 4 764 %add34 = or i64 %mul, 1 765 %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add34 766 %4 = load float, float* %arrayidx6, align 4 767 %mul7 = fmul fast float %3, %4 768 %add8 = fadd fast float %mul3, %mul7 769 %5 = load float, float* %arrayidx9, align 4 770 %add1135 = or i64 %mul, 2 771 %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1135 772 %6 = load float, float* %arrayidx12, align 4 773 %mul13 = fmul fast float %5, %6 774 %add14 = fadd fast float %add8, %mul13 775 %7 = load float, float* %arrayidx15, align 4 776 %add1736 = or i64 %mul, 3 777 %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1736 778 %8 = load float, float* %arrayidx18, align 4 779 %mul19 = fmul fast float %7, %8 780 %add20 = fadd fast float %add14, %mul19 781 store float %add20, float* %C.addr.038, align 4 782 %incdec.ptr = getelementptr inbounds float, float* %C.addr.038, i64 1 783 %inc = add nsw i64 %i.039, 1 784 %exitcond = icmp eq i64 %inc, %0 785 br i1 %exitcond, label %for.end, label %for.body 786 787for.end: 788 ret i32 0 789} 790 791@arr_i32 = global [32 x i32] zeroinitializer, align 16 792@arr_float = global [32 x float] zeroinitializer, align 16 793 794define void @float_red_example4(float* %res) { 795; CHECK-LABEL: @float_red_example4( 796; CHECK-NEXT: entry: 797; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 798; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 799; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]] 800; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8 801; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]] 802; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4 803; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]] 804; CHECK-NEXT: store float [[ADD_2]], float* [[RES:%.*]], align 16 805; CHECK-NEXT: ret void 806; 807; STORE-LABEL: @float_red_example4( 808; STORE-NEXT: entry: 809; STORE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([32 x float]* @arr_float to <4 x float>*), align 16 810; STORE-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP0]]) 811; STORE-NEXT: store float [[TMP1]], float* [[RES:%.*]], align 16 812; STORE-NEXT: ret void 813; 814entry: 815 %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 816 %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 817 %add = fadd fast float %1, %0 818 %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8 819 %add.1 = fadd fast float %2, %add 820 %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4 821 %add.2 = fadd fast float %3, %add.1 822 store float %add.2, float* %res, align 16 823 ret void 824} 825 826define void @float_red_example8(float* %res) { 827; CHECK-LABEL: @float_red_example8( 828; CHECK-NEXT: entry: 829; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 830; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 831; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]] 832; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8 833; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]] 834; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4 835; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]] 836; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16 837; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP4]], [[ADD_2]] 838; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4 839; CHECK-NEXT: [[ADD_4:%.*]] = fadd fast float [[TMP5]], [[ADD_3]] 840; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8 841; CHECK-NEXT: [[ADD_5:%.*]] = fadd fast float [[TMP6]], [[ADD_4]] 842; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4 843; CHECK-NEXT: [[ADD_6:%.*]] = fadd fast float [[TMP7]], [[ADD_5]] 844; CHECK-NEXT: store float [[ADD_6]], float* [[RES:%.*]], align 16 845; CHECK-NEXT: ret void 846; 847; STORE-LABEL: @float_red_example8( 848; STORE-NEXT: entry: 849; STORE-NEXT: [[TMP0:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr_float to <8 x float>*), align 16 850; STORE-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]]) 851; STORE-NEXT: store float [[TMP1]], float* [[RES:%.*]], align 16 852; STORE-NEXT: ret void 853; 854entry: 855 %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 856 %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 857 %add = fadd fast float %1, %0 858 %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8 859 %add.1 = fadd fast float %2, %add 860 %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4 861 %add.2 = fadd fast float %3, %add.1 862 %4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16 863 %add.3 = fadd fast float %4, %add.2 864 %5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4 865 %add.4 = fadd fast float %5, %add.3 866 %6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8 867 %add.5 = fadd fast float %6, %add.4 868 %7 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4 869 %add.6 = fadd fast float %7, %add.5 870 store float %add.6, float* %res, align 16 871 ret void 872} 873 874define void @float_red_example16(float* %res) { 875; CHECK-LABEL: @float_red_example16( 876; CHECK-NEXT: entry: 877; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 878; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 879; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]] 880; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8 881; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]] 882; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4 883; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]] 884; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16 885; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP4]], [[ADD_2]] 886; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4 887; CHECK-NEXT: [[ADD_4:%.*]] = fadd fast float [[TMP5]], [[ADD_3]] 888; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8 889; CHECK-NEXT: [[ADD_5:%.*]] = fadd fast float [[TMP6]], [[ADD_4]] 890; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4 891; CHECK-NEXT: [[ADD_6:%.*]] = fadd fast float [[TMP7]], [[ADD_5]] 892; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 8), align 16 893; CHECK-NEXT: [[ADD_7:%.*]] = fadd fast float [[TMP8]], [[ADD_6]] 894; CHECK-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 9), align 4 895; CHECK-NEXT: [[ADD_8:%.*]] = fadd fast float [[TMP9]], [[ADD_7]] 896; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 10), align 8 897; CHECK-NEXT: [[ADD_9:%.*]] = fadd fast float [[TMP10]], [[ADD_8]] 898; CHECK-NEXT: [[TMP11:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 11), align 4 899; CHECK-NEXT: [[ADD_10:%.*]] = fadd fast float [[TMP11]], [[ADD_9]] 900; CHECK-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 12), align 16 901; CHECK-NEXT: [[ADD_11:%.*]] = fadd fast float [[TMP12]], [[ADD_10]] 902; CHECK-NEXT: [[TMP13:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 13), align 4 903; CHECK-NEXT: [[ADD_12:%.*]] = fadd fast float [[TMP13]], [[ADD_11]] 904; CHECK-NEXT: [[TMP14:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 14), align 8 905; CHECK-NEXT: [[ADD_13:%.*]] = fadd fast float [[TMP14]], [[ADD_12]] 906; CHECK-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 15), align 4 907; CHECK-NEXT: [[ADD_14:%.*]] = fadd fast float [[TMP15]], [[ADD_13]] 908; CHECK-NEXT: store float [[ADD_14]], float* [[RES:%.*]], align 16 909; CHECK-NEXT: ret void 910; 911; STORE-LABEL: @float_red_example16( 912; STORE-NEXT: entry: 913; STORE-NEXT: [[TMP0:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr_float to <16 x float>*), align 16 914; STORE-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP0]]) 915; STORE-NEXT: store float [[TMP1]], float* [[RES:%.*]], align 16 916; STORE-NEXT: ret void 917; 918entry: 919 %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 920 %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 921 %add = fadd fast float %1, %0 922 %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8 923 %add.1 = fadd fast float %2, %add 924 %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4 925 %add.2 = fadd fast float %3, %add.1 926 %4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16 927 %add.3 = fadd fast float %4, %add.2 928 %5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4 929 %add.4 = fadd fast float %5, %add.3 930 %6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8 931 %add.5 = fadd fast float %6, %add.4 932 %7 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4 933 %add.6 = fadd fast float %7, %add.5 934 %8 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 8), align 16 935 %add.7 = fadd fast float %8, %add.6 936 %9 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 9), align 4 937 %add.8 = fadd fast float %9, %add.7 938 %10 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 10), align 8 939 %add.9 = fadd fast float %10, %add.8 940 %11 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 11), align 4 941 %add.10 = fadd fast float %11, %add.9 942 %12 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 12), align 16 943 %add.11 = fadd fast float %12, %add.10 944 %13 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 13), align 4 945 %add.12 = fadd fast float %13, %add.11 946 %14 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 14), align 8 947 %add.13 = fadd fast float %14, %add.12 948 %15 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 15), align 4 949 %add.14 = fadd fast float %15, %add.13 950 store float %add.14, float* %res, align 16 951 ret void 952} 953 954define void @i32_red_example4(i32* %res) { 955; CHECK-LABEL: @i32_red_example4( 956; CHECK-NEXT: entry: 957; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 958; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 959; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] 960; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 961; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]] 962; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 963; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]] 964; CHECK-NEXT: store i32 [[ADD_2]], i32* [[RES:%.*]], align 16 965; CHECK-NEXT: ret void 966; 967; STORE-LABEL: @i32_red_example4( 968; STORE-NEXT: entry: 969; STORE-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16 970; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]]) 971; STORE-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16 972; STORE-NEXT: ret void 973; 974entry: 975 %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 976 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 977 %add = add nsw i32 %1, %0 978 %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 979 %add.1 = add nsw i32 %2, %add 980 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 981 %add.2 = add nsw i32 %3, %add.1 982 store i32 %add.2, i32* %res, align 16 983 ret void 984} 985 986define void @i32_red_example8(i32* %res) { 987; CHECK-LABEL: @i32_red_example8( 988; CHECK-NEXT: entry: 989; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 990; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 991; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] 992; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 993; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]] 994; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 995; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]] 996; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16 997; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]] 998; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4 999; CHECK-NEXT: [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]] 1000; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8 1001; CHECK-NEXT: [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]] 1002; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4 1003; CHECK-NEXT: [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]] 1004; CHECK-NEXT: store i32 [[ADD_6]], i32* [[RES:%.*]], align 16 1005; CHECK-NEXT: ret void 1006; 1007; STORE-LABEL: @i32_red_example8( 1008; STORE-NEXT: entry: 1009; STORE-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 1010; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) 1011; STORE-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16 1012; STORE-NEXT: ret void 1013; 1014entry: 1015 %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 1016 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 1017 %add = add nsw i32 %1, %0 1018 %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 1019 %add.1 = add nsw i32 %2, %add 1020 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 1021 %add.2 = add nsw i32 %3, %add.1 1022 %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16 1023 %add.3 = add nsw i32 %4, %add.2 1024 %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4 1025 %add.4 = add nsw i32 %5, %add.3 1026 %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8 1027 %add.5 = add nsw i32 %6, %add.4 1028 %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4 1029 %add.6 = add nsw i32 %7, %add.5 1030 store i32 %add.6, i32* %res, align 16 1031 ret void 1032} 1033 1034define void @i32_red_example16(i32* %res) { 1035; CHECK-LABEL: @i32_red_example16( 1036; CHECK-NEXT: entry: 1037; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 1038; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 1039; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] 1040; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 1041; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]] 1042; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 1043; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]] 1044; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16 1045; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]] 1046; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4 1047; CHECK-NEXT: [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]] 1048; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8 1049; CHECK-NEXT: [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]] 1050; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4 1051; CHECK-NEXT: [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]] 1052; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16 1053; CHECK-NEXT: [[ADD_7:%.*]] = add nsw i32 [[TMP8]], [[ADD_6]] 1054; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4 1055; CHECK-NEXT: [[ADD_8:%.*]] = add nsw i32 [[TMP9]], [[ADD_7]] 1056; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8 1057; CHECK-NEXT: [[ADD_9:%.*]] = add nsw i32 [[TMP10]], [[ADD_8]] 1058; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4 1059; CHECK-NEXT: [[ADD_10:%.*]] = add nsw i32 [[TMP11]], [[ADD_9]] 1060; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16 1061; CHECK-NEXT: [[ADD_11:%.*]] = add nsw i32 [[TMP12]], [[ADD_10]] 1062; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4 1063; CHECK-NEXT: [[ADD_12:%.*]] = add nsw i32 [[TMP13]], [[ADD_11]] 1064; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8 1065; CHECK-NEXT: [[ADD_13:%.*]] = add nsw i32 [[TMP14]], [[ADD_12]] 1066; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4 1067; CHECK-NEXT: [[ADD_14:%.*]] = add nsw i32 [[TMP15]], [[ADD_13]] 1068; CHECK-NEXT: store i32 [[ADD_14]], i32* [[RES:%.*]], align 16 1069; CHECK-NEXT: ret void 1070; 1071; STORE-LABEL: @i32_red_example16( 1072; STORE-NEXT: entry: 1073; STORE-NEXT: [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 16 1074; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP0]]) 1075; STORE-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16 1076; STORE-NEXT: ret void 1077; 1078entry: 1079 %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 1080 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 1081 %add = add nsw i32 %1, %0 1082 %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 1083 %add.1 = add nsw i32 %2, %add 1084 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 1085 %add.2 = add nsw i32 %3, %add.1 1086 %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16 1087 %add.3 = add nsw i32 %4, %add.2 1088 %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4 1089 %add.4 = add nsw i32 %5, %add.3 1090 %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8 1091 %add.5 = add nsw i32 %6, %add.4 1092 %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4 1093 %add.6 = add nsw i32 %7, %add.5 1094 %8 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16 1095 %add.7 = add nsw i32 %8, %add.6 1096 %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4 1097 %add.8 = add nsw i32 %9, %add.7 1098 %10 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8 1099 %add.9 = add nsw i32 %10, %add.8 1100 %11 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4 1101 %add.10 = add nsw i32 %11, %add.9 1102 %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16 1103 %add.11 = add nsw i32 %12, %add.10 1104 %13 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4 1105 %add.12 = add nsw i32 %13, %add.11 1106 %14 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8 1107 %add.13 = add nsw i32 %14, %add.12 1108 %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4 1109 %add.14 = add nsw i32 %15, %add.13 1110 store i32 %add.14, i32* %res, align 16 1111 ret void 1112} 1113 1114define void @i32_red_example32(i32* %res) { 1115; CHECK-LABEL: @i32_red_example32( 1116; CHECK-NEXT: entry: 1117; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 1118; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 1119; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] 1120; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 1121; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]] 1122; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 1123; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]] 1124; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16 1125; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]] 1126; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4 1127; CHECK-NEXT: [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]] 1128; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8 1129; CHECK-NEXT: [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]] 1130; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4 1131; CHECK-NEXT: [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]] 1132; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16 1133; CHECK-NEXT: [[ADD_7:%.*]] = add nsw i32 [[TMP8]], [[ADD_6]] 1134; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4 1135; CHECK-NEXT: [[ADD_8:%.*]] = add nsw i32 [[TMP9]], [[ADD_7]] 1136; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8 1137; CHECK-NEXT: [[ADD_9:%.*]] = add nsw i32 [[TMP10]], [[ADD_8]] 1138; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4 1139; CHECK-NEXT: [[ADD_10:%.*]] = add nsw i32 [[TMP11]], [[ADD_9]] 1140; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16 1141; CHECK-NEXT: [[ADD_11:%.*]] = add nsw i32 [[TMP12]], [[ADD_10]] 1142; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4 1143; CHECK-NEXT: [[ADD_12:%.*]] = add nsw i32 [[TMP13]], [[ADD_11]] 1144; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8 1145; CHECK-NEXT: [[ADD_13:%.*]] = add nsw i32 [[TMP14]], [[ADD_12]] 1146; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4 1147; CHECK-NEXT: [[ADD_14:%.*]] = add nsw i32 [[TMP15]], [[ADD_13]] 1148; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 16), align 16 1149; CHECK-NEXT: [[ADD_15:%.*]] = add nsw i32 [[TMP16]], [[ADD_14]] 1150; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 17), align 4 1151; CHECK-NEXT: [[ADD_16:%.*]] = add nsw i32 [[TMP17]], [[ADD_15]] 1152; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 18), align 8 1153; CHECK-NEXT: [[ADD_17:%.*]] = add nsw i32 [[TMP18]], [[ADD_16]] 1154; CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 19), align 4 1155; CHECK-NEXT: [[ADD_18:%.*]] = add nsw i32 [[TMP19]], [[ADD_17]] 1156; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 20), align 16 1157; CHECK-NEXT: [[ADD_19:%.*]] = add nsw i32 [[TMP20]], [[ADD_18]] 1158; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 21), align 4 1159; CHECK-NEXT: [[ADD_20:%.*]] = add nsw i32 [[TMP21]], [[ADD_19]] 1160; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 22), align 8 1161; CHECK-NEXT: [[ADD_21:%.*]] = add nsw i32 [[TMP22]], [[ADD_20]] 1162; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 23), align 4 1163; CHECK-NEXT: [[ADD_22:%.*]] = add nsw i32 [[TMP23]], [[ADD_21]] 1164; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 24), align 16 1165; CHECK-NEXT: [[ADD_23:%.*]] = add nsw i32 [[TMP24]], [[ADD_22]] 1166; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 25), align 4 1167; CHECK-NEXT: [[ADD_24:%.*]] = add nsw i32 [[TMP25]], [[ADD_23]] 1168; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 26), align 8 1169; CHECK-NEXT: [[ADD_25:%.*]] = add nsw i32 [[TMP26]], [[ADD_24]] 1170; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 27), align 4 1171; CHECK-NEXT: [[ADD_26:%.*]] = add nsw i32 [[TMP27]], [[ADD_25]] 1172; CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 28), align 16 1173; CHECK-NEXT: [[ADD_27:%.*]] = add nsw i32 [[TMP28]], [[ADD_26]] 1174; CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 29), align 4 1175; CHECK-NEXT: [[ADD_28:%.*]] = add nsw i32 [[TMP29]], [[ADD_27]] 1176; CHECK-NEXT: [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 30), align 8 1177; CHECK-NEXT: [[ADD_29:%.*]] = add nsw i32 [[TMP30]], [[ADD_28]] 1178; CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 31), align 4 1179; CHECK-NEXT: [[ADD_30:%.*]] = add nsw i32 [[TMP31]], [[ADD_29]] 1180; CHECK-NEXT: store i32 [[ADD_30]], i32* [[RES:%.*]], align 16 1181; CHECK-NEXT: ret void 1182; 1183; STORE-LABEL: @i32_red_example32( 1184; STORE-NEXT: entry: 1185; STORE-NEXT: [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 16 1186; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> [[TMP0]]) 1187; STORE-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16 1188; STORE-NEXT: ret void 1189; 1190entry: 1191 %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 1192 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 1193 %add = add nsw i32 %1, %0 1194 %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 1195 %add.1 = add nsw i32 %2, %add 1196 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 1197 %add.2 = add nsw i32 %3, %add.1 1198 %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16 1199 %add.3 = add nsw i32 %4, %add.2 1200 %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4 1201 %add.4 = add nsw i32 %5, %add.3 1202 %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8 1203 %add.5 = add nsw i32 %6, %add.4 1204 %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4 1205 %add.6 = add nsw i32 %7, %add.5 1206 %8 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16 1207 %add.7 = add nsw i32 %8, %add.6 1208 %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4 1209 %add.8 = add nsw i32 %9, %add.7 1210 %10 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8 1211 %add.9 = add nsw i32 %10, %add.8 1212 %11 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4 1213 %add.10 = add nsw i32 %11, %add.9 1214 %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16 1215 %add.11 = add nsw i32 %12, %add.10 1216 %13 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4 1217 %add.12 = add nsw i32 %13, %add.11 1218 %14 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8 1219 %add.13 = add nsw i32 %14, %add.12 1220 %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4 1221 %add.14 = add nsw i32 %15, %add.13 1222 %16 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 16), align 16 1223 %add.15 = add nsw i32 %16, %add.14 1224 %17 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 17), align 4 1225 %add.16 = add nsw i32 %17, %add.15 1226 %18 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 18), align 8 1227 %add.17 = add nsw i32 %18, %add.16 1228 %19 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 19), align 4 1229 %add.18 = add nsw i32 %19, %add.17 1230 %20 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 20), align 16 1231 %add.19 = add nsw i32 %20, %add.18 1232 %21 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 21), align 4 1233 %add.20 = add nsw i32 %21, %add.19 1234 %22 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 22), align 8 1235 %add.21 = add nsw i32 %22, %add.20 1236 %23 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 23), align 4 1237 %add.22 = add nsw i32 %23, %add.21 1238 %24 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 24), align 16 1239 %add.23 = add nsw i32 %24, %add.22 1240 %25 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 25), align 4 1241 %add.24 = add nsw i32 %25, %add.23 1242 %26 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 26), align 8 1243 %add.25 = add nsw i32 %26, %add.24 1244 %27 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 27), align 4 1245 %add.26 = add nsw i32 %27, %add.25 1246 %28 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 28), align 16 1247 %add.27 = add nsw i32 %28, %add.26 1248 %29 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 29), align 4 1249 %add.28 = add nsw i32 %29, %add.27 1250 %30 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 30), align 8 1251 %add.29 = add nsw i32 %30, %add.28 1252 %31 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 31), align 4 1253 %add.30 = add nsw i32 %31, %add.29 1254 store i32 %add.30, i32* %res, align 16 1255 ret void 1256} 1257 1258declare i32 @foobar(i32) 1259 1260define void @i32_red_call(i32 %val) { 1261; ALL-LABEL: @i32_red_call( 1262; ALL-NEXT: entry: 1263; ALL-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 1264; ALL-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) 1265; ALL-NEXT: [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]]) 1266; ALL-NEXT: ret void 1267; 1268entry: 1269 %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 1270 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 1271 %add = add nsw i32 %1, %0 1272 %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 1273 %add.1 = add nsw i32 %2, %add 1274 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 1275 %add.2 = add nsw i32 %3, %add.1 1276 %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16 1277 %add.3 = add nsw i32 %4, %add.2 1278 %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4 1279 %add.4 = add nsw i32 %5, %add.3 1280 %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8 1281 %add.5 = add nsw i32 %6, %add.4 1282 %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4 1283 %add.6 = add nsw i32 %7, %add.5 1284 %res = call i32 @foobar(i32 %add.6) 1285 ret void 1286} 1287 1288define void @i32_red_invoke(i32 %val) personality i32 (...)* @__gxx_personality_v0 { 1289; ALL-LABEL: @i32_red_invoke( 1290; ALL-NEXT: entry: 1291; ALL-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 1292; ALL-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) 1293; ALL-NEXT: [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]]) 1294; ALL-NEXT: to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]] 1295; ALL: exception: 1296; ALL-NEXT: [[CLEANUP:%.*]] = landingpad i8 1297; ALL-NEXT: cleanup 1298; ALL-NEXT: br label [[NORMAL]] 1299; ALL: normal: 1300; ALL-NEXT: ret void 1301; 1302entry: 1303 %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 1304 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 1305 %add = add nsw i32 %1, %0 1306 %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 1307 %add.1 = add nsw i32 %2, %add 1308 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 1309 %add.2 = add nsw i32 %3, %add.1 1310 %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16 1311 %add.3 = add nsw i32 %4, %add.2 1312 %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4 1313 %add.4 = add nsw i32 %5, %add.3 1314 %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8 1315 %add.5 = add nsw i32 %6, %add.4 1316 %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4 1317 %add.6 = add nsw i32 %7, %add.5 1318 %res = invoke i32 @foobar(i32 %add.6) to label %normal unwind label %exception 1319exception: 1320 %cleanup = landingpad i8 cleanup 1321 br label %normal 1322normal: 1323 ret void 1324} 1325 1326; Test case from PR47670. Reduction result is used as incoming value in phi. 1327define i32 @reduction_result_used_in_phi(i32* nocapture readonly %data, i1 zeroext %b) { 1328; ALL-LABEL: @reduction_result_used_in_phi( 1329; ALL-NEXT: entry: 1330; ALL-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]] 1331; ALL: bb: 1332; ALL-NEXT: [[TMP0:%.*]] = bitcast i32* [[DATA:%.*]] to <4 x i32>* 1333; ALL-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 1334; ALL-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) 1335; ALL-NEXT: br label [[EXIT]] 1336; ALL: exit: 1337; ALL-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ] 1338; ALL-NEXT: ret i32 [[SUM_1]] 1339; 1340entry: 1341 br i1 %b, label %bb, label %exit 1342 1343bb: 1344 %l.0 = load i32, i32* %data, align 4 1345 %idx.1 = getelementptr inbounds i32, i32* %data, i64 1 1346 %l.1 = load i32, i32* %idx.1, align 4 1347 %add.1 = add i32 %l.1, %l.0 1348 %idx.2 = getelementptr inbounds i32, i32* %data, i64 2 1349 %l.2 = load i32, i32* %idx.2, align 4 1350 %add.2 = add i32 %l.2, %add.1 1351 %idx.3 = getelementptr inbounds i32, i32* %data, i64 3 1352 %l.3 = load i32, i32* %idx.3, align 4 1353 %add.3 = add i32 %l.3, %add.2 1354 br label %exit 1355 1356exit: 1357 %sum.1 = phi i32 [ 0, %entry ], [ %add.3, %bb] 1358 ret i32 %sum.1 1359} 1360 1361define i32 @reduction_result_used_in_phi_loop(i32* nocapture readonly %data, i1 zeroext %b) { 1362; ALL-LABEL: @reduction_result_used_in_phi_loop( 1363; ALL-NEXT: entry: 1364; ALL-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]] 1365; ALL: bb: 1366; ALL-NEXT: [[TMP0:%.*]] = bitcast i32* [[DATA:%.*]] to <4 x i32>* 1367; ALL-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 1368; ALL-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) 1369; ALL-NEXT: br label [[EXIT]] 1370; ALL: exit: 1371; ALL-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ] 1372; ALL-NEXT: ret i32 [[SUM_1]] 1373; 1374entry: 1375 br i1 %b, label %bb, label %exit 1376 1377bb: 1378 %l.0 = load i32, i32* %data, align 4 1379 %idx.1 = getelementptr inbounds i32, i32* %data, i64 1 1380 %l.1 = load i32, i32* %idx.1, align 4 1381 %add.1 = add i32 %l.1, %l.0 1382 %idx.2 = getelementptr inbounds i32, i32* %data, i64 2 1383 %l.2 = load i32, i32* %idx.2, align 4 1384 %add.2 = add i32 %l.2, %add.1 1385 %idx.3 = getelementptr inbounds i32, i32* %data, i64 3 1386 %l.3 = load i32, i32* %idx.3, align 4 1387 %add.3 = add i32 %l.3, %add.2 1388 br label %exit 1389 1390exit: 1391 %sum.1 = phi i32 [ 0, %entry ], [ %add.3, %bb] 1392 ret i32 %sum.1 1393} 1394 1395; Make sure we do not crash or infinite loop on ill-formed IR. 1396 1397define void @unreachable_block() { 1398; ALL-LABEL: @unreachable_block( 1399; ALL-NEXT: bb.0: 1400; ALL-NEXT: br label [[BB_1:%.*]] 1401; ALL: dead: 1402; ALL-NEXT: [[T0:%.*]] = add i16 [[T0]], undef 1403; ALL-NEXT: br label [[BB_1]] 1404; ALL: bb.1: 1405; ALL-NEXT: [[T1:%.*]] = phi i16 [ undef, [[BB_0:%.*]] ], [ [[T0]], [[DEAD:%.*]] ] 1406; ALL-NEXT: ret void 1407; 1408bb.0: 1409 br label %bb.1 1410 1411dead: 1412 %t0 = add i16 %t0, undef ; unreachable IR may depend on itself 1413 br label %bb.1 1414 1415bb.1: 1416 %t1 = phi i16 [ undef, %bb.0 ], [ %t0, %dead ] 1417 ret void 1418} 1419 1420; The FMF on the reduction should match the incoming insts. 1421 1422define float @fadd_v4f32_fmf(float* %p) { 1423; ALL-LABEL: @fadd_v4f32_fmf( 1424; ALL-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* 1425; ALL-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 1426; ALL-NEXT: [[TMP3:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]]) 1427; ALL-NEXT: ret float [[TMP3]] 1428; 1429 %p1 = getelementptr inbounds float, float* %p, i64 1 1430 %p2 = getelementptr inbounds float, float* %p, i64 2 1431 %p3 = getelementptr inbounds float, float* %p, i64 3 1432 %t0 = load float, float* %p, align 4 1433 %t1 = load float, float* %p1, align 4 1434 %t2 = load float, float* %p2, align 4 1435 %t3 = load float, float* %p3, align 4 1436 %add1 = fadd reassoc nsz float %t1, %t0 1437 %add2 = fadd reassoc nsz float %t2, %add1 1438 %add3 = fadd reassoc nsz float %t3, %add2 1439 ret float %add3 1440} 1441 1442; The minimal FMF for fadd reduction are "reassoc nsz". 1443; Only the common FMF of all operations in the reduction propagate to the result. 1444; In this example, "contract nnan arcp" are dropped, but "ninf" transfers with the required flags. 1445 1446define float @fadd_v4f32_fmf_intersect(float* %p) { 1447; ALL-LABEL: @fadd_v4f32_fmf_intersect( 1448; ALL-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* 1449; ALL-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 1450; ALL-NEXT: [[TMP3:%.*]] = call reassoc ninf nsz float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]]) 1451; ALL-NEXT: ret float [[TMP3]] 1452; 1453 %p1 = getelementptr inbounds float, float* %p, i64 1 1454 %p2 = getelementptr inbounds float, float* %p, i64 2 1455 %p3 = getelementptr inbounds float, float* %p, i64 3 1456 %t0 = load float, float* %p, align 4 1457 %t1 = load float, float* %p1, align 4 1458 %t2 = load float, float* %p2, align 4 1459 %t3 = load float, float* %p3, align 4 1460 %add1 = fadd ninf reassoc nsz nnan float %t1, %t0 1461 %add2 = fadd ninf reassoc nsz nnan arcp float %t2, %add1 1462 %add3 = fadd ninf reassoc nsz contract float %t3, %add2 1463 ret float %add3 1464} 1465 1466; This must not propagate 'nsw' to a new add instruction. 1467 1468define void @nsw_propagation_v4i32(i32* %res, i32 %start) { 1469; CHECK-LABEL: @nsw_propagation_v4i32( 1470; CHECK-NEXT: [[T0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 1471; CHECK-NEXT: [[T1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 1472; CHECK-NEXT: [[T2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 1473; CHECK-NEXT: [[T3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 1474; CHECK-NEXT: [[S:%.*]] = add nsw i32 [[START:%.*]], [[T0]] 1475; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[T1]], [[S]] 1476; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[T2]], [[ADD]] 1477; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[T3]], [[ADD_1]] 1478; CHECK-NEXT: store i32 [[ADD_2]], i32* [[RES:%.*]], align 16 1479; CHECK-NEXT: ret void 1480; 1481; STORE-LABEL: @nsw_propagation_v4i32( 1482; STORE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16 1483; STORE-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) 1484; STORE-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP2]], [[START:%.*]] 1485; STORE-NEXT: store i32 [[OP_RDX]], i32* [[RES:%.*]], align 16 1486; STORE-NEXT: ret void 1487; 1488 %t0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 1489 %t1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 1490 %t2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 1491 %t3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 1492 %s = add nsw i32 %start, %t0 1493 %add = add nsw i32 %t1, %s 1494 %add.1 = add nsw i32 %t2, %add 1495 %add.2 = add nsw i32 %t3, %add.1 1496 store i32 %add.2, i32* %res, align 16 1497 ret void 1498} 1499 1500declare i32 @__gxx_personality_v0(...) 1501