1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s 3 4; #include <stdint.h> 5; 6; int foo(float *A, int n) { 7; float sum = 0; 8; for (intptr_t i=0; i < n; ++i) { 9; sum += 7*A[i*4 ] + 10; 7*A[i*4+1] + 11; 7*A[i*4+2] + 12; 7*A[i*4+3]; 13; } 14; return sum; 15; } 16 17; CHECK-LABEL: add_red 18; CHECK: fmul <4 x float> 19; CHECK: shufflevector <4 x float> 20 21define i32 @add_red(float* %A, i32 %n) { 22entry: 23 %cmp31 = icmp sgt i32 %n, 0 24 br i1 %cmp31, label %for.body.lr.ph, label %for.end 25 26for.body.lr.ph: 27 %0 = sext i32 %n to i64 28 br label %for.body 29 30for.body: 31 %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 32 %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ] 33 %mul = shl nsw i64 %i.033, 2 34 %arrayidx = getelementptr inbounds float, float* %A, i64 %mul 35 %1 = load float, float* %arrayidx, align 4 36 %mul2 = fmul float %1, 7.000000e+00 37 %add28 = or i64 %mul, 1 38 %arrayidx4 = getelementptr inbounds float, float* %A, i64 %add28 39 %2 = load float, float* %arrayidx4, align 4 40 %mul5 = fmul float %2, 7.000000e+00 41 %add6 = fadd fast float %mul2, %mul5 42 %add829 = or i64 %mul, 2 43 %arrayidx9 = getelementptr inbounds float, float* %A, i64 %add829 44 %3 = load float, float* %arrayidx9, align 4 45 %mul10 = fmul float %3, 7.000000e+00 46 %add11 = fadd fast float %add6, %mul10 47 %add1330 = or i64 %mul, 3 48 %arrayidx14 = getelementptr inbounds float, float* %A, i64 %add1330 49 %4 = load float, float* %arrayidx14, align 4 50 %mul15 = fmul float %4, 7.000000e+00 51 %add16 = fadd fast float %add11, %mul15 52 %add17 = fadd fast float %sum.032, %add16 53 %inc = add nsw i64 %i.033, 1 54 %exitcond = icmp eq i64 %inc, %0 55 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body 56 57for.cond.for.end_crit_edge: 58 %phitmp = fptosi float %add17 to i32 59 br label %for.end 60 61for.end: 62 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] 63 ret i32 %sum.0.lcssa 64} 65 66; int foo(float * restrict A, float * restrict B, int n) { 67; float sum = 0; 68; for (intptr_t i=0; i < n; ++i) { 69; sum *= B[0]*A[i*4 ] + 70; B[1]*A[i*4+1] + 71; B[2]*A[i*4+2] + 72; B[3]*A[i*4+3]; 73; } 74; return sum; 75; } 76 77; CHECK-LABEL: mul_red 78; CHECK: fmul <4 x float> 79; CHECK: shufflevector <4 x float> 80 81define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) { 82entry: 83 %cmp38 = icmp sgt i32 %n, 0 84 br i1 %cmp38, label %for.body.lr.ph, label %for.end 85 86for.body.lr.ph: 87 %0 = load float, float* %B, align 4 88 %arrayidx4 = getelementptr inbounds float, float* %B, i64 1 89 %1 = load float, float* %arrayidx4, align 4 90 %arrayidx9 = getelementptr inbounds float, float* %B, i64 2 91 %2 = load float, float* %arrayidx9, align 4 92 %arrayidx15 = getelementptr inbounds float, float* %B, i64 3 93 %3 = load float, float* %arrayidx15, align 4 94 %4 = sext i32 %n to i64 95 br label %for.body 96 97for.body: 98 %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 99 %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ] 100 %mul = shl nsw i64 %i.040, 2 101 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul 102 %5 = load float, float* %arrayidx2, align 4 103 %mul3 = fmul float %0, %5 104 %add35 = or i64 %mul, 1 105 %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add35 106 %6 = load float, float* %arrayidx6, align 4 107 %mul7 = fmul float %1, %6 108 %add8 = fadd fast float %mul3, %mul7 109 %add1136 = or i64 %mul, 2 110 %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1136 111 %7 = load float, float* %arrayidx12, align 4 112 %mul13 = fmul float %2, %7 113 %add14 = fadd fast float %add8, %mul13 114 %add1737 = or i64 %mul, 3 115 %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1737 116 %8 = load float, float* %arrayidx18, align 4 117 %mul19 = fmul float %3, %8 118 %add20 = fadd fast float %add14, %mul19 119 %mul21 = fmul float %sum.039, %add20 120 %inc = add nsw i64 %i.040, 1 121 %exitcond = icmp eq i64 %inc, %4 122 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body 123 124for.cond.for.end_crit_edge: 125 %phitmp = fptosi float %mul21 to i32 126 br label %for.end 127 128for.end: 129 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] 130 ret i32 %sum.0.lcssa 131} 132 133; int foo(float * restrict A, float * restrict B, int n) { 134; float sum = 0; 135; for (intptr_t i=0; i < n; ++i) { 136; sum += B[0]*A[i*6 ] + 137; B[1]*A[i*6+1] + 138; B[2]*A[i*6+2] + 139; B[3]*A[i*6+3] + 140; B[4]*A[i*6+4] + 141; B[5]*A[i*6+5] + 142; B[6]*A[i*6+6] + 143; B[7]*A[i*6+7] + 144; B[8]*A[i*6+8]; 145; } 146; return sum; 147; } 148 149; CHECK-LABEL: long_red 150; CHECK: fmul fast <8 x float> 151; CHECK: shufflevector <8 x float> 152 153define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) { 154entry: 155 %cmp81 = icmp sgt i32 %n, 0 156 br i1 %cmp81, label %for.body.lr.ph, label %for.end 157 158for.body.lr.ph: 159 %0 = load float, float* %B, align 4 160 %arrayidx4 = getelementptr inbounds float, float* %B, i64 1 161 %1 = load float, float* %arrayidx4, align 4 162 %arrayidx9 = getelementptr inbounds float, float* %B, i64 2 163 %2 = load float, float* %arrayidx9, align 4 164 %arrayidx15 = getelementptr inbounds float, float* %B, i64 3 165 %3 = load float, float* %arrayidx15, align 4 166 %arrayidx21 = getelementptr inbounds float, float* %B, i64 4 167 %4 = load float, float* %arrayidx21, align 4 168 %arrayidx27 = getelementptr inbounds float, float* %B, i64 5 169 %5 = load float, float* %arrayidx27, align 4 170 %arrayidx33 = getelementptr inbounds float, float* %B, i64 6 171 %6 = load float, float* %arrayidx33, align 4 172 %arrayidx39 = getelementptr inbounds float, float* %B, i64 7 173 %7 = load float, float* %arrayidx39, align 4 174 %arrayidx45 = getelementptr inbounds float, float* %B, i64 8 175 %8 = load float, float* %arrayidx45, align 4 176 %9 = sext i32 %n to i64 177 br label %for.body 178 179for.body: 180 %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 181 %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ] 182 %mul = mul nsw i64 %i.083, 6 183 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul 184 %10 = load float, float* %arrayidx2, align 4 185 %mul3 = fmul fast float %0, %10 186 %add80 = or i64 %mul, 1 187 %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add80 188 %11 = load float, float* %arrayidx6, align 4 189 %mul7 = fmul fast float %1, %11 190 %add8 = fadd fast float %mul3, %mul7 191 %add11 = add nsw i64 %mul, 2 192 %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add11 193 %12 = load float, float* %arrayidx12, align 4 194 %mul13 = fmul fast float %2, %12 195 %add14 = fadd fast float %add8, %mul13 196 %add17 = add nsw i64 %mul, 3 197 %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add17 198 %13 = load float, float* %arrayidx18, align 4 199 %mul19 = fmul fast float %3, %13 200 %add20 = fadd fast float %add14, %mul19 201 %add23 = add nsw i64 %mul, 4 202 %arrayidx24 = getelementptr inbounds float, float* %A, i64 %add23 203 %14 = load float, float* %arrayidx24, align 4 204 %mul25 = fmul fast float %4, %14 205 %add26 = fadd fast float %add20, %mul25 206 %add29 = add nsw i64 %mul, 5 207 %arrayidx30 = getelementptr inbounds float, float* %A, i64 %add29 208 %15 = load float, float* %arrayidx30, align 4 209 %mul31 = fmul fast float %5, %15 210 %add32 = fadd fast float %add26, %mul31 211 %add35 = add nsw i64 %mul, 6 212 %arrayidx36 = getelementptr inbounds float, float* %A, i64 %add35 213 %16 = load float, float* %arrayidx36, align 4 214 %mul37 = fmul fast float %6, %16 215 %add38 = fadd fast float %add32, %mul37 216 %add41 = add nsw i64 %mul, 7 217 %arrayidx42 = getelementptr inbounds float, float* %A, i64 %add41 218 %17 = load float, float* %arrayidx42, align 4 219 %mul43 = fmul fast float %7, %17 220 %add44 = fadd fast float %add38, %mul43 221 %add47 = add nsw i64 %mul, 8 222 %arrayidx48 = getelementptr inbounds float, float* %A, i64 %add47 223 %18 = load float, float* %arrayidx48, align 4 224 %mul49 = fmul fast float %8, %18 225 %add50 = fadd fast float %add44, %mul49 226 %add51 = fadd fast float %sum.082, %add50 227 %inc = add nsw i64 %i.083, 1 228 %exitcond = icmp eq i64 %inc, %9 229 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body 230 231for.cond.for.end_crit_edge: 232 %phitmp = fptosi float %add51 to i32 233 br label %for.end 234 235for.end: 236 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] 237 ret i32 %sum.0.lcssa 238} 239 240; int foo(float * restrict A, float * restrict B, int n) { 241; float sum = 0; 242; for (intptr_t i=0; i < n; ++i) { 243; sum += B[0]*A[i*4 ]; 244; sum += B[1]*A[i*4+1]; 245; sum += B[2]*A[i*4+2]; 246; sum += B[3]*A[i*4+3]; 247; } 248; return sum; 249; } 250 251; CHECK-LABEL: chain_red 252; CHECK: fmul fast <4 x float> 253; CHECK: shufflevector <4 x float> 254 255define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) { 256entry: 257 %cmp41 = icmp sgt i32 %n, 0 258 br i1 %cmp41, label %for.body.lr.ph, label %for.end 259 260for.body.lr.ph: 261 %0 = load float, float* %B, align 4 262 %arrayidx4 = getelementptr inbounds float, float* %B, i64 1 263 %1 = load float, float* %arrayidx4, align 4 264 %arrayidx10 = getelementptr inbounds float, float* %B, i64 2 265 %2 = load float, float* %arrayidx10, align 4 266 %arrayidx16 = getelementptr inbounds float, float* %B, i64 3 267 %3 = load float, float* %arrayidx16, align 4 268 %4 = sext i32 %n to i64 269 br label %for.body 270 271for.body: 272 %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 273 %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ] 274 %mul = shl nsw i64 %i.043, 2 275 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul 276 %5 = load float, float* %arrayidx2, align 4 277 %mul3 = fmul fast float %0, %5 278 %add = fadd fast float %sum.042, %mul3 279 %add638 = or i64 %mul, 1 280 %arrayidx7 = getelementptr inbounds float, float* %A, i64 %add638 281 %6 = load float, float* %arrayidx7, align 4 282 %mul8 = fmul fast float %1, %6 283 %add9 = fadd fast float %add, %mul8 284 %add1239 = or i64 %mul, 2 285 %arrayidx13 = getelementptr inbounds float, float* %A, i64 %add1239 286 %7 = load float, float* %arrayidx13, align 4 287 %mul14 = fmul fast float %2, %7 288 %add15 = fadd fast float %add9, %mul14 289 %add1840 = or i64 %mul, 3 290 %arrayidx19 = getelementptr inbounds float, float* %A, i64 %add1840 291 %8 = load float, float* %arrayidx19, align 4 292 %mul20 = fmul fast float %3, %8 293 %add21 = fadd fast float %add15, %mul20 294 %inc = add nsw i64 %i.043, 1 295 %exitcond = icmp eq i64 %inc, %4 296 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body 297 298for.cond.for.end_crit_edge: 299 %phitmp = fptosi float %add21 to i32 300 br label %for.end 301 302for.end: 303 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] 304 ret i32 %sum.0.lcssa 305} 306 307; void foo(const float *arg_A, unsigned arg_B, float *array) { 308; for (uint32_t i = 0; i < 6; ++i) { 309; const float *ptr = arg_A + i; 310; float w0 = array[i * 4 + 0]; 311; float w1 = array[i * 4 + 1]; 312; float w2 = array[i * 4 + 2]; 313; float w3 = array[i * 4 + 3]; 314; 315; for (unsigned j = 0; j < arg_B; ++j) { 316; const float x1 = *ptr - (-1.1f * w0) - (1.2f * w1); 317; const float x2 = (2.1f * x1) + (-2.2f * w0) + (2.3f * w1); 318; const float x3 = x2 - (-3.1f * w2) - (3.2f * w3); 319; const float x4 = x3 + (-4.0f * w2) + w3; 320; w1 = w0; 321; w0 = x1; 322; w3 = w2; 323; w2 = x3; 324; } 325; 326; array[i * 4 + 0] = w0; 327; array[i * 4 + 1] = w1; 328; array[i * 4 + 2] = w2; 329; array[i * 4 + 3] = w3; 330; } 331; } 332 333define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture %array) { 334; CHECK-LABEL: @foo( 335; CHECK: fmul fast <4 x float> 336; CHECK: shufflevector <4 x float> 337; 338entry: 339 %cmp1495 = icmp eq i32 %arg_B, 0 340 br label %for.body 341 342for.cond.cleanup: ; preds = %for.cond.cleanup15 343 ret void 344 345for.body: ; preds = %for.cond.cleanup15, %entry 346 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond.cleanup15 ] 347 %0 = shl i64 %indvars.iv, 2 348 %arrayidx = getelementptr inbounds float, float* %array, i64 %0 349 %1 = load float, float* %arrayidx, align 4 350 %2 = or i64 %0, 1 351 %arrayidx4 = getelementptr inbounds float, float* %array, i64 %2 352 %3 = load float, float* %arrayidx4, align 4 353 %4 = or i64 %0, 2 354 %arrayidx8 = getelementptr inbounds float, float* %array, i64 %4 355 %5 = load float, float* %arrayidx8, align 4 356 %6 = or i64 %0, 3 357 %arrayidx12 = getelementptr inbounds float, float* %array, i64 %6 358 %7 = load float, float* %arrayidx12, align 4 359 br i1 %cmp1495, label %for.cond.cleanup15, label %for.body16.lr.ph 360 361for.body16.lr.ph: ; preds = %for.body 362 %add.ptr = getelementptr inbounds float, float* %arg_A, i64 %indvars.iv 363 %8 = load float, float* %add.ptr, align 4 364 br label %for.body16 365 366for.cond.cleanup15: ; preds = %for.body16, %for.body 367 %w2.0.lcssa = phi float [ %5, %for.body ], [ %sub28, %for.body16 ] 368 %w3.0.lcssa = phi float [ %7, %for.body ], [ %w2.096, %for.body16 ] 369 %w1.0.lcssa = phi float [ %3, %for.body ], [ %w0.0100, %for.body16 ] 370 %w0.0.lcssa = phi float [ %1, %for.body ], [ %sub19, %for.body16 ] 371 store float %w0.0.lcssa, float* %arrayidx, align 4 372 store float %w1.0.lcssa, float* %arrayidx4, align 4 373 store float %w2.0.lcssa, float* %arrayidx8, align 4 374 store float %w3.0.lcssa, float* %arrayidx12, align 4 375 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 376 %exitcond109 = icmp eq i64 %indvars.iv.next, 6 377 br i1 %exitcond109, label %for.cond.cleanup, label %for.body 378 379for.body16: ; preds = %for.body16, %for.body16.lr.ph 380 %w0.0100 = phi float [ %1, %for.body16.lr.ph ], [ %sub19, %for.body16 ] 381 %w1.099 = phi float [ %3, %for.body16.lr.ph ], [ %w0.0100, %for.body16 ] 382 %j.098 = phi i32 [ 0, %for.body16.lr.ph ], [ %inc, %for.body16 ] 383 %w3.097 = phi float [ %7, %for.body16.lr.ph ], [ %w2.096, %for.body16 ] 384 %w2.096 = phi float [ %5, %for.body16.lr.ph ], [ %sub28, %for.body16 ] 385 %mul17 = fmul fast float %w0.0100, 0x3FF19999A0000000 386 %mul18.neg = fmul fast float %w1.099, 0xBFF3333340000000 387 %sub92 = fadd fast float %mul17, %mul18.neg 388 %sub19 = fadd fast float %sub92, %8 389 %mul20 = fmul fast float %sub19, 0x4000CCCCC0000000 390 %mul21.neg = fmul fast float %w0.0100, 0xC0019999A0000000 391 %mul23 = fmul fast float %w1.099, 0x4002666660000000 392 %mul25 = fmul fast float %w2.096, 0x4008CCCCC0000000 393 %mul27.neg = fmul fast float %w3.097, 0xC0099999A0000000 394 %add2293 = fadd fast float %mul27.neg, %mul25 395 %add24 = fadd fast float %add2293, %mul23 396 %sub2694 = fadd fast float %add24, %mul21.neg 397 %sub28 = fadd fast float %sub2694, %mul20 398 %inc = add nuw i32 %j.098, 1 399 %exitcond = icmp eq i32 %inc, %arg_B 400 br i1 %exitcond, label %for.cond.cleanup15, label %for.body16 401} 402 403; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE 404 405; void foo(double * restrict A, double * restrict B, double * restrict C, 406; int n) { 407; for (intptr_t i=0; i < n; ++i) { 408; C[i] = B[0] *A[i*4 ] + B[1] *A[i*4+1]; 409; } 410; } 411 412; STORE-LABEL: store_red_double 413; STORE: fmul fast <2 x double> 414; STORE: extractelement <2 x double> 415; STORE: extractelement <2 x double> 416 417define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) { 418entry: 419 %cmp17 = icmp sgt i32 %n, 0 420 br i1 %cmp17, label %for.body.lr.ph, label %for.end 421 422for.body.lr.ph: 423 %0 = load double, double* %B, align 8 424 %arrayidx4 = getelementptr inbounds double, double* %B, i64 1 425 %1 = load double, double* %arrayidx4, align 8 426 %2 = sext i32 %n to i64 427 br label %for.body 428 429for.body: 430 %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 431 %mul = shl nsw i64 %i.018, 2 432 %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul 433 %3 = load double, double* %arrayidx2, align 8 434 %mul3 = fmul fast double %0, %3 435 %add16 = or i64 %mul, 1 436 %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16 437 %4 = load double, double* %arrayidx6, align 8 438 %mul7 = fmul fast double %1, %4 439 %add8 = fadd fast double %mul3, %mul7 440 %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018 441 store double %add8, double* %arrayidx9, align 8 442 %inc = add nsw i64 %i.018, 1 443 %exitcond = icmp eq i64 %inc, %2 444 br i1 %exitcond, label %for.end, label %for.body 445 446for.end: 447 ret void 448} 449 450; int foo(float * restrict A, float * restrict B, float * restrict C, int n) { 451; float sum = 0; 452; for (intptr_t i=0; i < n; ++i) { 453; C[i] = B[0] *A[i*4 ] + 454; B[1] *A[i*4+1] + 455; B[2] *A[i*4+2] + 456; B[3] *A[i*4+3]; 457; } 458; return sum; 459; } 460 461; STORE-LABEL: store_red 462; STORE: fmul fast <4 x float> 463; STORE: shufflevector <4 x float> 464 465define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) { 466entry: 467 %cmp37 = icmp sgt i32 %n, 0 468 br i1 %cmp37, label %for.body.lr.ph, label %for.end 469 470for.body.lr.ph: 471 %arrayidx4 = getelementptr inbounds float, float* %B, i64 1 472 %arrayidx9 = getelementptr inbounds float, float* %B, i64 2 473 %arrayidx15 = getelementptr inbounds float, float* %B, i64 3 474 %0 = sext i32 %n to i64 475 br label %for.body 476 477for.body: 478 %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 479 %C.addr.038 = phi float* [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ] 480 %1 = load float, float* %B, align 4 481 %mul = shl nsw i64 %i.039, 2 482 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul 483 %2 = load float, float* %arrayidx2, align 4 484 %mul3 = fmul fast float %1, %2 485 %3 = load float, float* %arrayidx4, align 4 486 %add34 = or i64 %mul, 1 487 %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add34 488 %4 = load float, float* %arrayidx6, align 4 489 %mul7 = fmul fast float %3, %4 490 %add8 = fadd fast float %mul3, %mul7 491 %5 = load float, float* %arrayidx9, align 4 492 %add1135 = or i64 %mul, 2 493 %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1135 494 %6 = load float, float* %arrayidx12, align 4 495 %mul13 = fmul fast float %5, %6 496 %add14 = fadd fast float %add8, %mul13 497 %7 = load float, float* %arrayidx15, align 4 498 %add1736 = or i64 %mul, 3 499 %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1736 500 %8 = load float, float* %arrayidx18, align 4 501 %mul19 = fmul fast float %7, %8 502 %add20 = fadd fast float %add14, %mul19 503 store float %add20, float* %C.addr.038, align 4 504 %incdec.ptr = getelementptr inbounds float, float* %C.addr.038, i64 1 505 %inc = add nsw i64 %i.039, 1 506 %exitcond = icmp eq i64 %inc, %0 507 br i1 %exitcond, label %for.end, label %for.body 508 509for.end: 510 ret i32 0 511} 512 513@arr_i32 = global [32 x i32] zeroinitializer, align 16 514@arr_float = global [32 x float] zeroinitializer, align 16 515 516define void @float_red_example4(float* %res) { 517; STORE-LABEL: @float_red_example4( 518; STORE: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([32 x float]* @arr_float to <4 x float>*), align 16 519; STORE: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 520; STORE-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP0]], [[RDX_SHUF]] 521; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 522; STORE-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] 523; STORE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 524; STORE: store float [[TMP1]], float* %res, align 16 525; STORE-NEXT: ret void 526; 527entry: 528 %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 529 %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 530 %add = fadd fast float %1, %0 531 %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8 532 %add.1 = fadd fast float %2, %add 533 %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4 534 %add.2 = fadd fast float %3, %add.1 535 store float %add.2, float* %res, align 16 536 ret void 537} 538 539define void @float_red_example8(float* %res) { 540; STORE-LABEL: @float_red_example8( 541; STORE: [[TMP0:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr_float to <8 x float>*), align 16 542; STORE: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 543; STORE-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP0]], [[RDX_SHUF]] 544; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 545; STORE-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]] 546; STORE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 547; STORE-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] 548; STORE-NEXT: [[TMP1:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 549; STORE: store float [[TMP1]], float* %res, align 16 550; STORE-NEXT: ret void 551; 552entry: 553 %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 554 %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 555 %add = fadd fast float %1, %0 556 %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8 557 %add.1 = fadd fast float %2, %add 558 %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4 559 %add.2 = fadd fast float %3, %add.1 560 %4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16 561 %add.3 = fadd fast float %4, %add.2 562 %5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4 563 %add.4 = fadd fast float %5, %add.3 564 %6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8 565 %add.5 = fadd fast float %6, %add.4 566 %7 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4 567 %add.6 = fadd fast float %7, %add.5 568 store float %add.6, float* %res, align 16 569 ret void 570} 571 572define void @float_red_example16(float* %res) { 573; STORE-LABEL: @float_red_example16( 574; STORE: [[TMP0:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr_float to <16 x float>*), align 16 575; STORE: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 576; STORE-NEXT: [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP0]], [[RDX_SHUF]] 577; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 578; STORE-NEXT: [[BIN_RDX2:%.*]] = fadd fast <16 x float> [[BIN_RDX]], [[RDX_SHUF1]] 579; STORE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 580; STORE-NEXT: [[BIN_RDX4:%.*]] = fadd fast <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]] 581; STORE-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 582; STORE-NEXT: [[BIN_RDX6:%.*]] = fadd fast <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]] 583; STORE-NEXT: [[TMP1:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0 584; STORE: store float [[TMP1]], float* %res, align 16 585; STORE-NEXT: ret void 586; 587entry: 588 %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 589 %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 590 %add = fadd fast float %1, %0 591 %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8 592 %add.1 = fadd fast float %2, %add 593 %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4 594 %add.2 = fadd fast float %3, %add.1 595 %4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16 596 %add.3 = fadd fast float %4, %add.2 597 %5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4 598 %add.4 = fadd fast float %5, %add.3 599 %6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8 600 %add.5 = fadd fast float %6, %add.4 601 %7 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4 602 %add.6 = fadd fast float %7, %add.5 603 %8 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 8), align 16 604 %add.7 = fadd fast float %8, %add.6 605 %9 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 9), align 4 606 %add.8 = fadd fast float %9, %add.7 607 %10 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 10), align 8 608 %add.9 = fadd fast float %10, %add.8 609 %11 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 11), align 4 610 %add.10 = fadd fast float %11, %add.9 611 %12 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 12), align 16 612 %add.11 = fadd fast float %12, %add.10 613 %13 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 13), align 4 614 %add.12 = fadd fast float %13, %add.11 615 %14 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 14), align 8 616 %add.13 = fadd fast float %14, %add.12 617 %15 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 15), align 4 618 %add.14 = fadd fast float %15, %add.13 619 store float %add.14, float* %res, align 16 620 ret void 621} 622 623define void @i32_red_example4(i32* %res) { 624; STORE-LABEL: @i32_red_example4( 625; STORE: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16 626; STORE: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 627; STORE-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP0]], [[RDX_SHUF]] 628; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 629; STORE-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] 630; STORE-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 631; STORE: store i32 [[TMP1]], i32* %res, align 16 632; STORE-NEXT: ret void 633; 634entry: 635 %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 636 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 637 %add = add nsw i32 %1, %0 638 %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 639 %add.1 = add nsw i32 %2, %add 640 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 641 %add.2 = add nsw i32 %3, %add.1 642 store i32 %add.2, i32* %res, align 16 643 ret void 644} 645 646define void @i32_red_example8(i32* %res) { 647; STORE-LABEL: @i32_red_example8( 648; STORE: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 649; STORE: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 650; STORE-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP0]], [[RDX_SHUF]] 651; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 652; STORE-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] 653; STORE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 654; STORE-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] 655; STORE-NEXT: [[TMP1:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 656; STORE: store i32 [[TMP1]], i32* %res, align 16 657; STORE-NEXT: ret void 658; 659entry: 660 %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 661 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 662 %add = add nsw i32 %1, %0 663 %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 664 %add.1 = add nsw i32 %2, %add 665 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 666 %add.2 = add nsw i32 %3, %add.1 667 %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16 668 %add.3 = add nsw i32 %4, %add.2 669 %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4 670 %add.4 = add nsw i32 %5, %add.3 671 %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8 672 %add.5 = add nsw i32 %6, %add.4 673 %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4 674 %add.6 = add nsw i32 %7, %add.5 675 store i32 %add.6, i32* %res, align 16 676 ret void 677} 678 679define void @i32_red_example16(i32* %res) { 680; STORE-LABEL: @i32_red_example16( 681; STORE: [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 16 682; STORE: [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 683; STORE-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP0]], [[RDX_SHUF]] 684; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 685; STORE-NEXT: [[BIN_RDX2:%.*]] = add <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]] 686; STORE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 687; STORE-NEXT: [[BIN_RDX4:%.*]] = add <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] 688; STORE-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 689; STORE-NEXT: [[BIN_RDX6:%.*]] = add <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] 690; STORE-NEXT: [[TMP1:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0 691; STORE: store i32 [[TMP1]], i32* %res, align 16 692; STORE-NEXT: ret void 693; 694entry: 695 %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 696 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 697 %add = add nsw i32 %1, %0 698 %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 699 %add.1 = add nsw i32 %2, %add 700 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 701 %add.2 = add nsw i32 %3, %add.1 702 %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16 703 %add.3 = add nsw i32 %4, %add.2 704 %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4 705 %add.4 = add nsw i32 %5, %add.3 706 %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8 707 %add.5 = add nsw i32 %6, %add.4 708 %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4 709 %add.6 = add nsw i32 %7, %add.5 710 %8 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16 711 %add.7 = add nsw i32 %8, %add.6 712 %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4 713 %add.8 = add nsw i32 %9, %add.7 714 %10 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8 715 %add.9 = add nsw i32 %10, %add.8 716 %11 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4 717 %add.10 = add nsw i32 %11, %add.9 718 %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16 719 %add.11 = add nsw i32 %12, %add.10 720 %13 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4 721 %add.12 = add nsw i32 %13, %add.11 722 %14 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8 723 %add.13 = add nsw i32 %14, %add.12 724 %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4 725 %add.14 = add nsw i32 %15, %add.13 726 store i32 %add.14, i32* %res, align 16 727 ret void 728} 729 730define void @i32_red_example32(i32* %res) { 731; STORE-LABEL: @i32_red_example32( 732; STORE: [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 16 733; STORE: [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 734; STORE-NEXT: [[BIN_RDX:%.*]] = add <32 x i32> [[TMP0]], [[RDX_SHUF]] 735; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 736; STORE-NEXT: [[BIN_RDX2:%.*]] = add <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]] 737; STORE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 738; STORE-NEXT: [[BIN_RDX4:%.*]] = add <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] 739; STORE-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 740; STORE-NEXT: [[BIN_RDX6:%.*]] = add <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] 741; STORE-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 742; STORE-NEXT: [[BIN_RDX8:%.*]] = add <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]] 743; STORE-NEXT: [[TMP1:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0 744; STORE: store i32 [[TMP1]], i32* %res, align 16 745; STORE-NEXT: ret void 746; 747entry: 748 %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 749 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 750 %add = add nsw i32 %1, %0 751 %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 752 %add.1 = add nsw i32 %2, %add 753 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 754 %add.2 = add nsw i32 %3, %add.1 755 %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16 756 %add.3 = add nsw i32 %4, %add.2 757 %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4 758 %add.4 = add nsw i32 %5, %add.3 759 %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8 760 %add.5 = add nsw i32 %6, %add.4 761 %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4 762 %add.6 = add nsw i32 %7, %add.5 763 %8 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16 764 %add.7 = add nsw i32 %8, %add.6 765 %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4 766 %add.8 = add nsw i32 %9, %add.7 767 %10 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8 768 %add.9 = add nsw i32 %10, %add.8 769 %11 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4 770 %add.10 = add nsw i32 %11, %add.9 771 %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16 772 %add.11 = add nsw i32 %12, %add.10 773 %13 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4 774 %add.12 = add nsw i32 %13, %add.11 775 %14 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8 776 %add.13 = add nsw i32 %14, %add.12 777 %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4 778 %add.14 = add nsw i32 %15, %add.13 779 %16 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 16), align 16 780 %add.15 = add nsw i32 %16, %add.14 781 %17 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 17), align 4 782 %add.16 = add nsw i32 %17, %add.15 783 %18 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 18), align 8 784 %add.17 = add nsw i32 %18, %add.16 785 %19 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 19), align 4 786 %add.18 = add nsw i32 %19, %add.17 787 %20 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 20), align 16 788 %add.19 = add nsw i32 %20, %add.18 789 %21 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 21), align 4 790 %add.20 = add nsw i32 %21, %add.19 791 %22 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 22), align 8 792 %add.21 = add nsw i32 %22, %add.20 793 %23 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 23), align 4 794 %add.22 = add nsw i32 %23, %add.21 795 %24 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 24), align 16 796 %add.23 = add nsw i32 %24, %add.22 797 %25 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 25), align 4 798 %add.24 = add nsw i32 %25, %add.23 799 %26 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 26), align 8 800 %add.25 = add nsw i32 %26, %add.24 801 %27 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 27), align 4 802 %add.26 = add nsw i32 %27, %add.25 803 %28 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 28), align 16 804 %add.27 = add nsw i32 %28, %add.26 805 %29 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 29), align 4 806 %add.28 = add nsw i32 %29, %add.27 807 %30 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 30), align 8 808 %add.29 = add nsw i32 %30, %add.28 809 %31 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 31), align 4 810 %add.30 = add nsw i32 %31, %add.29 811 store i32 %add.30, i32* %res, align 16 812 ret void 813} 814 815