1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -verify | FileCheck %s -check-prefix=ENABLED 3; 4; Without supernode operand reordering, this does not get fully vectorized. 5; S[0] = (A[0] + B[0]) + C[0] 6; S[1] = (B[1] + C[1]) + A[1] 7define void @test_supernode_add(double* %Aarray, double* %Barray, double *%Carray, double *%Sarray) { 8; ENABLED-LABEL: @test_supernode_add( 9; ENABLED-NEXT: entry: 10; ENABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 11; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1 12; ENABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 13; ENABLED-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0 14; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1 15; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 16; ENABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 17; ENABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 18; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* 19; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 20; ENABLED-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 21; ENABLED-NEXT: [[C1:%.*]] = load double, double* [[IDXC1]], align 8 22; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 23; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1 24; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] 25; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 26; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1 27; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] 28; ENABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* 29; ENABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 30; ENABLED-NEXT: ret void 31; 32entry: 33 %idxA0 = getelementptr inbounds double, double* %Aarray, i64 0 34 %idxA1 = getelementptr inbounds double, double* %Aarray, i64 1 35 %idxB0 = getelementptr inbounds double, double* %Barray, i64 0 36 %idxB1 = getelementptr inbounds double, double* %Barray, i64 1 37 %idxC0 = getelementptr inbounds double, double* %Carray, i64 0 38 %idxC1 = getelementptr inbounds double, double* %Carray, i64 1 39 %idxS0 = getelementptr inbounds double, double* %Sarray, i64 0 40 %idxS1 = getelementptr inbounds double, double* %Sarray, i64 1 41 42 %A0 = load double, double *%idxA0, align 8 43 %A1 = load double, double *%idxA1, align 8 44 45 %B0 = load double, double *%idxB0, align 8 46 %B1 = load double, double *%idxB1, align 8 47 48 %C0 = load double, double *%idxC0, align 8 49 %C1 = load double, double *%idxC1, align 8 50 51 %addA0B0 = fadd fast double %A0, %B0 52 %addB1C1 = fadd fast double %B1, %C1 53 %add0 = fadd fast double %addA0B0, %C0 54 %add1 = fadd fast double %addB1C1, %A1 55 store double %add0, double *%idxS0, align 8 56 store double %add1, double *%idxS1, align 8 57 ret void 58} 59 60 61; Without supernode operand reordering, this does not get fully vectorized. 62; S[0] = (A[0] - B[0]) + C[0] 63; S[1] = (C[1] - B[1]) + A[1] 64define void @test_supernode_addsub(double* %Aarray, double* %Barray, double *%Carray, double *%Sarray) { 65; ENABLED-LABEL: @test_supernode_addsub( 66; ENABLED-NEXT: entry: 67; ENABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 68; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1 69; ENABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 70; ENABLED-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0 71; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1 72; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 73; ENABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 74; ENABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 75; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* 76; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 77; ENABLED-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 78; ENABLED-NEXT: [[C1:%.*]] = load double, double* [[IDXC1]], align 8 79; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 80; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1 81; ENABLED-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]] 82; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 83; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1 84; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] 85; ENABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* 86; ENABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 87; ENABLED-NEXT: ret void 88; 89entry: 90 %idxA0 = getelementptr inbounds double, double* %Aarray, i64 0 91 %idxA1 = getelementptr inbounds double, double* %Aarray, i64 1 92 %idxB0 = getelementptr inbounds double, double* %Barray, i64 0 93 %idxB1 = getelementptr inbounds double, double* %Barray, i64 1 94 %idxC0 = getelementptr inbounds double, double* %Carray, i64 0 95 %idxC1 = getelementptr inbounds double, double* %Carray, i64 1 96 %idxS0 = getelementptr inbounds double, double* %Sarray, i64 0 97 %idxS1 = getelementptr inbounds double, double* %Sarray, i64 1 98 99 %A0 = load double, double *%idxA0, align 8 100 %A1 = load double, double *%idxA1, align 8 101 102 %B0 = load double, double *%idxB0, align 8 103 %B1 = load double, double *%idxB1, align 8 104 105 %C0 = load double, double *%idxC0, align 8 106 %C1 = load double, double *%idxC1, align 8 107 108 %subA0B0 = fsub fast double %A0, %B0 109 %subC1B1 = fsub fast double %C1, %B1 110 %add0 = fadd fast double %subA0B0, %C0 111 %add1 = fadd fast double %subC1B1, %A1 112 store double %add0, double *%idxS0, align 8 113 store double %add1, double *%idxS1, align 8 114 ret void 115} 116 117; Without supernode operand reordering, this does not get fully vectorized. 118; This checks that the super-node works with alternate sequences. 119; 120; S[0] = (A[0] - B[0]) - C[0] 121; S[1] = (B[1] + C[1]) + A[1] 122define void @test_supernode_addsub_alt(double* %Aarray, double* %Barray, double *%Carray, double *%Sarray) { 123; ENABLED-LABEL: @test_supernode_addsub_alt( 124; ENABLED-NEXT: entry: 125; ENABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 126; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1 127; ENABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 128; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1 129; ENABLED-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0 130; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1 131; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 132; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 133; ENABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 134; ENABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 135; ENABLED-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 136; ENABLED-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 137; ENABLED-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 138; ENABLED-NEXT: [[C1:%.*]] = load double, double* [[IDXC1]], align 8 139; ENABLED-NEXT: [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]] 140; ENABLED-NEXT: [[ADDB1C1:%.*]] = fadd fast double [[B1]], [[C1]] 141; ENABLED-NEXT: [[SUB0:%.*]] = fsub fast double [[SUBA0B0]], [[C0]] 142; ENABLED-NEXT: [[ADD1:%.*]] = fadd fast double [[ADDB1C1]], [[A1]] 143; ENABLED-NEXT: store double [[SUB0]], double* [[IDXS0]], align 8 144; ENABLED-NEXT: store double [[ADD1]], double* [[IDXS1]], align 8 145; ENABLED-NEXT: ret void 146; 147entry: 148 %idxA0 = getelementptr inbounds double, double* %Aarray, i64 0 149 %idxA1 = getelementptr inbounds double, double* %Aarray, i64 1 150 %idxB0 = getelementptr inbounds double, double* %Barray, i64 0 151 %idxB1 = getelementptr inbounds double, double* %Barray, i64 1 152 %idxC0 = getelementptr inbounds double, double* %Carray, i64 0 153 %idxC1 = getelementptr inbounds double, double* %Carray, i64 1 154 %idxS0 = getelementptr inbounds double, double* %Sarray, i64 0 155 %idxS1 = getelementptr inbounds double, double* %Sarray, i64 1 156 157 %A0 = load double, double *%idxA0, align 8 158 %A1 = load double, double *%idxA1, align 8 159 160 %B0 = load double, double *%idxB0, align 8 161 %B1 = load double, double *%idxB1, align 8 162 163 %C0 = load double, double *%idxC0, align 8 164 %C1 = load double, double *%idxC1, align 8 165 166 %subA0B0 = fsub fast double %A0, %B0 167 %addB1C1 = fadd fast double %B1, %C1 168 %sub0 = fsub fast double %subA0B0, %C0 169 %add1 = fadd fast double %addB1C1, %A1 170 store double %sub0, double *%idxS0, align 8 171 store double %add1, double *%idxS1, align 8 172 ret void 173} 174 175; This checks that vectorizeTree() works correctly with the supernode 176; and does not generate uses before defs. 177; If all of the operands of the supernode are vectorizable, then the scheduler 178; will fix their position in the program. If not, then the scheduler may not 179; touch them, leading to uses before defs. 180; 181; A0 = ... 182; C = ... 183; t1 = A0 + C 184; B0 = ... 185; t2 = t1 + B0 186; A1 = ... 187; B1 = ... 188; t3 = A1 + B1 189; D = ... 190; t4 = t3 + D 191; 192; 193; A0 C A1 B1 A0 C A1 D A0:1 C,D 194; \ / \ / Reorder \ / \ / Bundles \ / 195; t1 + B0 t3 + D -------> t1 + B0 t3 + B1 ------> t1:3 + B0:1 196; |/ |/ |/ |/ |/ 197; t2 + t4 + t2 + t4 + t2:4 + 198; 199; After reordering, 'D' conceptually becomes an operand of t3: 200; t3 = A1 + D 201; But D is defined *after* its use. 202; 203define void @supernode_scheduling(double* %Aarray, double* %Barray, double *%Carray, double *%Darray, double *%Sarray) { 204; ENABLED-LABEL: @supernode_scheduling( 205; ENABLED-NEXT: entry: 206; ENABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 207; ENABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 208; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1 209; ENABLED-NEXT: [[IDXC:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0 210; ENABLED-NEXT: [[IDXD:%.*]] = getelementptr inbounds double, double* [[DARRAY:%.*]], i64 0 211; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 212; ENABLED-NEXT: [[C:%.*]] = load double, double* [[IDXC]], align 8 213; ENABLED-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 214; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* 215; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 216; ENABLED-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 217; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 218; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B1]], i32 1 219; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] 220; ENABLED-NEXT: [[D:%.*]] = load double, double* [[IDXD]], align 8 221; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 222; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[D]], i32 1 223; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] 224; ENABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* 225; ENABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 226; ENABLED-NEXT: ret void 227; 228entry: 229 %idxA0 = getelementptr inbounds double, double* %Aarray, i64 0 230 %idxA1 = getelementptr inbounds double, double* %Aarray, i64 1 231 %idxB0 = getelementptr inbounds double, double* %Barray, i64 0 232 %idxB1 = getelementptr inbounds double, double* %Barray, i64 1 233 %idxC = getelementptr inbounds double, double* %Carray, i64 0 234 %idxD = getelementptr inbounds double, double* %Darray, i64 0 235 %idxS0 = getelementptr inbounds double, double* %Sarray, i64 0 236 %idxS1 = getelementptr inbounds double, double* %Sarray, i64 1 237 238 239 %A0 = load double, double *%idxA0, align 8 240 %C = load double, double *%idxC, align 8 241 %t1 = fadd fast double %A0, %C 242 %B0 = load double, double *%idxB0, align 8 243 %t2 = fadd fast double %t1, %B0 244 %A1 = load double, double *%idxA1, align 8 245 %B1 = load double, double *%idxB1, align 8 246 %t3 = fadd fast double %A1, %B1 247 %D = load double, double *%idxD, align 8 248 %t4 = fadd fast double %t3, %D 249 250 store double %t2, double *%idxS0, align 8 251 store double %t4, double *%idxS1, align 8 252 ret void 253} 254 255 256; The SLP scheduler has trouble moving instructions across blocks. 257; Even though we can build a SuperNode for this example, we should not because the scheduler 258; cannot handle the cross-block instruction motion that is required once the operands of the 259; SuperNode are reordered. 260; 261; bb1: 262; A0 = ... 263; B1 = ... 264; Tmp0 = A0 + 2.0 265; Tmp1 = B1 + 2.0 266; 267; bb2: 268; A1 = ... 269; B0 = ... 270; S[0] = Tmp0 + B0 271; S[1] = Tmp1 + A1 272define void @supernode_scheduling_cross_block(double* %Aarray, double* %Barray, double *%Sarray) { 273; ENABLED-LABEL: @supernode_scheduling_cross_block( 274; ENABLED-NEXT: entry: 275; ENABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 276; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1 277; ENABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 278; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1 279; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 280; ENABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 281; ENABLED-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 282; ENABLED-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 283; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[B1]], i32 1 284; ENABLED-NEXT: [[TMP2:%.*]] = fadd fast <2 x double> [[TMP1]], <double 2.000000e+00, double 2.000000e+00> 285; ENABLED-NEXT: br label [[BB:%.*]] 286; ENABLED: bb: 287; ENABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 288; ENABLED-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 289; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 290; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[A1]], i32 1 291; ENABLED-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]] 292; ENABLED-NEXT: [[TMP6:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* 293; ENABLED-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 294; ENABLED-NEXT: ret void 295; 296entry: 297 %idxA0 = getelementptr inbounds double, double* %Aarray, i64 0 298 %idxA1 = getelementptr inbounds double, double* %Aarray, i64 1 299 %idxB0 = getelementptr inbounds double, double* %Barray, i64 0 300 %idxB1 = getelementptr inbounds double, double* %Barray, i64 1 301 %idxS0 = getelementptr inbounds double, double* %Sarray, i64 0 302 %idxS1 = getelementptr inbounds double, double* %Sarray, i64 1 303 304 %A0 = load double, double *%idxA0, align 8 305 %B1 = load double, double *%idxB1, align 8 306 %Tmp0 = fadd fast double %A0, 2.0 307 %Tmp1 = fadd fast double %B1, 2.0 308br label %bb 309 310bb: 311 %A1 = load double, double *%idxA1, align 8 312 %B0 = load double, double *%idxB0, align 8 313 314 %Sum0 = fadd fast double %Tmp0, %B0 315 %Sum1 = fadd fast double %Tmp1, %A1 316 317 store double %Sum0, double *%idxS0, align 8 318 store double %Sum1, double *%idxS1, align 8 319 ret void 320} 321