1; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf \ 2; RUN: -enable-arm-maskedgatscat=false \ 3; RUN: -tail-predication=enabled -loop-vectorize -S < %s | \ 4; RUN: FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING 5 6; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=-mve \ 7; RUN: -tail-predication=enabled -loop-vectorize \ 8; RUN: -enable-arm-maskedgatscat=false \ 9; RUN: -enable-arm-maskedldst=true -S < %s | \ 10; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 11 12; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \ 13; RUN: -tail-predication=enabled -loop-vectorize \ 14; RUN: -enable-arm-maskedgatscat=false \ 15; RUN: -enable-arm-maskedldst=false -S < %s | \ 16; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 17 18; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \ 19; RUN: -tail-predication=disabled -loop-vectorize \ 20; RUN: -enable-arm-maskedgatscat=false \ 21; RUN: -enable-arm-maskedldst=true -S < %s | \ 22; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 23 24; Disabling the low-overhead branch extension will make 25; 'isHardwareLoopProfitable' return false, so that we test avoiding folding for 26; these cases. 27; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve,-lob \ 28; RUN: -tail-predication=enabled -loop-vectorize \ 29; RUN: -enable-arm-maskedgatscat=false \ 30; RUN: -enable-arm-maskedldst=true -S < %s | \ 31; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 32 33; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \ 34; RUN: -tail-predication=enabled -loop-vectorize \ 35; RUN: -enable-arm-maskedgatscat=false \ 36; RUN: -enable-arm-maskedldst=true -S < %s | \ 37; RUN: FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING 38 39; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \ 40; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \ 41; RUN: -tail-predication=enabled -loop-vectorize \ 42; RUN: -enable-arm-maskedgatscat=false \ 43; RUN: -enable-arm-maskedldst=true -S < %s | \ 44; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 45 46; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \ 47; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ 48; RUN: -tail-predication=enabled -loop-vectorize \ 49; RUN: -enable-arm-maskedgatscat=false \ 50; RUN: -enable-arm-maskedldst=true -S < %s | \ 51; RUN: FileCheck %s -check-prefixes=CHECK 52 53define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 54; CHECK-LABEL: prefer_folding( 55; PREFER-FOLDING: vector.body: 56; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 57; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 58; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 431) 59; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask, 60; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask, 61; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask 62; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 63; 64; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( 65; NO-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32( 66; NO-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %for.body 67entry: 68 br label %for.body 69 70for.cond.cleanup: 71 ret void 72 73for.body: 74 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 75 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 76 %0 = load i32, i32* %arrayidx, align 4 77 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 78 %1 = load i32, i32* %arrayidx1, align 4 79 %add = add nsw i32 %1, %0 80 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 81 store i32 %add, i32* %arrayidx2, align 4 82 %add3 = add nuw nsw i32 %i.09, 1 83 %exitcond = icmp eq i32 %add3, 431 84 br i1 %exitcond, label %for.cond.cleanup, label %for.body 85} 86 87define void @mixed_types(i16* noalias nocapture %A, i16* noalias nocapture readonly %B, i16* noalias nocapture readonly %C, i32* noalias nocapture %D, i32* noalias nocapture readonly %E, i32* noalias nocapture readonly %F) #0 { 88; CHECK-LABEL: mixed_types( 89; PREFER-FOLDING: vector.body: 90; PREFER-FOLDING: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16 91; PREFER-FOLDING: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16 92; PREFER-FOLDING: call void @llvm.masked.store.v4i16.p0v4i16 93; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 94; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 95; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32 96; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 97entry: 98 br label %for.body 99 100for.cond.cleanup: 101 ret void 102 103for.body: 104 %i.018 = phi i32 [ 0, %entry ], [ %add9, %for.body ] 105 %arrayidx = getelementptr inbounds i16, i16* %B, i32 %i.018 106 %0 = load i16, i16* %arrayidx, align 2 107 %arrayidx1 = getelementptr inbounds i16, i16* %C, i32 %i.018 108 %1 = load i16, i16* %arrayidx1, align 2 109 %add = add i16 %1, %0 110 %arrayidx4 = getelementptr inbounds i16, i16* %A, i32 %i.018 111 store i16 %add, i16* %arrayidx4, align 2 112 %arrayidx5 = getelementptr inbounds i32, i32* %E, i32 %i.018 113 %2 = load i32, i32* %arrayidx5, align 4 114 %arrayidx6 = getelementptr inbounds i32, i32* %F, i32 %i.018 115 %3 = load i32, i32* %arrayidx6, align 4 116 %add7 = add nsw i32 %3, %2 117 %arrayidx8 = getelementptr inbounds i32, i32* %D, i32 %i.018 118 store i32 %add7, i32* %arrayidx8, align 4 119 %add9 = add nuw nsw i32 %i.018, 1 120 %exitcond = icmp eq i32 %add9, 431 121 br i1 %exitcond, label %for.cond.cleanup, label %for.body 122} 123 124define void @zero_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 125; CHECK-LABEL: zero_extending_load_allowed( 126; PREFER-FOLDING: vector.body: 127; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0v4i8 128; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 129; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32 130; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 131entry: 132 br label %for.body 133 134for.cond.cleanup: 135 ret void 136 137for.body: 138 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 139 %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09 140 %0 = load i8, i8* %arrayidx, align 1 141 %conv = zext i8 %0 to i32 142 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 143 %1 = load i32, i32* %arrayidx1, align 4 144 %add = add nsw i32 %1, %conv 145 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 146 store i32 %add, i32* %arrayidx2, align 4 147 %add3 = add nuw nsw i32 %i.09, 1 148 %exitcond = icmp eq i32 %add3, 431 149 br i1 %exitcond, label %for.cond.cleanup, label %for.body 150} 151 152define void @sign_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 153; CHECK-LABEL: sign_extending_load_allowed( 154; PREFER-FOLDING: vector.body: 155; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0v4i8 156; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 157; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32 158; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 159entry: 160 br label %for.body 161 162for.cond.cleanup: 163 ret void 164 165for.body: 166 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 167 %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09 168 %0 = load i8, i8* %arrayidx, align 1 169 %conv = sext i8 %0 to i32 170 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 171 %1 = load i32, i32* %arrayidx1, align 4 172 %add = add nsw i32 %1, %conv 173 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 174 store i32 %add, i32* %arrayidx2, align 4 175 %add3 = add nuw nsw i32 %i.09, 1 176 %exitcond = icmp eq i32 %add3, 431 177 br i1 %exitcond, label %for.cond.cleanup, label %for.body 178} 179 180define void @narrowing_store_allowed(i8* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 181; CHECK-LABEL: narrowing_store_allowed( 182; PREFER-FOLDING: call void @llvm.masked.store.v4i8.p0v4i8 183; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 184entry: 185 br label %for.body 186 187for.cond.cleanup: 188 ret void 189 190for.body: 191 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 192 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 193 %0 = load i32, i32* %arrayidx, align 4 194 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 195 %1 = load i32, i32* %arrayidx1, align 4 196 %add = add nsw i32 %1, %0 197 %conv = trunc i32 %add to i8 198 %arrayidx2 = getelementptr inbounds i8, i8* %A, i32 %i.09 199 store i8 %conv, i8* %arrayidx2, align 1 200 %add3 = add nuw nsw i32 %i.09, 1 201 %exitcond = icmp eq i32 %add3, 431 202 br i1 %exitcond, label %for.cond.cleanup, label %for.body 203} 204 205@tab = common global [32 x i8] zeroinitializer, align 1 206 207define i32 @icmp_not_allowed() #0 { 208; CHECK-LABEL: icmp_not_allowed( 209; PREFER-FOLDING: vector.body: 210; PREFER-FOLDING-NOT: llvm.masked.load 211; PREFER-FOLDING-NOT: llvm.masked.store 212; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 213entry: 214 br label %for.body 215 216for.body: 217 %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 218 %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 219 %0 = load i8, i8* %arrayidx, align 1 220 %cmp1 = icmp eq i8 %0, 0 221 %. = select i1 %cmp1, i8 2, i8 1 222 store i8 %., i8* %arrayidx, align 1 223 %inc = add nsw i32 %i.08, 1 224 %exitcond = icmp slt i32 %inc, 1000 225 br i1 %exitcond, label %for.body, label %for.end 226 227for.end: 228 ret i32 0 229} 230 231@ftab = common global [32 x float] zeroinitializer, align 1 232 233define float @fcmp_not_allowed() #0 { 234; CHECK-LABEL: fcmp_not_allowed( 235; PREFER-FOLDING: vector.body: 236; PREFER-FOLDING-NOT: llvm.masked.load 237; PREFER-FOLDING-NOT: llvm.masked.store 238; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 239entry: 240 br label %for.body 241 242for.body: 243 %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 244 %arrayidx = getelementptr inbounds [32 x float], [32 x float]* @ftab, i32 0, i32 %i.08 245 %0 = load float, float* %arrayidx, align 4 246 %cmp1 = fcmp oeq float %0, 0.000000e+00 247 %. = select i1 %cmp1, float 2.000000e+00, float 1.000000e+00 248 store float %., float* %arrayidx, align 4 249 %inc = add nsw i32 %i.08, 1 250 %exitcond = icmp slt i32 %inc, 999 251 br i1 %exitcond, label %for.body, label %for.end 252 253for.end: 254 ret float 0.000000e+00 255} 256 257define void @pragma_vect_predicate_disable(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 258; CHECK-LABEL: pragma_vect_predicate_disable( 259; PREFER-FOLDING: vector.body: 260; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 261; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 262; PREFER-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32 263; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 264entry: 265 br label %for.body 266 267for.cond.cleanup: 268 ret void 269 270for.body: 271 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 272 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 273 %0 = load i32, i32* %arrayidx, align 4 274 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 275 %1 = load i32, i32* %arrayidx1, align 4 276 %add = add nsw i32 %1, %0 277 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 278 store i32 %add, i32* %arrayidx2, align 4 279 %add3 = add nuw nsw i32 %i.09, 1 280 %exitcond = icmp eq i32 %add3, 431 281 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !7 282} 283 284define void @stride_4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 285; CHECK-LABEL: stride_4( 286; PREFER-FOLDING: vector.body: 287; PREFER-FOLDING-NOT: llvm.masked.load 288; PREFER-FOLDING-NOT: llvm.masked.store 289; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 290entry: 291 br label %for.body 292 293for.cond.cleanup: 294 ret void 295 296for.body: 297 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 298 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 299 %0 = load i32, i32* %arrayidx, align 4 300 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 301 %1 = load i32, i32* %arrayidx1, align 4 302 %add = add nsw i32 %1, %0 303 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 304 store i32 %add, i32* %arrayidx2, align 4 305 %add3 = add nuw nsw i32 %i.09, 4 306 %cmp = icmp ult i32 %add3, 731 307 br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !5 308} 309 310define dso_local void @half(half* noalias nocapture %A, half* noalias nocapture readonly %B, half* noalias nocapture readonly %C) #0 { 311; CHECK-LABEL: half( 312; PREFER-FOLDING: vector.body: 313; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0v8f16 314; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0v8f16 315; PREFER-FOLDING: call void @llvm.masked.store.v8f16.p0v8f16 316; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 317entry: 318 br label %for.body 319 320for.cond.cleanup: 321 ret void 322 323for.body: 324 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 325 %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09 326 %0 = load half, half* %arrayidx, align 2 327 %arrayidx1 = getelementptr inbounds half, half* %C, i32 %i.09 328 %1 = load half, half* %arrayidx1, align 2 329 %add = fadd fast half %1, %0 330 %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09 331 store half %add, half* %arrayidx2, align 2 332 %add3 = add nuw nsw i32 %i.09, 1 333 %exitcond = icmp eq i32 %add3, 431 334 br i1 %exitcond, label %for.cond.cleanup, label %for.body 335} 336 337define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { 338; CHECK-LABEL: float( 339; PREFER-FOLDING: vector.body: 340; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 341; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 342; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 431) 343; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask 344; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask 345; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32({{.*}}%active.lane.mask 346; PREFER-FOLDING: %index.next = add i32 %index, 4 347; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 348entry: 349 br label %for.body 350 351for.cond.cleanup: 352 ret void 353 354for.body: 355 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 356 %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09 357 %0 = load float, float* %arrayidx, align 4 358 %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09 359 %1 = load float, float* %arrayidx1, align 4 360 %add = fadd fast float %1, %0 361 %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09 362 store float %add, float* %arrayidx2, align 4 363 %add3 = add nuw nsw i32 %i.09, 1 364 %exitcond = icmp eq i32 %add3, 431 365 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10 366} 367 368define void @fpext_allowed(float* noalias nocapture %A, half* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { 369; CHECK-LABEL: fpext_allowed( 370; PREFER-FOLDING: vector.body: 371; PREFER-FOLDING-NOT: llvm.masked.load 372; PREFER-FOLDING-NOT: llvm.masked.store 373; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 374entry: 375 br label %for.body 376 377for.cond.cleanup: 378 ret void 379 380for.body: 381 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 382 %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09 383 %0 = load half, half* %arrayidx, align 2 384 %conv = fpext half %0 to float 385 %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09 386 %1 = load float, float* %arrayidx1, align 4 387 %add = fadd fast float %1, %conv 388 %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09 389 store float %add, float* %arrayidx2, align 4 390 %add3 = add nuw nsw i32 %i.09, 1 391 %exitcond = icmp eq i32 %add3, 431 392 br i1 %exitcond, label %for.cond.cleanup, label %for.body 393} 394 395define void @fptrunc_allowed(half* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { 396; CHECK-LABEL: fptrunc_allowed( 397; PREFER-FOLDING: vector.body: 398; PREFER-FOLDING-NOT: llvm.masked.load 399; PREFER-FOLDING-NOT: llvm.masked.store 400; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 401entry: 402 br label %for.body 403 404for.cond.cleanup: 405 ret void 406 407for.body: 408 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 409 %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09 410 %0 = load float, float* %arrayidx, align 4 411 %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09 412 %1 = load float, float* %arrayidx1, align 4 413 %add = fadd fast float %1, %0 414 %conv = fptrunc float %add to half 415 %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09 416 store half %conv, half* %arrayidx2, align 2 417 %add3 = add nuw nsw i32 %i.09, 1 418 %exitcond = icmp eq i32 %add3, 431 419 br i1 %exitcond, label %for.cond.cleanup, label %for.body 420} 421 422attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" } 423 424!5 = distinct !{!5, !6} 425!6 = !{!"llvm.loop.vectorize.enable", i1 true} 426 427!7 = distinct !{!7, !8} 428!8 = !{!"llvm.loop.vectorize.predicate.enable", i1 false} 429 430!10 = distinct !{!10, !11} 431!11 = !{!"llvm.loop.vectorize.width", i32 4} 432