1; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf \ 2; RUN: -tail-predication=enabled -loop-vectorize -S < %s | \ 3; RUN: FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING 4 5; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=-mve \ 6; RUN: -tail-predication=enabled -loop-vectorize \ 7; RUN: -enable-arm-maskedldst=true -S < %s | \ 8; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 9 10; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \ 11; RUN: -tail-predication=enabled -loop-vectorize \ 12; RUN: -enable-arm-maskedldst=false -S < %s | \ 13; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 14 15; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \ 16; RUN: -tail-predication=disabled -loop-vectorize \ 17; RUN: -enable-arm-maskedldst=true -S < %s | \ 18; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 19 20; Disabling the low-overhead branch extension will make 21; 'isHardwareLoopProfitable' return false, so that we test avoiding folding for 22; these cases. 23; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve,-lob \ 24; RUN: -tail-predication=enabled -loop-vectorize \ 25; RUN: -enable-arm-maskedldst=true -S < %s | \ 26; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 27 28; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \ 29; RUN: -tail-predication=enabled -loop-vectorize \ 30; RUN: -enable-arm-maskedldst=true -S < %s | \ 31; RUN: FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING 32 33; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \ 34; RUN: -prefer-predicate-over-epilog=false \ 35; RUN: -tail-predication=enabled -loop-vectorize \ 36; RUN: -enable-arm-maskedldst=true -S < %s | \ 37; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 38 39; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \ 40; RUN: -prefer-predicate-over-epilog=true \ 41; RUN: -tail-predication=enabled -loop-vectorize \ 42; RUN: -enable-arm-maskedldst=true -S < %s | \ 43; RUN: FileCheck %s -check-prefixes=CHECK,FOLDING-OPT 44 45define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 46; CHECK-LABEL: prefer_folding( 47; PREFER-FOLDING: vector.body: 48; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 49; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 50; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430) 51; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask, 52; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask, 53; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask 54; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 55; 56; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( 57; NO-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32( 58; NO-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %for.body 59entry: 60 br label %for.body 61 62for.cond.cleanup: 63 ret void 64 65for.body: 66 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 67 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 68 %0 = load i32, i32* %arrayidx, align 4 69 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 70 %1 = load i32, i32* %arrayidx1, align 4 71 %add = add nsw i32 %1, %0 72 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 73 store i32 %add, i32* %arrayidx2, align 4 74 %add3 = add nuw nsw i32 %i.09, 1 75 %exitcond = icmp eq i32 %add3, 431 76 br i1 %exitcond, label %for.cond.cleanup, label %for.body 77} 78 79define void @mixed_types(i16* noalias nocapture %A, i16* noalias nocapture readonly %B, i16* noalias nocapture readonly %C, i32* noalias nocapture %D, i32* noalias nocapture readonly %E, i32* noalias nocapture readonly %F) #0 { 80; CHECK-LABEL: mixed_types( 81; PREFER-FOLDING: vector.body: 82; PREFER-FOLDING: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16 83; PREFER-FOLDING: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16 84; PREFER-FOLDING: call void @llvm.masked.store.v4i16.p0v4i16 85; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 86; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 87; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32 88; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 89entry: 90 br label %for.body 91 92for.cond.cleanup: 93 ret void 94 95for.body: 96 %i.018 = phi i32 [ 0, %entry ], [ %add9, %for.body ] 97 %arrayidx = getelementptr inbounds i16, i16* %B, i32 %i.018 98 %0 = load i16, i16* %arrayidx, align 2 99 %arrayidx1 = getelementptr inbounds i16, i16* %C, i32 %i.018 100 %1 = load i16, i16* %arrayidx1, align 2 101 %add = add i16 %1, %0 102 %arrayidx4 = getelementptr inbounds i16, i16* %A, i32 %i.018 103 store i16 %add, i16* %arrayidx4, align 2 104 %arrayidx5 = getelementptr inbounds i32, i32* %E, i32 %i.018 105 %2 = load i32, i32* %arrayidx5, align 4 106 %arrayidx6 = getelementptr inbounds i32, i32* %F, i32 %i.018 107 %3 = load i32, i32* %arrayidx6, align 4 108 %add7 = add nsw i32 %3, %2 109 %arrayidx8 = getelementptr inbounds i32, i32* %D, i32 %i.018 110 store i32 %add7, i32* %arrayidx8, align 4 111 %add9 = add nuw nsw i32 %i.018, 1 112 %exitcond = icmp eq i32 %add9, 431 113 br i1 %exitcond, label %for.cond.cleanup, label %for.body 114} 115 116define void @zero_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 117; CHECK-LABEL: zero_extending_load_allowed( 118; PREFER-FOLDING: vector.body: 119; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0v4i8 120; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 121; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32 122; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 123entry: 124 br label %for.body 125 126for.cond.cleanup: 127 ret void 128 129for.body: 130 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 131 %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09 132 %0 = load i8, i8* %arrayidx, align 1 133 %conv = zext i8 %0 to i32 134 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 135 %1 = load i32, i32* %arrayidx1, align 4 136 %add = add nsw i32 %1, %conv 137 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 138 store i32 %add, i32* %arrayidx2, align 4 139 %add3 = add nuw nsw i32 %i.09, 1 140 %exitcond = icmp eq i32 %add3, 431 141 br i1 %exitcond, label %for.cond.cleanup, label %for.body 142} 143 144define void @sign_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 145; CHECK-LABEL: sign_extending_load_allowed( 146; PREFER-FOLDING: vector.body: 147; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0v4i8 148; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 149; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32 150; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 151entry: 152 br label %for.body 153 154for.cond.cleanup: 155 ret void 156 157for.body: 158 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 159 %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09 160 %0 = load i8, i8* %arrayidx, align 1 161 %conv = sext i8 %0 to i32 162 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 163 %1 = load i32, i32* %arrayidx1, align 4 164 %add = add nsw i32 %1, %conv 165 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 166 store i32 %add, i32* %arrayidx2, align 4 167 %add3 = add nuw nsw i32 %i.09, 1 168 %exitcond = icmp eq i32 %add3, 431 169 br i1 %exitcond, label %for.cond.cleanup, label %for.body 170} 171 172define void @narrowing_store_allowed(i8* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 173; CHECK-LABEL: narrowing_store_allowed( 174; PREFER-FOLDING: call void @llvm.masked.store.v4i8.p0v4i8 175; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 176entry: 177 br label %for.body 178 179for.cond.cleanup: 180 ret void 181 182for.body: 183 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 184 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 185 %0 = load i32, i32* %arrayidx, align 4 186 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 187 %1 = load i32, i32* %arrayidx1, align 4 188 %add = add nsw i32 %1, %0 189 %conv = trunc i32 %add to i8 190 %arrayidx2 = getelementptr inbounds i8, i8* %A, i32 %i.09 191 store i8 %conv, i8* %arrayidx2, align 1 192 %add3 = add nuw nsw i32 %i.09, 1 193 %exitcond = icmp eq i32 %add3, 431 194 br i1 %exitcond, label %for.cond.cleanup, label %for.body 195} 196 197@tab = common global [32 x i8] zeroinitializer, align 1 198 199define i32 @icmp_not_allowed() #0 { 200; CHECK-LABEL: icmp_not_allowed( 201; PREFER-FOLDING: vector.body: 202; PREFER-FOLDING-NOT: llvm.masked.load 203; PREFER-FOLDING-NOT: llvm.masked.store 204; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 205entry: 206 br label %for.body 207 208for.body: 209 %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 210 %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 211 %0 = load i8, i8* %arrayidx, align 1 212 %cmp1 = icmp eq i8 %0, 0 213 %. = select i1 %cmp1, i8 2, i8 1 214 store i8 %., i8* %arrayidx, align 1 215 %inc = add nsw i32 %i.08, 1 216 %exitcond = icmp slt i32 %inc, 1000 217 br i1 %exitcond, label %for.body, label %for.end 218 219for.end: 220 ret i32 0 221} 222 223@ftab = common global [32 x float] zeroinitializer, align 1 224 225define float @fcmp_not_allowed() #0 { 226; CHECK-LABEL: fcmp_not_allowed( 227; PREFER-FOLDING: vector.body: 228; PREFER-FOLDING-NOT: llvm.masked.load 229; PREFER-FOLDING-NOT: llvm.masked.store 230; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 231entry: 232 br label %for.body 233 234for.body: 235 %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 236 %arrayidx = getelementptr inbounds [32 x float], [32 x float]* @ftab, i32 0, i32 %i.08 237 %0 = load float, float* %arrayidx, align 4 238 %cmp1 = fcmp oeq float %0, 0.000000e+00 239 %. = select i1 %cmp1, float 2.000000e+00, float 1.000000e+00 240 store float %., float* %arrayidx, align 4 241 %inc = add nsw i32 %i.08, 1 242 %exitcond = icmp slt i32 %inc, 999 243 br i1 %exitcond, label %for.body, label %for.end 244 245for.end: 246 ret float 0.000000e+00 247} 248 249define void @pragma_vect_predicate_disable(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 250; CHECK-LABEL: pragma_vect_predicate_disable( 251; PREFER-FOLDING: vector.body: 252; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 253; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 254; PREFER-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32 255; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 256entry: 257 br label %for.body 258 259for.cond.cleanup: 260 ret void 261 262for.body: 263 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 264 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 265 %0 = load i32, i32* %arrayidx, align 4 266 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 267 %1 = load i32, i32* %arrayidx1, align 4 268 %add = add nsw i32 %1, %0 269 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 270 store i32 %add, i32* %arrayidx2, align 4 271 %add3 = add nuw nsw i32 %i.09, 1 272 %exitcond = icmp eq i32 %add3, 431 273 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !7 274} 275 276define void @stride_4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 277; CHECK-LABEL: stride_4( 278; PREFER-FOLDING: vector.body: 279; PREFER-FOLDING-NOT: llvm.masked.load 280; PREFER-FOLDING-NOT: llvm.masked.store 281; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 282entry: 283 br label %for.body 284 285for.cond.cleanup: 286 ret void 287 288for.body: 289 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 290 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 291 %0 = load i32, i32* %arrayidx, align 4 292 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 293 %1 = load i32, i32* %arrayidx1, align 4 294 %add = add nsw i32 %1, %0 295 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 296 store i32 %add, i32* %arrayidx2, align 4 297 %add3 = add nuw nsw i32 %i.09, 4 298 %cmp = icmp ult i32 %add3, 731 299 br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !5 300} 301 302define dso_local void @half(half* noalias nocapture %A, half* noalias nocapture readonly %B, half* noalias nocapture readonly %C) #0 { 303; CHECK-LABEL: half( 304; PREFER-FOLDING: vector.body: 305; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0v8f16 306; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0v8f16 307; PREFER-FOLDING: call void @llvm.masked.store.v8f16.p0v8f16 308; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 309entry: 310 br label %for.body 311 312for.cond.cleanup: 313 ret void 314 315for.body: 316 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 317 %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09 318 %0 = load half, half* %arrayidx, align 2 319 %arrayidx1 = getelementptr inbounds half, half* %C, i32 %i.09 320 %1 = load half, half* %arrayidx1, align 2 321 %add = fadd fast half %1, %0 322 %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09 323 store half %add, half* %arrayidx2, align 2 324 %add3 = add nuw nsw i32 %i.09, 1 325 %exitcond = icmp eq i32 %add3, 431 326 br i1 %exitcond, label %for.cond.cleanup, label %for.body 327} 328 329define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { 330; CHECK-LABEL: float( 331; PREFER-FOLDING: vector.body: 332; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 333; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 334; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430) 335; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask 336; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask 337; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32({{.*}}%active.lane.mask 338; PREFER-FOLDING: %index.next = add i32 %index, 4 339; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 340entry: 341 br label %for.body 342 343for.cond.cleanup: 344 ret void 345 346for.body: 347 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 348 %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09 349 %0 = load float, float* %arrayidx, align 4 350 %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09 351 %1 = load float, float* %arrayidx1, align 4 352 %add = fadd fast float %1, %0 353 %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09 354 store float %add, float* %arrayidx2, align 4 355 %add3 = add nuw nsw i32 %i.09, 1 356 %exitcond = icmp eq i32 %add3, 431 357 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10 358} 359 360define void @fpext_allowed(float* noalias nocapture %A, half* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { 361; CHECK-LABEL: fpext_allowed( 362; PREFER-FOLDING: vector.body: 363; PREFER-FOLDING-NOT: llvm.masked.load 364; PREFER-FOLDING-NOT: llvm.masked.store 365; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 366entry: 367 br label %for.body 368 369for.cond.cleanup: 370 ret void 371 372for.body: 373 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 374 %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09 375 %0 = load half, half* %arrayidx, align 2 376 %conv = fpext half %0 to float 377 %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09 378 %1 = load float, float* %arrayidx1, align 4 379 %add = fadd fast float %1, %conv 380 %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09 381 store float %add, float* %arrayidx2, align 4 382 %add3 = add nuw nsw i32 %i.09, 1 383 %exitcond = icmp eq i32 %add3, 431 384 br i1 %exitcond, label %for.cond.cleanup, label %for.body 385} 386 387define void @fptrunc_allowed(half* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { 388; CHECK-LABEL: fptrunc_allowed( 389; PREFER-FOLDING: vector.body: 390; PREFER-FOLDING-NOT: llvm.masked.load 391; PREFER-FOLDING-NOT: llvm.masked.store 392; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 393entry: 394 br label %for.body 395 396for.cond.cleanup: 397 ret void 398 399for.body: 400 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 401 %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09 402 %0 = load float, float* %arrayidx, align 4 403 %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09 404 %1 = load float, float* %arrayidx1, align 4 405 %add = fadd fast float %1, %0 406 %conv = fptrunc float %add to half 407 %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09 408 store half %conv, half* %arrayidx2, align 2 409 %add3 = add nuw nsw i32 %i.09, 1 410 %exitcond = icmp eq i32 %add3, 431 411 br i1 %exitcond, label %for.cond.cleanup, label %for.body 412} 413 414attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" } 415 416!5 = distinct !{!5, !6} 417!6 = !{!"llvm.loop.vectorize.enable", i1 true} 418 419!7 = distinct !{!7, !8} 420!8 = !{!"llvm.loop.vectorize.predicate.enable", i1 false} 421 422!10 = distinct !{!10, !11} 423!11 = !{!"llvm.loop.vectorize.width", i32 4} 424