1; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf \ 2; RUN: -disable-mve-tail-predication=false -loop-vectorize -S < %s | \ 3; RUN: FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING 4 5; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=-mve \ 6; RUN: -disable-mve-tail-predication=false -loop-vectorize \ 7; RUN: -enable-arm-maskedldst=true -S < %s | \ 8; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 9 10; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \ 11; RUN: -disable-mve-tail-predication=false -loop-vectorize \ 12; RUN: -enable-arm-maskedldst=false -S < %s | \ 13; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 14 15; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \ 16; RUN: -disable-mve-tail-predication=true -loop-vectorize \ 17; RUN: -enable-arm-maskedldst=true -S < %s | \ 18; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 19 20; Disabling the low-overhead branch extension will make 21; 'isHardwareLoopProfitable' return false, so that we test avoiding folding for 22; these cases. 23; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve,-lob \ 24; RUN: -disable-mve-tail-predication=false -loop-vectorize \ 25; RUN: -enable-arm-maskedldst=true -S < %s | \ 26; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 27 28; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \ 29; RUN: -disable-mve-tail-predication=false -loop-vectorize \ 30; RUN: -enable-arm-maskedldst=true -S < %s | \ 31; RUN: FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING 32 33; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \ 34; RUN: -prefer-predicate-over-epilog=false \ 35; RUN: -disable-mve-tail-predication=false -loop-vectorize \ 36; RUN: -enable-arm-maskedldst=true -S < %s | \ 37; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 38 39; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \ 40; RUN: -prefer-predicate-over-epilog=true \ 41; RUN: -disable-mve-tail-predication=false -loop-vectorize \ 42; RUN: -enable-arm-maskedldst=true -S < %s | \ 43; RUN: FileCheck %s -check-prefixes=CHECK,FOLDING-OPT 44 45define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 46; CHECK-LABEL: prefer_folding( 47; PREFER-FOLDING: vector.body: 48; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 49; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 50; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430) 51; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask, 52; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask, 53; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask 54; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 55; 56; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( 57; NO-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32( 58; NO-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %for.body 59entry: 60 br label %for.body 61 62for.cond.cleanup: 63 ret void 64 65for.body: 66 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 67 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 68 %0 = load i32, i32* %arrayidx, align 4 69 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 70 %1 = load i32, i32* %arrayidx1, align 4 71 %add = add nsw i32 %1, %0 72 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 73 store i32 %add, i32* %arrayidx2, align 4 74 %add3 = add nuw nsw i32 %i.09, 1 75 %exitcond = icmp eq i32 %add3, 431 76 br i1 %exitcond, label %for.cond.cleanup, label %for.body 77} 78 79define void @mixed_types(i16* noalias nocapture %A, i16* noalias nocapture readonly %B, i16* noalias nocapture readonly %C, i32* noalias nocapture %D, i32* noalias nocapture readonly %E, i32* noalias nocapture readonly %F) #0 { 80; CHECK-LABEL: mixed_types( 81; PREFER-FOLDING: vector.body: 82; PREFER-FOLDING: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16 83; PREFER-FOLDING: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16 84; PREFER-FOLDING: call void @llvm.masked.store.v4i16.p0v4i16 85; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 86; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 87; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32 88; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 89entry: 90 br label %for.body 91 92for.cond.cleanup: 93 ret void 94 95for.body: 96 %i.018 = phi i32 [ 0, %entry ], [ %add9, %for.body ] 97 %arrayidx = getelementptr inbounds i16, i16* %B, i32 %i.018 98 %0 = load i16, i16* %arrayidx, align 2 99 %arrayidx1 = getelementptr inbounds i16, i16* %C, i32 %i.018 100 %1 = load i16, i16* %arrayidx1, align 2 101 %add = add i16 %1, %0 102 %arrayidx4 = getelementptr inbounds i16, i16* %A, i32 %i.018 103 store i16 %add, i16* %arrayidx4, align 2 104 %arrayidx5 = getelementptr inbounds i32, i32* %E, i32 %i.018 105 %2 = load i32, i32* %arrayidx5, align 4 106 %arrayidx6 = getelementptr inbounds i32, i32* %F, i32 %i.018 107 %3 = load i32, i32* %arrayidx6, align 4 108 %add7 = add nsw i32 %3, %2 109 %arrayidx8 = getelementptr inbounds i32, i32* %D, i32 %i.018 110 store i32 %add7, i32* %arrayidx8, align 4 111 %add9 = add nuw nsw i32 %i.018, 1 112 %exitcond = icmp eq i32 %add9, 431 113 br i1 %exitcond, label %for.cond.cleanup, label %for.body 114} 115 116define void @unsupported_i64_type(i64* noalias nocapture %A, i64* noalias nocapture readonly %B, i64* noalias nocapture readonly %C) #0 { 117; CHECK-LABEL: unsupported_i64_type( 118; PREFER-FOLDING-NOT: vector.body: 119; PREFER-FOLDING-NOT: llvm.masked.load 120; PREFER-FOLDING-NOT: llvm.masked.store 121; PREFER-FOLDING: for.body: 122entry: 123 br label %for.body 124 125for.cond.cleanup: 126 ret void 127 128for.body: 129 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 130 %arrayidx = getelementptr inbounds i64, i64* %B, i32 %i.09 131 %0 = load i64, i64* %arrayidx, align 8 132 %arrayidx1 = getelementptr inbounds i64, i64* %C, i32 %i.09 133 %1 = load i64, i64* %arrayidx1, align 8 134 %add = add nsw i64 %1, %0 135 %arrayidx2 = getelementptr inbounds i64, i64* %A, i32 %i.09 136 store i64 %add, i64* %arrayidx2, align 8 137 %add3 = add nuw nsw i32 %i.09, 1 138 %exitcond = icmp eq i32 %add3, 431 139 br i1 %exitcond, label %for.cond.cleanup, label %for.body 140} 141 142define void @zero_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 143; CHECK-LABEL: zero_extending_load_allowed( 144; PREFER-FOLDING: vector.body: 145; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0v4i8 146; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 147; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32 148; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 149entry: 150 br label %for.body 151 152for.cond.cleanup: 153 ret void 154 155for.body: 156 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 157 %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09 158 %0 = load i8, i8* %arrayidx, align 1 159 %conv = zext i8 %0 to i32 160 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 161 %1 = load i32, i32* %arrayidx1, align 4 162 %add = add nsw i32 %1, %conv 163 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 164 store i32 %add, i32* %arrayidx2, align 4 165 %add3 = add nuw nsw i32 %i.09, 1 166 %exitcond = icmp eq i32 %add3, 431 167 br i1 %exitcond, label %for.cond.cleanup, label %for.body 168} 169 170define void @sign_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 171; CHECK-LABEL: sign_extending_load_allowed( 172; PREFER-FOLDING: vector.body: 173; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0v4i8 174; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 175; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32 176; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 177entry: 178 br label %for.body 179 180for.cond.cleanup: 181 ret void 182 183for.body: 184 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 185 %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09 186 %0 = load i8, i8* %arrayidx, align 1 187 %conv = sext i8 %0 to i32 188 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 189 %1 = load i32, i32* %arrayidx1, align 4 190 %add = add nsw i32 %1, %conv 191 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 192 store i32 %add, i32* %arrayidx2, align 4 193 %add3 = add nuw nsw i32 %i.09, 1 194 %exitcond = icmp eq i32 %add3, 431 195 br i1 %exitcond, label %for.cond.cleanup, label %for.body 196} 197 198define void @narrowing_load_not_allowed(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, i16* noalias nocapture readonly %C) #0 { 199; CHECK-LABEL: narrowing_load_not_allowed( 200; PREFER-FOLDING: vector.body: 201; PREFER-FOLDING-NOT: llvm.masked.load 202; PREFER-FOLDING-NOT: llvm.masked.store 203; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 204 205; FOLDING-OPT: vector.body: 206; FOLDING-OPT: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16 207; FOLDING-OPT: call <8 x i8> @llvm.masked.load.v8i8.p0v8i8 208; FOLDING-OPT: call void @llvm.masked.store.v8i8.p0v8i8 209; FOLDING-OPT: br i1 %{{.*}}, label %{{.*}}, label %vector.body 210entry: 211 br label %for.body 212 213for.cond.cleanup: ; preds = %for.body 214 ret void 215 216for.body: ; preds = %for.body, %entry 217 %i.012 = phi i32 [ 0, %entry ], [ %add6, %for.body ] 218 %arrayidx = getelementptr inbounds i16, i16* %C, i32 %i.012 219 %0 = load i16, i16* %arrayidx, align 2 220 %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.012 221 %1 = load i8, i8* %arrayidx1, align 1 222 %conv3 = trunc i16 %0 to i8 223 %add = add i8 %1, %conv3 224 %arrayidx5 = getelementptr inbounds i8, i8* %A, i32 %i.012 225 store i8 %add, i8* %arrayidx5, align 1 226 %add6 = add nuw nsw i32 %i.012, 1 227 %exitcond = icmp eq i32 %add6, 431 228 br i1 %exitcond, label %for.cond.cleanup, label %for.body 229} 230 231define void @narrowing_store_allowed(i8* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 232; CHECK-LABEL: narrowing_store_allowed( 233; PREFER-FOLDING: call void @llvm.masked.store.v4i8.p0v4i8 234; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 235entry: 236 br label %for.body 237 238for.cond.cleanup: 239 ret void 240 241for.body: 242 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 243 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 244 %0 = load i32, i32* %arrayidx, align 4 245 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 246 %1 = load i32, i32* %arrayidx1, align 4 247 %add = add nsw i32 %1, %0 248 %conv = trunc i32 %add to i8 249 %arrayidx2 = getelementptr inbounds i8, i8* %A, i32 %i.09 250 store i8 %conv, i8* %arrayidx2, align 1 251 %add3 = add nuw nsw i32 %i.09, 1 252 %exitcond = icmp eq i32 %add3, 431 253 br i1 %exitcond, label %for.cond.cleanup, label %for.body 254} 255 256; This is a trunc not connected to a store, so we don't allow this. 257; TODO: this is conservative, because the trunc is only used in the 258; loop control statements, and thus not affecting element sizes, so 259; we could allow this case. 260define void @trunc_not_allowed(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 261; CHECK-LABEL: trunc_not_allowed( 262; PREFER-FOLDING: vector.body: 263; PREFER-FOLDING-NOT: llvm.masked.load 264; PREFER-FOLDING-NOT: llvm.masked.store 265; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 266entry: 267 br label %for.body 268 269for.cond.cleanup: 270 ret void 271 272for.body: 273 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 274 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 275 %0 = load i32, i32* %arrayidx, align 4 276 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 277 %1 = load i32, i32* %arrayidx1, align 4 278 %add = add nsw i32 %1, %0 279 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 280 store i32 %add, i32* %arrayidx2, align 4 281 %add3 = add nuw nsw i32 %i.09, 1 282 283 %add.iv = trunc i32 %add3 to i16 284 285 %exitcond = icmp eq i16 %add.iv, 431 286 br i1 %exitcond, label %for.cond.cleanup, label %for.body 287} 288 289define void @trunc_not_allowed_different_vec_elemns(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i16* noalias nocapture %D) #0 { 290; CHECK-LABEL: trunc_not_allowed_different_vec_elemns( 291; PREFER-FOLDING: vector.body: 292; PREFER-FOLDING-NOT: llvm.masked.load 293; PREFER-FOLDING-NOT: llvm.masked.store 294; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 295entry: 296 br label %for.body 297 298for.cond.cleanup: 299 ret void 300 301for.body: 302 %i.021 = phi i32 [ 0, %entry ], [ %add9, %for.body ] 303 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.021 304 %0 = load i32, i32* %arrayidx, align 4 305 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.021 306 %1 = load i32, i32* %arrayidx1, align 4 307 %add = add nsw i32 %1, %0 308 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.021 309 store i32 %add, i32* %arrayidx2, align 4 310 %add.tr = trunc i32 %add to i16 311 %conv7 = shl i16 %add.tr, 1 312 %arrayidx8 = getelementptr inbounds i16, i16* %D, i32 %i.021 313 store i16 %conv7, i16* %arrayidx8, align 2 314 %add9 = add nuw nsw i32 %i.021, 1 315 %exitcond = icmp eq i32 %add9, 431 316 br i1 %exitcond, label %for.cond.cleanup, label %for.body 317} 318 319 320@tab = common global [32 x i8] zeroinitializer, align 1 321 322define i32 @icmp_not_allowed() #0 { 323; CHECK-LABEL: icmp_not_allowed( 324; PREFER-FOLDING: vector.body: 325; PREFER-FOLDING-NOT: llvm.masked.load 326; PREFER-FOLDING-NOT: llvm.masked.store 327; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 328entry: 329 br label %for.body 330 331for.body: 332 %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 333 %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 334 %0 = load i8, i8* %arrayidx, align 1 335 %cmp1 = icmp eq i8 %0, 0 336 %. = select i1 %cmp1, i8 2, i8 1 337 store i8 %., i8* %arrayidx, align 1 338 %inc = add nsw i32 %i.08, 1 339 %exitcond = icmp slt i32 %inc, 1000 340 br i1 %exitcond, label %for.body, label %for.end 341 342for.end: 343 ret i32 0 344} 345 346@ftab = common global [32 x float] zeroinitializer, align 1 347 348define float @fcmp_not_allowed() #0 { 349; CHECK-LABEL: fcmp_not_allowed( 350; PREFER-FOLDING: vector.body: 351; PREFER-FOLDING-NOT: llvm.masked.load 352; PREFER-FOLDING-NOT: llvm.masked.store 353; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 354entry: 355 br label %for.body 356 357for.body: 358 %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 359 %arrayidx = getelementptr inbounds [32 x float], [32 x float]* @ftab, i32 0, i32 %i.08 360 %0 = load float, float* %arrayidx, align 4 361 %cmp1 = fcmp oeq float %0, 0.000000e+00 362 %. = select i1 %cmp1, float 2.000000e+00, float 1.000000e+00 363 store float %., float* %arrayidx, align 4 364 %inc = add nsw i32 %i.08, 1 365 %exitcond = icmp slt i32 %inc, 999 366 br i1 %exitcond, label %for.body, label %for.end 367 368for.end: 369 ret float 0.000000e+00 370} 371 372define void @pragma_vect_predicate_disable(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 373; CHECK-LABEL: pragma_vect_predicate_disable( 374; PREFER-FOLDING: vector.body: 375; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 376; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 377; PREFER-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32 378; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 379entry: 380 br label %for.body 381 382for.cond.cleanup: 383 ret void 384 385for.body: 386 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 387 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 388 %0 = load i32, i32* %arrayidx, align 4 389 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 390 %1 = load i32, i32* %arrayidx1, align 4 391 %add = add nsw i32 %1, %0 392 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 393 store i32 %add, i32* %arrayidx2, align 4 394 %add3 = add nuw nsw i32 %i.09, 1 395 %exitcond = icmp eq i32 %add3, 431 396 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !7 397} 398 399; Test directions for array indices i and N-1. I.e. check strides 1 and -1, and 400; force vectorisation with a loop hint. 401define void @strides_different_direction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) #0 { 402; CHECK-LABEL: strides_different_direction( 403; PREFER-FOLDING: vector.body: 404; PREFER-FOLDING-NOT: llvm.masked.load 405; PREFER-FOLDING-NOT: llvm.masked.store 406; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 407entry: 408 br label %for.body 409 410for.cond.cleanup: 411 ret void 412 413for.body: 414 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 415 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 416 %0 = load i32, i32* %arrayidx, align 4 417 %sub = sub nsw i32 %N, %i.09 418 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %sub 419 %1 = load i32, i32* %arrayidx1, align 4 420 %add = add nsw i32 %1, %0 421 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 422 store i32 %add, i32* %arrayidx2, align 4 423 %add3 = add nuw nsw i32 %i.09, 1 424 %exitcond = icmp eq i32 %add3, 431 425 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10 426} 427 428define void @stride_4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 429; CHECK-LABEL: stride_4( 430; PREFER-FOLDING: vector.body: 431; PREFER-FOLDING-NOT: llvm.masked.load 432; PREFER-FOLDING-NOT: llvm.masked.store 433; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 434entry: 435 br label %for.body 436 437for.cond.cleanup: 438 ret void 439 440for.body: 441 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 442 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 443 %0 = load i32, i32* %arrayidx, align 4 444 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 445 %1 = load i32, i32* %arrayidx1, align 4 446 %add = add nsw i32 %1, %0 447 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 448 store i32 %add, i32* %arrayidx2, align 4 449 %add3 = add nuw nsw i32 %i.09, 4 450 %cmp = icmp ult i32 %add3, 731 451 br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !5 452} 453 454define void @too_many_loop_blocks(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 455; CHECK-LABEL: too_many_loop_blocks( 456; PREFER-FOLDING: vector.body: 457; PREFER-FOLDING-NOT: llvm.masked.load 458; PREFER-FOLDING-NOT: llvm.masked.store 459; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 460entry: 461 br label %for.body 462 463for.cond.cleanup: 464 ret void 465 466for.body: 467 %i.09 = phi i32 [ 0, %entry ], [ %add3, %loopincr ] 468 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 469 %0 = load i32, i32* %arrayidx, align 4 470 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 471 %1 = load i32, i32* %arrayidx1, align 4 472 %add = add nsw i32 %1, %0 473 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 474 store i32 %add, i32* %arrayidx2, align 4 475 br label %loopincr 476 477loopincr: 478 %add3 = add nuw nsw i32 %i.09, 1 479 %exitcond = icmp eq i32 %add3, 431 480 br i1 %exitcond, label %for.cond.cleanup, label %for.body 481} 482 483define dso_local void @half(half* noalias nocapture %A, half* noalias nocapture readonly %B, half* noalias nocapture readonly %C) #0 { 484; CHECK-LABEL: half( 485; PREFER-FOLDING: vector.body: 486; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0v8f16 487; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0v8f16 488; PREFER-FOLDING: call void @llvm.masked.store.v8f16.p0v8f16 489; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 490entry: 491 br label %for.body 492 493for.cond.cleanup: 494 ret void 495 496for.body: 497 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 498 %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09 499 %0 = load half, half* %arrayidx, align 2 500 %arrayidx1 = getelementptr inbounds half, half* %C, i32 %i.09 501 %1 = load half, half* %arrayidx1, align 2 502 %add = fadd fast half %1, %0 503 %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09 504 store half %add, half* %arrayidx2, align 2 505 %add3 = add nuw nsw i32 %i.09, 1 506 %exitcond = icmp eq i32 %add3, 431 507 br i1 %exitcond, label %for.cond.cleanup, label %for.body 508} 509 510define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { 511; CHECK-LABEL: float( 512; PREFER-FOLDING: vector.body: 513; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 514; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 515; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430) 516; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask 517; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask 518; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32({{.*}}%active.lane.mask 519; PREFER-FOLDING: %index.next = add i32 %index, 4 520; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 521entry: 522 br label %for.body 523 524for.cond.cleanup: 525 ret void 526 527for.body: 528 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 529 %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09 530 %0 = load float, float* %arrayidx, align 4 531 %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09 532 %1 = load float, float* %arrayidx1, align 4 533 %add = fadd fast float %1, %0 534 %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09 535 store float %add, float* %arrayidx2, align 4 536 %add3 = add nuw nsw i32 %i.09, 1 537 %exitcond = icmp eq i32 %add3, 431 538 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10 539} 540 541define void @double(double* noalias nocapture %A, double* noalias nocapture readonly %B, double* noalias nocapture readonly %C) #0 { 542; CHECK-LABEL: double( 543; PREFER-FOLDING: for.body: 544; PREFER-FOLDING-NOT: vector.body: 545entry: 546 br label %for.body 547 548for.cond.cleanup: 549 ret void 550 551for.body: 552 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 553 %arrayidx = getelementptr inbounds double, double* %B, i32 %i.09 554 %0 = load double, double* %arrayidx, align 8 555 %arrayidx1 = getelementptr inbounds double, double* %C, i32 %i.09 556 %1 = load double, double* %arrayidx1, align 8 557 %add = fadd fast double %1, %0 558 %arrayidx2 = getelementptr inbounds double, double* %A, i32 %i.09 559 store double %add, double* %arrayidx2, align 8 560 %add3 = add nuw nsw i32 %i.09, 1 561 %exitcond = icmp eq i32 %add3, 431 562 br i1 %exitcond, label %for.cond.cleanup, label %for.body 563} 564 565; TODO: this fpext could be allowed, but we don't lower it very efficiently yet, 566; so reject this for now. 567define void @fpext_allowed(float* noalias nocapture %A, half* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { 568; CHECK-LABEL: fpext_allowed( 569; PREFER-FOLDING: vector.body: 570; PREFER-FOLDING-NOT: llvm.masked.load 571; PREFER-FOLDING-NOT: llvm.masked.store 572; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 573entry: 574 br label %for.body 575 576for.cond.cleanup: 577 ret void 578 579for.body: 580 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 581 %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09 582 %0 = load half, half* %arrayidx, align 2 583 %conv = fpext half %0 to float 584 %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09 585 %1 = load float, float* %arrayidx1, align 4 586 %add = fadd fast float %1, %conv 587 %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09 588 store float %add, float* %arrayidx2, align 4 589 %add3 = add nuw nsw i32 %i.09, 1 590 %exitcond = icmp eq i32 %add3, 431 591 br i1 %exitcond, label %for.cond.cleanup, label %for.body 592} 593 594; TODO: this fptrunc could be allowed, but we don't lower it very efficiently yet, 595; so reject this for now. 596define void @fptrunc_allowed(half* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { 597; CHECK-LABEL: fptrunc_allowed( 598; PREFER-FOLDING: vector.body: 599; PREFER-FOLDING-NOT: llvm.masked.load 600; PREFER-FOLDING-NOT: llvm.masked.store 601; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 602entry: 603 br label %for.body 604 605for.cond.cleanup: 606 ret void 607 608for.body: 609 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 610 %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09 611 %0 = load float, float* %arrayidx, align 4 612 %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09 613 %1 = load float, float* %arrayidx1, align 4 614 %add = fadd fast float %1, %0 615 %conv = fptrunc float %add to half 616 %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09 617 store half %conv, half* %arrayidx2, align 2 618 %add3 = add nuw nsw i32 %i.09, 1 619 %exitcond = icmp eq i32 %add3, 431 620 br i1 %exitcond, label %for.cond.cleanup, label %for.body 621} 622 623define void @fptrunc_not_allowed(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C, half* noalias nocapture %D) #0 { 624; CHECK-LABEL: fptrunc_not_allowed( 625; PREFER-FOLDING: vector.body: 626; PREFER-FOLDING-NOT: llvm.masked.load 627; PREFER-FOLDING-NOT: llvm.masked.store 628; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 629entry: 630 br label %for.body 631 632for.cond.cleanup: 633 ret void 634 635for.body: 636 %i.017 = phi i32 [ 0, %entry ], [ %add6, %for.body ] 637 %arrayidx = getelementptr inbounds float, float* %B, i32 %i.017 638 %0 = load float, float* %arrayidx, align 4 639 %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.017 640 %1 = load float, float* %arrayidx1, align 4 641 %add = fadd fast float %1, %0 642 %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.017 643 store float %add, float* %arrayidx2, align 4 644 %conv = fptrunc float %add to half 645 %factor = fmul fast half %conv, 0xH4000 646 %arrayidx5 = getelementptr inbounds half, half* %D, i32 %i.017 647 store half %factor, half* %arrayidx5, align 2 648 %add6 = add nuw nsw i32 %i.017, 1 649 %exitcond = icmp eq i32 %add6, 431 650 br i1 %exitcond, label %for.cond.cleanup, label %for.body 651} 652 653attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" } 654 655!5 = distinct !{!5, !6} 656!6 = !{!"llvm.loop.vectorize.enable", i1 true} 657 658!7 = distinct !{!7, !8} 659!8 = !{!"llvm.loop.vectorize.predicate.enable", i1 false} 660 661!10 = distinct !{!10, !11} 662!11 = !{!"llvm.loop.vectorize.width", i32 4} 663