1; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf \ 2; RUN: -disable-mve-tail-predication=false -loop-vectorize -S < %s | \ 3; RUN: FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING 4 5; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=-mve \ 6; RUN: -disable-mve-tail-predication=false -loop-vectorize \ 7; RUN: -enable-arm-maskedldst=true -S < %s | \ 8; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 9 10; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \ 11; RUN: -disable-mve-tail-predication=false -loop-vectorize \ 12; RUN: -enable-arm-maskedldst=false -S < %s | \ 13; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 14 15; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \ 16; RUN: -disable-mve-tail-predication=true -loop-vectorize \ 17; RUN: -enable-arm-maskedldst=true -S < %s | \ 18; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 19 20; Disabling the low-overhead branch extension will make 21; 'isHardwareLoopProfitable' return false, so that we test avoiding folding for 22; these cases. 23; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve,-lob \ 24; RUN: -disable-mve-tail-predication=false -loop-vectorize \ 25; RUN: -enable-arm-maskedldst=true -S < %s | \ 26; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 27 28; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \ 29; RUN: -disable-mve-tail-predication=false -loop-vectorize \ 30; RUN: -enable-arm-maskedldst=true -S < %s | \ 31; RUN: FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING 32 33; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \ 34; RUN: -prefer-predicate-over-epilog=false \ 35; RUN: -disable-mve-tail-predication=false -loop-vectorize \ 36; RUN: -enable-arm-maskedldst=true -S < %s | \ 37; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 38 39; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \ 40; RUN: -prefer-predicate-over-epilog=true \ 41; RUN: -disable-mve-tail-predication=false -loop-vectorize \ 42; RUN: -enable-arm-maskedldst=true -S < %s | \ 43; RUN: FileCheck %s -check-prefixes=CHECK,FOLDING-OPT 44 45define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 46; CHECK-LABEL: prefer_folding( 47; PREFER-FOLDING: vector.body: 48; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 49; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 50; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32 51; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 52; 53; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( 54; NO-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32( 55; NO-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %for.body 56entry: 57 br label %for.body 58 59for.cond.cleanup: 60 ret void 61 62for.body: 63 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 64 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 65 %0 = load i32, i32* %arrayidx, align 4 66 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 67 %1 = load i32, i32* %arrayidx1, align 4 68 %add = add nsw i32 %1, %0 69 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 70 store i32 %add, i32* %arrayidx2, align 4 71 %add3 = add nuw nsw i32 %i.09, 1 72 %exitcond = icmp eq i32 %add3, 431 73 br i1 %exitcond, label %for.cond.cleanup, label %for.body 74} 75 76define void @mixed_types(i16* noalias nocapture %A, i16* noalias nocapture readonly %B, i16* noalias nocapture readonly %C, i32* noalias nocapture %D, i32* noalias nocapture readonly %E, i32* noalias nocapture readonly %F) #0 { 77; CHECK-LABEL: mixed_types( 78; PREFER-FOLDING: vector.body: 79; PREFER-FOLDING: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16 80; PREFER-FOLDING: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16 81; PREFER-FOLDING: call void @llvm.masked.store.v4i16.p0v4i16 82; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 83; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 84; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32 85; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 86entry: 87 br label %for.body 88 89for.cond.cleanup: 90 ret void 91 92for.body: 93 %i.018 = phi i32 [ 0, %entry ], [ %add9, %for.body ] 94 %arrayidx = getelementptr inbounds i16, i16* %B, i32 %i.018 95 %0 = load i16, i16* %arrayidx, align 2 96 %arrayidx1 = getelementptr inbounds i16, i16* %C, i32 %i.018 97 %1 = load i16, i16* %arrayidx1, align 2 98 %add = add i16 %1, %0 99 %arrayidx4 = getelementptr inbounds i16, i16* %A, i32 %i.018 100 store i16 %add, i16* %arrayidx4, align 2 101 %arrayidx5 = getelementptr inbounds i32, i32* %E, i32 %i.018 102 %2 = load i32, i32* %arrayidx5, align 4 103 %arrayidx6 = getelementptr inbounds i32, i32* %F, i32 %i.018 104 %3 = load i32, i32* %arrayidx6, align 4 105 %add7 = add nsw i32 %3, %2 106 %arrayidx8 = getelementptr inbounds i32, i32* %D, i32 %i.018 107 store i32 %add7, i32* %arrayidx8, align 4 108 %add9 = add nuw nsw i32 %i.018, 1 109 %exitcond = icmp eq i32 %add9, 431 110 br i1 %exitcond, label %for.cond.cleanup, label %for.body 111} 112 113define void @unsupported_i64_type(i64* noalias nocapture %A, i64* noalias nocapture readonly %B, i64* noalias nocapture readonly %C) #0 { 114; CHECK-LABEL: unsupported_i64_type( 115; PREFER-FOLDING-NOT: vector.body: 116; PREFER-FOLDING-NOT: llvm.masked.load 117; PREFER-FOLDING-NOT: llvm.masked.store 118; PREFER-FOLDING: for.body: 119entry: 120 br label %for.body 121 122for.cond.cleanup: 123 ret void 124 125for.body: 126 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 127 %arrayidx = getelementptr inbounds i64, i64* %B, i32 %i.09 128 %0 = load i64, i64* %arrayidx, align 8 129 %arrayidx1 = getelementptr inbounds i64, i64* %C, i32 %i.09 130 %1 = load i64, i64* %arrayidx1, align 8 131 %add = add nsw i64 %1, %0 132 %arrayidx2 = getelementptr inbounds i64, i64* %A, i32 %i.09 133 store i64 %add, i64* %arrayidx2, align 8 134 %add3 = add nuw nsw i32 %i.09, 1 135 %exitcond = icmp eq i32 %add3, 431 136 br i1 %exitcond, label %for.cond.cleanup, label %for.body 137} 138 139define void @zero_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 140; CHECK-LABEL: zero_extending_load_allowed( 141; PREFER-FOLDING: vector.body: 142; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0v4i8 143; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 144; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32 145; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 146entry: 147 br label %for.body 148 149for.cond.cleanup: 150 ret void 151 152for.body: 153 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 154 %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09 155 %0 = load i8, i8* %arrayidx, align 1 156 %conv = zext i8 %0 to i32 157 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 158 %1 = load i32, i32* %arrayidx1, align 4 159 %add = add nsw i32 %1, %conv 160 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 161 store i32 %add, i32* %arrayidx2, align 4 162 %add3 = add nuw nsw i32 %i.09, 1 163 %exitcond = icmp eq i32 %add3, 431 164 br i1 %exitcond, label %for.cond.cleanup, label %for.body 165} 166 167define void @sign_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 168; CHECK-LABEL: sign_extending_load_allowed( 169; PREFER-FOLDING: vector.body: 170; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0v4i8 171; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 172; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32 173; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 174entry: 175 br label %for.body 176 177for.cond.cleanup: 178 ret void 179 180for.body: 181 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 182 %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09 183 %0 = load i8, i8* %arrayidx, align 1 184 %conv = sext i8 %0 to i32 185 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 186 %1 = load i32, i32* %arrayidx1, align 4 187 %add = add nsw i32 %1, %conv 188 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 189 store i32 %add, i32* %arrayidx2, align 4 190 %add3 = add nuw nsw i32 %i.09, 1 191 %exitcond = icmp eq i32 %add3, 431 192 br i1 %exitcond, label %for.cond.cleanup, label %for.body 193} 194 195define void @narrowing_load_not_allowed(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, i16* noalias nocapture readonly %C) #0 { 196; CHECK-LABEL: narrowing_load_not_allowed( 197; PREFER-FOLDING: vector.body: 198; PREFER-FOLDING-NOT: llvm.masked.load 199; PREFER-FOLDING-NOT: llvm.masked.store 200; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 201 202; FOLDING-OPT: vector.body: 203; FOLDING-OPT: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16 204; FOLDING-OPT: call <8 x i8> @llvm.masked.load.v8i8.p0v8i8 205; FOLDING-OPT: call void @llvm.masked.store.v8i8.p0v8i8 206; FOLDING-OPT: br i1 %{{.*}}, label %{{.*}}, label %vector.body 207entry: 208 br label %for.body 209 210for.cond.cleanup: ; preds = %for.body 211 ret void 212 213for.body: ; preds = %for.body, %entry 214 %i.012 = phi i32 [ 0, %entry ], [ %add6, %for.body ] 215 %arrayidx = getelementptr inbounds i16, i16* %C, i32 %i.012 216 %0 = load i16, i16* %arrayidx, align 2 217 %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.012 218 %1 = load i8, i8* %arrayidx1, align 1 219 %conv3 = trunc i16 %0 to i8 220 %add = add i8 %1, %conv3 221 %arrayidx5 = getelementptr inbounds i8, i8* %A, i32 %i.012 222 store i8 %add, i8* %arrayidx5, align 1 223 %add6 = add nuw nsw i32 %i.012, 1 224 %exitcond = icmp eq i32 %add6, 431 225 br i1 %exitcond, label %for.cond.cleanup, label %for.body 226} 227 228define void @narrowing_store_allowed(i8* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 229; CHECK-LABEL: narrowing_store_allowed( 230; PREFER-FOLDING: call void @llvm.masked.store.v4i8.p0v4i8 231; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 232entry: 233 br label %for.body 234 235for.cond.cleanup: 236 ret void 237 238for.body: 239 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 240 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 241 %0 = load i32, i32* %arrayidx, align 4 242 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 243 %1 = load i32, i32* %arrayidx1, align 4 244 %add = add nsw i32 %1, %0 245 %conv = trunc i32 %add to i8 246 %arrayidx2 = getelementptr inbounds i8, i8* %A, i32 %i.09 247 store i8 %conv, i8* %arrayidx2, align 1 248 %add3 = add nuw nsw i32 %i.09, 1 249 %exitcond = icmp eq i32 %add3, 431 250 br i1 %exitcond, label %for.cond.cleanup, label %for.body 251} 252 253; This is a trunc not connected to a store, so we don't allow this. 254; TODO: this is conservative, because the trunc is only used in the 255; loop control statements, and thus not affecting element sizes, so 256; we could allow this case. 257define void @trunc_not_allowed(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 258; CHECK-LABEL: trunc_not_allowed( 259; PREFER-FOLDING: vector.body: 260; PREFER-FOLDING-NOT: llvm.masked.load 261; PREFER-FOLDING-NOT: llvm.masked.store 262; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 263entry: 264 br label %for.body 265 266for.cond.cleanup: 267 ret void 268 269for.body: 270 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 271 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 272 %0 = load i32, i32* %arrayidx, align 4 273 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 274 %1 = load i32, i32* %arrayidx1, align 4 275 %add = add nsw i32 %1, %0 276 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 277 store i32 %add, i32* %arrayidx2, align 4 278 %add3 = add nuw nsw i32 %i.09, 1 279 280 %add.iv = trunc i32 %add3 to i16 281 282 %exitcond = icmp eq i16 %add.iv, 431 283 br i1 %exitcond, label %for.cond.cleanup, label %for.body 284} 285 286define void @trunc_not_allowed_different_vec_elemns(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i16* noalias nocapture %D) #0 { 287; CHECK-LABEL: trunc_not_allowed_different_vec_elemns( 288; PREFER-FOLDING: vector.body: 289; PREFER-FOLDING-NOT: llvm.masked.load 290; PREFER-FOLDING-NOT: llvm.masked.store 291; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 292entry: 293 br label %for.body 294 295for.cond.cleanup: 296 ret void 297 298for.body: 299 %i.021 = phi i32 [ 0, %entry ], [ %add9, %for.body ] 300 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.021 301 %0 = load i32, i32* %arrayidx, align 4 302 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.021 303 %1 = load i32, i32* %arrayidx1, align 4 304 %add = add nsw i32 %1, %0 305 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.021 306 store i32 %add, i32* %arrayidx2, align 4 307 %add.tr = trunc i32 %add to i16 308 %conv7 = shl i16 %add.tr, 1 309 %arrayidx8 = getelementptr inbounds i16, i16* %D, i32 %i.021 310 store i16 %conv7, i16* %arrayidx8, align 2 311 %add9 = add nuw nsw i32 %i.021, 1 312 %exitcond = icmp eq i32 %add9, 431 313 br i1 %exitcond, label %for.cond.cleanup, label %for.body 314} 315 316 317@tab = common global [32 x i8] zeroinitializer, align 1 318 319define i32 @icmp_not_allowed() #0 { 320; CHECK-LABEL: icmp_not_allowed( 321; PREFER-FOLDING: vector.body: 322; PREFER-FOLDING-NOT: llvm.masked.load 323; PREFER-FOLDING-NOT: llvm.masked.store 324; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 325entry: 326 br label %for.body 327 328for.body: 329 %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 330 %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 331 %0 = load i8, i8* %arrayidx, align 1 332 %cmp1 = icmp eq i8 %0, 0 333 %. = select i1 %cmp1, i8 2, i8 1 334 store i8 %., i8* %arrayidx, align 1 335 %inc = add nsw i32 %i.08, 1 336 %exitcond = icmp slt i32 %inc, 1000 337 br i1 %exitcond, label %for.body, label %for.end 338 339for.end: 340 ret i32 0 341} 342 343@ftab = common global [32 x float] zeroinitializer, align 1 344 345define float @fcmp_not_allowed() #0 { 346; CHECK-LABEL: fcmp_not_allowed( 347; PREFER-FOLDING: vector.body: 348; PREFER-FOLDING-NOT: llvm.masked.load 349; PREFER-FOLDING-NOT: llvm.masked.store 350; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 351entry: 352 br label %for.body 353 354for.body: 355 %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 356 %arrayidx = getelementptr inbounds [32 x float], [32 x float]* @ftab, i32 0, i32 %i.08 357 %0 = load float, float* %arrayidx, align 4 358 %cmp1 = fcmp oeq float %0, 0.000000e+00 359 %. = select i1 %cmp1, float 2.000000e+00, float 1.000000e+00 360 store float %., float* %arrayidx, align 4 361 %inc = add nsw i32 %i.08, 1 362 %exitcond = icmp slt i32 %inc, 999 363 br i1 %exitcond, label %for.body, label %for.end 364 365for.end: 366 ret float 0.000000e+00 367} 368 369define void @pragma_vect_predicate_disable(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 370; CHECK-LABEL: pragma_vect_predicate_disable( 371; PREFER-FOLDING: vector.body: 372; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 373; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 374; PREFER-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32 375; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 376entry: 377 br label %for.body 378 379for.cond.cleanup: 380 ret void 381 382for.body: 383 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 384 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 385 %0 = load i32, i32* %arrayidx, align 4 386 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 387 %1 = load i32, i32* %arrayidx1, align 4 388 %add = add nsw i32 %1, %0 389 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 390 store i32 %add, i32* %arrayidx2, align 4 391 %add3 = add nuw nsw i32 %i.09, 1 392 %exitcond = icmp eq i32 %add3, 431 393 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !7 394} 395 396; Test directions for array indices i and N-1. I.e. check strides 1 and -1, and 397; force vectorisation with a loop hint. 398define void @strides_different_direction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) #0 { 399; CHECK-LABEL: strides_different_direction( 400; PREFER-FOLDING: vector.body: 401; PREFER-FOLDING-NOT: llvm.masked.load 402; PREFER-FOLDING-NOT: llvm.masked.store 403; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 404entry: 405 br label %for.body 406 407for.cond.cleanup: 408 ret void 409 410for.body: 411 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 412 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 413 %0 = load i32, i32* %arrayidx, align 4 414 %sub = sub nsw i32 %N, %i.09 415 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %sub 416 %1 = load i32, i32* %arrayidx1, align 4 417 %add = add nsw i32 %1, %0 418 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 419 store i32 %add, i32* %arrayidx2, align 4 420 %add3 = add nuw nsw i32 %i.09, 1 421 %exitcond = icmp eq i32 %add3, 431 422 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10 423} 424 425define void @stride_4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 426; CHECK-LABEL: stride_4( 427; PREFER-FOLDING: vector.body: 428; PREFER-FOLDING-NOT: llvm.masked.load 429; PREFER-FOLDING-NOT: llvm.masked.store 430; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 431entry: 432 br label %for.body 433 434for.cond.cleanup: 435 ret void 436 437for.body: 438 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 439 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 440 %0 = load i32, i32* %arrayidx, align 4 441 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 442 %1 = load i32, i32* %arrayidx1, align 4 443 %add = add nsw i32 %1, %0 444 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 445 store i32 %add, i32* %arrayidx2, align 4 446 %add3 = add nuw nsw i32 %i.09, 4 447 %cmp = icmp ult i32 %add3, 731 448 br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !5 449} 450 451define void @too_many_loop_blocks(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { 452; CHECK-LABEL: too_many_loop_blocks( 453; PREFER-FOLDING: vector.body: 454; PREFER-FOLDING-NOT: llvm.masked.load 455; PREFER-FOLDING-NOT: llvm.masked.store 456; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 457entry: 458 br label %for.body 459 460for.cond.cleanup: 461 ret void 462 463for.body: 464 %i.09 = phi i32 [ 0, %entry ], [ %add3, %loopincr ] 465 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 466 %0 = load i32, i32* %arrayidx, align 4 467 %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 468 %1 = load i32, i32* %arrayidx1, align 4 469 %add = add nsw i32 %1, %0 470 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 471 store i32 %add, i32* %arrayidx2, align 4 472 br label %loopincr 473 474loopincr: 475 %add3 = add nuw nsw i32 %i.09, 1 476 %exitcond = icmp eq i32 %add3, 431 477 br i1 %exitcond, label %for.cond.cleanup, label %for.body 478} 479 480define dso_local void @half(half* noalias nocapture %A, half* noalias nocapture readonly %B, half* noalias nocapture readonly %C) #0 { 481; CHECK-LABEL: half( 482; PREFER-FOLDING: vector.body: 483; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0v8f16 484; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0v8f16 485; PREFER-FOLDING: call void @llvm.masked.store.v8f16.p0v8f16 486; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 487entry: 488 br label %for.body 489 490for.cond.cleanup: 491 ret void 492 493for.body: 494 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 495 %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09 496 %0 = load half, half* %arrayidx, align 2 497 %arrayidx1 = getelementptr inbounds half, half* %C, i32 %i.09 498 %1 = load half, half* %arrayidx1, align 2 499 %add = fadd fast half %1, %0 500 %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09 501 store half %add, half* %arrayidx2, align 2 502 %add3 = add nuw nsw i32 %i.09, 1 503 %exitcond = icmp eq i32 %add3, 431 504 br i1 %exitcond, label %for.cond.cleanup, label %for.body 505} 506 507define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { 508; CHECK-LABEL: float( 509; PREFER-FOLDING: vector.body: 510; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32 511; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32 512; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32 513; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 514entry: 515 br label %for.body 516 517for.cond.cleanup: 518 ret void 519 520for.body: 521 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 522 %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09 523 %0 = load float, float* %arrayidx, align 4 524 %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09 525 %1 = load float, float* %arrayidx1, align 4 526 %add = fadd fast float %1, %0 527 %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09 528 store float %add, float* %arrayidx2, align 4 529 %add3 = add nuw nsw i32 %i.09, 1 530 %exitcond = icmp eq i32 %add3, 431 531 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10 532} 533 534define void @double(double* noalias nocapture %A, double* noalias nocapture readonly %B, double* noalias nocapture readonly %C) #0 { 535; CHECK-LABEL: double( 536; PREFER-FOLDING: for.body: 537; PREFER-FOLDING-NOT: vector.body: 538entry: 539 br label %for.body 540 541for.cond.cleanup: 542 ret void 543 544for.body: 545 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 546 %arrayidx = getelementptr inbounds double, double* %B, i32 %i.09 547 %0 = load double, double* %arrayidx, align 8 548 %arrayidx1 = getelementptr inbounds double, double* %C, i32 %i.09 549 %1 = load double, double* %arrayidx1, align 8 550 %add = fadd fast double %1, %0 551 %arrayidx2 = getelementptr inbounds double, double* %A, i32 %i.09 552 store double %add, double* %arrayidx2, align 8 553 %add3 = add nuw nsw i32 %i.09, 1 554 %exitcond = icmp eq i32 %add3, 431 555 br i1 %exitcond, label %for.cond.cleanup, label %for.body 556} 557 558; TODO: this fpext could be allowed, but we don't lower it very efficiently yet, 559; so reject this for now. 560define void @fpext_allowed(float* noalias nocapture %A, half* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { 561; CHECK-LABEL: fpext_allowed( 562; PREFER-FOLDING: vector.body: 563; PREFER-FOLDING-NOT: llvm.masked.load 564; PREFER-FOLDING-NOT: llvm.masked.store 565; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 566entry: 567 br label %for.body 568 569for.cond.cleanup: 570 ret void 571 572for.body: 573 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 574 %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09 575 %0 = load half, half* %arrayidx, align 2 576 %conv = fpext half %0 to float 577 %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09 578 %1 = load float, float* %arrayidx1, align 4 579 %add = fadd fast float %1, %conv 580 %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09 581 store float %add, float* %arrayidx2, align 4 582 %add3 = add nuw nsw i32 %i.09, 1 583 %exitcond = icmp eq i32 %add3, 431 584 br i1 %exitcond, label %for.cond.cleanup, label %for.body 585} 586 587; TODO: this fptrunc could be allowed, but we don't lower it very efficiently yet, 588; so reject this for now. 589define void @fptrunc_allowed(half* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { 590; CHECK-LABEL: fptrunc_allowed( 591; PREFER-FOLDING: vector.body: 592; PREFER-FOLDING-NOT: llvm.masked.load 593; PREFER-FOLDING-NOT: llvm.masked.store 594; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 595entry: 596 br label %for.body 597 598for.cond.cleanup: 599 ret void 600 601for.body: 602 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 603 %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09 604 %0 = load float, float* %arrayidx, align 4 605 %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09 606 %1 = load float, float* %arrayidx1, align 4 607 %add = fadd fast float %1, %0 608 %conv = fptrunc float %add to half 609 %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09 610 store half %conv, half* %arrayidx2, align 2 611 %add3 = add nuw nsw i32 %i.09, 1 612 %exitcond = icmp eq i32 %add3, 431 613 br i1 %exitcond, label %for.cond.cleanup, label %for.body 614} 615 616define void @fptrunc_not_allowed(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C, half* noalias nocapture %D) #0 { 617; CHECK-LABEL: fptrunc_not_allowed( 618; PREFER-FOLDING: vector.body: 619; PREFER-FOLDING-NOT: llvm.masked.load 620; PREFER-FOLDING-NOT: llvm.masked.store 621; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 622entry: 623 br label %for.body 624 625for.cond.cleanup: 626 ret void 627 628for.body: 629 %i.017 = phi i32 [ 0, %entry ], [ %add6, %for.body ] 630 %arrayidx = getelementptr inbounds float, float* %B, i32 %i.017 631 %0 = load float, float* %arrayidx, align 4 632 %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.017 633 %1 = load float, float* %arrayidx1, align 4 634 %add = fadd fast float %1, %0 635 %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.017 636 store float %add, float* %arrayidx2, align 4 637 %conv = fptrunc float %add to half 638 %factor = fmul fast half %conv, 0xH4000 639 %arrayidx5 = getelementptr inbounds half, half* %D, i32 %i.017 640 store half %factor, half* %arrayidx5, align 2 641 %add6 = add nuw nsw i32 %i.017, 1 642 %exitcond = icmp eq i32 %add6, 431 643 br i1 %exitcond, label %for.cond.cleanup, label %for.body 644} 645 646attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" } 647 648!5 = distinct !{!5, !6} 649!6 = !{!"llvm.loop.vectorize.enable", i1 true} 650 651!7 = distinct !{!7, !8} 652!8 = !{!"llvm.loop.vectorize.predicate.enable", i1 false} 653 654!10 = distinct !{!10, !11} 655!11 = !{!"llvm.loop.vectorize.width", i32 4} 656