1; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s 2 3target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" 4 5; Vectorized subsets of the load/store chains in the presence of 6; interleaved loads/stores 7 8; CHECK-LABEL: @interleave_2L_2S( 9; CHECK: load <2 x i32> 10; CHECK: load i32 11; CHECK: store <2 x i32> 12; CHECK: load i32 13define void @interleave_2L_2S(i32* noalias %ptr) { 14 %next.gep = getelementptr i32, i32* %ptr, i64 0 15 %next.gep1 = getelementptr i32, i32* %ptr, i64 1 16 %next.gep2 = getelementptr i32, i32* %ptr, i64 2 17 18 %l1 = load i32, i32* %next.gep1, align 4 19 %l2 = load i32, i32* %next.gep, align 4 20 store i32 0, i32* %next.gep1, align 4 21 store i32 0, i32* %next.gep, align 4 22 %l3 = load i32, i32* %next.gep1, align 4 23 %l4 = load i32, i32* %next.gep2, align 4 24 25 ret void 26} 27 28; CHECK-LABEL: @interleave_3L_2S_1L( 29; CHECK: load <3 x i32> 30; CHECK: store <2 x i32> 31; CHECK: load i32 32 33define void @interleave_3L_2S_1L(i32* noalias %ptr) { 34 %next.gep = getelementptr i32, i32* %ptr, i64 0 35 %next.gep1 = getelementptr i32, i32* %ptr, i64 1 36 %next.gep2 = getelementptr i32, i32* %ptr, i64 2 37 38 %l2 = load i32, i32* %next.gep, align 4 39 %l1 = load i32, i32* %next.gep1, align 4 40 store i32 0, i32* %next.gep1, align 4 41 store i32 0, i32* %next.gep, align 4 42 %l3 = load i32, i32* %next.gep1, align 4 43 %l4 = load i32, i32* %next.gep2, align 4 44 45 ret void 46} 47 48; CHECK-LABEL: @chain_suffix( 49; CHECK: load i32 50; CHECK: store <2 x i32> 51; CHECK: load <2 x i32> 52define void @chain_suffix(i32* noalias %ptr) { 53 %next.gep = getelementptr i32, i32* %ptr, i64 0 54 %next.gep1 = getelementptr i32, i32* %ptr, i64 1 55 %next.gep2 = getelementptr i32, i32* %ptr, i64 2 56 57 %l2 = load i32, i32* %next.gep, align 4 58 store i32 0, i32* %next.gep1, align 4 59 store i32 0, i32* %next.gep, align 4 60 %l3 = load i32, i32* %next.gep1, align 4 61 %l4 = load i32, i32* %next.gep2, align 4 62 63 ret void 64} 65 66 67; CHECK-LABEL: @chain_prefix_suffix( 68; CHECK: load <2 x i32> 69; CHECK: store <2 x i32> 70; CHECK: load <3 x i32> 71define void @chain_prefix_suffix(i32* noalias %ptr) { 72 %next.gep = getelementptr i32, i32* %ptr, i64 0 73 %next.gep1 = getelementptr i32, i32* %ptr, i64 1 74 %next.gep2 = getelementptr i32, i32* %ptr, i64 2 75 %next.gep3 = getelementptr i32, i32* %ptr, i64 3 76 77 %l1 = load i32, i32* %next.gep, align 4 78 %l2 = load i32, i32* %next.gep1, align 4 79 store i32 0, i32* %next.gep1, align 4 80 store i32 0, i32* %next.gep2, align 4 81 %l3 = load i32, i32* %next.gep1, align 4 82 %l4 = load i32, i32* %next.gep2, align 4 83 %l5 = load i32, i32* %next.gep3, align 4 84 85 ret void 86} 87 88; FIXME: If the chain is too long and TLI says misaligned is not fast, 89; then LSV fails to vectorize anything in that chain. 90; To reproduce below, add a tmp5 (ptr+4) and load tmp5 into l6 and l7. 91 92; CHECK-LABEL: @interleave_get_longest 93; CHECK: load <3 x i32> 94; CHECK: load i32 95; CHECK: store <2 x i32> zeroinitializer 96; CHECK: load i32 97; CHECK: load i32 98; CHECK: load i32 99 100define void @interleave_get_longest(i32* noalias %ptr) { 101 %tmp1 = getelementptr i32, i32* %ptr, i64 0 102 %tmp2 = getelementptr i32, i32* %ptr, i64 1 103 %tmp3 = getelementptr i32, i32* %ptr, i64 2 104 %tmp4 = getelementptr i32, i32* %ptr, i64 3 105 106 %l1 = load i32, i32* %tmp2, align 4 107 %l2 = load i32, i32* %tmp1, align 4 108 store i32 0, i32* %tmp2, align 4 109 store i32 0, i32* %tmp1, align 4 110 %l3 = load i32, i32* %tmp2, align 4 111 %l4 = load i32, i32* %tmp3, align 4 112 %l5 = load i32, i32* %tmp4, align 4 113 %l6 = load i32, i32* %tmp4, align 4 114 %l7 = load i32, i32* %tmp4, align 4 115 116 ret void 117} 118