1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
3; RUN:   -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
4; RUN:   FileCheck %s --check-prefix=CHECK-LE
5; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
6; RUN:   -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
7; RUN:   FileCheck %s --check-prefix=CHECK-BE
8; RUN: opt --passes=sroa,loop-vectorize,loop-unroll,instcombine -S \
9; RUN: -vectorizer-maximize-bandwidth --mtriple=powerpc64le-- -mcpu=pwr10 < %s | \
10; RUN: FileCheck %s --check-prefix=CHECK-OPT
11
12target datalayout = "e-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512"
13
14define dso_local signext i32 @test_32byte_vector() nounwind {
15; CHECK-LE-LABEL: test_32byte_vector:
16; CHECK-LE:       # %bb.0: # %entry
17; CHECK-LE-NEXT:    mflr r0
18; CHECK-LE-NEXT:    std r30, -16(r1)
19; CHECK-LE-NEXT:    mr r30, r1
20; CHECK-LE-NEXT:    std r0, 16(r1)
21; CHECK-LE-NEXT:    clrldi r0, r1, 59
22; CHECK-LE-NEXT:    subfic r0, r0, -96
23; CHECK-LE-NEXT:    stdux r1, r1, r0
24; CHECK-LE-NEXT:    addis r3, r2, .LCPI0_0@toc@ha
25; CHECK-LE-NEXT:    addis r4, r2, .LCPI0_1@toc@ha
26; CHECK-LE-NEXT:    addi r3, r3, .LCPI0_0@toc@l
27; CHECK-LE-NEXT:    addi r4, r4, .LCPI0_1@toc@l
28; CHECK-LE-NEXT:    lxvd2x vs0, 0, r3
29; CHECK-LE-NEXT:    lxvd2x vs1, 0, r4
30; CHECK-LE-NEXT:    addi r4, r1, 48
31; CHECK-LE-NEXT:    addi r3, r1, 32
32; CHECK-LE-NEXT:    stxvd2x vs0, 0, r4
33; CHECK-LE-NEXT:    stxvd2x vs1, 0, r3
34; CHECK-LE-NEXT:    bl test
35; CHECK-LE-NEXT:    nop
36; CHECK-LE-NEXT:    lwa r3, 32(r1)
37; CHECK-LE-NEXT:    mr r1, r30
38; CHECK-LE-NEXT:    ld r0, 16(r1)
39; CHECK-LE-NEXT:    ld r30, -16(r1)
40; CHECK-LE-NEXT:    mtlr r0
41; CHECK-LE-NEXT:    blr
42;
43; CHECK-BE-LABEL: test_32byte_vector:
44; CHECK-BE:       # %bb.0: # %entry
45; CHECK-BE-NEXT:    mflr r0
46; CHECK-BE-NEXT:    std r30, -16(r1)
47; CHECK-BE-NEXT:    std r0, 16(r1)
48; CHECK-BE-NEXT:    clrldi r0, r1, 59
49; CHECK-BE-NEXT:    mr r30, r1
50; CHECK-BE-NEXT:    subfic r0, r0, -192
51; CHECK-BE-NEXT:    stdux r1, r1, r0
52; CHECK-BE-NEXT:    lis r3, -8192
53; CHECK-BE-NEXT:    li r4, 5
54; CHECK-BE-NEXT:    lis r5, -16384
55; CHECK-BE-NEXT:    lis r6, -32768
56; CHECK-BE-NEXT:    ori r3, r3, 1
57; CHECK-BE-NEXT:    rldic r4, r4, 32, 29
58; CHECK-BE-NEXT:    ori r5, r5, 1
59; CHECK-BE-NEXT:    ori r6, r6, 1
60; CHECK-BE-NEXT:    rldic r3, r3, 3, 29
61; CHECK-BE-NEXT:    ori r4, r4, 6
62; CHECK-BE-NEXT:    rldic r5, r5, 2, 30
63; CHECK-BE-NEXT:    rldic r6, r6, 1, 31
64; CHECK-BE-NEXT:    std r3, 152(r1)
65; CHECK-BE-NEXT:    addi r3, r1, 128
66; CHECK-BE-NEXT:    std r4, 144(r1)
67; CHECK-BE-NEXT:    std r5, 136(r1)
68; CHECK-BE-NEXT:    std r6, 128(r1)
69; CHECK-BE-NEXT:    bl test
70; CHECK-BE-NEXT:    nop
71; CHECK-BE-NEXT:    lwa r3, 128(r1)
72; CHECK-BE-NEXT:    mr r1, r30
73; CHECK-BE-NEXT:    ld r0, 16(r1)
74; CHECK-BE-NEXT:    ld r30, -16(r1)
75; CHECK-BE-NEXT:    mtlr r0
76; CHECK-BE-NEXT:    blr
77entry:
78  %a = alloca <8 x i32>, align 32
79  %0 = bitcast <8 x i32>* %a to i8*
80  call void @llvm.lifetime.start.p0i8(i64 32, i8* %0)
81  store <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32>* %a, align 32
82  call void @test(<8 x i32>* %a)
83  %1 = load <8 x i32>, <8 x i32>* %a, align 32
84  %vecext = extractelement <8 x i32> %1, i32 0
85  %2 = bitcast <8 x i32>* %a to i8*
86  call void @llvm.lifetime.end.p0i8(i64 32, i8* %2)
87  ret i32 %vecext
88}
89
90define dso_local signext i32 @test_32byte_aligned_vector() nounwind {
91; CHECK-LE-LABEL: test_32byte_aligned_vector:
92; CHECK-LE:       # %bb.0: # %entry
93; CHECK-LE-NEXT:    mflr r0
94; CHECK-LE-NEXT:    std r30, -16(r1)
95; CHECK-LE-NEXT:    mr r30, r1
96; CHECK-LE-NEXT:    std r0, 16(r1)
97; CHECK-LE-NEXT:    clrldi r0, r1, 59
98; CHECK-LE-NEXT:    subfic r0, r0, -64
99; CHECK-LE-NEXT:    stdux r1, r1, r0
100; CHECK-LE-NEXT:    addis r3, r2, .LCPI1_0@toc@ha
101; CHECK-LE-NEXT:    addi r3, r3, .LCPI1_0@toc@l
102; CHECK-LE-NEXT:    lxvd2x vs0, 0, r3
103; CHECK-LE-NEXT:    addi r3, r1, 32
104; CHECK-LE-NEXT:    stxvd2x vs0, 0, r3
105; CHECK-LE-NEXT:    bl test1
106; CHECK-LE-NEXT:    nop
107; CHECK-LE-NEXT:    lwa r3, 32(r1)
108; CHECK-LE-NEXT:    mr r1, r30
109; CHECK-LE-NEXT:    ld r0, 16(r1)
110; CHECK-LE-NEXT:    ld r30, -16(r1)
111; CHECK-LE-NEXT:    mtlr r0
112; CHECK-LE-NEXT:    blr
113;
114; CHECK-BE-LABEL: test_32byte_aligned_vector:
115; CHECK-BE:       # %bb.0: # %entry
116; CHECK-BE-NEXT:    mflr r0
117; CHECK-BE-NEXT:    std r30, -16(r1)
118; CHECK-BE-NEXT:    std r0, 16(r1)
119; CHECK-BE-NEXT:    clrldi r0, r1, 59
120; CHECK-BE-NEXT:    mr r30, r1
121; CHECK-BE-NEXT:    subfic r0, r0, -160
122; CHECK-BE-NEXT:    stdux r1, r1, r0
123; CHECK-BE-NEXT:    lis r3, -16384
124; CHECK-BE-NEXT:    lis r4, -32768
125; CHECK-BE-NEXT:    ori r3, r3, 1
126; CHECK-BE-NEXT:    ori r4, r4, 1
127; CHECK-BE-NEXT:    rldic r3, r3, 2, 30
128; CHECK-BE-NEXT:    rldic r4, r4, 1, 31
129; CHECK-BE-NEXT:    std r3, 136(r1)
130; CHECK-BE-NEXT:    addi r3, r1, 128
131; CHECK-BE-NEXT:    std r4, 128(r1)
132; CHECK-BE-NEXT:    bl test1
133; CHECK-BE-NEXT:    nop
134; CHECK-BE-NEXT:    lwa r3, 128(r1)
135; CHECK-BE-NEXT:    mr r1, r30
136; CHECK-BE-NEXT:    ld r0, 16(r1)
137; CHECK-BE-NEXT:    ld r30, -16(r1)
138; CHECK-BE-NEXT:    mtlr r0
139; CHECK-BE-NEXT:    blr
140entry:
141  %a = alloca <4 x i32>, align 32
142  %0 = bitcast <4 x i32>* %a to i8*
143  call void @llvm.lifetime.start.p0i8(i64 16, i8* %0)
144  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32>* %a, align 32
145  call void @test1(<4 x i32>* %a)
146  %1 = load <4 x i32>, <4 x i32>* %a, align 32
147  %vecext = extractelement <4 x i32> %1, i32 0
148  %2 = bitcast <4 x i32>* %a to i8*
149  call void @llvm.lifetime.end.p0i8(i64 16, i8* %2)
150  ret i32 %vecext
151}
152
153
154@Arr1 = dso_local global [64 x i8] zeroinitializer, align 1
155
156define dso_local void @test_Array() nounwind {
157; CHECK-OPT-LABEL: @test_Array(
158; CHECK-OPT-NEXT: entry:
159; CHECK-OPT-NEXT: %Arr2 = alloca [64 x i16], align 2
160; CHECK-OPT: store <16 x i16> [[TMP0:%.*]], <16 x i16>* [[TMP0:%.*]], align 2
161; CHECK-LE-LABEL: test_Array:
162; CHECK-LE:       # %bb.0: # %entry
163; CHECK-LE-NEXT:    mflr r0
164; CHECK-LE-NEXT:    std r0, 16(r1)
165; CHECK-LE-NEXT:    stdu r1, -176(r1)
166; CHECK-LE-NEXT:    addis r4, r2, Arr1@toc@ha
167; CHECK-LE-NEXT:    li r3, 0
168; CHECK-LE-NEXT:    li r6, 65
169; CHECK-LE-NEXT:    addi r5, r1, 46
170; CHECK-LE-NEXT:    addi r4, r4, Arr1@toc@l
171; CHECK-LE-NEXT:    stw r3, 44(r1)
172; CHECK-LE-NEXT:    addi r4, r4, -1
173; CHECK-LE-NEXT:    mtctr r6
174; CHECK-LE-NEXT:    bdz .LBB2_2
175; CHECK-LE-NEXT:    .p2align 5
176; CHECK-LE-NEXT:  .LBB2_1: # %for.body
177; CHECK-LE-NEXT:    #
178; CHECK-LE-NEXT:    lbz r6, 1(r4)
179; CHECK-LE-NEXT:    addi r7, r5, 2
180; CHECK-LE-NEXT:    addi r4, r4, 1
181; CHECK-LE-NEXT:    addi r3, r3, 1
182; CHECK-LE-NEXT:    sth r6, 2(r5)
183; CHECK-LE-NEXT:    mr r5, r7
184; CHECK-LE-NEXT:    bdnz .LBB2_1
185; CHECK-LE-NEXT:  .LBB2_2: # %for.cond.cleanup
186; CHECK-LE-NEXT:    addi r3, r1, 48
187; CHECK-LE-NEXT:    bl test_arr
188; CHECK-LE-NEXT:    nop
189; CHECK-LE-NEXT:    addi r1, r1, 176
190; CHECK-LE-NEXT:    ld r0, 16(r1)
191; CHECK-LE-NEXT:    mtlr r0
192; CHECK-LE-NEXT:    blr
193;
194; CHECK-BE-LABEL: test_Array:
195; CHECK-BE:       # %bb.0: # %entry
196; CHECK-BE-NEXT:    mflr r0
197; CHECK-BE-NEXT:    std r0, 16(r1)
198; CHECK-BE-NEXT:    stdu r1, -256(r1)
199; CHECK-BE-NEXT:    addis r5, r2, Arr1@toc@ha
200; CHECK-BE-NEXT:    li r3, 0
201; CHECK-BE-NEXT:    addi r5, r5, Arr1@toc@l
202; CHECK-BE-NEXT:    addi r4, r1, 126
203; CHECK-BE-NEXT:    li r6, 65
204; CHECK-BE-NEXT:    stw r3, 124(r1)
205; CHECK-BE-NEXT:    addi r5, r5, -1
206; CHECK-BE-NEXT:    mtctr r6
207; CHECK-BE-NEXT:    bdz .LBB2_2
208; CHECK-BE-NEXT:  .LBB2_1: # %for.body
209; CHECK-BE-NEXT:    #
210; CHECK-BE-NEXT:    lbz r6, 1(r5)
211; CHECK-BE-NEXT:    addi r5, r5, 1
212; CHECK-BE-NEXT:    addi r3, r3, 1
213; CHECK-BE-NEXT:    sth r6, 2(r4)
214; CHECK-BE-NEXT:    addi r4, r4, 2
215; CHECK-BE-NEXT:    bdnz .LBB2_1
216; CHECK-BE-NEXT:  .LBB2_2: # %for.cond.cleanup
217; CHECK-BE-NEXT:    addi r3, r1, 128
218; CHECK-BE-NEXT:    bl test_arr
219; CHECK-BE-NEXT:    nop
220; CHECK-BE-NEXT:    addi r1, r1, 256
221; CHECK-BE-NEXT:    ld r0, 16(r1)
222; CHECK-BE-NEXT:    mtlr r0
223; CHECK-BE-NEXT:    blr
224entry:
225  %Arr2 = alloca [64 x i16], align 2
226  %i = alloca i32, align 4
227  %0 = bitcast [64 x i16]* %Arr2 to i8*
228  call void @llvm.lifetime.start.p0i8(i64 128, i8* %0)
229  %1 = bitcast i32* %i to i8*
230  call void @llvm.lifetime.start.p0i8(i64 4, i8* %1)
231  store i32 0, i32* %i, align 4
232  br label %for.cond
233
234for.cond:                                         ; preds = %for.inc, %entry
235  %2 = load i32, i32* %i, align 4
236  %cmp = icmp slt i32 %2, 64
237  br i1 %cmp, label %for.body, label %for.cond.cleanup
238
239for.cond.cleanup:                                 ; preds = %for.cond
240  %3 = bitcast i32* %i to i8*
241  call void @llvm.lifetime.end.p0i8(i64 4, i8* %3)
242  br label %for.end
243
244for.body:                                         ; preds = %for.cond
245  %4 = load i32, i32* %i, align 4
246  %idxprom = sext i32 %4 to i64
247  %arrayidx = getelementptr inbounds [64 x i8], [64 x i8]* @Arr1, i64 0, i64 %idxprom
248  %5 = load i8, i8* %arrayidx, align 1
249  %conv = zext i8 %5 to i16
250  %6 = load i32, i32* %i, align 4
251  %idxprom1 = sext i32 %6 to i64
252  %arrayidx2 = getelementptr inbounds [64 x i16], [64 x i16]* %Arr2, i64 0, i64 %idxprom1
253  store i16 %conv, i16* %arrayidx2, align 2
254  br label %for.inc
255
256for.inc:                                          ; preds = %for.body
257  %7 = load i32, i32* %i, align 4
258  %inc = add nsw i32 %7, 1
259  store i32 %inc, i32* %i, align 4
260  br label %for.cond
261
262for.end:                                          ; preds = %for.cond.cleanup
263  %arraydecay = getelementptr inbounds [64 x i16], [64 x i16]* %Arr2, i64 0, i64 0
264  call void @test_arr(i16* %arraydecay)
265  %8 = bitcast [64 x i16]* %Arr2 to i8*
266  call void @llvm.lifetime.end.p0i8(i64 128, i8* %8)
267  ret void
268}
269
270declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) nounwind
271
272declare void @test(<8 x i32>*) nounwind
273declare void @test1(<4 x i32>*) nounwind
274declare void @test_arr(i16*)
275
276declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) nounwind
277