1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx -S < %s | FileCheck %s --check-prefixes=AVX1
3; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx2 -S < %s | FileCheck %s --check-prefixes=AVX2
4; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx512bw -S < %s | FileCheck %s --check-prefixes=AVX512BW
5; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx,+xop -S < %s | FileCheck %s --check-prefixes=XOP
6; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx2,+xop -S < %s | FileCheck %s --check-prefixes=XOP
7; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx -S -enable-debugify < %s 2>&1 | FileCheck %s -check-prefix=DEBUG
8
9define <4 x i32> @vector_variable_shift_right_v4i32(<4 x i1> %cond, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
10; AVX1-LABEL: @vector_variable_shift_right_v4i32(
11; AVX1-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
12; AVX1-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
13; AVX1-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
14; AVX1-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> [[Z:%.*]], [[SPLAT1]]
15; AVX1-NEXT:    [[TMP2:%.*]] = lshr <4 x i32> [[Z]], [[SPLAT2]]
16; AVX1-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[COND]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]]
17; AVX1-NEXT:    ret <4 x i32> [[TMP3]]
18;
19; AVX2-LABEL: @vector_variable_shift_right_v4i32(
20; AVX2-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
21; AVX2-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
22; AVX2-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
23; AVX2-NEXT:    [[SH:%.*]] = lshr <4 x i32> [[Z:%.*]], [[SEL]]
24; AVX2-NEXT:    ret <4 x i32> [[SH]]
25;
26; AVX512BW-LABEL: @vector_variable_shift_right_v4i32(
27; AVX512BW-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
28; AVX512BW-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
29; AVX512BW-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
30; AVX512BW-NEXT:    [[SH:%.*]] = lshr <4 x i32> [[Z:%.*]], [[SEL]]
31; AVX512BW-NEXT:    ret <4 x i32> [[SH]]
32;
33; XOP-LABEL: @vector_variable_shift_right_v4i32(
34; XOP-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
35; XOP-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
36; XOP-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
37; XOP-NEXT:    [[SH:%.*]] = lshr <4 x i32> [[Z:%.*]], [[SEL]]
38; XOP-NEXT:    ret <4 x i32> [[SH]]
39;
40  %splat1 = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
41  %splat2 = shufflevector <4 x i32> %y, <4 x i32> undef, <4 x i32> zeroinitializer
42  %sel = select <4 x i1> %cond, <4 x i32> %splat1, <4 x i32> %splat2
43  %sh = lshr <4 x i32> %z, %sel
44  ret <4 x i32> %sh
45}
46
47define <16 x i16> @vector_variable_shift_right_v16i16(<16 x i1> %cond, <16 x i16> %x, <16 x i16> %y, <16 x i16> %z) {
48; AVX1-LABEL: @vector_variable_shift_right_v16i16(
49; AVX1-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
50; AVX1-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
51; AVX1-NEXT:    [[SEL:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i16> [[SPLAT1]], <16 x i16> [[SPLAT2]]
52; AVX1-NEXT:    [[TMP1:%.*]] = lshr <16 x i16> [[Z:%.*]], [[SPLAT1]]
53; AVX1-NEXT:    [[TMP2:%.*]] = lshr <16 x i16> [[Z]], [[SPLAT2]]
54; AVX1-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[COND]], <16 x i16> [[TMP1]], <16 x i16> [[TMP2]]
55; AVX1-NEXT:    ret <16 x i16> [[TMP3]]
56;
57; AVX2-LABEL: @vector_variable_shift_right_v16i16(
58; AVX2-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
59; AVX2-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
60; AVX2-NEXT:    [[SEL:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i16> [[SPLAT1]], <16 x i16> [[SPLAT2]]
61; AVX2-NEXT:    [[TMP1:%.*]] = lshr <16 x i16> [[Z:%.*]], [[SPLAT1]]
62; AVX2-NEXT:    [[TMP2:%.*]] = lshr <16 x i16> [[Z]], [[SPLAT2]]
63; AVX2-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[COND]], <16 x i16> [[TMP1]], <16 x i16> [[TMP2]]
64; AVX2-NEXT:    ret <16 x i16> [[TMP3]]
65;
66; AVX512BW-LABEL: @vector_variable_shift_right_v16i16(
67; AVX512BW-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
68; AVX512BW-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
69; AVX512BW-NEXT:    [[SEL:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i16> [[SPLAT1]], <16 x i16> [[SPLAT2]]
70; AVX512BW-NEXT:    [[SH:%.*]] = lshr <16 x i16> [[Z:%.*]], [[SEL]]
71; AVX512BW-NEXT:    ret <16 x i16> [[SH]]
72;
73; XOP-LABEL: @vector_variable_shift_right_v16i16(
74; XOP-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
75; XOP-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
76; XOP-NEXT:    [[SEL:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i16> [[SPLAT1]], <16 x i16> [[SPLAT2]]
77; XOP-NEXT:    [[SH:%.*]] = lshr <16 x i16> [[Z:%.*]], [[SEL]]
78; XOP-NEXT:    ret <16 x i16> [[SH]]
79;
80  %splat1 = shufflevector <16 x i16> %x, <16 x i16> undef, <16 x i32> zeroinitializer
81  %splat2 = shufflevector <16 x i16> %y, <16 x i16> undef, <16 x i32> zeroinitializer
82  %sel = select <16 x i1> %cond, <16 x i16> %splat1, <16 x i16> %splat2
83  %sh = lshr <16 x i16> %z, %sel
84  ret <16 x i16> %sh
85}
86
87define <32 x i8> @vector_variable_shift_right_v32i8(<32 x i1> %cond, <32 x i8> %x, <32 x i8> %y, <32 x i8> %z) {
88; AVX1-LABEL: @vector_variable_shift_right_v32i8(
89; AVX1-NEXT:    [[SPLAT1:%.*]] = shufflevector <32 x i8> [[X:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
90; AVX1-NEXT:    [[SPLAT2:%.*]] = shufflevector <32 x i8> [[Y:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
91; AVX1-NEXT:    [[SEL:%.*]] = select <32 x i1> [[COND:%.*]], <32 x i8> [[SPLAT1]], <32 x i8> [[SPLAT2]]
92; AVX1-NEXT:    [[TMP1:%.*]] = lshr <32 x i8> [[Z:%.*]], [[SPLAT1]]
93; AVX1-NEXT:    [[TMP2:%.*]] = lshr <32 x i8> [[Z]], [[SPLAT2]]
94; AVX1-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[COND]], <32 x i8> [[TMP1]], <32 x i8> [[TMP2]]
95; AVX1-NEXT:    ret <32 x i8> [[TMP3]]
96;
97; AVX2-LABEL: @vector_variable_shift_right_v32i8(
98; AVX2-NEXT:    [[SPLAT1:%.*]] = shufflevector <32 x i8> [[X:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
99; AVX2-NEXT:    [[SPLAT2:%.*]] = shufflevector <32 x i8> [[Y:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
100; AVX2-NEXT:    [[SEL:%.*]] = select <32 x i1> [[COND:%.*]], <32 x i8> [[SPLAT1]], <32 x i8> [[SPLAT2]]
101; AVX2-NEXT:    [[TMP1:%.*]] = lshr <32 x i8> [[Z:%.*]], [[SPLAT1]]
102; AVX2-NEXT:    [[TMP2:%.*]] = lshr <32 x i8> [[Z]], [[SPLAT2]]
103; AVX2-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[COND]], <32 x i8> [[TMP1]], <32 x i8> [[TMP2]]
104; AVX2-NEXT:    ret <32 x i8> [[TMP3]]
105;
106; AVX512BW-LABEL: @vector_variable_shift_right_v32i8(
107; AVX512BW-NEXT:    [[SPLAT1:%.*]] = shufflevector <32 x i8> [[X:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
108; AVX512BW-NEXT:    [[SPLAT2:%.*]] = shufflevector <32 x i8> [[Y:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
109; AVX512BW-NEXT:    [[SEL:%.*]] = select <32 x i1> [[COND:%.*]], <32 x i8> [[SPLAT1]], <32 x i8> [[SPLAT2]]
110; AVX512BW-NEXT:    [[TMP1:%.*]] = lshr <32 x i8> [[Z:%.*]], [[SPLAT1]]
111; AVX512BW-NEXT:    [[TMP2:%.*]] = lshr <32 x i8> [[Z]], [[SPLAT2]]
112; AVX512BW-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[COND]], <32 x i8> [[TMP1]], <32 x i8> [[TMP2]]
113; AVX512BW-NEXT:    ret <32 x i8> [[TMP3]]
114;
115; XOP-LABEL: @vector_variable_shift_right_v32i8(
116; XOP-NEXT:    [[SPLAT1:%.*]] = shufflevector <32 x i8> [[X:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
117; XOP-NEXT:    [[SPLAT2:%.*]] = shufflevector <32 x i8> [[Y:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
118; XOP-NEXT:    [[SEL:%.*]] = select <32 x i1> [[COND:%.*]], <32 x i8> [[SPLAT1]], <32 x i8> [[SPLAT2]]
119; XOP-NEXT:    [[SH:%.*]] = lshr <32 x i8> [[Z:%.*]], [[SEL]]
120; XOP-NEXT:    ret <32 x i8> [[SH]]
121;
122  %splat1 = shufflevector <32 x i8> %x, <32 x i8> undef, <32 x i32> zeroinitializer
123  %splat2 = shufflevector <32 x i8> %y, <32 x i8> undef, <32 x i32> zeroinitializer
124  %sel = select <32 x i1> %cond, <32 x i8> %splat1, <32 x i8> %splat2
125  %sh = lshr <32 x i8> %z, %sel
126  ret <32 x i8> %sh
127}
128
129; PR37428 - https://bugs.llvm.org/show_bug.cgi?id=37428
130
131define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture readonly %control, i32 %count, i32 %amt0, i32 %amt1, i32 %x) {
132; AVX1-LABEL: @vector_variable_shift_left_loop(
133; AVX1-NEXT:  entry:
134; AVX1-NEXT:    [[CMP16:%.*]] = icmp sgt i32 [[COUNT:%.*]], 0
135; AVX1-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[COUNT]] to i64
136; AVX1-NEXT:    br i1 [[CMP16]], label [[VECTOR_PH:%.*]], label [[EXIT:%.*]]
137; AVX1:       vector.ph:
138; AVX1-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
139; AVX1-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> undef, i32 [[AMT0:%.*]], i32 0
140; AVX1-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
141; AVX1-NEXT:    [[SPLATINSERT20:%.*]] = insertelement <4 x i32> undef, i32 [[AMT1:%.*]], i32 0
142; AVX1-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
143; AVX1-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
144; AVX1-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> undef, <4 x i32> zeroinitializer
145; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
146; AVX1:       vector.body:
147; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
148; AVX1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
149; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
150; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
151; AVX1-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
152; AVX1-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
153; AVX1-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
154; AVX1-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[SPLAT3]], [[TMP4]]
155; AVX1-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
156; AVX1-NEXT:    [[TMP7:%.*]] = shl <4 x i32> [[SPLAT3]], [[TMP6]]
157; AVX1-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP5]], <4 x i32> [[TMP7]]
158; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
159; AVX1-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
160; AVX1-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP10]], align 4
161; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
162; AVX1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
163; AVX1-NEXT:    br i1 [[TMP11]], label [[EXIT]], label [[VECTOR_BODY]]
164; AVX1:       exit:
165; AVX1-NEXT:    ret void
166;
167; AVX2-LABEL: @vector_variable_shift_left_loop(
168; AVX2-NEXT:  entry:
169; AVX2-NEXT:    [[CMP16:%.*]] = icmp sgt i32 [[COUNT:%.*]], 0
170; AVX2-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[COUNT]] to i64
171; AVX2-NEXT:    br i1 [[CMP16]], label [[VECTOR_PH:%.*]], label [[EXIT:%.*]]
172; AVX2:       vector.ph:
173; AVX2-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
174; AVX2-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> undef, i32 [[AMT0:%.*]], i32 0
175; AVX2-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
176; AVX2-NEXT:    [[SPLATINSERT20:%.*]] = insertelement <4 x i32> undef, i32 [[AMT1:%.*]], i32 0
177; AVX2-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
178; AVX2-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
179; AVX2-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> undef, <4 x i32> zeroinitializer
180; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
181; AVX2:       vector.body:
182; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
183; AVX2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
184; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
185; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
186; AVX2-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
187; AVX2-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
188; AVX2-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[SPLAT3]], [[TMP3]]
189; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
190; AVX2-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
191; AVX2-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP6]], align 4
192; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
193; AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
194; AVX2-NEXT:    br i1 [[TMP7]], label [[EXIT]], label [[VECTOR_BODY]]
195; AVX2:       exit:
196; AVX2-NEXT:    ret void
197;
198; AVX512BW-LABEL: @vector_variable_shift_left_loop(
199; AVX512BW-NEXT:  entry:
200; AVX512BW-NEXT:    [[CMP16:%.*]] = icmp sgt i32 [[COUNT:%.*]], 0
201; AVX512BW-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[COUNT]] to i64
202; AVX512BW-NEXT:    br i1 [[CMP16]], label [[VECTOR_PH:%.*]], label [[EXIT:%.*]]
203; AVX512BW:       vector.ph:
204; AVX512BW-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
205; AVX512BW-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> undef, i32 [[AMT0:%.*]], i32 0
206; AVX512BW-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
207; AVX512BW-NEXT:    [[SPLATINSERT20:%.*]] = insertelement <4 x i32> undef, i32 [[AMT1:%.*]], i32 0
208; AVX512BW-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
209; AVX512BW-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
210; AVX512BW-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> undef, <4 x i32> zeroinitializer
211; AVX512BW-NEXT:    br label [[VECTOR_BODY:%.*]]
212; AVX512BW:       vector.body:
213; AVX512BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
214; AVX512BW-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
215; AVX512BW-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
216; AVX512BW-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
217; AVX512BW-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
218; AVX512BW-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
219; AVX512BW-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[SPLAT3]], [[TMP3]]
220; AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
221; AVX512BW-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
222; AVX512BW-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP6]], align 4
223; AVX512BW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
224; AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
225; AVX512BW-NEXT:    br i1 [[TMP7]], label [[EXIT]], label [[VECTOR_BODY]]
226; AVX512BW:       exit:
227; AVX512BW-NEXT:    ret void
228;
229; XOP-LABEL: @vector_variable_shift_left_loop(
230; XOP-NEXT:  entry:
231; XOP-NEXT:    [[CMP16:%.*]] = icmp sgt i32 [[COUNT:%.*]], 0
232; XOP-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[COUNT]] to i64
233; XOP-NEXT:    br i1 [[CMP16]], label [[VECTOR_PH:%.*]], label [[EXIT:%.*]]
234; XOP:       vector.ph:
235; XOP-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
236; XOP-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> undef, i32 [[AMT0:%.*]], i32 0
237; XOP-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
238; XOP-NEXT:    [[SPLATINSERT20:%.*]] = insertelement <4 x i32> undef, i32 [[AMT1:%.*]], i32 0
239; XOP-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
240; XOP-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
241; XOP-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> undef, <4 x i32> zeroinitializer
242; XOP-NEXT:    br label [[VECTOR_BODY:%.*]]
243; XOP:       vector.body:
244; XOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
245; XOP-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
246; XOP-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
247; XOP-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
248; XOP-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
249; XOP-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
250; XOP-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[SPLAT3]], [[TMP3]]
251; XOP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
252; XOP-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
253; XOP-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP6]], align 4
254; XOP-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
255; XOP-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
256; XOP-NEXT:    br i1 [[TMP7]], label [[EXIT]], label [[VECTOR_BODY]]
257; XOP:       exit:
258; XOP-NEXT:    ret void
259;
260entry:
261  %cmp16 = icmp sgt i32 %count, 0
262  %wide.trip.count = zext i32 %count to i64
263  br i1 %cmp16, label %vector.ph, label %exit
264
265vector.ph:
266  %n.vec = and i64 %wide.trip.count, 4294967292
267  %splatinsert18 = insertelement <4 x i32> undef, i32 %amt0, i32 0
268  %splat1 = shufflevector <4 x i32> %splatinsert18, <4 x i32> undef, <4 x i32> zeroinitializer
269  %splatinsert20 = insertelement <4 x i32> undef, i32 %amt1, i32 0
270  %splat2 = shufflevector <4 x i32> %splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer
271  %splatinsert22 = insertelement <4 x i32> undef, i32 %x, i32 0
272  %splat3 = shufflevector <4 x i32> %splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
273  br label %vector.body
274
275vector.body:
276  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
277  %0 = getelementptr inbounds i8, i8* %control, i64 %index
278  %1 = bitcast i8* %0 to <4 x i8>*
279  %wide.load = load <4 x i8>, <4 x i8>* %1, align 1
280  %2 = icmp eq <4 x i8> %wide.load, zeroinitializer
281  %3 = select <4 x i1> %2, <4 x i32> %splat1, <4 x i32> %splat2
282  %4 = shl <4 x i32> %splat3, %3
283  %5 = getelementptr inbounds i32, i32* %arr, i64 %index
284  %6 = bitcast i32* %5 to <4 x i32>*
285  store <4 x i32> %4, <4 x i32>* %6, align 4
286  %index.next = add i64 %index, 4
287  %7 = icmp eq i64 %index.next, %n.vec
288  br i1 %7, label %exit, label %vector.body
289
290exit:
291  ret void
292}
293
294; PR37426 - https://bugs.llvm.org/show_bug.cgi?id=37426
295; If we don't have real vector shift instructions (AVX1), convert the funnel
296; shift into 2 funnel shifts and sink the splat shuffles into the loop.
297
298define void @fancierRotate2(i32* %arr, i8* %control, i32 %rot0, i32 %rot1) {
299; AVX1-LABEL: @fancierRotate2(
300; AVX1-NEXT:  entry:
301; AVX1-NEXT:    [[I0:%.*]] = insertelement <8 x i32> undef, i32 [[ROT0:%.*]], i32 0
302; AVX1-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
303; AVX1-NEXT:    [[I1:%.*]] = insertelement <8 x i32> undef, i32 [[ROT1:%.*]], i32 0
304; AVX1-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
305; AVX1-NEXT:    br label [[LOOP:%.*]]
306; AVX1:       loop:
307; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
308; AVX1-NEXT:    [[T0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
309; AVX1-NEXT:    [[T1:%.*]] = bitcast i8* [[T0]] to <8 x i8>*
310; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[T1]], align 1
311; AVX1-NEXT:    [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
312; AVX1-NEXT:    [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
313; AVX1-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
314; AVX1-NEXT:    [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
315; AVX1-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
316; AVX1-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
317; AVX1-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[TMP0]])
318; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
319; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[TMP2]])
320; AVX1-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[TMP1]], <8 x i32> [[TMP3]]
321; AVX1-NEXT:    store <8 x i32> [[TMP4]], <8 x i32>* [[T5]], align 4
322; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
323; AVX1-NEXT:    [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
324; AVX1-NEXT:    br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
325; AVX1:       exit:
326; AVX1-NEXT:    ret void
327;
328; AVX2-LABEL: @fancierRotate2(
329; AVX2-NEXT:  entry:
330; AVX2-NEXT:    [[I0:%.*]] = insertelement <8 x i32> undef, i32 [[ROT0:%.*]], i32 0
331; AVX2-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
332; AVX2-NEXT:    [[I1:%.*]] = insertelement <8 x i32> undef, i32 [[ROT1:%.*]], i32 0
333; AVX2-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
334; AVX2-NEXT:    br label [[LOOP:%.*]]
335; AVX2:       loop:
336; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
337; AVX2-NEXT:    [[T0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
338; AVX2-NEXT:    [[T1:%.*]] = bitcast i8* [[T0]] to <8 x i8>*
339; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[T1]], align 1
340; AVX2-NEXT:    [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
341; AVX2-NEXT:    [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
342; AVX2-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
343; AVX2-NEXT:    [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
344; AVX2-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
345; AVX2-NEXT:    [[ROT:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[SHAMT]])
346; AVX2-NEXT:    store <8 x i32> [[ROT]], <8 x i32>* [[T5]], align 4
347; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
348; AVX2-NEXT:    [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
349; AVX2-NEXT:    br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
350; AVX2:       exit:
351; AVX2-NEXT:    ret void
352;
353; AVX512BW-LABEL: @fancierRotate2(
354; AVX512BW-NEXT:  entry:
355; AVX512BW-NEXT:    [[I0:%.*]] = insertelement <8 x i32> undef, i32 [[ROT0:%.*]], i32 0
356; AVX512BW-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
357; AVX512BW-NEXT:    [[I1:%.*]] = insertelement <8 x i32> undef, i32 [[ROT1:%.*]], i32 0
358; AVX512BW-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
359; AVX512BW-NEXT:    br label [[LOOP:%.*]]
360; AVX512BW:       loop:
361; AVX512BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
362; AVX512BW-NEXT:    [[T0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
363; AVX512BW-NEXT:    [[T1:%.*]] = bitcast i8* [[T0]] to <8 x i8>*
364; AVX512BW-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[T1]], align 1
365; AVX512BW-NEXT:    [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
366; AVX512BW-NEXT:    [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
367; AVX512BW-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
368; AVX512BW-NEXT:    [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
369; AVX512BW-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
370; AVX512BW-NEXT:    [[ROT:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[SHAMT]])
371; AVX512BW-NEXT:    store <8 x i32> [[ROT]], <8 x i32>* [[T5]], align 4
372; AVX512BW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
373; AVX512BW-NEXT:    [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
374; AVX512BW-NEXT:    br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
375; AVX512BW:       exit:
376; AVX512BW-NEXT:    ret void
377;
378; XOP-LABEL: @fancierRotate2(
379; XOP-NEXT:  entry:
380; XOP-NEXT:    [[I0:%.*]] = insertelement <8 x i32> undef, i32 [[ROT0:%.*]], i32 0
381; XOP-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
382; XOP-NEXT:    [[I1:%.*]] = insertelement <8 x i32> undef, i32 [[ROT1:%.*]], i32 0
383; XOP-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
384; XOP-NEXT:    br label [[LOOP:%.*]]
385; XOP:       loop:
386; XOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
387; XOP-NEXT:    [[T0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
388; XOP-NEXT:    [[T1:%.*]] = bitcast i8* [[T0]] to <8 x i8>*
389; XOP-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[T1]], align 1
390; XOP-NEXT:    [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
391; XOP-NEXT:    [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
392; XOP-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
393; XOP-NEXT:    [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
394; XOP-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
395; XOP-NEXT:    [[ROT:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[SHAMT]])
396; XOP-NEXT:    store <8 x i32> [[ROT]], <8 x i32>* [[T5]], align 4
397; XOP-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
398; XOP-NEXT:    [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
399; XOP-NEXT:    br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
400; XOP:       exit:
401; XOP-NEXT:    ret void
402;
403entry:
404  %i0 = insertelement <8 x i32> undef, i32 %rot0, i32 0
405  %s0 = shufflevector <8 x i32> %i0, <8 x i32> undef, <8 x i32> zeroinitializer
406  %i1 = insertelement <8 x i32> undef, i32 %rot1, i32 0
407  %s1 = shufflevector <8 x i32> %i1, <8 x i32> undef, <8 x i32> zeroinitializer
408  br label %loop
409
410loop:
411  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
412  %t0 = getelementptr inbounds i8, i8* %control, i64 %index
413  %t1 = bitcast i8* %t0 to <8 x i8>*
414  %wide.load = load <8 x i8>, <8 x i8>* %t1, align 1
415  %t2 = icmp eq <8 x i8> %wide.load, zeroinitializer
416  %shamt = select <8 x i1> %t2, <8 x i32> %s0, <8 x i32> %s1
417  %t4 = getelementptr inbounds i32, i32* %arr, i64 %index
418  %t5 = bitcast i32* %t4 to <8 x i32>*
419  %wide.load21 = load <8 x i32>, <8 x i32>* %t5, align 4
420  %rot = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %wide.load21, <8 x i32> %wide.load21, <8 x i32> %shamt)
421  store <8 x i32> %rot, <8 x i32>* %t5, align 4
422  %index.next = add i64 %index, 8
423  %t7 = icmp eq i64 %index.next, 1024
424  br i1 %t7, label %exit, label %loop
425
426exit:
427  ret void
428}
429
430declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>) #1
431
432; Check that every instruction inserted by -codegenprepare has a debug location.
433; DEBUG: CheckModuleDebugify: PASS
434