1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
3; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
4
5target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
6
7define float @matching_fp_scalar(float* align 16 dereferenceable(16) %p) {
8; CHECK-LABEL: @matching_fp_scalar(
9; CHECK-NEXT:    [[R:%.*]] = load float, float* [[P:%.*]], align 16
10; CHECK-NEXT:    ret float [[R]]
11;
12  %r = load float, float* %p, align 16
13  ret float %r
14}
15
16define float @matching_fp_scalar_volatile(float* align 16 dereferenceable(16) %p) {
17; CHECK-LABEL: @matching_fp_scalar_volatile(
18; CHECK-NEXT:    [[R:%.*]] = load volatile float, float* [[P:%.*]], align 16
19; CHECK-NEXT:    ret float [[R]]
20;
21  %r = load volatile float, float* %p, align 16
22  ret float %r
23}
24
25define double @larger_fp_scalar(float* align 16 dereferenceable(16) %p) {
26; CHECK-LABEL: @larger_fp_scalar(
27; CHECK-NEXT:    [[BC:%.*]] = bitcast float* [[P:%.*]] to double*
28; CHECK-NEXT:    [[R:%.*]] = load double, double* [[BC]], align 16
29; CHECK-NEXT:    ret double [[R]]
30;
31  %bc = bitcast float* %p to double*
32  %r = load double, double* %bc, align 16
33  ret double %r
34}
35
36define float @smaller_fp_scalar(double* align 16 dereferenceable(16) %p) {
37; CHECK-LABEL: @smaller_fp_scalar(
38; CHECK-NEXT:    [[BC:%.*]] = bitcast double* [[P:%.*]] to float*
39; CHECK-NEXT:    [[R:%.*]] = load float, float* [[BC]], align 16
40; CHECK-NEXT:    ret float [[R]]
41;
42  %bc = bitcast double* %p to float*
43  %r = load float, float* %bc, align 16
44  ret float %r
45}
46
47define float @matching_fp_vector(<4 x float>* align 16 dereferenceable(16) %p) {
48; CHECK-LABEL: @matching_fp_vector(
49; CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x float>* [[P:%.*]] to float*
50; CHECK-NEXT:    [[R:%.*]] = load float, float* [[BC]], align 16
51; CHECK-NEXT:    ret float [[R]]
52;
53  %bc = bitcast <4 x float>* %p to float*
54  %r = load float, float* %bc, align 16
55  ret float %r
56}
57
58define float @matching_fp_vector_gep00(<4 x float>* align 16 dereferenceable(16) %p) {
59; CHECK-LABEL: @matching_fp_vector_gep00(
60; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 0
61; CHECK-NEXT:    [[R:%.*]] = load float, float* [[GEP]], align 16
62; CHECK-NEXT:    ret float [[R]]
63;
64  %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0
65  %r = load float, float* %gep, align 16
66  ret float %r
67}
68
69define float @matching_fp_vector_gep01(<4 x float>* align 16 dereferenceable(20) %p) {
70; CHECK-LABEL: @matching_fp_vector_gep01(
71; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 1
72; CHECK-NEXT:    [[R:%.*]] = load float, float* [[GEP]], align 4
73; CHECK-NEXT:    ret float [[R]]
74;
75  %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 1
76  %r = load float, float* %gep, align 4
77  ret float %r
78}
79
80define float @matching_fp_vector_gep01_deref(<4 x float>* align 16 dereferenceable(19) %p) {
81; CHECK-LABEL: @matching_fp_vector_gep01_deref(
82; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 1
83; CHECK-NEXT:    [[R:%.*]] = load float, float* [[GEP]], align 4
84; CHECK-NEXT:    ret float [[R]]
85;
86  %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 1
87  %r = load float, float* %gep, align 4
88  ret float %r
89}
90
91define float @matching_fp_vector_gep10(<4 x float>* align 16 dereferenceable(32) %p) {
92; CHECK-LABEL: @matching_fp_vector_gep10(
93; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 1, i64 0
94; CHECK-NEXT:    [[R:%.*]] = load float, float* [[GEP]], align 16
95; CHECK-NEXT:    ret float [[R]]
96;
97  %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 1, i64 0
98  %r = load float, float* %gep, align 16
99  ret float %r
100}
101
102define float @matching_fp_vector_gep10_deref(<4 x float>* align 16 dereferenceable(31) %p) {
103; CHECK-LABEL: @matching_fp_vector_gep10_deref(
104; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 1, i64 0
105; CHECK-NEXT:    [[R:%.*]] = load float, float* [[GEP]], align 16
106; CHECK-NEXT:    ret float [[R]]
107;
108  %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 1, i64 0
109  %r = load float, float* %gep, align 16
110  ret float %r
111}
112
113define float @nonmatching_int_vector(<2 x i64>* align 16 dereferenceable(16) %p) {
114; CHECK-LABEL: @nonmatching_int_vector(
115; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64>* [[P:%.*]] to float*
116; CHECK-NEXT:    [[R:%.*]] = load float, float* [[BC]], align 16
117; CHECK-NEXT:    ret float [[R]]
118;
119  %bc = bitcast <2 x i64>* %p to float*
120  %r = load float, float* %bc, align 16
121  ret float %r
122}
123
124define double @less_aligned(double* align 4 dereferenceable(16) %p) {
125; CHECK-LABEL: @less_aligned(
126; CHECK-NEXT:    [[R:%.*]] = load double, double* [[P:%.*]], align 4
127; CHECK-NEXT:    ret double [[R]]
128;
129  %r = load double, double* %p, align 4
130  ret double %r
131}
132
133define float @matching_fp_scalar_small_deref(float* align 16 dereferenceable(15) %p) {
134; CHECK-LABEL: @matching_fp_scalar_small_deref(
135; CHECK-NEXT:    [[R:%.*]] = load float, float* [[P:%.*]], align 16
136; CHECK-NEXT:    ret float [[R]]
137;
138  %r = load float, float* %p, align 16
139  ret float %r
140}
141
142define i64 @larger_int_scalar(<4 x float>* align 16 dereferenceable(16) %p) {
143; CHECK-LABEL: @larger_int_scalar(
144; CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x float>* [[P:%.*]] to i64*
145; CHECK-NEXT:    [[R:%.*]] = load i64, i64* [[BC]], align 16
146; CHECK-NEXT:    ret i64 [[R]]
147;
148  %bc = bitcast <4 x float>* %p to i64*
149  %r = load i64, i64* %bc, align 16
150  ret i64 %r
151}
152
153define i8 @smaller_int_scalar(<4 x float>* align 16 dereferenceable(16) %p) {
154; CHECK-LABEL: @smaller_int_scalar(
155; CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x float>* [[P:%.*]] to i8*
156; CHECK-NEXT:    [[R:%.*]] = load i8, i8* [[BC]], align 16
157; CHECK-NEXT:    ret i8 [[R]]
158;
159  %bc = bitcast <4 x float>* %p to i8*
160  %r = load i8, i8* %bc, align 16
161  ret i8 %r
162}
163
164define double @larger_fp_scalar_256bit_vec(<8 x float>* align 32 dereferenceable(32) %p) {
165; CHECK-LABEL: @larger_fp_scalar_256bit_vec(
166; CHECK-NEXT:    [[BC:%.*]] = bitcast <8 x float>* [[P:%.*]] to double*
167; CHECK-NEXT:    [[R:%.*]] = load double, double* [[BC]], align 32
168; CHECK-NEXT:    ret double [[R]]
169;
170  %bc = bitcast <8 x float>* %p to double*
171  %r = load double, double* %bc, align 32
172  ret double %r
173}
174
175define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) nofree nosync {
176; CHECK-LABEL: @load_f32_insert_v4f32(
177; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
178; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
179; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
180; CHECK-NEXT:    ret <4 x float> [[R]]
181;
182  %s = load float, float* %p, align 4
183  %r = insertelement <4 x float> undef, float %s, i32 0
184  ret <4 x float> %r
185}
186
187define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenceable(16) %p) nofree nosync {
188; CHECK-LABEL: @casted_load_f32_insert_v4f32(
189; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 4
190; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
191; CHECK-NEXT:    ret <4 x float> [[R]]
192;
193  %b = bitcast <4 x float>* %p to float*
194  %s = load float, float* %b, align 4
195  %r = insertelement <4 x float> undef, float %s, i32 0
196  ret <4 x float> %r
197}
198
199; Element type does not change cost.
200
201define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) nofree nosync {
202; CHECK-LABEL: @load_i32_insert_v4i32(
203; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
204; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16
205; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
206; CHECK-NEXT:    ret <4 x i32> [[R]]
207;
208  %s = load i32, i32* %p, align 4
209  %r = insertelement <4 x i32> undef, i32 %s, i32 0
210  ret <4 x i32> %r
211}
212
213; Pointer type does not change cost.
214
215define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceable(16) %p) nofree nosync {
216; CHECK-LABEL: @casted_load_i32_insert_v4i32(
217; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>*
218; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
219; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
220; CHECK-NEXT:    ret <4 x i32> [[R]]
221;
222  %b = bitcast <16 x i8>* %p to i32*
223  %s = load i32, i32* %b, align 4
224  %r = insertelement <4 x i32> undef, i32 %s, i32 0
225  ret <4 x i32> %r
226}
227
228; This is canonical form for vector element access.
229
230define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenceable(16) %p) nofree nosync {
231; CHECK-LABEL: @gep00_load_f32_insert_v4f32(
232; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16
233; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
234; CHECK-NEXT:    ret <4 x float> [[R]]
235;
236  %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0
237  %s = load float, float* %gep, align 16
238  %r = insertelement <4 x float> undef, float %s, i64 0
239  ret <4 x float> %r
240}
241
242; Should work with addrspace as well.
243
244define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(<4 x float> addrspace(44)* align 16 dereferenceable(16) %p) nofree nosync {
245; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace(
246; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float> addrspace(44)* [[P:%.*]], align 16
247; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
248; CHECK-NEXT:    ret <4 x float> [[R]]
249;
250  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(44)* %p, i64 0, i64 0
251  %s = load float, float addrspace(44)* %gep, align 16
252  %r = insertelement <4 x float> undef, float %s, i64 0
253  ret <4 x float> %r
254}
255
256; If there are enough dereferenceable bytes, we can offset the vector load.
257
258define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) nofree nosync {
259; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
260; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
261; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>*
262; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
263; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
264; CHECK-NEXT:    ret <8 x i16> [[R]]
265;
266  %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
267  %s = load i16, i16* %gep, align 2
268  %r = insertelement <8 x i16> undef, i16 %s, i64 0
269  ret <8 x i16> %r
270}
271
272; Can't safely load the offset vector, but can load+shuffle if it is profitable.
273
274define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(17) %p) nofree nosync {
275; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref(
276; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
277; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 2
278; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
279; SSE2-NEXT:    ret <8 x i16> [[R]]
280;
281; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref(
282; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 16
283; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
284; AVX2-NEXT:    ret <8 x i16> [[R]]
285;
286  %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
287  %s = load i16, i16* %gep, align 2
288  %r = insertelement <8 x i16> undef, i16 %s, i64 0
289  ret <8 x i16> %r
290}
291
292; Verify that alignment of the new load is not over-specified.
293
294define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(<8 x i16>* align 2 dereferenceable(16) %p) nofree nosync {
295; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
296; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
297; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 8
298; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
299; SSE2-NEXT:    ret <8 x i16> [[R]]
300;
301; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
302; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 2
303; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
304; AVX2-NEXT:    ret <8 x i16> [[R]]
305;
306  %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
307  %s = load i16, i16* %gep, align 8
308  %r = insertelement <8 x i16> undef, i16 %s, i64 0
309  ret <8 x i16> %r
310}
311
312; Negative test - if we are shuffling a load from the base pointer, the address offset
313; must be a multiple of element size.
314; TODO: Could bitcast around this limitation.
315
316define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(16) %p) {
317; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32(
318; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 1
319; CHECK-NEXT:    [[B:%.*]] = bitcast i8* [[GEP]] to i32*
320; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 1
321; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
322; CHECK-NEXT:    ret <4 x i32> [[R]]
323;
324  %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 1
325  %b = bitcast i8* %gep to i32*
326  %s = load i32, i32* %b, align 1
327  %r = insertelement <4 x i32> undef, i32 %s, i64 0
328  ret <4 x i32> %r
329}
330
331define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(20) %p) nofree nosync {
332; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32(
333; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>*
334; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
335; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
336; CHECK-NEXT:    ret <4 x i32> [[R]]
337;
338  %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 12
339  %b = bitcast i8* %gep to i32*
340  %s = load i32, i32* %b, align 1
341  %r = insertelement <4 x i32> undef, i32 %s, i64 0
342  ret <4 x i32> %r
343}
344
345; Negative test - if we are shuffling a load from the base pointer, the address offset
346; must be a multiple of element size and the offset must be low enough to fit in the vector
347; (bitcasting would not help this case).
348
349define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(20) %p) nofree nosync {
350; CHECK-LABEL: @gep013_bitcast_load_i32_insert_v4i32(
351; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 13
352; CHECK-NEXT:    [[B:%.*]] = bitcast i8* [[GEP]] to i32*
353; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 1
354; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
355; CHECK-NEXT:    ret <4 x i32> [[R]]
356;
357  %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 13
358  %b = bitcast i8* %gep to i32*
359  %s = load i32, i32* %b, align 1
360  %r = insertelement <4 x i32> undef, i32 %s, i64 0
361  ret <4 x i32> %r
362}
363
364; If there are enough dereferenceable bytes, we can offset the vector load.
365
366define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(32) %p) nofree nosync {
367; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
368; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
369; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>*
370; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16
371; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
372; CHECK-NEXT:    ret <8 x i16> [[R]]
373;
374  %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
375  %s = load i16, i16* %gep, align 16
376  %r = insertelement <8 x i16> undef, i16 %s, i64 0
377  ret <8 x i16> %r
378}
379
380; Negative test - disable under asan because widened load can cause spurious
381; use-after-poison issues when __asan_poison_memory_region is used.
382
383define <8 x i16> @gep10_load_i16_insert_v8i16_asan(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_address nofree nosync {
384; CHECK-LABEL: @gep10_load_i16_insert_v8i16_asan(
385; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
386; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
387; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
388; CHECK-NEXT:    ret <8 x i16> [[R]]
389;
390  %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
391  %s = load i16, i16* %gep, align 16
392  %r = insertelement <8 x i16> undef, i16 %s, i64 0
393  ret <8 x i16> %r
394}
395
396; hwasan and memtag should be similarly suppressed.
397
398define <8 x i16> @gep10_load_i16_insert_v8i16_hwasan(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_hwaddress nofree nosync {
399; CHECK-LABEL: @gep10_load_i16_insert_v8i16_hwasan(
400; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
401; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
402; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
403; CHECK-NEXT:    ret <8 x i16> [[R]]
404;
405  %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
406  %s = load i16, i16* %gep, align 16
407  %r = insertelement <8 x i16> undef, i16 %s, i64 0
408  ret <8 x i16> %r
409}
410
411define <8 x i16> @gep10_load_i16_insert_v8i16_memtag(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_memtag nofree nosync {
412; CHECK-LABEL: @gep10_load_i16_insert_v8i16_memtag(
413; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
414; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
415; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
416; CHECK-NEXT:    ret <8 x i16> [[R]]
417;
418  %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
419  %s = load i16, i16* %gep, align 16
420  %r = insertelement <8 x i16> undef, i16 %s, i64 0
421  ret <8 x i16> %r
422}
423
424; Negative test - disable under tsan because widened load may overlap bytes
425; being concurrently modified. tsan does not know that some bytes are undef.
426
427define <8 x i16> @gep10_load_i16_insert_v8i16_tsan(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_thread nofree nosync {
428; CHECK-LABEL: @gep10_load_i16_insert_v8i16_tsan(
429; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
430; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
431; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
432; CHECK-NEXT:    ret <8 x i16> [[R]]
433;
434  %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
435  %s = load i16, i16* %gep, align 16
436  %r = insertelement <8 x i16> undef, i16 %s, i64 0
437  ret <8 x i16> %r
438}
439
440; Negative test - can't safely load the offset vector, but could load+shuffle.
441
442define <8 x i16> @gep10_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(31) %p) nofree nosync {
443; CHECK-LABEL: @gep10_load_i16_insert_v8i16_deref(
444; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
445; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
446; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
447; CHECK-NEXT:    ret <8 x i16> [[R]]
448;
449  %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
450  %s = load i16, i16* %gep, align 16
451  %r = insertelement <8 x i16> undef, i16 %s, i64 0
452  ret <8 x i16> %r
453}
454
455; Negative test - do not alter volatile.
456
457define <4 x float> @load_f32_insert_v4f32_volatile(float* align 16 dereferenceable(16) %p) nofree nosync {
458; CHECK-LABEL: @load_f32_insert_v4f32_volatile(
459; CHECK-NEXT:    [[S:%.*]] = load volatile float, float* [[P:%.*]], align 4
460; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
461; CHECK-NEXT:    ret <4 x float> [[R]]
462;
463  %s = load volatile float, float* %p, align 4
464  %r = insertelement <4 x float> undef, float %s, i32 0
465  ret <4 x float> %r
466}
467
468; Pointer is not as aligned as load, but that's ok.
469; The new load uses the larger alignment value.
470
471define <4 x float> @load_f32_insert_v4f32_align(float* align 1 dereferenceable(16) %p) nofree nosync {
472; CHECK-LABEL: @load_f32_insert_v4f32_align(
473; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
474; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
475; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
476; CHECK-NEXT:    ret <4 x float> [[R]]
477;
478  %s = load float, float* %p, align 4
479  %r = insertelement <4 x float> undef, float %s, i32 0
480  ret <4 x float> %r
481}
482
483; Negative test - not enough bytes.
484
485define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(15) %p) nofree nosync {
486; CHECK-LABEL: @load_f32_insert_v4f32_deref(
487; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
488; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
489; CHECK-NEXT:    ret <4 x float> [[R]]
490;
491  %s = load float, float* %p, align 4
492  %r = insertelement <4 x float> undef, float %s, i32 0
493  ret <4 x float> %r
494}
495
496define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) nofree nosync {
497; CHECK-LABEL: @load_i32_insert_v8i32(
498; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
499; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16
500; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
501; CHECK-NEXT:    ret <8 x i32> [[R]]
502;
503  %s = load i32, i32* %p, align 4
504  %r = insertelement <8 x i32> undef, i32 %s, i32 0
505  ret <8 x i32> %r
506}
507
508define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) nofree nosync {
509; CHECK-LABEL: @casted_load_i32_insert_v8i32(
510; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4
511; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
512; CHECK-NEXT:    ret <8 x i32> [[R]]
513;
514  %b = bitcast <4 x i32>* %p to i32*
515  %s = load i32, i32* %b, align 4
516  %r = insertelement <8 x i32> undef, i32 %s, i32 0
517  ret <8 x i32> %r
518}
519
520define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16) %p) nofree nosync {
521; CHECK-LABEL: @load_f32_insert_v16f32(
522; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
523; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
524; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
525; CHECK-NEXT:    ret <16 x float> [[R]]
526;
527  %s = load float, float* %p, align 4
528  %r = insertelement <16 x float> undef, float %s, i32 0
529  ret <16 x float> %r
530}
531
532define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) nofree nosync {
533; CHECK-LABEL: @load_f32_insert_v2f32(
534; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
535; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
536; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <2 x i32> <i32 0, i32 undef>
537; CHECK-NEXT:    ret <2 x float> [[R]]
538;
539  %s = load float, float* %p, align 4
540  %r = insertelement <2 x float> undef, float %s, i32 0
541  ret <2 x float> %r
542}
543
544; Negative test - suppress load widening for asan/hwasan/memtag/tsan.
545
546define <2 x float> @load_f32_insert_v2f32_asan(float* align 16 dereferenceable(16) %p) sanitize_address {
547; CHECK-LABEL: @load_f32_insert_v2f32_asan(
548; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
549; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0
550; CHECK-NEXT:    ret <2 x float> [[R]]
551;
552  %s = load float, float* %p, align 4
553  %r = insertelement <2 x float> undef, float %s, i32 0
554  ret <2 x float> %r
555}
556
557declare float* @getscaleptr()
558define void @PR47558_multiple_use_load(<2 x float>* nocapture nonnull %resultptr, <2 x float>* nocapture nonnull readonly %opptr) {
559; CHECK-LABEL: @PR47558_multiple_use_load(
560; CHECK-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) float* @getscaleptr()
561; CHECK-NEXT:    [[OP:%.*]] = load <2 x float>, <2 x float>* [[OPPTR:%.*]], align 4
562; CHECK-NEXT:    [[SCALE:%.*]] = load float, float* [[SCALEPTR]], align 16
563; CHECK-NEXT:    [[T1:%.*]] = insertelement <2 x float> undef, float [[SCALE]], i32 0
564; CHECK-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
565; CHECK-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
566; CHECK-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
567; CHECK-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> undef, float [[T4]], i32 0
568; CHECK-NEXT:    [[T5:%.*]] = extractelement <2 x float> [[T3]], i32 1
569; CHECK-NEXT:    [[RESULT1:%.*]] = insertelement <2 x float> [[RESULT0]], float [[T5]], i32 1
570; CHECK-NEXT:    store <2 x float> [[RESULT1]], <2 x float>* [[RESULTPTR:%.*]], align 8
571; CHECK-NEXT:    ret void
572;
573  %scaleptr = tail call nonnull align 16 dereferenceable(64) float* @getscaleptr()
574  %op = load <2 x float>, <2 x float>* %opptr, align 4
575  %scale = load float, float* %scaleptr, align 16
576  %t1 = insertelement <2 x float> undef, float %scale, i32 0
577  %t2 = insertelement <2 x float> %t1, float %scale, i32 1
578  %t3 = fmul <2 x float> %op, %t2
579  %t4 = extractelement <2 x float> %t3, i32 0
580  %result0 = insertelement <2 x float> undef, float %t4, i32 0
581  %t5 = extractelement <2 x float> %t3, i32 1
582  %result1 = insertelement <2 x float> %result0, float %t5, i32 1
583  store <2 x float> %result1, <2 x float>* %resultptr, align 8
584  ret void
585}
586
587define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) nofree nosync {
588; CHECK-LABEL: @load_v2f32_extract_insert_v4f32(
589; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>*
590; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
591; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
592; CHECK-NEXT:    ret <4 x float> [[R]]
593;
594  %l = load <2 x float>, <2 x float>* %p, align 4
595  %s = extractelement <2 x float> %l, i32 0
596  %r = insertelement <4 x float> undef, float %s, i32 0
597  ret <4 x float> %r
598}
599
600define <4 x float> @load_v8f32_extract_insert_v4f32(<8 x float>* align 16 dereferenceable(16) %p) nofree nosync {
601; CHECK-LABEL: @load_v8f32_extract_insert_v4f32(
602; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float>* [[P:%.*]] to <4 x float>*
603; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
604; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
605; CHECK-NEXT:    ret <4 x float> [[R]]
606;
607  %l = load <8 x float>, <8 x float>* %p, align 4
608  %s = extractelement <8 x float> %l, i32 0
609  %r = insertelement <4 x float> undef, float %s, i32 0
610  ret <4 x float> %r
611}
612
613define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(<1 x i32>* align 16 dereferenceable(16) %p, <1 x i32>* %store_ptr) nofree nosync {
614; CHECK-LABEL: @load_v1i32_extract_insert_v8i32_extra_use(
615; CHECK-NEXT:    [[L:%.*]] = load <1 x i32>, <1 x i32>* [[P:%.*]], align 4
616; CHECK-NEXT:    store <1 x i32> [[L]], <1 x i32>* [[STORE_PTR:%.*]], align 4
617; CHECK-NEXT:    [[S:%.*]] = extractelement <1 x i32> [[L]], i32 0
618; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
619; CHECK-NEXT:    ret <8 x i32> [[R]]
620;
621  %l = load <1 x i32>, <1 x i32>* %p, align 4
622  store <1 x i32> %l, <1 x i32>* %store_ptr
623  %s = extractelement <1 x i32> %l, i32 0
624  %r = insertelement <8 x i32> undef, i32 %s, i32 0
625  ret <8 x i32> %r
626}
627
628; Can't safely load the offset vector, but can load+shuffle if it is profitable.
629
630define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) nofree nosync {
631; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
632; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1
633; SSE2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[GEP]], i32 0, i32 0
634; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[TMP1]], align 8
635; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
636; SSE2-NEXT:    ret <8 x i16> [[R]]
637;
638; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
639; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16>* [[P:%.*]] to <8 x i16>*
640; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 4
641; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
642; AVX2-NEXT:    ret <8 x i16> [[R]]
643;
644  %gep = getelementptr inbounds <2 x i16>, <2 x i16>* %p, i64 1
645  %l = load <2 x i16>, <2 x i16>* %gep, align 8
646  %s = extractelement <2 x i16> %l, i32 0
647  %r = insertelement <8 x i16> undef, i16 %s, i64 0
648  ret <8 x i16> %r
649}
650
651; PR30986 - split vector loads for scalarized operations
652define <2 x i64> @PR30986(<2 x i64>* %0) {
653; CHECK-LABEL: @PR30986(
654; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[TMP0:%.*]], i32 0, i32 0
655; CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 16
656; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[TMP3]])
657; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
658; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[TMP0]], i32 0, i32 1
659; CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 8
660; CHECK-NEXT:    [[TMP8:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[TMP7]])
661; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[TMP8]], i32 1
662; CHECK-NEXT:    ret <2 x i64> [[TMP9]]
663;
664  %2 = load <2 x i64>, <2 x i64>* %0, align 16
665  %3 = extractelement <2 x i64> %2, i32 0
666  %4 = tail call i64 @llvm.ctpop.i64(i64 %3)
667  %5 = insertelement <2 x i64> undef, i64 %4, i32 0
668  %6 = extractelement <2 x i64> %2, i32 1
669  %7 = tail call i64 @llvm.ctpop.i64(i64 %6)
670  %8 = insertelement <2 x i64> %5, i64 %7, i32 1
671  ret <2 x i64> %8
672}
673declare i64 @llvm.ctpop.i64(i64)
674