1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2 3; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 4 5target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 6 7define float @matching_fp_scalar(float* align 16 dereferenceable(16) %p) { 8; CHECK-LABEL: @matching_fp_scalar( 9; CHECK-NEXT: [[R:%.*]] = load float, float* [[P:%.*]], align 16 10; CHECK-NEXT: ret float [[R]] 11; 12 %r = load float, float* %p, align 16 13 ret float %r 14} 15 16define float @matching_fp_scalar_volatile(float* align 16 dereferenceable(16) %p) { 17; CHECK-LABEL: @matching_fp_scalar_volatile( 18; CHECK-NEXT: [[R:%.*]] = load volatile float, float* [[P:%.*]], align 16 19; CHECK-NEXT: ret float [[R]] 20; 21 %r = load volatile float, float* %p, align 16 22 ret float %r 23} 24 25define double @larger_fp_scalar(float* align 16 dereferenceable(16) %p) { 26; CHECK-LABEL: @larger_fp_scalar( 27; CHECK-NEXT: [[BC:%.*]] = bitcast float* [[P:%.*]] to double* 28; CHECK-NEXT: [[R:%.*]] = load double, double* [[BC]], align 16 29; CHECK-NEXT: ret double [[R]] 30; 31 %bc = bitcast float* %p to double* 32 %r = load double, double* %bc, align 16 33 ret double %r 34} 35 36define float @smaller_fp_scalar(double* align 16 dereferenceable(16) %p) { 37; CHECK-LABEL: @smaller_fp_scalar( 38; CHECK-NEXT: [[BC:%.*]] = bitcast double* [[P:%.*]] to float* 39; CHECK-NEXT: [[R:%.*]] = load float, float* [[BC]], align 16 40; CHECK-NEXT: ret float [[R]] 41; 42 %bc = bitcast double* %p to float* 43 %r = load float, float* %bc, align 16 44 ret float %r 45} 46 47define float @matching_fp_vector(<4 x float>* align 16 dereferenceable(16) %p) { 48; CHECK-LABEL: @matching_fp_vector( 49; CHECK-NEXT: [[BC:%.*]] = bitcast <4 x float>* [[P:%.*]] to float* 50; CHECK-NEXT: [[R:%.*]] = load float, float* [[BC]], align 16 51; CHECK-NEXT: ret float [[R]] 52; 53 %bc = bitcast <4 x float>* %p to float* 54 %r = load float, float* %bc, align 16 55 ret float %r 56} 57 58define float @matching_fp_vector_gep00(<4 x float>* align 16 dereferenceable(16) %p) { 59; CHECK-LABEL: @matching_fp_vector_gep00( 60; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 0 61; CHECK-NEXT: [[R:%.*]] = load float, float* [[GEP]], align 16 62; CHECK-NEXT: ret float [[R]] 63; 64 %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0 65 %r = load float, float* %gep, align 16 66 ret float %r 67} 68 69define float @matching_fp_vector_gep01(<4 x float>* align 16 dereferenceable(20) %p) { 70; CHECK-LABEL: @matching_fp_vector_gep01( 71; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 1 72; CHECK-NEXT: [[R:%.*]] = load float, float* [[GEP]], align 4 73; CHECK-NEXT: ret float [[R]] 74; 75 %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 1 76 %r = load float, float* %gep, align 4 77 ret float %r 78} 79 80define float @matching_fp_vector_gep01_deref(<4 x float>* align 16 dereferenceable(19) %p) { 81; CHECK-LABEL: @matching_fp_vector_gep01_deref( 82; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 1 83; CHECK-NEXT: [[R:%.*]] = load float, float* [[GEP]], align 4 84; CHECK-NEXT: ret float [[R]] 85; 86 %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 1 87 %r = load float, float* %gep, align 4 88 ret float %r 89} 90 91define float @matching_fp_vector_gep10(<4 x float>* align 16 dereferenceable(32) %p) { 92; CHECK-LABEL: @matching_fp_vector_gep10( 93; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 1, i64 0 94; CHECK-NEXT: [[R:%.*]] = load float, float* [[GEP]], align 16 95; CHECK-NEXT: ret float [[R]] 96; 97 %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 1, i64 0 98 %r = load float, float* %gep, align 16 99 ret float %r 100} 101 102define float @matching_fp_vector_gep10_deref(<4 x float>* align 16 dereferenceable(31) %p) { 103; CHECK-LABEL: @matching_fp_vector_gep10_deref( 104; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 1, i64 0 105; CHECK-NEXT: [[R:%.*]] = load float, float* [[GEP]], align 16 106; CHECK-NEXT: ret float [[R]] 107; 108 %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 1, i64 0 109 %r = load float, float* %gep, align 16 110 ret float %r 111} 112 113define float @nonmatching_int_vector(<2 x i64>* align 16 dereferenceable(16) %p) { 114; CHECK-LABEL: @nonmatching_int_vector( 115; CHECK-NEXT: [[BC:%.*]] = bitcast <2 x i64>* [[P:%.*]] to float* 116; CHECK-NEXT: [[R:%.*]] = load float, float* [[BC]], align 16 117; CHECK-NEXT: ret float [[R]] 118; 119 %bc = bitcast <2 x i64>* %p to float* 120 %r = load float, float* %bc, align 16 121 ret float %r 122} 123 124define double @less_aligned(double* align 4 dereferenceable(16) %p) { 125; CHECK-LABEL: @less_aligned( 126; CHECK-NEXT: [[R:%.*]] = load double, double* [[P:%.*]], align 4 127; CHECK-NEXT: ret double [[R]] 128; 129 %r = load double, double* %p, align 4 130 ret double %r 131} 132 133define float @matching_fp_scalar_small_deref(float* align 16 dereferenceable(15) %p) { 134; CHECK-LABEL: @matching_fp_scalar_small_deref( 135; CHECK-NEXT: [[R:%.*]] = load float, float* [[P:%.*]], align 16 136; CHECK-NEXT: ret float [[R]] 137; 138 %r = load float, float* %p, align 16 139 ret float %r 140} 141 142define i64 @larger_int_scalar(<4 x float>* align 16 dereferenceable(16) %p) { 143; CHECK-LABEL: @larger_int_scalar( 144; CHECK-NEXT: [[BC:%.*]] = bitcast <4 x float>* [[P:%.*]] to i64* 145; CHECK-NEXT: [[R:%.*]] = load i64, i64* [[BC]], align 16 146; CHECK-NEXT: ret i64 [[R]] 147; 148 %bc = bitcast <4 x float>* %p to i64* 149 %r = load i64, i64* %bc, align 16 150 ret i64 %r 151} 152 153define i8 @smaller_int_scalar(<4 x float>* align 16 dereferenceable(16) %p) { 154; CHECK-LABEL: @smaller_int_scalar( 155; CHECK-NEXT: [[BC:%.*]] = bitcast <4 x float>* [[P:%.*]] to i8* 156; CHECK-NEXT: [[R:%.*]] = load i8, i8* [[BC]], align 16 157; CHECK-NEXT: ret i8 [[R]] 158; 159 %bc = bitcast <4 x float>* %p to i8* 160 %r = load i8, i8* %bc, align 16 161 ret i8 %r 162} 163 164define double @larger_fp_scalar_256bit_vec(<8 x float>* align 32 dereferenceable(32) %p) { 165; CHECK-LABEL: @larger_fp_scalar_256bit_vec( 166; CHECK-NEXT: [[BC:%.*]] = bitcast <8 x float>* [[P:%.*]] to double* 167; CHECK-NEXT: [[R:%.*]] = load double, double* [[BC]], align 32 168; CHECK-NEXT: ret double [[R]] 169; 170 %bc = bitcast <8 x float>* %p to double* 171 %r = load double, double* %bc, align 32 172 ret double %r 173} 174 175define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) nofree nosync { 176; CHECK-LABEL: @load_f32_insert_v4f32( 177; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* 178; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 179; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 180; CHECK-NEXT: ret <4 x float> [[R]] 181; 182 %s = load float, float* %p, align 4 183 %r = insertelement <4 x float> undef, float %s, i32 0 184 ret <4 x float> %r 185} 186 187define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenceable(16) %p) nofree nosync { 188; CHECK-LABEL: @casted_load_f32_insert_v4f32( 189; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 4 190; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 191; CHECK-NEXT: ret <4 x float> [[R]] 192; 193 %b = bitcast <4 x float>* %p to float* 194 %s = load float, float* %b, align 4 195 %r = insertelement <4 x float> undef, float %s, i32 0 196 ret <4 x float> %r 197} 198 199; Element type does not change cost. 200 201define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) nofree nosync { 202; CHECK-LABEL: @load_i32_insert_v4i32( 203; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>* 204; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16 205; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 206; CHECK-NEXT: ret <4 x i32> [[R]] 207; 208 %s = load i32, i32* %p, align 4 209 %r = insertelement <4 x i32> undef, i32 %s, i32 0 210 ret <4 x i32> %r 211} 212 213; Pointer type does not change cost. 214 215define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceable(16) %p) nofree nosync { 216; CHECK-LABEL: @casted_load_i32_insert_v4i32( 217; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>* 218; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 219; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 220; CHECK-NEXT: ret <4 x i32> [[R]] 221; 222 %b = bitcast <16 x i8>* %p to i32* 223 %s = load i32, i32* %b, align 4 224 %r = insertelement <4 x i32> undef, i32 %s, i32 0 225 ret <4 x i32> %r 226} 227 228; This is canonical form for vector element access. 229 230define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenceable(16) %p) nofree nosync { 231; CHECK-LABEL: @gep00_load_f32_insert_v4f32( 232; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16 233; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 234; CHECK-NEXT: ret <4 x float> [[R]] 235; 236 %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0 237 %s = load float, float* %gep, align 16 238 %r = insertelement <4 x float> undef, float %s, i64 0 239 ret <4 x float> %r 240} 241 242; Should work with addrspace as well. 243 244define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(<4 x float> addrspace(44)* align 16 dereferenceable(16) %p) nofree nosync { 245; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace( 246; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float> addrspace(44)* [[P:%.*]], align 16 247; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 248; CHECK-NEXT: ret <4 x float> [[R]] 249; 250 %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(44)* %p, i64 0, i64 0 251 %s = load float, float addrspace(44)* %gep, align 16 252 %r = insertelement <4 x float> undef, float %s, i64 0 253 ret <4 x float> %r 254} 255 256; If there are enough dereferenceable bytes, we can offset the vector load. 257 258define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) nofree nosync { 259; CHECK-LABEL: @gep01_load_i16_insert_v8i16( 260; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 261; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>* 262; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2 263; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 264; CHECK-NEXT: ret <8 x i16> [[R]] 265; 266 %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1 267 %s = load i16, i16* %gep, align 2 268 %r = insertelement <8 x i16> undef, i16 %s, i64 0 269 ret <8 x i16> %r 270} 271 272; Can't safely load the offset vector, but can load+shuffle if it is profitable. 273 274define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(17) %p) nofree nosync { 275; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref( 276; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 277; SSE2-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 2 278; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 279; SSE2-NEXT: ret <8 x i16> [[R]] 280; 281; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref( 282; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 16 283; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 284; AVX2-NEXT: ret <8 x i16> [[R]] 285; 286 %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1 287 %s = load i16, i16* %gep, align 2 288 %r = insertelement <8 x i16> undef, i16 %s, i64 0 289 ret <8 x i16> %r 290} 291 292; Verify that alignment of the new load is not over-specified. 293 294define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(<8 x i16>* align 2 dereferenceable(16) %p) nofree nosync { 295; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( 296; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 297; SSE2-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 8 298; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 299; SSE2-NEXT: ret <8 x i16> [[R]] 300; 301; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( 302; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 2 303; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 304; AVX2-NEXT: ret <8 x i16> [[R]] 305; 306 %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1 307 %s = load i16, i16* %gep, align 8 308 %r = insertelement <8 x i16> undef, i16 %s, i64 0 309 ret <8 x i16> %r 310} 311 312; Negative test - if we are shuffling a load from the base pointer, the address offset 313; must be a multiple of element size. 314; TODO: Could bitcast around this limitation. 315 316define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(16) %p) { 317; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32( 318; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 1 319; CHECK-NEXT: [[B:%.*]] = bitcast i8* [[GEP]] to i32* 320; CHECK-NEXT: [[S:%.*]] = load i32, i32* [[B]], align 1 321; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0 322; CHECK-NEXT: ret <4 x i32> [[R]] 323; 324 %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 1 325 %b = bitcast i8* %gep to i32* 326 %s = load i32, i32* %b, align 1 327 %r = insertelement <4 x i32> undef, i32 %s, i64 0 328 ret <4 x i32> %r 329} 330 331define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(20) %p) nofree nosync { 332; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32( 333; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>* 334; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 335; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> 336; CHECK-NEXT: ret <4 x i32> [[R]] 337; 338 %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 12 339 %b = bitcast i8* %gep to i32* 340 %s = load i32, i32* %b, align 1 341 %r = insertelement <4 x i32> undef, i32 %s, i64 0 342 ret <4 x i32> %r 343} 344 345; Negative test - if we are shuffling a load from the base pointer, the address offset 346; must be a multiple of element size and the offset must be low enough to fit in the vector 347; (bitcasting would not help this case). 348 349define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(20) %p) nofree nosync { 350; CHECK-LABEL: @gep013_bitcast_load_i32_insert_v4i32( 351; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 13 352; CHECK-NEXT: [[B:%.*]] = bitcast i8* [[GEP]] to i32* 353; CHECK-NEXT: [[S:%.*]] = load i32, i32* [[B]], align 1 354; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0 355; CHECK-NEXT: ret <4 x i32> [[R]] 356; 357 %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 13 358 %b = bitcast i8* %gep to i32* 359 %s = load i32, i32* %b, align 1 360 %r = insertelement <4 x i32> undef, i32 %s, i64 0 361 ret <4 x i32> %r 362} 363 364; If there are enough dereferenceable bytes, we can offset the vector load. 365 366define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(32) %p) nofree nosync { 367; CHECK-LABEL: @gep10_load_i16_insert_v8i16( 368; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0 369; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>* 370; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16 371; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 372; CHECK-NEXT: ret <8 x i16> [[R]] 373; 374 %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0 375 %s = load i16, i16* %gep, align 16 376 %r = insertelement <8 x i16> undef, i16 %s, i64 0 377 ret <8 x i16> %r 378} 379 380; Negative test - disable under asan because widened load can cause spurious 381; use-after-poison issues when __asan_poison_memory_region is used. 382 383define <8 x i16> @gep10_load_i16_insert_v8i16_asan(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_address nofree nosync { 384; CHECK-LABEL: @gep10_load_i16_insert_v8i16_asan( 385; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0 386; CHECK-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 16 387; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 388; CHECK-NEXT: ret <8 x i16> [[R]] 389; 390 %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0 391 %s = load i16, i16* %gep, align 16 392 %r = insertelement <8 x i16> undef, i16 %s, i64 0 393 ret <8 x i16> %r 394} 395 396; hwasan and memtag should be similarly suppressed. 397 398define <8 x i16> @gep10_load_i16_insert_v8i16_hwasan(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_hwaddress nofree nosync { 399; CHECK-LABEL: @gep10_load_i16_insert_v8i16_hwasan( 400; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0 401; CHECK-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 16 402; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 403; CHECK-NEXT: ret <8 x i16> [[R]] 404; 405 %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0 406 %s = load i16, i16* %gep, align 16 407 %r = insertelement <8 x i16> undef, i16 %s, i64 0 408 ret <8 x i16> %r 409} 410 411define <8 x i16> @gep10_load_i16_insert_v8i16_memtag(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_memtag nofree nosync { 412; CHECK-LABEL: @gep10_load_i16_insert_v8i16_memtag( 413; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0 414; CHECK-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 16 415; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 416; CHECK-NEXT: ret <8 x i16> [[R]] 417; 418 %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0 419 %s = load i16, i16* %gep, align 16 420 %r = insertelement <8 x i16> undef, i16 %s, i64 0 421 ret <8 x i16> %r 422} 423 424; Negative test - disable under tsan because widened load may overlap bytes 425; being concurrently modified. tsan does not know that some bytes are undef. 426 427define <8 x i16> @gep10_load_i16_insert_v8i16_tsan(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_thread nofree nosync { 428; CHECK-LABEL: @gep10_load_i16_insert_v8i16_tsan( 429; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0 430; CHECK-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 16 431; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 432; CHECK-NEXT: ret <8 x i16> [[R]] 433; 434 %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0 435 %s = load i16, i16* %gep, align 16 436 %r = insertelement <8 x i16> undef, i16 %s, i64 0 437 ret <8 x i16> %r 438} 439 440; Negative test - can't safely load the offset vector, but could load+shuffle. 441 442define <8 x i16> @gep10_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(31) %p) nofree nosync { 443; CHECK-LABEL: @gep10_load_i16_insert_v8i16_deref( 444; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0 445; CHECK-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 16 446; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 447; CHECK-NEXT: ret <8 x i16> [[R]] 448; 449 %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0 450 %s = load i16, i16* %gep, align 16 451 %r = insertelement <8 x i16> undef, i16 %s, i64 0 452 ret <8 x i16> %r 453} 454 455; Negative test - do not alter volatile. 456 457define <4 x float> @load_f32_insert_v4f32_volatile(float* align 16 dereferenceable(16) %p) nofree nosync { 458; CHECK-LABEL: @load_f32_insert_v4f32_volatile( 459; CHECK-NEXT: [[S:%.*]] = load volatile float, float* [[P:%.*]], align 4 460; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 461; CHECK-NEXT: ret <4 x float> [[R]] 462; 463 %s = load volatile float, float* %p, align 4 464 %r = insertelement <4 x float> undef, float %s, i32 0 465 ret <4 x float> %r 466} 467 468; Pointer is not as aligned as load, but that's ok. 469; The new load uses the larger alignment value. 470 471define <4 x float> @load_f32_insert_v4f32_align(float* align 1 dereferenceable(16) %p) nofree nosync { 472; CHECK-LABEL: @load_f32_insert_v4f32_align( 473; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* 474; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 475; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 476; CHECK-NEXT: ret <4 x float> [[R]] 477; 478 %s = load float, float* %p, align 4 479 %r = insertelement <4 x float> undef, float %s, i32 0 480 ret <4 x float> %r 481} 482 483; Negative test - not enough bytes. 484 485define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(15) %p) nofree nosync { 486; CHECK-LABEL: @load_f32_insert_v4f32_deref( 487; CHECK-NEXT: [[S:%.*]] = load float, float* [[P:%.*]], align 4 488; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 489; CHECK-NEXT: ret <4 x float> [[R]] 490; 491 %s = load float, float* %p, align 4 492 %r = insertelement <4 x float> undef, float %s, i32 0 493 ret <4 x float> %r 494} 495 496define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) nofree nosync { 497; CHECK-LABEL: @load_i32_insert_v8i32( 498; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>* 499; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16 500; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 501; CHECK-NEXT: ret <8 x i32> [[R]] 502; 503 %s = load i32, i32* %p, align 4 504 %r = insertelement <8 x i32> undef, i32 %s, i32 0 505 ret <8 x i32> %r 506} 507 508define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) nofree nosync { 509; CHECK-LABEL: @casted_load_i32_insert_v8i32( 510; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4 511; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 512; CHECK-NEXT: ret <8 x i32> [[R]] 513; 514 %b = bitcast <4 x i32>* %p to i32* 515 %s = load i32, i32* %b, align 4 516 %r = insertelement <8 x i32> undef, i32 %s, i32 0 517 ret <8 x i32> %r 518} 519 520define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16) %p) nofree nosync { 521; CHECK-LABEL: @load_f32_insert_v16f32( 522; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* 523; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 524; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 525; CHECK-NEXT: ret <16 x float> [[R]] 526; 527 %s = load float, float* %p, align 4 528 %r = insertelement <16 x float> undef, float %s, i32 0 529 ret <16 x float> %r 530} 531 532define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) nofree nosync { 533; CHECK-LABEL: @load_f32_insert_v2f32( 534; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* 535; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 536; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <2 x i32> <i32 0, i32 undef> 537; CHECK-NEXT: ret <2 x float> [[R]] 538; 539 %s = load float, float* %p, align 4 540 %r = insertelement <2 x float> undef, float %s, i32 0 541 ret <2 x float> %r 542} 543 544; Negative test - suppress load widening for asan/hwasan/memtag/tsan. 545 546define <2 x float> @load_f32_insert_v2f32_asan(float* align 16 dereferenceable(16) %p) sanitize_address { 547; CHECK-LABEL: @load_f32_insert_v2f32_asan( 548; CHECK-NEXT: [[S:%.*]] = load float, float* [[P:%.*]], align 4 549; CHECK-NEXT: [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0 550; CHECK-NEXT: ret <2 x float> [[R]] 551; 552 %s = load float, float* %p, align 4 553 %r = insertelement <2 x float> undef, float %s, i32 0 554 ret <2 x float> %r 555} 556 557declare float* @getscaleptr() 558define void @PR47558_multiple_use_load(<2 x float>* nocapture nonnull %resultptr, <2 x float>* nocapture nonnull readonly %opptr) { 559; CHECK-LABEL: @PR47558_multiple_use_load( 560; CHECK-NEXT: [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) float* @getscaleptr() 561; CHECK-NEXT: [[OP:%.*]] = load <2 x float>, <2 x float>* [[OPPTR:%.*]], align 4 562; CHECK-NEXT: [[SCALE:%.*]] = load float, float* [[SCALEPTR]], align 16 563; CHECK-NEXT: [[T1:%.*]] = insertelement <2 x float> undef, float [[SCALE]], i32 0 564; CHECK-NEXT: [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1 565; CHECK-NEXT: [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]] 566; CHECK-NEXT: [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0 567; CHECK-NEXT: [[RESULT0:%.*]] = insertelement <2 x float> undef, float [[T4]], i32 0 568; CHECK-NEXT: [[T5:%.*]] = extractelement <2 x float> [[T3]], i32 1 569; CHECK-NEXT: [[RESULT1:%.*]] = insertelement <2 x float> [[RESULT0]], float [[T5]], i32 1 570; CHECK-NEXT: store <2 x float> [[RESULT1]], <2 x float>* [[RESULTPTR:%.*]], align 8 571; CHECK-NEXT: ret void 572; 573 %scaleptr = tail call nonnull align 16 dereferenceable(64) float* @getscaleptr() 574 %op = load <2 x float>, <2 x float>* %opptr, align 4 575 %scale = load float, float* %scaleptr, align 16 576 %t1 = insertelement <2 x float> undef, float %scale, i32 0 577 %t2 = insertelement <2 x float> %t1, float %scale, i32 1 578 %t3 = fmul <2 x float> %op, %t2 579 %t4 = extractelement <2 x float> %t3, i32 0 580 %result0 = insertelement <2 x float> undef, float %t4, i32 0 581 %t5 = extractelement <2 x float> %t3, i32 1 582 %result1 = insertelement <2 x float> %result0, float %t5, i32 1 583 store <2 x float> %result1, <2 x float>* %resultptr, align 8 584 ret void 585} 586 587define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) nofree nosync { 588; CHECK-LABEL: @load_v2f32_extract_insert_v4f32( 589; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>* 590; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 591; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 592; CHECK-NEXT: ret <4 x float> [[R]] 593; 594 %l = load <2 x float>, <2 x float>* %p, align 4 595 %s = extractelement <2 x float> %l, i32 0 596 %r = insertelement <4 x float> undef, float %s, i32 0 597 ret <4 x float> %r 598} 599 600define <4 x float> @load_v8f32_extract_insert_v4f32(<8 x float>* align 16 dereferenceable(16) %p) nofree nosync { 601; CHECK-LABEL: @load_v8f32_extract_insert_v4f32( 602; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x float>* [[P:%.*]] to <4 x float>* 603; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 604; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 605; CHECK-NEXT: ret <4 x float> [[R]] 606; 607 %l = load <8 x float>, <8 x float>* %p, align 4 608 %s = extractelement <8 x float> %l, i32 0 609 %r = insertelement <4 x float> undef, float %s, i32 0 610 ret <4 x float> %r 611} 612 613define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(<1 x i32>* align 16 dereferenceable(16) %p, <1 x i32>* %store_ptr) nofree nosync { 614; CHECK-LABEL: @load_v1i32_extract_insert_v8i32_extra_use( 615; CHECK-NEXT: [[L:%.*]] = load <1 x i32>, <1 x i32>* [[P:%.*]], align 4 616; CHECK-NEXT: store <1 x i32> [[L]], <1 x i32>* [[STORE_PTR:%.*]], align 4 617; CHECK-NEXT: [[S:%.*]] = extractelement <1 x i32> [[L]], i32 0 618; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0 619; CHECK-NEXT: ret <8 x i32> [[R]] 620; 621 %l = load <1 x i32>, <1 x i32>* %p, align 4 622 store <1 x i32> %l, <1 x i32>* %store_ptr 623 %s = extractelement <1 x i32> %l, i32 0 624 %r = insertelement <8 x i32> undef, i32 %s, i32 0 625 ret <8 x i32> %r 626} 627 628; Can't safely load the offset vector, but can load+shuffle if it is profitable. 629 630define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) nofree nosync { 631; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16( 632; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1 633; SSE2-NEXT: [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[GEP]], i32 0, i32 0 634; SSE2-NEXT: [[S:%.*]] = load i16, i16* [[TMP1]], align 8 635; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 636; SSE2-NEXT: ret <8 x i16> [[R]] 637; 638; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16( 639; AVX2-NEXT: [[TMP1:%.*]] = bitcast <2 x i16>* [[P:%.*]] to <8 x i16>* 640; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 4 641; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 642; AVX2-NEXT: ret <8 x i16> [[R]] 643; 644 %gep = getelementptr inbounds <2 x i16>, <2 x i16>* %p, i64 1 645 %l = load <2 x i16>, <2 x i16>* %gep, align 8 646 %s = extractelement <2 x i16> %l, i32 0 647 %r = insertelement <8 x i16> undef, i16 %s, i64 0 648 ret <8 x i16> %r 649} 650 651; PR30986 - split vector loads for scalarized operations 652define <2 x i64> @PR30986(<2 x i64>* %0) { 653; CHECK-LABEL: @PR30986( 654; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[TMP0:%.*]], i32 0, i32 0 655; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 16 656; CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[TMP3]]) 657; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 658; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[TMP0]], i32 0, i32 1 659; CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 8 660; CHECK-NEXT: [[TMP8:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[TMP7]]) 661; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[TMP8]], i32 1 662; CHECK-NEXT: ret <2 x i64> [[TMP9]] 663; 664 %2 = load <2 x i64>, <2 x i64>* %0, align 16 665 %3 = extractelement <2 x i64> %2, i32 0 666 %4 = tail call i64 @llvm.ctpop.i64(i64 %3) 667 %5 = insertelement <2 x i64> undef, i64 %4, i32 0 668 %6 = extractelement <2 x i64> %2, i32 1 669 %7 = tail call i64 @llvm.ctpop.i64(i64 %6) 670 %8 = insertelement <2 x i64> %5, i64 %7, i32 1 671 ret <2 x i64> %8 672} 673declare i64 @llvm.ctpop.i64(i64) 674