1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41 4 5define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind { 6; SSE2-LABEL: foo: 7; SSE2: # BB#0: 8; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 9; SSE2-NEXT: movl $255, %eax 10; SSE2-NEXT: movd %eax, %xmm1 11; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] 12; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 13; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 14; SSE2-NEXT: packuswb %xmm0, %xmm0 15; SSE2-NEXT: packuswb %xmm0, %xmm0 16; SSE2-NEXT: movd %xmm0, (%rdi) 17; SSE2-NEXT: retq 18; 19; SSE41-LABEL: foo: 20; SSE41: # BB#0: 21; SSE41-NEXT: cvttps2dq %xmm0, %xmm0 22; SSE41-NEXT: movl $255, %eax 23; SSE41-NEXT: pinsrd $3, %eax, %xmm0 24; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 25; SSE41-NEXT: movd %xmm0, (%rdi) 26; SSE41-NEXT: retq 27 %t0 = fptoui <3 x float> %in to <3 x i8> 28 %t1 = shufflevector <3 x i8> %t0, <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef> 29 %t2 = insertelement <4 x i8> %t1, i8 -1, i32 3 30 store <4 x i8> %t2, <4 x i8>* %out, align 4 31 ret void 32} 33 34; Verify that the DAGCombiner doesn't wrongly fold a build_vector into a 35; blend with a zero vector if the build_vector contains negative zero. 36 37define <4 x float> @test_negative_zero_1(<4 x float> %A) { 38; SSE2-LABEL: test_negative_zero_1: 39; SSE2: # BB#0: # %entry 40; SSE2-NEXT: movaps %xmm0, %xmm1 41; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 42; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 43; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 44; SSE2-NEXT: xorps %xmm2, %xmm2 45; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 46; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 47; SSE2-NEXT: retq 48; 49; SSE41-LABEL: test_negative_zero_1: 50; SSE41: # BB#0: # %entry 51; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2],zero 52; SSE41-NEXT: retq 53entry: 54 %0 = extractelement <4 x float> %A, i32 0 55 %1 = insertelement <4 x float> undef, float %0, i32 0 56 %2 = insertelement <4 x float> %1, float -0.0, i32 1 57 %3 = extractelement <4 x float> %A, i32 2 58 %4 = insertelement <4 x float> %2, float %3, i32 2 59 %5 = insertelement <4 x float> %4, float 0.0, i32 3 60 ret <4 x float> %5 61} 62 63define <2 x double> @test_negative_zero_2(<2 x double> %A) { 64; CHECK-LABEL: test_negative_zero_2: 65; CHECK: # BB#0: # %entry 66; CHECK-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] 67; CHECK-NEXT: retq 68entry: 69 %0 = extractelement <2 x double> %A, i32 0 70 %1 = insertelement <2 x double> undef, double %0, i32 0 71 %2 = insertelement <2 x double> %1, double -0.0, i32 1 72 ret <2 x double> %2 73} 74 75define <4 x float> @test_buildvector_v4f32_register(float %f0, float %f1, float %f2, float %f3) { 76; SSE2-LABEL: test_buildvector_v4f32_register: 77; SSE2: # BB#0: 78; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 79; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 80; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 81; SSE2-NEXT: retq 82; 83; SSE41-LABEL: test_buildvector_v4f32_register: 84; SSE41: # BB#0: 85; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 86; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] 87; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] 88; SSE41-NEXT: retq 89 %ins0 = insertelement <4 x float> undef, float %f0, i32 0 90 %ins1 = insertelement <4 x float> %ins0, float %f1, i32 1 91 %ins2 = insertelement <4 x float> %ins1, float %f2, i32 2 92 %ins3 = insertelement <4 x float> %ins2, float %f3, i32 3 93 ret <4 x float> %ins3 94} 95 96define <4 x float> @test_buildvector_v4f32_load(float* %p0, float* %p1, float* %p2, float* %p3) { 97; SSE2-LABEL: test_buildvector_v4f32_load: 98; SSE2: # BB#0: 99; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 100; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 101; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 102; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 103; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 104; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 105; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 106; SSE2-NEXT: retq 107; 108; SSE41-LABEL: test_buildvector_v4f32_load: 109; SSE41: # BB#0: 110; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 111; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 112; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 113; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 114; SSE41-NEXT: retq 115 %f0 = load float, float* %p0, align 4 116 %f1 = load float, float* %p1, align 4 117 %f2 = load float, float* %p2, align 4 118 %f3 = load float, float* %p3, align 4 119 %ins0 = insertelement <4 x float> undef, float %f0, i32 0 120 %ins1 = insertelement <4 x float> %ins0, float %f1, i32 1 121 %ins2 = insertelement <4 x float> %ins1, float %f2, i32 2 122 %ins3 = insertelement <4 x float> %ins2, float %f3, i32 3 123 ret <4 x float> %ins3 124} 125 126define <4 x float> @test_buildvector_v4f32_partial_load(float %f0, float %f1, float %f2, float* %p3) { 127; SSE2-LABEL: test_buildvector_v4f32_partial_load: 128; SSE2: # BB#0: 129; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 130; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 131; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 132; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 133; SSE2-NEXT: retq 134; 135; SSE41-LABEL: test_buildvector_v4f32_partial_load: 136; SSE41: # BB#0: 137; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 138; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] 139; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 140; SSE41-NEXT: retq 141 %f3 = load float, float* %p3, align 4 142 %ins0 = insertelement <4 x float> undef, float %f0, i32 0 143 %ins1 = insertelement <4 x float> %ins0, float %f1, i32 1 144 %ins2 = insertelement <4 x float> %ins1, float %f2, i32 2 145 %ins3 = insertelement <4 x float> %ins2, float %f3, i32 3 146 ret <4 x float> %ins3 147} 148 149define <4 x i32> @test_buildvector_v4i32_register(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { 150; SSE2-LABEL: test_buildvector_v4i32_register: 151; SSE2: # BB#0: 152; SSE2-NEXT: movd %ecx, %xmm0 153; SSE2-NEXT: movd %edx, %xmm1 154; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 155; SSE2-NEXT: movd %esi, %xmm2 156; SSE2-NEXT: movd %edi, %xmm0 157; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 158; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 159; SSE2-NEXT: retq 160; 161; SSE41-LABEL: test_buildvector_v4i32_register: 162; SSE41: # BB#0: 163; SSE41-NEXT: movd %edi, %xmm0 164; SSE41-NEXT: pinsrd $1, %esi, %xmm0 165; SSE41-NEXT: pinsrd $2, %edx, %xmm0 166; SSE41-NEXT: pinsrd $3, %ecx, %xmm0 167; SSE41-NEXT: retq 168 %ins0 = insertelement <4 x i32> undef, i32 %a0, i32 0 169 %ins1 = insertelement <4 x i32> %ins0, i32 %a1, i32 1 170 %ins2 = insertelement <4 x i32> %ins1, i32 %a2, i32 2 171 %ins3 = insertelement <4 x i32> %ins2, i32 %a3, i32 3 172 ret <4 x i32> %ins3 173} 174 175define <4 x i32> @test_buildvector_v4i32_partial(i32 %a0, i32 %a3) { 176; SSE2-LABEL: test_buildvector_v4i32_partial: 177; SSE2: # BB#0: 178; SSE2-NEXT: movd %edi, %xmm0 179; SSE2-NEXT: movd %esi, %xmm1 180; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 181; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 182; SSE2-NEXT: retq 183; 184; SSE41-LABEL: test_buildvector_v4i32_partial: 185; SSE41: # BB#0: 186; SSE41-NEXT: movd %edi, %xmm0 187; SSE41-NEXT: pinsrd $3, %esi, %xmm0 188; SSE41-NEXT: retq 189 %ins0 = insertelement <4 x i32> undef, i32 %a0, i32 0 190 %ins1 = insertelement <4 x i32> %ins0, i32 undef, i32 1 191 %ins2 = insertelement <4 x i32> %ins1, i32 undef, i32 2 192 %ins3 = insertelement <4 x i32> %ins2, i32 %a3, i32 3 193 ret <4 x i32> %ins3 194} 195 196define <4 x i32> @test_buildvector_v4i32_register_zero(i32 %a0, i32 %a2, i32 %a3) { 197; CHECK-LABEL: test_buildvector_v4i32_register_zero: 198; CHECK: # BB#0: 199; CHECK-NEXT: movd %edx, %xmm0 200; CHECK-NEXT: movd %esi, %xmm1 201; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 202; CHECK-NEXT: movd %edi, %xmm0 203; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 204; CHECK-NEXT: retq 205 %ins0 = insertelement <4 x i32> undef, i32 %a0, i32 0 206 %ins1 = insertelement <4 x i32> %ins0, i32 0, i32 1 207 %ins2 = insertelement <4 x i32> %ins1, i32 %a2, i32 2 208 %ins3 = insertelement <4 x i32> %ins2, i32 %a3, i32 3 209 ret <4 x i32> %ins3 210} 211 212define <4 x i32> @test_buildvector_v4i32_register_zero_2(i32 %a1, i32 %a2, i32 %a3) { 213; CHECK-LABEL: test_buildvector_v4i32_register_zero_2: 214; CHECK: # BB#0: 215; CHECK-NEXT: movd %edx, %xmm0 216; CHECK-NEXT: movd %esi, %xmm1 217; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 218; CHECK-NEXT: movd %edi, %xmm0 219; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1] 220; CHECK-NEXT: retq 221 %ins0 = insertelement <4 x i32> undef, i32 0, i32 0 222 %ins1 = insertelement <4 x i32> %ins0, i32 %a1, i32 1 223 %ins2 = insertelement <4 x i32> %ins1, i32 %a2, i32 2 224 %ins3 = insertelement <4 x i32> %ins2, i32 %a3, i32 3 225 ret <4 x i32> %ins3 226} 227 228define <8 x i16> @test_buildvector_v8i16_register(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) { 229; SSE2-LABEL: test_buildvector_v8i16_register: 230; SSE2: # BB#0: 231; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 232; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 233; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 234; SSE2-NEXT: movd %r9d, %xmm0 235; SSE2-NEXT: movd %r8d, %xmm2 236; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 237; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 238; SSE2-NEXT: movd %ecx, %xmm0 239; SSE2-NEXT: movd %edx, %xmm1 240; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 241; SSE2-NEXT: movd %esi, %xmm3 242; SSE2-NEXT: movd %edi, %xmm0 243; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 244; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 245; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 246; SSE2-NEXT: retq 247; 248; SSE41-LABEL: test_buildvector_v8i16_register: 249; SSE41: # BB#0: 250; SSE41-NEXT: movd %edi, %xmm0 251; SSE41-NEXT: pinsrw $1, %esi, %xmm0 252; SSE41-NEXT: pinsrw $2, %edx, %xmm0 253; SSE41-NEXT: pinsrw $3, %ecx, %xmm0 254; SSE41-NEXT: pinsrw $4, %r8d, %xmm0 255; SSE41-NEXT: pinsrw $5, %r9d, %xmm0 256; SSE41-NEXT: pinsrw $6, {{[0-9]+}}(%rsp), %xmm0 257; SSE41-NEXT: pinsrw $7, {{[0-9]+}}(%rsp), %xmm0 258; SSE41-NEXT: retq 259 %ins0 = insertelement <8 x i16> undef, i16 %a0, i32 0 260 %ins1 = insertelement <8 x i16> %ins0, i16 %a1, i32 1 261 %ins2 = insertelement <8 x i16> %ins1, i16 %a2, i32 2 262 %ins3 = insertelement <8 x i16> %ins2, i16 %a3, i32 3 263 %ins4 = insertelement <8 x i16> %ins3, i16 %a4, i32 4 264 %ins5 = insertelement <8 x i16> %ins4, i16 %a5, i32 5 265 %ins6 = insertelement <8 x i16> %ins5, i16 %a6, i32 6 266 %ins7 = insertelement <8 x i16> %ins6, i16 %a7, i32 7 267 ret <8 x i16> %ins7 268} 269 270define <8 x i16> @test_buildvector_v8i16_partial(i16 %a1, i16 %a3, i16 %a4, i16 %a5) { 271; CHECK-LABEL: test_buildvector_v8i16_partial: 272; CHECK: # BB#0: 273; CHECK-NEXT: pxor %xmm0, %xmm0 274; CHECK-NEXT: pinsrw $1, %edi, %xmm0 275; CHECK-NEXT: pinsrw $3, %esi, %xmm0 276; CHECK-NEXT: pinsrw $4, %edx, %xmm0 277; CHECK-NEXT: pinsrw $5, %ecx, %xmm0 278; CHECK-NEXT: retq 279 %ins0 = insertelement <8 x i16> undef, i16 undef, i32 0 280 %ins1 = insertelement <8 x i16> %ins0, i16 %a1, i32 1 281 %ins2 = insertelement <8 x i16> %ins1, i16 undef, i32 2 282 %ins3 = insertelement <8 x i16> %ins2, i16 %a3, i32 3 283 %ins4 = insertelement <8 x i16> %ins3, i16 %a4, i32 4 284 %ins5 = insertelement <8 x i16> %ins4, i16 %a5, i32 5 285 %ins6 = insertelement <8 x i16> %ins5, i16 undef, i32 6 286 %ins7 = insertelement <8 x i16> %ins6, i16 undef, i32 7 287 ret <8 x i16> %ins7 288} 289 290define <8 x i16> @test_buildvector_v8i16_register_zero(i16 %a0, i16 %a3, i16 %a4, i16 %a5) { 291; CHECK-LABEL: test_buildvector_v8i16_register_zero: 292; CHECK: # BB#0: 293; CHECK-NEXT: pxor %xmm0, %xmm0 294; CHECK-NEXT: pinsrw $0, %edi, %xmm0 295; CHECK-NEXT: pinsrw $3, %esi, %xmm0 296; CHECK-NEXT: pinsrw $4, %edx, %xmm0 297; CHECK-NEXT: pinsrw $5, %ecx, %xmm0 298; CHECK-NEXT: retq 299 %ins0 = insertelement <8 x i16> undef, i16 %a0, i32 0 300 %ins1 = insertelement <8 x i16> %ins0, i16 0, i32 1 301 %ins2 = insertelement <8 x i16> %ins1, i16 0, i32 2 302 %ins3 = insertelement <8 x i16> %ins2, i16 %a3, i32 3 303 %ins4 = insertelement <8 x i16> %ins3, i16 %a4, i32 4 304 %ins5 = insertelement <8 x i16> %ins4, i16 %a5, i32 5 305 %ins6 = insertelement <8 x i16> %ins5, i16 0, i32 6 306 %ins7 = insertelement <8 x i16> %ins6, i16 0, i32 7 307 ret <8 x i16> %ins7 308} 309 310define <8 x i16> @test_buildvector_v8i16_register_zero_2(i16 %a1, i16 %a3, i16 %a4, i16 %a5) { 311; CHECK-LABEL: test_buildvector_v8i16_register_zero_2: 312; CHECK: # BB#0: 313; CHECK-NEXT: pxor %xmm0, %xmm0 314; CHECK-NEXT: pinsrw $1, %edi, %xmm0 315; CHECK-NEXT: pinsrw $3, %esi, %xmm0 316; CHECK-NEXT: pinsrw $4, %edx, %xmm0 317; CHECK-NEXT: pinsrw $5, %ecx, %xmm0 318; CHECK-NEXT: retq 319 %ins0 = insertelement <8 x i16> undef, i16 0, i32 0 320 %ins1 = insertelement <8 x i16> %ins0, i16 %a1, i32 1 321 %ins2 = insertelement <8 x i16> %ins1, i16 0, i32 2 322 %ins3 = insertelement <8 x i16> %ins2, i16 %a3, i32 3 323 %ins4 = insertelement <8 x i16> %ins3, i16 %a4, i32 4 324 %ins5 = insertelement <8 x i16> %ins4, i16 %a5, i32 5 325 %ins6 = insertelement <8 x i16> %ins5, i16 0, i32 6 326 %ins7 = insertelement <8 x i16> %ins6, i16 0, i32 7 327 ret <8 x i16> %ins7 328} 329 330define <16 x i8> @test_buildvector_v16i8_register(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15) { 331; SSE2-LABEL: test_buildvector_v16i8_register: 332; SSE2: # BB#0: 333; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 334; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 335; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 336; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 337; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 338; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 339; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 340; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 341; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 342; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 343; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 344; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero 345; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 346; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 347; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 348; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 349; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 350; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 351; SSE2-NEXT: movd %r9d, %xmm0 352; SSE2-NEXT: movd %r8d, %xmm2 353; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 354; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 355; SSE2-NEXT: movd %ecx, %xmm0 356; SSE2-NEXT: movd %edx, %xmm1 357; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 358; SSE2-NEXT: movd %esi, %xmm4 359; SSE2-NEXT: movd %edi, %xmm0 360; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 361; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 362; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 363; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] 364; SSE2-NEXT: retq 365; 366; SSE41-LABEL: test_buildvector_v16i8_register: 367; SSE41: # BB#0: 368; SSE41-NEXT: movd %edi, %xmm0 369; SSE41-NEXT: pinsrb $1, %esi, %xmm0 370; SSE41-NEXT: pinsrb $2, %edx, %xmm0 371; SSE41-NEXT: pinsrb $3, %ecx, %xmm0 372; SSE41-NEXT: pinsrb $4, %r8d, %xmm0 373; SSE41-NEXT: pinsrb $5, %r9d, %xmm0 374; SSE41-NEXT: pinsrb $6, {{[0-9]+}}(%rsp), %xmm0 375; SSE41-NEXT: pinsrb $7, {{[0-9]+}}(%rsp), %xmm0 376; SSE41-NEXT: pinsrb $8, {{[0-9]+}}(%rsp), %xmm0 377; SSE41-NEXT: pinsrb $9, {{[0-9]+}}(%rsp), %xmm0 378; SSE41-NEXT: pinsrb $10, {{[0-9]+}}(%rsp), %xmm0 379; SSE41-NEXT: pinsrb $11, {{[0-9]+}}(%rsp), %xmm0 380; SSE41-NEXT: pinsrb $12, {{[0-9]+}}(%rsp), %xmm0 381; SSE41-NEXT: pinsrb $13, {{[0-9]+}}(%rsp), %xmm0 382; SSE41-NEXT: pinsrb $14, {{[0-9]+}}(%rsp), %xmm0 383; SSE41-NEXT: pinsrb $15, {{[0-9]+}}(%rsp), %xmm0 384; SSE41-NEXT: retq 385 %ins0 = insertelement <16 x i8> undef, i8 %a0, i32 0 386 %ins1 = insertelement <16 x i8> %ins0, i8 %a1, i32 1 387 %ins2 = insertelement <16 x i8> %ins1, i8 %a2, i32 2 388 %ins3 = insertelement <16 x i8> %ins2, i8 %a3, i32 3 389 %ins4 = insertelement <16 x i8> %ins3, i8 %a4, i32 4 390 %ins5 = insertelement <16 x i8> %ins4, i8 %a5, i32 5 391 %ins6 = insertelement <16 x i8> %ins5, i8 %a6, i32 6 392 %ins7 = insertelement <16 x i8> %ins6, i8 %a7, i32 7 393 %ins8 = insertelement <16 x i8> %ins7, i8 %a8, i32 8 394 %ins9 = insertelement <16 x i8> %ins8, i8 %a9, i32 9 395 %ins10 = insertelement <16 x i8> %ins9, i8 %a10, i32 10 396 %ins11 = insertelement <16 x i8> %ins10, i8 %a11, i32 11 397 %ins12 = insertelement <16 x i8> %ins11, i8 %a12, i32 12 398 %ins13 = insertelement <16 x i8> %ins12, i8 %a13, i32 13 399 %ins14 = insertelement <16 x i8> %ins13, i8 %a14, i32 14 400 %ins15 = insertelement <16 x i8> %ins14, i8 %a15, i32 15 401 ret <16 x i8> %ins15 402} 403 404define <16 x i8> @test_buildvector_v16i8_partial(i8 %a2, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) { 405; SSE2-LABEL: test_buildvector_v16i8_partial: 406; SSE2: # BB#0: 407; SSE2-NEXT: movzbl %dil, %eax 408; SSE2-NEXT: pinsrw $1, %eax, %xmm0 409; SSE2-NEXT: movzbl %sil, %eax 410; SSE2-NEXT: pinsrw $3, %eax, %xmm0 411; SSE2-NEXT: movzbl %dl, %eax 412; SSE2-NEXT: pinsrw $4, %eax, %xmm0 413; SSE2-NEXT: shll $8, %ecx 414; SSE2-NEXT: pinsrw $5, %ecx, %xmm0 415; SSE2-NEXT: movzbl %r8b, %eax 416; SSE2-NEXT: pinsrw $6, %eax, %xmm0 417; SSE2-NEXT: shll $8, %r9d 418; SSE2-NEXT: pinsrw $7, %r9d, %xmm0 419; SSE2-NEXT: retq 420; 421; SSE41-LABEL: test_buildvector_v16i8_partial: 422; SSE41: # BB#0: 423; SSE41-NEXT: pxor %xmm0, %xmm0 424; SSE41-NEXT: pinsrb $2, %edi, %xmm0 425; SSE41-NEXT: pinsrb $6, %esi, %xmm0 426; SSE41-NEXT: pinsrb $8, %edx, %xmm0 427; SSE41-NEXT: pinsrb $11, %ecx, %xmm0 428; SSE41-NEXT: pinsrb $12, %r8d, %xmm0 429; SSE41-NEXT: pinsrb $15, %r9d, %xmm0 430; SSE41-NEXT: retq 431 %ins0 = insertelement <16 x i8> undef, i8 undef, i32 0 432 %ins1 = insertelement <16 x i8> %ins0, i8 undef, i32 1 433 %ins2 = insertelement <16 x i8> %ins1, i8 %a2, i32 2 434 %ins3 = insertelement <16 x i8> %ins2, i8 undef, i32 3 435 %ins4 = insertelement <16 x i8> %ins3, i8 undef, i32 4 436 %ins5 = insertelement <16 x i8> %ins4, i8 undef, i32 5 437 %ins6 = insertelement <16 x i8> %ins5, i8 %a6, i32 6 438 %ins7 = insertelement <16 x i8> %ins6, i8 undef, i32 7 439 %ins8 = insertelement <16 x i8> %ins7, i8 %a8, i32 8 440 %ins9 = insertelement <16 x i8> %ins8, i8 undef, i32 9 441 %ins10 = insertelement <16 x i8> %ins9, i8 undef, i32 10 442 %ins11 = insertelement <16 x i8> %ins10, i8 %a11, i32 11 443 %ins12 = insertelement <16 x i8> %ins11, i8 %a12, i32 12 444 %ins13 = insertelement <16 x i8> %ins12, i8 undef, i32 13 445 %ins14 = insertelement <16 x i8> %ins13, i8 undef, i32 14 446 %ins15 = insertelement <16 x i8> %ins14, i8 %a15, i32 15 447 ret <16 x i8> %ins15 448} 449 450define <16 x i8> @test_buildvector_v16i8_register_zero(i8 %a0, i8 %a4, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) { 451; SSE2-LABEL: test_buildvector_v16i8_register_zero: 452; SSE2: # BB#0: 453; SSE2-NEXT: movzbl %sil, %eax 454; SSE2-NEXT: movzbl %dil, %esi 455; SSE2-NEXT: movd %esi, %xmm0 456; SSE2-NEXT: pinsrw $2, %eax, %xmm0 457; SSE2-NEXT: movzbl %dl, %eax 458; SSE2-NEXT: pinsrw $3, %eax, %xmm0 459; SSE2-NEXT: movzbl %cl, %eax 460; SSE2-NEXT: pinsrw $4, %eax, %xmm0 461; SSE2-NEXT: shll $8, %r8d 462; SSE2-NEXT: pinsrw $5, %r8d, %xmm0 463; SSE2-NEXT: movzbl %r9b, %eax 464; SSE2-NEXT: pinsrw $6, %eax, %xmm0 465; SSE2-NEXT: movl {{[0-9]+}}(%rsp), %eax 466; SSE2-NEXT: shll $8, %eax 467; SSE2-NEXT: pinsrw $7, %eax, %xmm0 468; SSE2-NEXT: retq 469; 470; SSE41-LABEL: test_buildvector_v16i8_register_zero: 471; SSE41: # BB#0: 472; SSE41-NEXT: pxor %xmm0, %xmm0 473; SSE41-NEXT: pinsrb $0, %edi, %xmm0 474; SSE41-NEXT: pinsrb $4, %esi, %xmm0 475; SSE41-NEXT: pinsrb $6, %edx, %xmm0 476; SSE41-NEXT: pinsrb $8, %ecx, %xmm0 477; SSE41-NEXT: pinsrb $11, %r8d, %xmm0 478; SSE41-NEXT: pinsrb $12, %r9d, %xmm0 479; SSE41-NEXT: pinsrb $15, {{[0-9]+}}(%rsp), %xmm0 480; SSE41-NEXT: retq 481 %ins0 = insertelement <16 x i8> undef, i8 %a0, i32 0 482 %ins1 = insertelement <16 x i8> %ins0, i8 0, i32 1 483 %ins2 = insertelement <16 x i8> %ins1, i8 0, i32 2 484 %ins3 = insertelement <16 x i8> %ins2, i8 0, i32 3 485 %ins4 = insertelement <16 x i8> %ins3, i8 %a4, i32 4 486 %ins5 = insertelement <16 x i8> %ins4, i8 0, i32 5 487 %ins6 = insertelement <16 x i8> %ins5, i8 %a6, i32 6 488 %ins7 = insertelement <16 x i8> %ins6, i8 0, i32 7 489 %ins8 = insertelement <16 x i8> %ins7, i8 %a8, i32 8 490 %ins9 = insertelement <16 x i8> %ins8, i8 0, i32 9 491 %ins10 = insertelement <16 x i8> %ins9, i8 0, i32 10 492 %ins11 = insertelement <16 x i8> %ins10, i8 %a11, i32 11 493 %ins12 = insertelement <16 x i8> %ins11, i8 %a12, i32 12 494 %ins13 = insertelement <16 x i8> %ins12, i8 0, i32 13 495 %ins14 = insertelement <16 x i8> %ins13, i8 0, i32 14 496 %ins15 = insertelement <16 x i8> %ins14, i8 %a15, i32 15 497 ret <16 x i8> %ins15 498} 499 500define <16 x i8> @test_buildvector_v16i8_register_zero_2(i8 %a2, i8 %a3, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) { 501; SSE2-LABEL: test_buildvector_v16i8_register_zero_2: 502; SSE2: # BB#0: 503; SSE2-NEXT: shll $8, %esi 504; SSE2-NEXT: movzbl %dil, %eax 505; SSE2-NEXT: orl %esi, %eax 506; SSE2-NEXT: pxor %xmm0, %xmm0 507; SSE2-NEXT: pinsrw $1, %eax, %xmm0 508; SSE2-NEXT: movzbl %dl, %eax 509; SSE2-NEXT: pinsrw $3, %eax, %xmm0 510; SSE2-NEXT: movzbl %cl, %eax 511; SSE2-NEXT: pinsrw $4, %eax, %xmm0 512; SSE2-NEXT: shll $8, %r8d 513; SSE2-NEXT: pinsrw $5, %r8d, %xmm0 514; SSE2-NEXT: movzbl %r9b, %eax 515; SSE2-NEXT: pinsrw $6, %eax, %xmm0 516; SSE2-NEXT: movl {{[0-9]+}}(%rsp), %eax 517; SSE2-NEXT: shll $8, %eax 518; SSE2-NEXT: pinsrw $7, %eax, %xmm0 519; SSE2-NEXT: retq 520; 521; SSE41-LABEL: test_buildvector_v16i8_register_zero_2: 522; SSE41: # BB#0: 523; SSE41-NEXT: pxor %xmm0, %xmm0 524; SSE41-NEXT: pinsrb $2, %edi, %xmm0 525; SSE41-NEXT: pinsrb $3, %esi, %xmm0 526; SSE41-NEXT: pinsrb $6, %edx, %xmm0 527; SSE41-NEXT: pinsrb $8, %ecx, %xmm0 528; SSE41-NEXT: pinsrb $11, %r8d, %xmm0 529; SSE41-NEXT: pinsrb $12, %r9d, %xmm0 530; SSE41-NEXT: pinsrb $15, {{[0-9]+}}(%rsp), %xmm0 531; SSE41-NEXT: retq 532 %ins0 = insertelement <16 x i8> undef, i8 0, i32 0 533 %ins1 = insertelement <16 x i8> %ins0, i8 0, i32 1 534 %ins2 = insertelement <16 x i8> %ins1, i8 %a2, i32 2 535 %ins3 = insertelement <16 x i8> %ins2, i8 %a3, i32 3 536 %ins4 = insertelement <16 x i8> %ins3, i8 0, i32 4 537 %ins5 = insertelement <16 x i8> %ins4, i8 0, i32 5 538 %ins6 = insertelement <16 x i8> %ins5, i8 %a6, i32 6 539 %ins7 = insertelement <16 x i8> %ins6, i8 0, i32 7 540 %ins8 = insertelement <16 x i8> %ins7, i8 %a8, i32 8 541 %ins9 = insertelement <16 x i8> %ins8, i8 0, i32 9 542 %ins10 = insertelement <16 x i8> %ins9, i8 0, i32 10 543 %ins11 = insertelement <16 x i8> %ins10, i8 %a11, i32 11 544 %ins12 = insertelement <16 x i8> %ins11, i8 %a12, i32 12 545 %ins13 = insertelement <16 x i8> %ins12, i8 0, i32 13 546 %ins14 = insertelement <16 x i8> %ins13, i8 0, i32 14 547 %ins15 = insertelement <16 x i8> %ins14, i8 %a15, i32 15 548 ret <16 x i8> %ins15 549} 550