1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 4 5; 6; Unary shuffle indices from registers 7; 8 9define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { 10; ALL-LABEL: var_shuffle_v4f64_v4f64_xxxx_i64: 11; ALL: # BB#0: 12; ALL-NEXT: pushq %rbp 13; ALL-NEXT: movq %rsp, %rbp 14; ALL-NEXT: andq $-32, %rsp 15; ALL-NEXT: subq $64, %rsp 16; ALL-NEXT: andl $3, %ecx 17; ALL-NEXT: andl $3, %edx 18; ALL-NEXT: andl $3, %esi 19; ALL-NEXT: andl $3, %edi 20; ALL-NEXT: vmovaps %ymm0, (%rsp) 21; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 22; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] 23; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 24; ALL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 25; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 26; ALL-NEXT: movq %rbp, %rsp 27; ALL-NEXT: popq %rbp 28; ALL-NEXT: retq 29 %x0 = extractelement <4 x double> %x, i64 %i0 30 %x1 = extractelement <4 x double> %x, i64 %i1 31 %x2 = extractelement <4 x double> %x, i64 %i2 32 %x3 = extractelement <4 x double> %x, i64 %i3 33 %r0 = insertelement <4 x double> undef, double %x0, i32 0 34 %r1 = insertelement <4 x double> %r0, double %x1, i32 1 35 %r2 = insertelement <4 x double> %r1, double %x2, i32 2 36 %r3 = insertelement <4 x double> %r2, double %x3, i32 3 37 ret <4 x double> %r3 38} 39 40define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { 41; ALL-LABEL: var_shuffle_v4f64_v4f64_uxx0_i64: 42; ALL: # BB#0: 43; ALL-NEXT: pushq %rbp 44; ALL-NEXT: movq %rsp, %rbp 45; ALL-NEXT: andq $-32, %rsp 46; ALL-NEXT: subq $64, %rsp 47; ALL-NEXT: andl $3, %edx 48; ALL-NEXT: andl $3, %esi 49; ALL-NEXT: vmovaps %ymm0, (%rsp) 50; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 51; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 52; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 53; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 54; ALL-NEXT: movq %rbp, %rsp 55; ALL-NEXT: popq %rbp 56; ALL-NEXT: retq 57 %x0 = extractelement <4 x double> %x, i64 %i0 58 %x1 = extractelement <4 x double> %x, i64 %i1 59 %x2 = extractelement <4 x double> %x, i64 %i2 60 %x3 = extractelement <4 x double> %x, i64 %i3 61 %r0 = insertelement <4 x double> undef, double undef, i32 0 62 %r1 = insertelement <4 x double> %r0, double %x1, i32 1 63 %r2 = insertelement <4 x double> %r1, double %x2, i32 2 64 %r3 = insertelement <4 x double> %r2, double 0.0, i32 3 65 ret <4 x double> %r3 66} 67 68define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { 69; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64: 70; ALL: # BB#0: 71; ALL-NEXT: andl $1, %ecx 72; ALL-NEXT: andl $1, %edx 73; ALL-NEXT: andl $1, %esi 74; ALL-NEXT: andl $1, %edi 75; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 76; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 77; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] 78; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 79; ALL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 80; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 81; ALL-NEXT: retq 82 %x0 = extractelement <2 x double> %x, i64 %i0 83 %x1 = extractelement <2 x double> %x, i64 %i1 84 %x2 = extractelement <2 x double> %x, i64 %i2 85 %x3 = extractelement <2 x double> %x, i64 %i3 86 %r0 = insertelement <4 x double> undef, double %x0, i32 0 87 %r1 = insertelement <4 x double> %r0, double %x1, i32 1 88 %r2 = insertelement <4 x double> %r1, double %x2, i32 2 89 %r3 = insertelement <4 x double> %r2, double %x3, i32 3 90 ret <4 x double> %r3 91} 92 93define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { 94; AVX1-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64: 95; AVX1: # BB#0: 96; AVX1-NEXT: pushq %rbp 97; AVX1-NEXT: movq %rsp, %rbp 98; AVX1-NEXT: andq $-32, %rsp 99; AVX1-NEXT: subq $64, %rsp 100; AVX1-NEXT: andl $3, %ecx 101; AVX1-NEXT: andl $3, %edx 102; AVX1-NEXT: andl $3, %esi 103; AVX1-NEXT: andl $3, %edi 104; AVX1-NEXT: vmovaps %ymm0, (%rsp) 105; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 106; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 107; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 108; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 109; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 110; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 111; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 112; AVX1-NEXT: movq %rbp, %rsp 113; AVX1-NEXT: popq %rbp 114; AVX1-NEXT: retq 115; 116; AVX2-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64: 117; AVX2: # BB#0: 118; AVX2-NEXT: pushq %rbp 119; AVX2-NEXT: movq %rsp, %rbp 120; AVX2-NEXT: andq $-32, %rsp 121; AVX2-NEXT: subq $64, %rsp 122; AVX2-NEXT: andl $3, %ecx 123; AVX2-NEXT: andl $3, %edx 124; AVX2-NEXT: andl $3, %esi 125; AVX2-NEXT: andl $3, %edi 126; AVX2-NEXT: vmovaps %ymm0, (%rsp) 127; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 128; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 129; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 130; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 131; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 132; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 133; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 134; AVX2-NEXT: movq %rbp, %rsp 135; AVX2-NEXT: popq %rbp 136; AVX2-NEXT: retq 137 %x0 = extractelement <4 x i64> %x, i64 %i0 138 %x1 = extractelement <4 x i64> %x, i64 %i1 139 %x2 = extractelement <4 x i64> %x, i64 %i2 140 %x3 = extractelement <4 x i64> %x, i64 %i3 141 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0 142 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1 143 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2 144 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3 145 ret <4 x i64> %r3 146} 147 148define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { 149; AVX1-LABEL: var_shuffle_v4i64_v4i64_xx00_i64: 150; AVX1: # BB#0: 151; AVX1-NEXT: pushq %rbp 152; AVX1-NEXT: movq %rsp, %rbp 153; AVX1-NEXT: andq $-32, %rsp 154; AVX1-NEXT: subq $64, %rsp 155; AVX1-NEXT: andl $3, %esi 156; AVX1-NEXT: andl $3, %edi 157; AVX1-NEXT: vmovaps %ymm0, (%rsp) 158; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 159; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 160; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 161; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 162; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 163; AVX1-NEXT: movq %rbp, %rsp 164; AVX1-NEXT: popq %rbp 165; AVX1-NEXT: retq 166; 167; AVX2-LABEL: var_shuffle_v4i64_v4i64_xx00_i64: 168; AVX2: # BB#0: 169; AVX2-NEXT: pushq %rbp 170; AVX2-NEXT: movq %rsp, %rbp 171; AVX2-NEXT: andq $-32, %rsp 172; AVX2-NEXT: subq $64, %rsp 173; AVX2-NEXT: andl $3, %esi 174; AVX2-NEXT: andl $3, %edi 175; AVX2-NEXT: vmovaps %ymm0, (%rsp) 176; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 177; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 178; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 179; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 180; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 181; AVX2-NEXT: movq %rbp, %rsp 182; AVX2-NEXT: popq %rbp 183; AVX2-NEXT: retq 184 %x0 = extractelement <4 x i64> %x, i64 %i0 185 %x1 = extractelement <4 x i64> %x, i64 %i1 186 %x2 = extractelement <4 x i64> %x, i64 %i2 187 %x3 = extractelement <4 x i64> %x, i64 %i3 188 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0 189 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1 190 %r2 = insertelement <4 x i64> %r1, i64 0, i32 2 191 %r3 = insertelement <4 x i64> %r2, i64 0, i32 3 192 ret <4 x i64> %r3 193} 194 195define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { 196; AVX1-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64: 197; AVX1: # BB#0: 198; AVX1-NEXT: andl $1, %ecx 199; AVX1-NEXT: andl $1, %edx 200; AVX1-NEXT: andl $1, %esi 201; AVX1-NEXT: andl $1, %edi 202; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 203; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 204; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 205; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 206; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 207; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 208; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 209; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 210; AVX1-NEXT: retq 211; 212; AVX2-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64: 213; AVX2: # BB#0: 214; AVX2-NEXT: andl $1, %ecx 215; AVX2-NEXT: andl $1, %edx 216; AVX2-NEXT: andl $1, %esi 217; AVX2-NEXT: andl $1, %edi 218; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 219; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 220; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 221; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 222; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 223; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 224; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 225; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 226; AVX2-NEXT: retq 227 %x0 = extractelement <2 x i64> %x, i64 %i0 228 %x1 = extractelement <2 x i64> %x, i64 %i1 229 %x2 = extractelement <2 x i64> %x, i64 %i2 230 %x3 = extractelement <2 x i64> %x, i64 %i3 231 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0 232 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1 233 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2 234 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3 235 ret <4 x i64> %r3 236} 237 238define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind { 239; AVX1-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32: 240; AVX1: # BB#0: 241; AVX1-NEXT: pushq %rbp 242; AVX1-NEXT: movq %rsp, %rbp 243; AVX1-NEXT: andq $-32, %rsp 244; AVX1-NEXT: subq $64, %rsp 245; AVX1-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def> 246; AVX1-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def> 247; AVX1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> 248; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def> 249; AVX1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> 250; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> 251; AVX1-NEXT: andl $7, %edi 252; AVX1-NEXT: andl $7, %esi 253; AVX1-NEXT: andl $7, %edx 254; AVX1-NEXT: andl $7, %ecx 255; AVX1-NEXT: andl $7, %r8d 256; AVX1-NEXT: vmovaps %ymm0, (%rsp) 257; AVX1-NEXT: andl $7, %r9d 258; AVX1-NEXT: movl 16(%rbp), %r10d 259; AVX1-NEXT: andl $7, %r10d 260; AVX1-NEXT: movl 24(%rbp), %eax 261; AVX1-NEXT: andl $7, %eax 262; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 263; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 264; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 265; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] 266; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] 267; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] 268; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 269; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] 270; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3] 271; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 272; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 273; AVX1-NEXT: movq %rbp, %rsp 274; AVX1-NEXT: popq %rbp 275; AVX1-NEXT: retq 276; 277; AVX2-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32: 278; AVX2: # BB#0: 279; AVX2-NEXT: vmovd %edi, %xmm1 280; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1 281; AVX2-NEXT: vmovd %esi, %xmm2 282; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm2 283; AVX2-NEXT: vmovd %edx, %xmm3 284; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm3 285; AVX2-NEXT: vmovd %ecx, %xmm4 286; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm4 287; AVX2-NEXT: vmovd %r8d, %xmm5 288; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm5 289; AVX2-NEXT: vmovd %r9d, %xmm6 290; AVX2-NEXT: vpermps %ymm0, %ymm6, %ymm6 291; AVX2-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero 292; AVX2-NEXT: vpermps %ymm0, %ymm7, %ymm7 293; AVX2-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero 294; AVX2-NEXT: vpermps %ymm0, %ymm8, %ymm0 295; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3] 296; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3] 297; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0] 298; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] 299; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] 300; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] 301; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 302; AVX2-NEXT: retq 303 %x0 = extractelement <8 x float> %x, i32 %i0 304 %x1 = extractelement <8 x float> %x, i32 %i1 305 %x2 = extractelement <8 x float> %x, i32 %i2 306 %x3 = extractelement <8 x float> %x, i32 %i3 307 %x4 = extractelement <8 x float> %x, i32 %i4 308 %x5 = extractelement <8 x float> %x, i32 %i5 309 %x6 = extractelement <8 x float> %x, i32 %i6 310 %x7 = extractelement <8 x float> %x, i32 %i7 311 %r0 = insertelement <8 x float> undef, float %x0, i32 0 312 %r1 = insertelement <8 x float> %r0, float %x1, i32 1 313 %r2 = insertelement <8 x float> %r1, float %x2, i32 2 314 %r3 = insertelement <8 x float> %r2, float %x3, i32 3 315 %r4 = insertelement <8 x float> %r3, float %x4, i32 4 316 %r5 = insertelement <8 x float> %r4, float %x5, i32 5 317 %r6 = insertelement <8 x float> %r5, float %x6, i32 6 318 %r7 = insertelement <8 x float> %r6, float %x7, i32 7 319 ret <8 x float> %r7 320} 321 322define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind { 323; ALL-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32: 324; ALL: # BB#0: 325; ALL-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def> 326; ALL-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def> 327; ALL-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> 328; ALL-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def> 329; ALL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> 330; ALL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> 331; ALL-NEXT: andl $3, %edi 332; ALL-NEXT: andl $3, %esi 333; ALL-NEXT: andl $3, %edx 334; ALL-NEXT: andl $3, %ecx 335; ALL-NEXT: andl $3, %r8d 336; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 337; ALL-NEXT: andl $3, %r9d 338; ALL-NEXT: movl {{[0-9]+}}(%rsp), %r10d 339; ALL-NEXT: andl $3, %r10d 340; ALL-NEXT: movl {{[0-9]+}}(%rsp), %eax 341; ALL-NEXT: andl $3, %eax 342; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 343; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 344; ALL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 345; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] 346; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] 347; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] 348; ALL-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 349; ALL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] 350; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3] 351; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 352; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 353; ALL-NEXT: retq 354 %x0 = extractelement <4 x float> %x, i32 %i0 355 %x1 = extractelement <4 x float> %x, i32 %i1 356 %x2 = extractelement <4 x float> %x, i32 %i2 357 %x3 = extractelement <4 x float> %x, i32 %i3 358 %x4 = extractelement <4 x float> %x, i32 %i4 359 %x5 = extractelement <4 x float> %x, i32 %i5 360 %x6 = extractelement <4 x float> %x, i32 %i6 361 %x7 = extractelement <4 x float> %x, i32 %i7 362 %r0 = insertelement <8 x float> undef, float %x0, i32 0 363 %r1 = insertelement <8 x float> %r0, float %x1, i32 1 364 %r2 = insertelement <8 x float> %r1, float %x2, i32 2 365 %r3 = insertelement <8 x float> %r2, float %x3, i32 3 366 %r4 = insertelement <8 x float> %r3, float %x4, i32 4 367 %r5 = insertelement <8 x float> %r4, float %x5, i32 5 368 %r6 = insertelement <8 x float> %r5, float %x6, i32 6 369 %r7 = insertelement <8 x float> %r6, float %x7, i32 7 370 ret <8 x float> %r7 371} 372 373define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind { 374; AVX1-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16: 375; AVX1: # BB#0: 376; AVX1-NEXT: pushq %rbp 377; AVX1-NEXT: movq %rsp, %rbp 378; AVX1-NEXT: andq $-32, %rsp 379; AVX1-NEXT: subq $64, %rsp 380; AVX1-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def> 381; AVX1-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def> 382; AVX1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> 383; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def> 384; AVX1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> 385; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> 386; AVX1-NEXT: vmovaps %ymm0, (%rsp) 387; AVX1-NEXT: movl 32(%rbp), %eax 388; AVX1-NEXT: andl $15, %eax 389; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 390; AVX1-NEXT: vmovd %eax, %xmm0 391; AVX1-NEXT: movl 40(%rbp), %eax 392; AVX1-NEXT: andl $15, %eax 393; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 394; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 395; AVX1-NEXT: movl 48(%rbp), %eax 396; AVX1-NEXT: andl $15, %eax 397; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 398; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 399; AVX1-NEXT: movl 56(%rbp), %eax 400; AVX1-NEXT: andl $15, %eax 401; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 402; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 403; AVX1-NEXT: movl 64(%rbp), %eax 404; AVX1-NEXT: andl $15, %eax 405; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 406; AVX1-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 407; AVX1-NEXT: movl 72(%rbp), %eax 408; AVX1-NEXT: andl $15, %eax 409; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 410; AVX1-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 411; AVX1-NEXT: movl 80(%rbp), %eax 412; AVX1-NEXT: andl $15, %eax 413; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 414; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 415; AVX1-NEXT: movl 88(%rbp), %eax 416; AVX1-NEXT: andl $15, %eax 417; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 418; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 419; AVX1-NEXT: andl $15, %edi 420; AVX1-NEXT: movzwl (%rsp,%rdi,2), %eax 421; AVX1-NEXT: vmovd %eax, %xmm1 422; AVX1-NEXT: andl $15, %esi 423; AVX1-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1 424; AVX1-NEXT: andl $15, %edx 425; AVX1-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1 426; AVX1-NEXT: andl $15, %ecx 427; AVX1-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1 428; AVX1-NEXT: andl $15, %r8d 429; AVX1-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1 430; AVX1-NEXT: andl $15, %r9d 431; AVX1-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1 432; AVX1-NEXT: movl 16(%rbp), %eax 433; AVX1-NEXT: andl $15, %eax 434; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 435; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 436; AVX1-NEXT: movl 24(%rbp), %eax 437; AVX1-NEXT: andl $15, %eax 438; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 439; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 440; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 441; AVX1-NEXT: movq %rbp, %rsp 442; AVX1-NEXT: popq %rbp 443; AVX1-NEXT: retq 444; 445; AVX2-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16: 446; AVX2: # BB#0: 447; AVX2-NEXT: pushq %rbp 448; AVX2-NEXT: movq %rsp, %rbp 449; AVX2-NEXT: andq $-32, %rsp 450; AVX2-NEXT: subq $64, %rsp 451; AVX2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def> 452; AVX2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def> 453; AVX2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> 454; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def> 455; AVX2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> 456; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> 457; AVX2-NEXT: vmovaps %ymm0, (%rsp) 458; AVX2-NEXT: movl 32(%rbp), %eax 459; AVX2-NEXT: andl $15, %eax 460; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 461; AVX2-NEXT: vmovd %eax, %xmm0 462; AVX2-NEXT: movl 40(%rbp), %eax 463; AVX2-NEXT: andl $15, %eax 464; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 465; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 466; AVX2-NEXT: movl 48(%rbp), %eax 467; AVX2-NEXT: andl $15, %eax 468; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 469; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 470; AVX2-NEXT: movl 56(%rbp), %eax 471; AVX2-NEXT: andl $15, %eax 472; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 473; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 474; AVX2-NEXT: movl 64(%rbp), %eax 475; AVX2-NEXT: andl $15, %eax 476; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 477; AVX2-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 478; AVX2-NEXT: movl 72(%rbp), %eax 479; AVX2-NEXT: andl $15, %eax 480; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 481; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 482; AVX2-NEXT: movl 80(%rbp), %eax 483; AVX2-NEXT: andl $15, %eax 484; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 485; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 486; AVX2-NEXT: movl 88(%rbp), %eax 487; AVX2-NEXT: andl $15, %eax 488; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 489; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 490; AVX2-NEXT: andl $15, %edi 491; AVX2-NEXT: movzwl (%rsp,%rdi,2), %eax 492; AVX2-NEXT: vmovd %eax, %xmm1 493; AVX2-NEXT: andl $15, %esi 494; AVX2-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1 495; AVX2-NEXT: andl $15, %edx 496; AVX2-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1 497; AVX2-NEXT: andl $15, %ecx 498; AVX2-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1 499; AVX2-NEXT: andl $15, %r8d 500; AVX2-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1 501; AVX2-NEXT: andl $15, %r9d 502; AVX2-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1 503; AVX2-NEXT: movl 16(%rbp), %eax 504; AVX2-NEXT: andl $15, %eax 505; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 506; AVX2-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 507; AVX2-NEXT: movl 24(%rbp), %eax 508; AVX2-NEXT: andl $15, %eax 509; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 510; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 511; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 512; AVX2-NEXT: movq %rbp, %rsp 513; AVX2-NEXT: popq %rbp 514; AVX2-NEXT: retq 515 %x0 = extractelement <16 x i16> %x, i32 %i0 516 %x1 = extractelement <16 x i16> %x, i32 %i1 517 %x2 = extractelement <16 x i16> %x, i32 %i2 518 %x3 = extractelement <16 x i16> %x, i32 %i3 519 %x4 = extractelement <16 x i16> %x, i32 %i4 520 %x5 = extractelement <16 x i16> %x, i32 %i5 521 %x6 = extractelement <16 x i16> %x, i32 %i6 522 %x7 = extractelement <16 x i16> %x, i32 %i7 523 %x8 = extractelement <16 x i16> %x, i32 %i8 524 %x9 = extractelement <16 x i16> %x, i32 %i9 525 %x10 = extractelement <16 x i16> %x, i32 %i10 526 %x11 = extractelement <16 x i16> %x, i32 %i11 527 %x12 = extractelement <16 x i16> %x, i32 %i12 528 %x13 = extractelement <16 x i16> %x, i32 %i13 529 %x14 = extractelement <16 x i16> %x, i32 %i14 530 %x15 = extractelement <16 x i16> %x, i32 %i15 531 %r0 = insertelement <16 x i16> undef, i16 %x0 , i32 0 532 %r1 = insertelement <16 x i16> %r0 , i16 %x1 , i32 1 533 %r2 = insertelement <16 x i16> %r1 , i16 %x2 , i32 2 534 %r3 = insertelement <16 x i16> %r2 , i16 %x3 , i32 3 535 %r4 = insertelement <16 x i16> %r3 , i16 %x4 , i32 4 536 %r5 = insertelement <16 x i16> %r4 , i16 %x5 , i32 5 537 %r6 = insertelement <16 x i16> %r5 , i16 %x6 , i32 6 538 %r7 = insertelement <16 x i16> %r6 , i16 %x7 , i32 7 539 %r8 = insertelement <16 x i16> %r7 , i16 %x8 , i32 8 540 %r9 = insertelement <16 x i16> %r8 , i16 %x9 , i32 9 541 %r10 = insertelement <16 x i16> %r9 , i16 %x10, i32 10 542 %r11 = insertelement <16 x i16> %r10, i16 %x11, i32 11 543 %r12 = insertelement <16 x i16> %r11, i16 %x12, i32 12 544 %r13 = insertelement <16 x i16> %r12, i16 %x13, i32 13 545 %r14 = insertelement <16 x i16> %r13, i16 %x14, i32 14 546 %r15 = insertelement <16 x i16> %r14, i16 %x15, i32 15 547 ret <16 x i16> %r15 548} 549 550define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind { 551; AVX1-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16: 552; AVX1: # BB#0: 553; AVX1-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def> 554; AVX1-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def> 555; AVX1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> 556; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def> 557; AVX1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> 558; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> 559; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 560; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 561; AVX1-NEXT: andl $7, %eax 562; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 563; AVX1-NEXT: vmovd %eax, %xmm0 564; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 565; AVX1-NEXT: andl $7, %eax 566; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 567; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 568; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 569; AVX1-NEXT: andl $7, %eax 570; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 571; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 572; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 573; AVX1-NEXT: andl $7, %eax 574; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 575; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 576; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 577; AVX1-NEXT: andl $7, %eax 578; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 579; AVX1-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 580; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 581; AVX1-NEXT: andl $7, %eax 582; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 583; AVX1-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 584; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 585; AVX1-NEXT: andl $7, %eax 586; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 587; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 588; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 589; AVX1-NEXT: andl $7, %eax 590; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 591; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 592; AVX1-NEXT: andl $7, %edi 593; AVX1-NEXT: movzwl -24(%rsp,%rdi,2), %eax 594; AVX1-NEXT: vmovd %eax, %xmm1 595; AVX1-NEXT: andl $7, %esi 596; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1 597; AVX1-NEXT: andl $7, %edx 598; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1 599; AVX1-NEXT: andl $7, %ecx 600; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1 601; AVX1-NEXT: andl $7, %r8d 602; AVX1-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1 603; AVX1-NEXT: andl $7, %r9d 604; AVX1-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1 605; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 606; AVX1-NEXT: andl $7, %eax 607; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 608; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 609; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 610; AVX1-NEXT: andl $7, %eax 611; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 612; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 613; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 614; AVX1-NEXT: retq 615; 616; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16: 617; AVX2: # BB#0: 618; AVX2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def> 619; AVX2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def> 620; AVX2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> 621; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def> 622; AVX2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> 623; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> 624; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 625; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 626; AVX2-NEXT: andl $7, %eax 627; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 628; AVX2-NEXT: vmovd %eax, %xmm0 629; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 630; AVX2-NEXT: andl $7, %eax 631; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 632; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 633; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 634; AVX2-NEXT: andl $7, %eax 635; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 636; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 637; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 638; AVX2-NEXT: andl $7, %eax 639; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 640; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 641; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 642; AVX2-NEXT: andl $7, %eax 643; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 644; AVX2-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 645; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 646; AVX2-NEXT: andl $7, %eax 647; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 648; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 649; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 650; AVX2-NEXT: andl $7, %eax 651; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 652; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 653; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 654; AVX2-NEXT: andl $7, %eax 655; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 656; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 657; AVX2-NEXT: andl $7, %edi 658; AVX2-NEXT: movzwl -24(%rsp,%rdi,2), %eax 659; AVX2-NEXT: vmovd %eax, %xmm1 660; AVX2-NEXT: andl $7, %esi 661; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1 662; AVX2-NEXT: andl $7, %edx 663; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1 664; AVX2-NEXT: andl $7, %ecx 665; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1 666; AVX2-NEXT: andl $7, %r8d 667; AVX2-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1 668; AVX2-NEXT: andl $7, %r9d 669; AVX2-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1 670; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 671; AVX2-NEXT: andl $7, %eax 672; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 673; AVX2-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 674; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 675; AVX2-NEXT: andl $7, %eax 676; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 677; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 678; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 679; AVX2-NEXT: retq 680 %x0 = extractelement <8 x i16> %x, i32 %i0 681 %x1 = extractelement <8 x i16> %x, i32 %i1 682 %x2 = extractelement <8 x i16> %x, i32 %i2 683 %x3 = extractelement <8 x i16> %x, i32 %i3 684 %x4 = extractelement <8 x i16> %x, i32 %i4 685 %x5 = extractelement <8 x i16> %x, i32 %i5 686 %x6 = extractelement <8 x i16> %x, i32 %i6 687 %x7 = extractelement <8 x i16> %x, i32 %i7 688 %x8 = extractelement <8 x i16> %x, i32 %i8 689 %x9 = extractelement <8 x i16> %x, i32 %i9 690 %x10 = extractelement <8 x i16> %x, i32 %i10 691 %x11 = extractelement <8 x i16> %x, i32 %i11 692 %x12 = extractelement <8 x i16> %x, i32 %i12 693 %x13 = extractelement <8 x i16> %x, i32 %i13 694 %x14 = extractelement <8 x i16> %x, i32 %i14 695 %x15 = extractelement <8 x i16> %x, i32 %i15 696 %r0 = insertelement <16 x i16> undef, i16 %x0 , i32 0 697 %r1 = insertelement <16 x i16> %r0 , i16 %x1 , i32 1 698 %r2 = insertelement <16 x i16> %r1 , i16 %x2 , i32 2 699 %r3 = insertelement <16 x i16> %r2 , i16 %x3 , i32 3 700 %r4 = insertelement <16 x i16> %r3 , i16 %x4 , i32 4 701 %r5 = insertelement <16 x i16> %r4 , i16 %x5 , i32 5 702 %r6 = insertelement <16 x i16> %r5 , i16 %x6 , i32 6 703 %r7 = insertelement <16 x i16> %r6 , i16 %x7 , i32 7 704 %r8 = insertelement <16 x i16> %r7 , i16 %x8 , i32 8 705 %r9 = insertelement <16 x i16> %r8 , i16 %x9 , i32 9 706 %r10 = insertelement <16 x i16> %r9 , i16 %x10, i32 10 707 %r11 = insertelement <16 x i16> %r10, i16 %x11, i32 11 708 %r12 = insertelement <16 x i16> %r11, i16 %x12, i32 12 709 %r13 = insertelement <16 x i16> %r12, i16 %x13, i32 13 710 %r14 = insertelement <16 x i16> %r13, i16 %x14, i32 14 711 %r15 = insertelement <16 x i16> %r14, i16 %x15, i32 15 712 ret <16 x i16> %r15 713} 714 715; 716; Unary shuffle indices from memory 717; 718 719define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwind { 720; AVX1-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64: 721; AVX1: # BB#0: 722; AVX1-NEXT: pushq %rbp 723; AVX1-NEXT: movq %rsp, %rbp 724; AVX1-NEXT: andq $-32, %rsp 725; AVX1-NEXT: subq $64, %rsp 726; AVX1-NEXT: movq (%rdi), %rax 727; AVX1-NEXT: movq 8(%rdi), %rcx 728; AVX1-NEXT: andl $3, %eax 729; AVX1-NEXT: andl $3, %ecx 730; AVX1-NEXT: movq 16(%rdi), %rdx 731; AVX1-NEXT: andl $3, %edx 732; AVX1-NEXT: movq 24(%rdi), %rsi 733; AVX1-NEXT: andl $3, %esi 734; AVX1-NEXT: vmovaps %ymm0, (%rsp) 735; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 736; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 737; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 738; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 739; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 740; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 741; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 742; AVX1-NEXT: movq %rbp, %rsp 743; AVX1-NEXT: popq %rbp 744; AVX1-NEXT: retq 745; 746; AVX2-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64: 747; AVX2: # BB#0: 748; AVX2-NEXT: pushq %rbp 749; AVX2-NEXT: movq %rsp, %rbp 750; AVX2-NEXT: andq $-32, %rsp 751; AVX2-NEXT: subq $64, %rsp 752; AVX2-NEXT: movq (%rdi), %rax 753; AVX2-NEXT: movq 8(%rdi), %rcx 754; AVX2-NEXT: andl $3, %eax 755; AVX2-NEXT: andl $3, %ecx 756; AVX2-NEXT: movq 16(%rdi), %rdx 757; AVX2-NEXT: andl $3, %edx 758; AVX2-NEXT: movq 24(%rdi), %rsi 759; AVX2-NEXT: andl $3, %esi 760; AVX2-NEXT: vmovaps %ymm0, (%rsp) 761; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 762; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 763; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 764; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 765; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 766; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 767; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 768; AVX2-NEXT: movq %rbp, %rsp 769; AVX2-NEXT: popq %rbp 770; AVX2-NEXT: retq 771 %p0 = getelementptr inbounds i64, i64* %i, i32 0 772 %p1 = getelementptr inbounds i64, i64* %i, i32 1 773 %p2 = getelementptr inbounds i64, i64* %i, i32 2 774 %p3 = getelementptr inbounds i64, i64* %i, i32 3 775 %i0 = load i64, i64* %p0, align 4 776 %i1 = load i64, i64* %p1, align 4 777 %i2 = load i64, i64* %p2, align 4 778 %i3 = load i64, i64* %p3, align 4 779 %x0 = extractelement <4 x i64> %x, i64 %i0 780 %x1 = extractelement <4 x i64> %x, i64 %i1 781 %x2 = extractelement <4 x i64> %x, i64 %i2 782 %x3 = extractelement <4 x i64> %x, i64 %i3 783 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0 784 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1 785 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2 786 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3 787 ret <4 x i64> %r3 788} 789 790define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwind { 791; AVX1-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64: 792; AVX1: # BB#0: 793; AVX1-NEXT: movq (%rdi), %rax 794; AVX1-NEXT: movq 8(%rdi), %rcx 795; AVX1-NEXT: andl $1, %eax 796; AVX1-NEXT: andl $1, %ecx 797; AVX1-NEXT: movq 16(%rdi), %rdx 798; AVX1-NEXT: andl $1, %edx 799; AVX1-NEXT: movq 24(%rdi), %rsi 800; AVX1-NEXT: andl $1, %esi 801; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 802; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 803; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 804; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 805; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 806; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 807; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 808; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 809; AVX1-NEXT: retq 810; 811; AVX2-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64: 812; AVX2: # BB#0: 813; AVX2-NEXT: movq (%rdi), %rax 814; AVX2-NEXT: movq 8(%rdi), %rcx 815; AVX2-NEXT: andl $1, %eax 816; AVX2-NEXT: andl $1, %ecx 817; AVX2-NEXT: movq 16(%rdi), %rdx 818; AVX2-NEXT: andl $1, %edx 819; AVX2-NEXT: movq 24(%rdi), %rsi 820; AVX2-NEXT: andl $1, %esi 821; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 822; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 823; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 824; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 825; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 826; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 827; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 828; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 829; AVX2-NEXT: retq 830 %p0 = getelementptr inbounds i64, i64* %i, i32 0 831 %p1 = getelementptr inbounds i64, i64* %i, i32 1 832 %p2 = getelementptr inbounds i64, i64* %i, i32 2 833 %p3 = getelementptr inbounds i64, i64* %i, i32 3 834 %i0 = load i64, i64* %p0, align 4 835 %i1 = load i64, i64* %p1, align 4 836 %i2 = load i64, i64* %p2, align 4 837 %i3 = load i64, i64* %p3, align 4 838 %x0 = extractelement <2 x i64> %x, i64 %i0 839 %x1 = extractelement <2 x i64> %x, i64 %i1 840 %x2 = extractelement <2 x i64> %x, i64 %i2 841 %x3 = extractelement <2 x i64> %x, i64 %i3 842 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0 843 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1 844 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2 845 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3 846 ret <4 x i64> %r3 847} 848