1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=SSE 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 8 9; PR21281 10define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d, <8 x i16> %e, <8 x i16> %f, <8 x i16> %h, <8 x i16> %g) { 11; SSE-LABEL: interleave8x8: 12; SSE: # %bb.0: 13; SSE-NEXT: movq %rdi, %rax 14; SSE-NEXT: movdqa %xmm0, %xmm8 15; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] 16; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 17; SSE-NEXT: movdqa %xmm2, %xmm1 18; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 19; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 20; SSE-NEXT: movdqa %xmm0, %xmm3 21; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] 22; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 23; SSE-NEXT: movdqa %xmm8, %xmm2 24; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 25; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] 26; SSE-NEXT: movdqa %xmm4, %xmm1 27; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] 28; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 29; SSE-NEXT: movdqa %xmm7, %xmm5 30; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 31; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 32; SSE-NEXT: movdqa %xmm4, %xmm6 33; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] 34; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] 35; SSE-NEXT: movdqa %xmm1, %xmm7 36; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] 37; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] 38; SSE-NEXT: movdqa %xmm8, %xmm5 39; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] 40; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] 41; SSE-NEXT: movdqa %xmm2, %xmm1 42; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] 43; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] 44; SSE-NEXT: movdqa %xmm0, %xmm7 45; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] 46; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 47; SSE-NEXT: movdqa %xmm3, %xmm4 48; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] 49; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] 50; SSE-NEXT: movdqa %xmm3, 112(%rdi) 51; SSE-NEXT: movdqa %xmm4, 96(%rdi) 52; SSE-NEXT: movdqa %xmm0, 80(%rdi) 53; SSE-NEXT: movdqa %xmm7, 64(%rdi) 54; SSE-NEXT: movdqa %xmm2, 48(%rdi) 55; SSE-NEXT: movdqa %xmm1, 32(%rdi) 56; SSE-NEXT: movdqa %xmm8, 16(%rdi) 57; SSE-NEXT: movdqa %xmm5, (%rdi) 58; SSE-NEXT: retq 59; 60; AVX1-LABEL: interleave8x8: 61; AVX1: # %bb.0: 62; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 63; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 64; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 65; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 66; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 67; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 68; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] 69; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] 70; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 71; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 72; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 73; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 74; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] 75; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] 76; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] 77; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] 78; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 79; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 80; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 81; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] 82; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] 83; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 84; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 85; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 86; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 87; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] 88; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] 89; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 90; AVX1-NEXT: retq 91; 92; AVX2-LABEL: interleave8x8: 93; AVX2: # %bb.0: 94; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 95; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 96; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 97; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 98; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 99; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 100; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] 101; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] 102; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 103; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 104; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 105; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 106; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] 107; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] 108; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] 109; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] 110; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 111; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 112; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 113; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] 114; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] 115; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 116; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 117; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 118; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 119; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] 120; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] 121; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 122; AVX2-NEXT: retq 123 %ab = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 124 %cd = shufflevector <8 x i16> %c, <8 x i16> %d, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 125 %ab32 = bitcast <16 x i16> %ab to <8 x i32> 126 %cd32 = bitcast <16 x i16> %cd to <8 x i32> 127 %abcd32 = shufflevector <8 x i32> %ab32, <8 x i32> %cd32, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 128 %abcd = bitcast <16 x i32> %abcd32 to <32 x i16> 129 130 %ef = shufflevector <8 x i16> %e, <8 x i16> %f, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 131 %gh = shufflevector <8 x i16> %g, <8 x i16> %h, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 132 %ef32 = bitcast <16 x i16> %ef to <8 x i32> 133 %gh32 = bitcast <16 x i16> %gh to <8 x i32> 134 %efgh32 = shufflevector <8 x i32> %ef32, <8 x i32> %gh32, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 135 %efgh = bitcast <16 x i32> %efgh32 to <32 x i16> 136 137 %result = shufflevector <32 x i16> %abcd, <32 x i16> %efgh, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 138 ret <64 x i16> %result 139} 140 141define void @splat2_i8(ptr %s, ptr %d) { 142; SSE-LABEL: splat2_i8: 143; SSE: # %bb.0: 144; SSE-NEXT: movdqu (%rdi), %xmm0 145; SSE-NEXT: movdqu 16(%rdi), %xmm1 146; SSE-NEXT: movdqa %xmm0, %xmm2 147; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 148; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 149; SSE-NEXT: movdqa %xmm1, %xmm3 150; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 151; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 152; SSE-NEXT: movdqu %xmm1, 48(%rsi) 153; SSE-NEXT: movdqu %xmm3, 32(%rsi) 154; SSE-NEXT: movdqu %xmm0, 16(%rsi) 155; SSE-NEXT: movdqu %xmm2, (%rsi) 156; SSE-NEXT: retq 157; 158; AVX1-LABEL: splat2_i8: 159; AVX1: # %bb.0: 160; AVX1-NEXT: vmovdqu (%rdi), %xmm0 161; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 162; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 163; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 164; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 165; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 166; AVX1-NEXT: vmovdqu %xmm1, 48(%rsi) 167; AVX1-NEXT: vmovdqu %xmm3, 32(%rsi) 168; AVX1-NEXT: vmovdqu %xmm0, 16(%rsi) 169; AVX1-NEXT: vmovdqu %xmm2, (%rsi) 170; AVX1-NEXT: retq 171; 172; AVX2-LABEL: splat2_i8: 173; AVX2: # %bb.0: 174; AVX2-NEXT: vpermq $216, (%rdi), %ymm0 # ymm0 = mem[0,2,1,3] 175; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 176; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 177; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi) 178; AVX2-NEXT: vmovdqu %ymm1, (%rsi) 179; AVX2-NEXT: vzeroupper 180; AVX2-NEXT: retq 181 %ld32 = load <32 x i8>, ptr %s, align 1 182 %cat = shufflevector <32 x i8> %ld32, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 183 %cat2 = shufflevector <64 x i8> %cat, <64 x i8> undef, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 184 store <64 x i8> %cat2, ptr %d, align 1 185 ret void 186} 187 188define void @splat2_i16(ptr %s, ptr %d) { 189; SSE-LABEL: splat2_i16: 190; SSE: # %bb.0: 191; SSE-NEXT: movdqu (%rdi), %xmm0 192; SSE-NEXT: movdqu 16(%rdi), %xmm1 193; SSE-NEXT: movdqa %xmm0, %xmm2 194; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 195; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 196; SSE-NEXT: movdqa %xmm1, %xmm3 197; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 198; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 199; SSE-NEXT: movdqu %xmm1, 48(%rsi) 200; SSE-NEXT: movdqu %xmm3, 32(%rsi) 201; SSE-NEXT: movdqu %xmm0, 16(%rsi) 202; SSE-NEXT: movdqu %xmm2, (%rsi) 203; SSE-NEXT: retq 204; 205; AVX1-LABEL: splat2_i16: 206; AVX1: # %bb.0: 207; AVX1-NEXT: vmovdqu (%rdi), %xmm0 208; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 209; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3] 210; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 211; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0,0,1,1,2,2,3,3] 212; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 213; AVX1-NEXT: vmovdqu %xmm1, 48(%rsi) 214; AVX1-NEXT: vmovdqu %xmm3, 32(%rsi) 215; AVX1-NEXT: vmovdqu %xmm0, 16(%rsi) 216; AVX1-NEXT: vmovdqu %xmm2, (%rsi) 217; AVX1-NEXT: retq 218; 219; AVX2-LABEL: splat2_i16: 220; AVX2: # %bb.0: 221; AVX2-NEXT: vpermq $216, (%rdi), %ymm0 # ymm0 = mem[0,2,1,3] 222; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] 223; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] 224; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi) 225; AVX2-NEXT: vmovdqu %ymm1, (%rsi) 226; AVX2-NEXT: vzeroupper 227; AVX2-NEXT: retq 228 %ld32 = load <16 x i16>, ptr %s, align 1 229 %cat = shufflevector <16 x i16> %ld32, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 230 %cat2 = shufflevector <32 x i16> %cat, <32 x i16> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 231 store <32 x i16> %cat2, ptr %d, align 1 232 ret void 233} 234 235define void @splat2_i32(ptr %s, ptr %d) { 236; SSE-LABEL: splat2_i32: 237; SSE: # %bb.0: 238; SSE-NEXT: movdqu (%rdi), %xmm0 239; SSE-NEXT: movdqu 16(%rdi), %xmm1 240; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] 241; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 242; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 243; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 244; SSE-NEXT: movdqu %xmm1, 48(%rsi) 245; SSE-NEXT: movdqu %xmm3, 32(%rsi) 246; SSE-NEXT: movdqu %xmm0, 16(%rsi) 247; SSE-NEXT: movdqu %xmm2, (%rsi) 248; SSE-NEXT: retq 249; 250; AVX1-LABEL: splat2_i32: 251; AVX1: # %bb.0: 252; AVX1-NEXT: vmovups (%rdi), %xmm0 253; AVX1-NEXT: vmovups 16(%rdi), %xmm1 254; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,0,1,1] 255; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] 256; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[0,0,1,1] 257; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3] 258; AVX1-NEXT: vmovups %xmm1, 48(%rsi) 259; AVX1-NEXT: vmovups %xmm3, 32(%rsi) 260; AVX1-NEXT: vmovups %xmm0, 16(%rsi) 261; AVX1-NEXT: vmovups %xmm2, (%rsi) 262; AVX1-NEXT: retq 263; 264; AVX2-LABEL: splat2_i32: 265; AVX2: # %bb.0: 266; AVX2-NEXT: vpermpd $216, (%rdi), %ymm0 # ymm0 = mem[0,2,1,3] 267; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,0,1,1,4,4,5,5] 268; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] 269; AVX2-NEXT: vmovups %ymm0, 32(%rsi) 270; AVX2-NEXT: vmovups %ymm1, (%rsi) 271; AVX2-NEXT: vzeroupper 272; AVX2-NEXT: retq 273 %ld32 = load <8 x i32>, ptr %s, align 1 274 %cat = shufflevector <8 x i32> %ld32, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 275 %cat2 = shufflevector <16 x i32> %cat, <16 x i32> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 276 store <16 x i32> %cat2, ptr %d, align 1 277 ret void 278} 279 280define void @splat2_i64(ptr %s, ptr %d) { 281; SSE-LABEL: splat2_i64: 282; SSE: # %bb.0: 283; SSE-NEXT: movdqu (%rdi), %xmm0 284; SSE-NEXT: movdqu 16(%rdi), %xmm1 285; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] 286; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 287; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] 288; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 289; SSE-NEXT: movdqu %xmm1, 48(%rsi) 290; SSE-NEXT: movdqu %xmm3, 32(%rsi) 291; SSE-NEXT: movdqu %xmm0, 16(%rsi) 292; SSE-NEXT: movdqu %xmm2, (%rsi) 293; SSE-NEXT: retq 294; 295; AVX1-LABEL: splat2_i64: 296; AVX1: # %bb.0: 297; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3] 298; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3] 299; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1] 300; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3] 301; AVX1-NEXT: vmovupd %ymm0, 32(%rsi) 302; AVX1-NEXT: vmovupd %ymm1, (%rsi) 303; AVX1-NEXT: vzeroupper 304; AVX1-NEXT: retq 305; 306; AVX2-LABEL: splat2_i64: 307; AVX2: # %bb.0: 308; AVX2-NEXT: vmovups (%rdi), %ymm0 309; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1] 310; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] 311; AVX2-NEXT: vmovups %ymm0, 32(%rsi) 312; AVX2-NEXT: vmovups %ymm1, (%rsi) 313; AVX2-NEXT: vzeroupper 314; AVX2-NEXT: retq 315 %ld32 = load <4 x i64>, ptr %s, align 1 316 %cat = shufflevector <4 x i64> %ld32, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 317 %cat2 = shufflevector <8 x i64> %cat, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 318 store <8 x i64> %cat2, ptr %d, align 1 319 ret void 320} 321