1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
10
11define i8 @test_bitreverse_i8(i8 %a) nounwind {
12; SSE-LABEL: test_bitreverse_i8:
13; SSE:       # BB#0:
14; SSE-NEXT:    rolb $4, %dil
15; SSE-NEXT:    movl %edi, %eax
16; SSE-NEXT:    andb $51, %al
17; SSE-NEXT:    shlb $2, %al
18; SSE-NEXT:    andb $-52, %dil
19; SSE-NEXT:    shrb $2, %dil
20; SSE-NEXT:    orb %al, %dil
21; SSE-NEXT:    movl %edi, %eax
22; SSE-NEXT:    andb $85, %al
23; SSE-NEXT:    addb %al, %al
24; SSE-NEXT:    andb $-86, %dil
25; SSE-NEXT:    shrb %dil
26; SSE-NEXT:    orb %al, %dil
27; SSE-NEXT:    movl %edi, %eax
28; SSE-NEXT:    retq
29;
30; AVX-LABEL: test_bitreverse_i8:
31; AVX:       # BB#0:
32; AVX-NEXT:    rolb $4, %dil
33; AVX-NEXT:    movl %edi, %eax
34; AVX-NEXT:    andb $51, %al
35; AVX-NEXT:    shlb $2, %al
36; AVX-NEXT:    andb $-52, %dil
37; AVX-NEXT:    shrb $2, %dil
38; AVX-NEXT:    orb %al, %dil
39; AVX-NEXT:    movl %edi, %eax
40; AVX-NEXT:    andb $85, %al
41; AVX-NEXT:    addb %al, %al
42; AVX-NEXT:    andb $-86, %dil
43; AVX-NEXT:    shrb %dil
44; AVX-NEXT:    orb %al, %dil
45; AVX-NEXT:    movl %edi, %eax
46; AVX-NEXT:    retq
47;
48; XOP-LABEL: test_bitreverse_i8:
49; XOP:       # BB#0:
50; XOP-NEXT:    vmovd %edi, %xmm0
51; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
52; XOP-NEXT:    vpextrb $0, %xmm0, %eax
53; XOP-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
54; XOP-NEXT:    retq
55  %b = call i8 @llvm.bitreverse.i8(i8 %a)
56  ret i8 %b
57}
58
59define i16 @test_bitreverse_i16(i16 %a) nounwind {
60; SSE-LABEL: test_bitreverse_i16:
61; SSE:       # BB#0:
62; SSE-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
63; SSE-NEXT:    rolw $8, %di
64; SSE-NEXT:    movl %edi, %eax
65; SSE-NEXT:    andl $3855, %eax # imm = 0xF0F
66; SSE-NEXT:    shll $4, %eax
67; SSE-NEXT:    andl $61680, %edi # imm = 0xF0F0
68; SSE-NEXT:    shrl $4, %edi
69; SSE-NEXT:    orl %eax, %edi
70; SSE-NEXT:    movl %edi, %eax
71; SSE-NEXT:    andl $13107, %eax # imm = 0x3333
72; SSE-NEXT:    andl $52428, %edi # imm = 0xCCCC
73; SSE-NEXT:    shrl $2, %edi
74; SSE-NEXT:    leal (%rdi,%rax,4), %eax
75; SSE-NEXT:    movl %eax, %ecx
76; SSE-NEXT:    andl $21845, %ecx # imm = 0x5555
77; SSE-NEXT:    andl $43690, %eax # imm = 0xAAAA
78; SSE-NEXT:    shrl %eax
79; SSE-NEXT:    leal (%rax,%rcx,2), %eax
80; SSE-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
81; SSE-NEXT:    retq
82;
83; AVX-LABEL: test_bitreverse_i16:
84; AVX:       # BB#0:
85; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
86; AVX-NEXT:    rolw $8, %di
87; AVX-NEXT:    movl %edi, %eax
88; AVX-NEXT:    andl $3855, %eax # imm = 0xF0F
89; AVX-NEXT:    shll $4, %eax
90; AVX-NEXT:    andl $61680, %edi # imm = 0xF0F0
91; AVX-NEXT:    shrl $4, %edi
92; AVX-NEXT:    orl %eax, %edi
93; AVX-NEXT:    movl %edi, %eax
94; AVX-NEXT:    andl $13107, %eax # imm = 0x3333
95; AVX-NEXT:    andl $52428, %edi # imm = 0xCCCC
96; AVX-NEXT:    shrl $2, %edi
97; AVX-NEXT:    leal (%rdi,%rax,4), %eax
98; AVX-NEXT:    movl %eax, %ecx
99; AVX-NEXT:    andl $21845, %ecx # imm = 0x5555
100; AVX-NEXT:    andl $43690, %eax # imm = 0xAAAA
101; AVX-NEXT:    shrl %eax
102; AVX-NEXT:    leal (%rax,%rcx,2), %eax
103; AVX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
104; AVX-NEXT:    retq
105;
106; XOP-LABEL: test_bitreverse_i16:
107; XOP:       # BB#0:
108; XOP-NEXT:    vmovd %edi, %xmm0
109; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
110; XOP-NEXT:    vmovd %xmm0, %eax
111; XOP-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
112; XOP-NEXT:    retq
113  %b = call i16 @llvm.bitreverse.i16(i16 %a)
114  ret i16 %b
115}
116
117define i32 @test_bitreverse_i32(i32 %a) nounwind {
118; SSE-LABEL: test_bitreverse_i32:
119; SSE:       # BB#0:
120; SSE-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
121; SSE-NEXT:    bswapl %edi
122; SSE-NEXT:    movl %edi, %eax
123; SSE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
124; SSE-NEXT:    shll $4, %eax
125; SSE-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
126; SSE-NEXT:    shrl $4, %edi
127; SSE-NEXT:    orl %eax, %edi
128; SSE-NEXT:    movl %edi, %eax
129; SSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
130; SSE-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
131; SSE-NEXT:    shrl $2, %edi
132; SSE-NEXT:    leal (%rdi,%rax,4), %eax
133; SSE-NEXT:    movl %eax, %ecx
134; SSE-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
135; SSE-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
136; SSE-NEXT:    shrl %eax
137; SSE-NEXT:    leal (%rax,%rcx,2), %eax
138; SSE-NEXT:    retq
139;
140; AVX-LABEL: test_bitreverse_i32:
141; AVX:       # BB#0:
142; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
143; AVX-NEXT:    bswapl %edi
144; AVX-NEXT:    movl %edi, %eax
145; AVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
146; AVX-NEXT:    shll $4, %eax
147; AVX-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
148; AVX-NEXT:    shrl $4, %edi
149; AVX-NEXT:    orl %eax, %edi
150; AVX-NEXT:    movl %edi, %eax
151; AVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
152; AVX-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
153; AVX-NEXT:    shrl $2, %edi
154; AVX-NEXT:    leal (%rdi,%rax,4), %eax
155; AVX-NEXT:    movl %eax, %ecx
156; AVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
157; AVX-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
158; AVX-NEXT:    shrl %eax
159; AVX-NEXT:    leal (%rax,%rcx,2), %eax
160; AVX-NEXT:    retq
161;
162; XOP-LABEL: test_bitreverse_i32:
163; XOP:       # BB#0:
164; XOP-NEXT:    vmovd %edi, %xmm0
165; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
166; XOP-NEXT:    vmovd %xmm0, %eax
167; XOP-NEXT:    retq
168  %b = call i32 @llvm.bitreverse.i32(i32 %a)
169  ret i32 %b
170}
171
172define i64 @test_bitreverse_i64(i64 %a) nounwind {
173; SSE-LABEL: test_bitreverse_i64:
174; SSE:       # BB#0:
175; SSE-NEXT:    bswapq %rdi
176; SSE-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
177; SSE-NEXT:    andq %rdi, %rax
178; SSE-NEXT:    shlq $4, %rax
179; SSE-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
180; SSE-NEXT:    andq %rdi, %rcx
181; SSE-NEXT:    shrq $4, %rcx
182; SSE-NEXT:    orq %rax, %rcx
183; SSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
184; SSE-NEXT:    andq %rcx, %rax
185; SSE-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
186; SSE-NEXT:    andq %rcx, %rdx
187; SSE-NEXT:    shrq $2, %rdx
188; SSE-NEXT:    leaq (%rdx,%rax,4), %rax
189; SSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
190; SSE-NEXT:    andq %rax, %rcx
191; SSE-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
192; SSE-NEXT:    andq %rax, %rdx
193; SSE-NEXT:    shrq %rdx
194; SSE-NEXT:    leaq (%rdx,%rcx,2), %rax
195; SSE-NEXT:    retq
196;
197; AVX-LABEL: test_bitreverse_i64:
198; AVX:       # BB#0:
199; AVX-NEXT:    bswapq %rdi
200; AVX-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
201; AVX-NEXT:    andq %rdi, %rax
202; AVX-NEXT:    shlq $4, %rax
203; AVX-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
204; AVX-NEXT:    andq %rdi, %rcx
205; AVX-NEXT:    shrq $4, %rcx
206; AVX-NEXT:    orq %rax, %rcx
207; AVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
208; AVX-NEXT:    andq %rcx, %rax
209; AVX-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
210; AVX-NEXT:    andq %rcx, %rdx
211; AVX-NEXT:    shrq $2, %rdx
212; AVX-NEXT:    leaq (%rdx,%rax,4), %rax
213; AVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
214; AVX-NEXT:    andq %rax, %rcx
215; AVX-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
216; AVX-NEXT:    andq %rax, %rdx
217; AVX-NEXT:    shrq %rdx
218; AVX-NEXT:    leaq (%rdx,%rcx,2), %rax
219; AVX-NEXT:    retq
220;
221; XOP-LABEL: test_bitreverse_i64:
222; XOP:       # BB#0:
223; XOP-NEXT:    vmovq %rdi, %xmm0
224; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
225; XOP-NEXT:    vmovq %xmm0, %rax
226; XOP-NEXT:    retq
227  %b = call i64 @llvm.bitreverse.i64(i64 %a)
228  ret i64 %b
229}
230
231define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
232; SSE2-LABEL: test_bitreverse_v16i8:
233; SSE2:       # BB#0:
234; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
235; SSE2-NEXT:    movdqa %xmm0, %xmm2
236; SSE2-NEXT:    pand %xmm1, %xmm2
237; SSE2-NEXT:    psllw $4, %xmm2
238; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
239; SSE2-NEXT:    pand %xmm3, %xmm2
240; SSE2-NEXT:    pand %xmm3, %xmm0
241; SSE2-NEXT:    psrlw $4, %xmm0
242; SSE2-NEXT:    pand %xmm1, %xmm0
243; SSE2-NEXT:    por %xmm2, %xmm0
244; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
245; SSE2-NEXT:    pand %xmm0, %xmm1
246; SSE2-NEXT:    psllw $2, %xmm1
247; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
248; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
249; SSE2-NEXT:    psrlw $2, %xmm0
250; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
251; SSE2-NEXT:    por %xmm1, %xmm0
252; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
253; SSE2-NEXT:    pand %xmm0, %xmm1
254; SSE2-NEXT:    paddb %xmm1, %xmm1
255; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
256; SSE2-NEXT:    psrlw $1, %xmm0
257; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
258; SSE2-NEXT:    por %xmm1, %xmm0
259; SSE2-NEXT:    retq
260;
261; SSSE3-LABEL: test_bitreverse_v16i8:
262; SSSE3:       # BB#0:
263; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
264; SSSE3-NEXT:    movdqa %xmm0, %xmm2
265; SSSE3-NEXT:    pand %xmm1, %xmm2
266; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
267; SSSE3-NEXT:    pshufb %xmm2, %xmm3
268; SSSE3-NEXT:    psrlw $4, %xmm0
269; SSSE3-NEXT:    pand %xmm1, %xmm0
270; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
271; SSSE3-NEXT:    pshufb %xmm0, %xmm1
272; SSSE3-NEXT:    por %xmm3, %xmm1
273; SSSE3-NEXT:    movdqa %xmm1, %xmm0
274; SSSE3-NEXT:    retq
275;
276; AVX-LABEL: test_bitreverse_v16i8:
277; AVX:       # BB#0:
278; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
279; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
280; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
281; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
282; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
283; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
284; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
285; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
286; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
287; AVX-NEXT:    retq
288;
289; XOP-LABEL: test_bitreverse_v16i8:
290; XOP:       # BB#0:
291; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
292; XOP-NEXT:    retq
293  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
294  ret <16 x i8> %b
295}
296
297define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
298; SSE2-LABEL: test_bitreverse_v8i16:
299; SSE2:       # BB#0:
300; SSE2-NEXT:    pxor %xmm1, %xmm1
301; SSE2-NEXT:    movdqa %xmm0, %xmm2
302; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
303; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
304; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
305; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
306; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
307; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
308; SSE2-NEXT:    packuswb %xmm2, %xmm0
309; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
310; SSE2-NEXT:    movdqa %xmm0, %xmm2
311; SSE2-NEXT:    pand %xmm1, %xmm2
312; SSE2-NEXT:    psllw $4, %xmm2
313; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
314; SSE2-NEXT:    pand %xmm3, %xmm2
315; SSE2-NEXT:    pand %xmm3, %xmm0
316; SSE2-NEXT:    psrlw $4, %xmm0
317; SSE2-NEXT:    pand %xmm1, %xmm0
318; SSE2-NEXT:    por %xmm2, %xmm0
319; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
320; SSE2-NEXT:    pand %xmm0, %xmm1
321; SSE2-NEXT:    psllw $2, %xmm1
322; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
323; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
324; SSE2-NEXT:    psrlw $2, %xmm0
325; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
326; SSE2-NEXT:    por %xmm1, %xmm0
327; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
328; SSE2-NEXT:    pand %xmm0, %xmm1
329; SSE2-NEXT:    paddb %xmm1, %xmm1
330; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
331; SSE2-NEXT:    psrlw $1, %xmm0
332; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
333; SSE2-NEXT:    por %xmm1, %xmm0
334; SSE2-NEXT:    retq
335;
336; SSSE3-LABEL: test_bitreverse_v8i16:
337; SSSE3:       # BB#0:
338; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
339; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
340; SSSE3-NEXT:    movdqa %xmm0, %xmm2
341; SSSE3-NEXT:    pand %xmm1, %xmm2
342; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
343; SSSE3-NEXT:    pshufb %xmm2, %xmm3
344; SSSE3-NEXT:    psrlw $4, %xmm0
345; SSSE3-NEXT:    pand %xmm1, %xmm0
346; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
347; SSSE3-NEXT:    pshufb %xmm0, %xmm1
348; SSSE3-NEXT:    por %xmm3, %xmm1
349; SSSE3-NEXT:    movdqa %xmm1, %xmm0
350; SSSE3-NEXT:    retq
351;
352; AVX-LABEL: test_bitreverse_v8i16:
353; AVX:       # BB#0:
354; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
355; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
356; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
357; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
358; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
359; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
360; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
361; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
362; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
363; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
364; AVX-NEXT:    retq
365;
366; XOP-LABEL: test_bitreverse_v8i16:
367; XOP:       # BB#0:
368; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
369; XOP-NEXT:    retq
370  %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
371  ret <8 x i16> %b
372}
373
374define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
375; SSE2-LABEL: test_bitreverse_v4i32:
376; SSE2:       # BB#0:
377; SSE2-NEXT:    pxor %xmm1, %xmm1
378; SSE2-NEXT:    movdqa %xmm0, %xmm2
379; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
380; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
381; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
382; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
383; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
384; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
385; SSE2-NEXT:    packuswb %xmm2, %xmm0
386; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
387; SSE2-NEXT:    movdqa %xmm0, %xmm2
388; SSE2-NEXT:    pand %xmm1, %xmm2
389; SSE2-NEXT:    psllw $4, %xmm2
390; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
391; SSE2-NEXT:    pand %xmm3, %xmm2
392; SSE2-NEXT:    pand %xmm3, %xmm0
393; SSE2-NEXT:    psrlw $4, %xmm0
394; SSE2-NEXT:    pand %xmm1, %xmm0
395; SSE2-NEXT:    por %xmm2, %xmm0
396; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
397; SSE2-NEXT:    pand %xmm0, %xmm1
398; SSE2-NEXT:    psllw $2, %xmm1
399; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
400; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
401; SSE2-NEXT:    psrlw $2, %xmm0
402; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
403; SSE2-NEXT:    por %xmm1, %xmm0
404; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
405; SSE2-NEXT:    pand %xmm0, %xmm1
406; SSE2-NEXT:    paddb %xmm1, %xmm1
407; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
408; SSE2-NEXT:    psrlw $1, %xmm0
409; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
410; SSE2-NEXT:    por %xmm1, %xmm0
411; SSE2-NEXT:    retq
412;
413; SSSE3-LABEL: test_bitreverse_v4i32:
414; SSSE3:       # BB#0:
415; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
416; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
417; SSSE3-NEXT:    movdqa %xmm0, %xmm2
418; SSSE3-NEXT:    pand %xmm1, %xmm2
419; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
420; SSSE3-NEXT:    pshufb %xmm2, %xmm3
421; SSSE3-NEXT:    psrlw $4, %xmm0
422; SSSE3-NEXT:    pand %xmm1, %xmm0
423; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
424; SSSE3-NEXT:    pshufb %xmm0, %xmm1
425; SSSE3-NEXT:    por %xmm3, %xmm1
426; SSSE3-NEXT:    movdqa %xmm1, %xmm0
427; SSSE3-NEXT:    retq
428;
429; AVX-LABEL: test_bitreverse_v4i32:
430; AVX:       # BB#0:
431; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
432; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
433; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
434; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
435; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
436; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
437; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
438; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
439; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
440; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
441; AVX-NEXT:    retq
442;
443; XOP-LABEL: test_bitreverse_v4i32:
444; XOP:       # BB#0:
445; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
446; XOP-NEXT:    retq
447  %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
448  ret <4 x i32> %b
449}
450
451define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
452; SSE2-LABEL: test_bitreverse_v2i64:
453; SSE2:       # BB#0:
454; SSE2-NEXT:    pxor %xmm1, %xmm1
455; SSE2-NEXT:    movdqa %xmm0, %xmm2
456; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
457; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
458; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
459; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
460; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
461; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
462; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
463; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
464; SSE2-NEXT:    packuswb %xmm2, %xmm0
465; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
466; SSE2-NEXT:    movdqa %xmm0, %xmm2
467; SSE2-NEXT:    pand %xmm1, %xmm2
468; SSE2-NEXT:    psllw $4, %xmm2
469; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
470; SSE2-NEXT:    pand %xmm3, %xmm2
471; SSE2-NEXT:    pand %xmm3, %xmm0
472; SSE2-NEXT:    psrlw $4, %xmm0
473; SSE2-NEXT:    pand %xmm1, %xmm0
474; SSE2-NEXT:    por %xmm2, %xmm0
475; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
476; SSE2-NEXT:    pand %xmm0, %xmm1
477; SSE2-NEXT:    psllw $2, %xmm1
478; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
479; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
480; SSE2-NEXT:    psrlw $2, %xmm0
481; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
482; SSE2-NEXT:    por %xmm1, %xmm0
483; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
484; SSE2-NEXT:    pand %xmm0, %xmm1
485; SSE2-NEXT:    paddb %xmm1, %xmm1
486; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
487; SSE2-NEXT:    psrlw $1, %xmm0
488; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
489; SSE2-NEXT:    por %xmm1, %xmm0
490; SSE2-NEXT:    retq
491;
492; SSSE3-LABEL: test_bitreverse_v2i64:
493; SSSE3:       # BB#0:
494; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
495; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
496; SSSE3-NEXT:    movdqa %xmm0, %xmm2
497; SSSE3-NEXT:    pand %xmm1, %xmm2
498; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
499; SSSE3-NEXT:    pshufb %xmm2, %xmm3
500; SSSE3-NEXT:    psrlw $4, %xmm0
501; SSSE3-NEXT:    pand %xmm1, %xmm0
502; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
503; SSSE3-NEXT:    pshufb %xmm0, %xmm1
504; SSSE3-NEXT:    por %xmm3, %xmm1
505; SSSE3-NEXT:    movdqa %xmm1, %xmm0
506; SSSE3-NEXT:    retq
507;
508; AVX-LABEL: test_bitreverse_v2i64:
509; AVX:       # BB#0:
510; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
511; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
512; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
513; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
514; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
515; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
516; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
517; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
518; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
519; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
520; AVX-NEXT:    retq
521;
522; XOP-LABEL: test_bitreverse_v2i64:
523; XOP:       # BB#0:
524; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
525; XOP-NEXT:    retq
526  %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
527  ret <2 x i64> %b
528}
529
530define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
531; SSE2-LABEL: test_bitreverse_v32i8:
532; SSE2:       # BB#0:
533; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
534; SSE2-NEXT:    movdqa %xmm0, %xmm3
535; SSE2-NEXT:    pand %xmm2, %xmm3
536; SSE2-NEXT:    psllw $4, %xmm3
537; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
538; SSE2-NEXT:    pand %xmm5, %xmm3
539; SSE2-NEXT:    pand %xmm5, %xmm0
540; SSE2-NEXT:    psrlw $4, %xmm0
541; SSE2-NEXT:    pand %xmm2, %xmm0
542; SSE2-NEXT:    por %xmm3, %xmm0
543; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
544; SSE2-NEXT:    movdqa %xmm0, %xmm4
545; SSE2-NEXT:    pand %xmm3, %xmm4
546; SSE2-NEXT:    psllw $2, %xmm4
547; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
548; SSE2-NEXT:    pand %xmm8, %xmm4
549; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
550; SSE2-NEXT:    pand %xmm9, %xmm0
551; SSE2-NEXT:    psrlw $2, %xmm0
552; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
553; SSE2-NEXT:    pand %xmm10, %xmm0
554; SSE2-NEXT:    por %xmm4, %xmm0
555; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
556; SSE2-NEXT:    movdqa %xmm0, %xmm7
557; SSE2-NEXT:    pand %xmm4, %xmm7
558; SSE2-NEXT:    psrlw $1, %xmm7
559; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
560; SSE2-NEXT:    pand %xmm11, %xmm7
561; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
562; SSE2-NEXT:    pand %xmm6, %xmm0
563; SSE2-NEXT:    paddb %xmm0, %xmm0
564; SSE2-NEXT:    por %xmm7, %xmm0
565; SSE2-NEXT:    movdqa %xmm1, %xmm7
566; SSE2-NEXT:    pand %xmm2, %xmm7
567; SSE2-NEXT:    psllw $4, %xmm7
568; SSE2-NEXT:    pand %xmm5, %xmm7
569; SSE2-NEXT:    pand %xmm5, %xmm1
570; SSE2-NEXT:    psrlw $4, %xmm1
571; SSE2-NEXT:    pand %xmm2, %xmm1
572; SSE2-NEXT:    por %xmm7, %xmm1
573; SSE2-NEXT:    pand %xmm1, %xmm3
574; SSE2-NEXT:    psllw $2, %xmm3
575; SSE2-NEXT:    pand %xmm8, %xmm3
576; SSE2-NEXT:    pand %xmm9, %xmm1
577; SSE2-NEXT:    psrlw $2, %xmm1
578; SSE2-NEXT:    pand %xmm10, %xmm1
579; SSE2-NEXT:    por %xmm3, %xmm1
580; SSE2-NEXT:    pand %xmm1, %xmm4
581; SSE2-NEXT:    psrlw $1, %xmm4
582; SSE2-NEXT:    pand %xmm11, %xmm4
583; SSE2-NEXT:    pand %xmm6, %xmm1
584; SSE2-NEXT:    paddb %xmm1, %xmm1
585; SSE2-NEXT:    por %xmm4, %xmm1
586; SSE2-NEXT:    retq
587;
588; SSSE3-LABEL: test_bitreverse_v32i8:
589; SSSE3:       # BB#0:
590; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
591; SSSE3-NEXT:    movdqa %xmm0, %xmm2
592; SSSE3-NEXT:    pand %xmm4, %xmm2
593; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
594; SSSE3-NEXT:    movdqa %xmm5, %xmm6
595; SSSE3-NEXT:    pshufb %xmm2, %xmm6
596; SSSE3-NEXT:    psrlw $4, %xmm0
597; SSSE3-NEXT:    pand %xmm4, %xmm0
598; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
599; SSSE3-NEXT:    movdqa %xmm2, %xmm3
600; SSSE3-NEXT:    pshufb %xmm0, %xmm3
601; SSSE3-NEXT:    por %xmm6, %xmm3
602; SSSE3-NEXT:    movdqa %xmm1, %xmm0
603; SSSE3-NEXT:    pand %xmm4, %xmm0
604; SSSE3-NEXT:    pshufb %xmm0, %xmm5
605; SSSE3-NEXT:    psrlw $4, %xmm1
606; SSSE3-NEXT:    pand %xmm4, %xmm1
607; SSSE3-NEXT:    pshufb %xmm1, %xmm2
608; SSSE3-NEXT:    por %xmm5, %xmm2
609; SSSE3-NEXT:    movdqa %xmm3, %xmm0
610; SSSE3-NEXT:    movdqa %xmm2, %xmm1
611; SSSE3-NEXT:    retq
612;
613; AVX1-LABEL: test_bitreverse_v32i8:
614; AVX1:       # BB#0:
615; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
616; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
617; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
618; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
619; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
620; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
621; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
622; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
623; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
624; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
625; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
626; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
627; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
628; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
629; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
630; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
631; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
632; AVX1-NEXT:    retq
633;
634; AVX2-LABEL: test_bitreverse_v32i8:
635; AVX2:       # BB#0:
636; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
637; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
638; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
639; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
640; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
641; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
642; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
643; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
644; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
645; AVX2-NEXT:    retq
646;
647; AVX512-LABEL: test_bitreverse_v32i8:
648; AVX512:       # BB#0:
649; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
650; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
651; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
652; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
653; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
654; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
655; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
656; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
657; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
658; AVX512-NEXT:    retq
659;
660; XOPAVX1-LABEL: test_bitreverse_v32i8:
661; XOPAVX1:       # BB#0:
662; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
663; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
664; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
665; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
666; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
667; XOPAVX1-NEXT:    retq
668;
669; XOPAVX2-LABEL: test_bitreverse_v32i8:
670; XOPAVX2:       # BB#0:
671; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
672; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
673; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
674; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
675; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
676; XOPAVX2-NEXT:    retq
677  %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
678  ret <32 x i8> %b
679}
680
681define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
682; SSE2-LABEL: test_bitreverse_v16i16:
683; SSE2:       # BB#0:
684; SSE2-NEXT:    pxor %xmm4, %xmm4
685; SSE2-NEXT:    movdqa %xmm0, %xmm2
686; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
687; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
688; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
689; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
690; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
691; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
692; SSE2-NEXT:    packuswb %xmm2, %xmm0
693; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
694; SSE2-NEXT:    movdqa %xmm0, %xmm3
695; SSE2-NEXT:    pand %xmm2, %xmm3
696; SSE2-NEXT:    psllw $4, %xmm3
697; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
698; SSE2-NEXT:    pand %xmm6, %xmm3
699; SSE2-NEXT:    pand %xmm6, %xmm0
700; SSE2-NEXT:    psrlw $4, %xmm0
701; SSE2-NEXT:    pand %xmm2, %xmm0
702; SSE2-NEXT:    por %xmm3, %xmm0
703; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
704; SSE2-NEXT:    movdqa %xmm0, %xmm5
705; SSE2-NEXT:    pand %xmm3, %xmm5
706; SSE2-NEXT:    psllw $2, %xmm5
707; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
708; SSE2-NEXT:    pand %xmm8, %xmm5
709; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
710; SSE2-NEXT:    pand %xmm9, %xmm0
711; SSE2-NEXT:    psrlw $2, %xmm0
712; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
713; SSE2-NEXT:    pand %xmm10, %xmm0
714; SSE2-NEXT:    por %xmm5, %xmm0
715; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
716; SSE2-NEXT:    movdqa %xmm0, %xmm7
717; SSE2-NEXT:    pand %xmm5, %xmm7
718; SSE2-NEXT:    psrlw $1, %xmm7
719; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
720; SSE2-NEXT:    pand %xmm11, %xmm7
721; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
722; SSE2-NEXT:    pand %xmm12, %xmm0
723; SSE2-NEXT:    paddb %xmm0, %xmm0
724; SSE2-NEXT:    por %xmm7, %xmm0
725; SSE2-NEXT:    movdqa %xmm1, %xmm7
726; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
727; SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[1,0,3,2,4,5,6,7]
728; SSE2-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,7,6]
729; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
730; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
731; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
732; SSE2-NEXT:    packuswb %xmm7, %xmm1
733; SSE2-NEXT:    movdqa %xmm1, %xmm4
734; SSE2-NEXT:    pand %xmm2, %xmm4
735; SSE2-NEXT:    psllw $4, %xmm4
736; SSE2-NEXT:    pand %xmm6, %xmm4
737; SSE2-NEXT:    pand %xmm6, %xmm1
738; SSE2-NEXT:    psrlw $4, %xmm1
739; SSE2-NEXT:    pand %xmm2, %xmm1
740; SSE2-NEXT:    por %xmm4, %xmm1
741; SSE2-NEXT:    pand %xmm1, %xmm3
742; SSE2-NEXT:    psllw $2, %xmm3
743; SSE2-NEXT:    pand %xmm8, %xmm3
744; SSE2-NEXT:    pand %xmm9, %xmm1
745; SSE2-NEXT:    psrlw $2, %xmm1
746; SSE2-NEXT:    pand %xmm10, %xmm1
747; SSE2-NEXT:    por %xmm3, %xmm1
748; SSE2-NEXT:    pand %xmm1, %xmm5
749; SSE2-NEXT:    psrlw $1, %xmm5
750; SSE2-NEXT:    pand %xmm11, %xmm5
751; SSE2-NEXT:    pand %xmm12, %xmm1
752; SSE2-NEXT:    paddb %xmm1, %xmm1
753; SSE2-NEXT:    por %xmm5, %xmm1
754; SSE2-NEXT:    retq
755;
756; SSSE3-LABEL: test_bitreverse_v16i16:
757; SSSE3:       # BB#0:
758; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
759; SSSE3-NEXT:    pshufb %xmm4, %xmm0
760; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
761; SSSE3-NEXT:    movdqa %xmm0, %xmm2
762; SSSE3-NEXT:    pand %xmm5, %xmm2
763; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
764; SSSE3-NEXT:    movdqa %xmm6, %xmm7
765; SSSE3-NEXT:    pshufb %xmm2, %xmm7
766; SSSE3-NEXT:    psrlw $4, %xmm0
767; SSSE3-NEXT:    pand %xmm5, %xmm0
768; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
769; SSSE3-NEXT:    movdqa %xmm2, %xmm3
770; SSSE3-NEXT:    pshufb %xmm0, %xmm3
771; SSSE3-NEXT:    por %xmm7, %xmm3
772; SSSE3-NEXT:    pshufb %xmm4, %xmm1
773; SSSE3-NEXT:    movdqa %xmm1, %xmm0
774; SSSE3-NEXT:    pand %xmm5, %xmm0
775; SSSE3-NEXT:    pshufb %xmm0, %xmm6
776; SSSE3-NEXT:    psrlw $4, %xmm1
777; SSSE3-NEXT:    pand %xmm5, %xmm1
778; SSSE3-NEXT:    pshufb %xmm1, %xmm2
779; SSSE3-NEXT:    por %xmm6, %xmm2
780; SSSE3-NEXT:    movdqa %xmm3, %xmm0
781; SSSE3-NEXT:    movdqa %xmm2, %xmm1
782; SSSE3-NEXT:    retq
783;
784; AVX1-LABEL: test_bitreverse_v16i16:
785; AVX1:       # BB#0:
786; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
787; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
788; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
789; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
790; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
791; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
792; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
793; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
794; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
795; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
796; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
797; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
798; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
799; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
800; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
801; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
802; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
803; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
804; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
805; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
806; AVX1-NEXT:    retq
807;
808; AVX2-LABEL: test_bitreverse_v16i16:
809; AVX2:       # BB#0:
810; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
811; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
812; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
813; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
814; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
815; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
816; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
817; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
818; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
819; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
820; AVX2-NEXT:    retq
821;
822; AVX512-LABEL: test_bitreverse_v16i16:
823; AVX512:       # BB#0:
824; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
825; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
826; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
827; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
828; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
829; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
830; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
831; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
832; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
833; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
834; AVX512-NEXT:    retq
835;
836; XOPAVX1-LABEL: test_bitreverse_v16i16:
837; XOPAVX1:       # BB#0:
838; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
839; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
840; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
841; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
842; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
843; XOPAVX1-NEXT:    retq
844;
845; XOPAVX2-LABEL: test_bitreverse_v16i16:
846; XOPAVX2:       # BB#0:
847; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
848; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
849; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
850; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
851; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
852; XOPAVX2-NEXT:    retq
853  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
854  ret <16 x i16> %b
855}
856
857define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
858; SSE2-LABEL: test_bitreverse_v8i32:
859; SSE2:       # BB#0:
860; SSE2-NEXT:    pxor %xmm4, %xmm4
861; SSE2-NEXT:    movdqa %xmm0, %xmm2
862; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
863; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
864; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
865; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
866; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
867; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
868; SSE2-NEXT:    packuswb %xmm2, %xmm0
869; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
870; SSE2-NEXT:    movdqa %xmm0, %xmm3
871; SSE2-NEXT:    pand %xmm2, %xmm3
872; SSE2-NEXT:    psllw $4, %xmm3
873; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
874; SSE2-NEXT:    pand %xmm6, %xmm3
875; SSE2-NEXT:    pand %xmm6, %xmm0
876; SSE2-NEXT:    psrlw $4, %xmm0
877; SSE2-NEXT:    pand %xmm2, %xmm0
878; SSE2-NEXT:    por %xmm3, %xmm0
879; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
880; SSE2-NEXT:    movdqa %xmm0, %xmm5
881; SSE2-NEXT:    pand %xmm3, %xmm5
882; SSE2-NEXT:    psllw $2, %xmm5
883; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
884; SSE2-NEXT:    pand %xmm8, %xmm5
885; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
886; SSE2-NEXT:    pand %xmm9, %xmm0
887; SSE2-NEXT:    psrlw $2, %xmm0
888; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
889; SSE2-NEXT:    pand %xmm10, %xmm0
890; SSE2-NEXT:    por %xmm5, %xmm0
891; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
892; SSE2-NEXT:    movdqa %xmm0, %xmm7
893; SSE2-NEXT:    pand %xmm5, %xmm7
894; SSE2-NEXT:    psrlw $1, %xmm7
895; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
896; SSE2-NEXT:    pand %xmm11, %xmm7
897; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
898; SSE2-NEXT:    pand %xmm12, %xmm0
899; SSE2-NEXT:    paddb %xmm0, %xmm0
900; SSE2-NEXT:    por %xmm7, %xmm0
901; SSE2-NEXT:    movdqa %xmm1, %xmm7
902; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
903; SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[3,2,1,0,4,5,6,7]
904; SSE2-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4]
905; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
906; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
907; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
908; SSE2-NEXT:    packuswb %xmm7, %xmm1
909; SSE2-NEXT:    movdqa %xmm1, %xmm4
910; SSE2-NEXT:    pand %xmm2, %xmm4
911; SSE2-NEXT:    psllw $4, %xmm4
912; SSE2-NEXT:    pand %xmm6, %xmm4
913; SSE2-NEXT:    pand %xmm6, %xmm1
914; SSE2-NEXT:    psrlw $4, %xmm1
915; SSE2-NEXT:    pand %xmm2, %xmm1
916; SSE2-NEXT:    por %xmm4, %xmm1
917; SSE2-NEXT:    pand %xmm1, %xmm3
918; SSE2-NEXT:    psllw $2, %xmm3
919; SSE2-NEXT:    pand %xmm8, %xmm3
920; SSE2-NEXT:    pand %xmm9, %xmm1
921; SSE2-NEXT:    psrlw $2, %xmm1
922; SSE2-NEXT:    pand %xmm10, %xmm1
923; SSE2-NEXT:    por %xmm3, %xmm1
924; SSE2-NEXT:    pand %xmm1, %xmm5
925; SSE2-NEXT:    psrlw $1, %xmm5
926; SSE2-NEXT:    pand %xmm11, %xmm5
927; SSE2-NEXT:    pand %xmm12, %xmm1
928; SSE2-NEXT:    paddb %xmm1, %xmm1
929; SSE2-NEXT:    por %xmm5, %xmm1
930; SSE2-NEXT:    retq
931;
932; SSSE3-LABEL: test_bitreverse_v8i32:
933; SSSE3:       # BB#0:
934; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
935; SSSE3-NEXT:    pshufb %xmm4, %xmm0
936; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
937; SSSE3-NEXT:    movdqa %xmm0, %xmm2
938; SSSE3-NEXT:    pand %xmm5, %xmm2
939; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
940; SSSE3-NEXT:    movdqa %xmm6, %xmm7
941; SSSE3-NEXT:    pshufb %xmm2, %xmm7
942; SSSE3-NEXT:    psrlw $4, %xmm0
943; SSSE3-NEXT:    pand %xmm5, %xmm0
944; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
945; SSSE3-NEXT:    movdqa %xmm2, %xmm3
946; SSSE3-NEXT:    pshufb %xmm0, %xmm3
947; SSSE3-NEXT:    por %xmm7, %xmm3
948; SSSE3-NEXT:    pshufb %xmm4, %xmm1
949; SSSE3-NEXT:    movdqa %xmm1, %xmm0
950; SSSE3-NEXT:    pand %xmm5, %xmm0
951; SSSE3-NEXT:    pshufb %xmm0, %xmm6
952; SSSE3-NEXT:    psrlw $4, %xmm1
953; SSSE3-NEXT:    pand %xmm5, %xmm1
954; SSSE3-NEXT:    pshufb %xmm1, %xmm2
955; SSSE3-NEXT:    por %xmm6, %xmm2
956; SSSE3-NEXT:    movdqa %xmm3, %xmm0
957; SSSE3-NEXT:    movdqa %xmm2, %xmm1
958; SSSE3-NEXT:    retq
959;
960; AVX1-LABEL: test_bitreverse_v8i32:
961; AVX1:       # BB#0:
962; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
963; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
964; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
965; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
966; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
967; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
968; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
969; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
970; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
971; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
972; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
973; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
974; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
975; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
976; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
977; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
978; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
979; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
980; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
981; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
982; AVX1-NEXT:    retq
983;
984; AVX2-LABEL: test_bitreverse_v8i32:
985; AVX2:       # BB#0:
986; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
987; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
988; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
989; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
990; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
991; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
992; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
993; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
994; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
995; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
996; AVX2-NEXT:    retq
997;
998; AVX512-LABEL: test_bitreverse_v8i32:
999; AVX512:       # BB#0:
1000; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1001; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1002; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1003; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1004; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1005; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1006; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1007; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1008; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1009; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1010; AVX512-NEXT:    retq
1011;
1012; XOPAVX1-LABEL: test_bitreverse_v8i32:
1013; XOPAVX1:       # BB#0:
1014; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1015; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1016; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1017; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1018; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1019; XOPAVX1-NEXT:    retq
1020;
1021; XOPAVX2-LABEL: test_bitreverse_v8i32:
1022; XOPAVX2:       # BB#0:
1023; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1024; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1025; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1026; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1027; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1028; XOPAVX2-NEXT:    retq
1029  %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
1030  ret <8 x i32> %b
1031}
1032
1033define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
1034; SSE2-LABEL: test_bitreverse_v4i64:
1035; SSE2:       # BB#0:
1036; SSE2-NEXT:    pxor %xmm4, %xmm4
1037; SSE2-NEXT:    movdqa %xmm0, %xmm2
1038; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
1039; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
1040; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1041; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1042; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1043; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1044; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1045; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1046; SSE2-NEXT:    packuswb %xmm2, %xmm0
1047; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1048; SSE2-NEXT:    movdqa %xmm0, %xmm3
1049; SSE2-NEXT:    pand %xmm2, %xmm3
1050; SSE2-NEXT:    psllw $4, %xmm3
1051; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1052; SSE2-NEXT:    pand %xmm6, %xmm3
1053; SSE2-NEXT:    pand %xmm6, %xmm0
1054; SSE2-NEXT:    psrlw $4, %xmm0
1055; SSE2-NEXT:    pand %xmm2, %xmm0
1056; SSE2-NEXT:    por %xmm3, %xmm0
1057; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1058; SSE2-NEXT:    movdqa %xmm0, %xmm5
1059; SSE2-NEXT:    pand %xmm3, %xmm5
1060; SSE2-NEXT:    psllw $2, %xmm5
1061; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1062; SSE2-NEXT:    pand %xmm8, %xmm5
1063; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1064; SSE2-NEXT:    pand %xmm9, %xmm0
1065; SSE2-NEXT:    psrlw $2, %xmm0
1066; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
1067; SSE2-NEXT:    pand %xmm10, %xmm0
1068; SSE2-NEXT:    por %xmm5, %xmm0
1069; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1070; SSE2-NEXT:    movdqa %xmm0, %xmm7
1071; SSE2-NEXT:    pand %xmm5, %xmm7
1072; SSE2-NEXT:    psrlw $1, %xmm7
1073; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1074; SSE2-NEXT:    pand %xmm11, %xmm7
1075; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1076; SSE2-NEXT:    pand %xmm12, %xmm0
1077; SSE2-NEXT:    paddb %xmm0, %xmm0
1078; SSE2-NEXT:    por %xmm7, %xmm0
1079; SSE2-NEXT:    movdqa %xmm1, %xmm7
1080; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
1081; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,3,0,1]
1082; SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[3,2,1,0,4,5,6,7]
1083; SSE2-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4]
1084; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1085; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1086; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1087; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1088; SSE2-NEXT:    packuswb %xmm7, %xmm1
1089; SSE2-NEXT:    movdqa %xmm1, %xmm4
1090; SSE2-NEXT:    pand %xmm2, %xmm4
1091; SSE2-NEXT:    psllw $4, %xmm4
1092; SSE2-NEXT:    pand %xmm6, %xmm4
1093; SSE2-NEXT:    pand %xmm6, %xmm1
1094; SSE2-NEXT:    psrlw $4, %xmm1
1095; SSE2-NEXT:    pand %xmm2, %xmm1
1096; SSE2-NEXT:    por %xmm4, %xmm1
1097; SSE2-NEXT:    pand %xmm1, %xmm3
1098; SSE2-NEXT:    psllw $2, %xmm3
1099; SSE2-NEXT:    pand %xmm8, %xmm3
1100; SSE2-NEXT:    pand %xmm9, %xmm1
1101; SSE2-NEXT:    psrlw $2, %xmm1
1102; SSE2-NEXT:    pand %xmm10, %xmm1
1103; SSE2-NEXT:    por %xmm3, %xmm1
1104; SSE2-NEXT:    pand %xmm1, %xmm5
1105; SSE2-NEXT:    psrlw $1, %xmm5
1106; SSE2-NEXT:    pand %xmm11, %xmm5
1107; SSE2-NEXT:    pand %xmm12, %xmm1
1108; SSE2-NEXT:    paddb %xmm1, %xmm1
1109; SSE2-NEXT:    por %xmm5, %xmm1
1110; SSE2-NEXT:    retq
1111;
1112; SSSE3-LABEL: test_bitreverse_v4i64:
1113; SSSE3:       # BB#0:
1114; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1115; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1116; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1117; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1118; SSSE3-NEXT:    pand %xmm5, %xmm2
1119; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1120; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1121; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1122; SSSE3-NEXT:    psrlw $4, %xmm0
1123; SSSE3-NEXT:    pand %xmm5, %xmm0
1124; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1125; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1126; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1127; SSSE3-NEXT:    por %xmm7, %xmm3
1128; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1129; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1130; SSSE3-NEXT:    pand %xmm5, %xmm0
1131; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1132; SSSE3-NEXT:    psrlw $4, %xmm1
1133; SSSE3-NEXT:    pand %xmm5, %xmm1
1134; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1135; SSSE3-NEXT:    por %xmm6, %xmm2
1136; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1137; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1138; SSSE3-NEXT:    retq
1139;
1140; AVX1-LABEL: test_bitreverse_v4i64:
1141; AVX1:       # BB#0:
1142; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1143; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1144; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1145; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1146; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1147; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1148; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1149; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1150; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1151; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1152; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1153; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1154; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1155; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1156; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1157; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1158; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1159; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1160; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1161; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1162; AVX1-NEXT:    retq
1163;
1164; AVX2-LABEL: test_bitreverse_v4i64:
1165; AVX2:       # BB#0:
1166; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1167; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1168; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1169; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1170; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1171; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1172; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1173; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1174; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1175; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1176; AVX2-NEXT:    retq
1177;
1178; AVX512-LABEL: test_bitreverse_v4i64:
1179; AVX512:       # BB#0:
1180; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1181; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1182; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1183; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1184; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1185; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1186; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1187; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1188; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1189; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1190; AVX512-NEXT:    retq
1191;
1192; XOPAVX1-LABEL: test_bitreverse_v4i64:
1193; XOPAVX1:       # BB#0:
1194; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1195; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1196; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1197; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1198; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1199; XOPAVX1-NEXT:    retq
1200;
1201; XOPAVX2-LABEL: test_bitreverse_v4i64:
1202; XOPAVX2:       # BB#0:
1203; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1204; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1205; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1206; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1207; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1208; XOPAVX2-NEXT:    retq
1209  %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
1210  ret <4 x i64> %b
1211}
1212
1213define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
1214; SSE2-LABEL: test_bitreverse_v64i8:
1215; SSE2:       # BB#0:
1216; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1217; SSE2-NEXT:    movdqa %xmm0, %xmm5
1218; SSE2-NEXT:    pand %xmm13, %xmm5
1219; SSE2-NEXT:    psllw $4, %xmm5
1220; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1221; SSE2-NEXT:    pand %xmm7, %xmm5
1222; SSE2-NEXT:    pand %xmm7, %xmm0
1223; SSE2-NEXT:    psrlw $4, %xmm0
1224; SSE2-NEXT:    pand %xmm13, %xmm0
1225; SSE2-NEXT:    por %xmm5, %xmm0
1226; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1227; SSE2-NEXT:    movdqa %xmm0, %xmm6
1228; SSE2-NEXT:    pand %xmm5, %xmm6
1229; SSE2-NEXT:    psllw $2, %xmm6
1230; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1231; SSE2-NEXT:    pand %xmm8, %xmm6
1232; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1233; SSE2-NEXT:    pand %xmm9, %xmm0
1234; SSE2-NEXT:    psrlw $2, %xmm0
1235; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
1236; SSE2-NEXT:    pand %xmm10, %xmm0
1237; SSE2-NEXT:    por %xmm6, %xmm0
1238; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1239; SSE2-NEXT:    movdqa %xmm0, %xmm4
1240; SSE2-NEXT:    pand %xmm6, %xmm4
1241; SSE2-NEXT:    psrlw $1, %xmm4
1242; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1243; SSE2-NEXT:    pand %xmm11, %xmm4
1244; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1245; SSE2-NEXT:    pand %xmm12, %xmm0
1246; SSE2-NEXT:    paddb %xmm0, %xmm0
1247; SSE2-NEXT:    por %xmm4, %xmm0
1248; SSE2-NEXT:    movdqa %xmm1, %xmm4
1249; SSE2-NEXT:    pand %xmm13, %xmm4
1250; SSE2-NEXT:    psllw $4, %xmm4
1251; SSE2-NEXT:    pand %xmm7, %xmm4
1252; SSE2-NEXT:    pand %xmm7, %xmm1
1253; SSE2-NEXT:    psrlw $4, %xmm1
1254; SSE2-NEXT:    pand %xmm13, %xmm1
1255; SSE2-NEXT:    por %xmm4, %xmm1
1256; SSE2-NEXT:    movdqa %xmm1, %xmm4
1257; SSE2-NEXT:    pand %xmm5, %xmm4
1258; SSE2-NEXT:    psllw $2, %xmm4
1259; SSE2-NEXT:    pand %xmm8, %xmm4
1260; SSE2-NEXT:    pand %xmm9, %xmm1
1261; SSE2-NEXT:    psrlw $2, %xmm1
1262; SSE2-NEXT:    pand %xmm10, %xmm1
1263; SSE2-NEXT:    por %xmm4, %xmm1
1264; SSE2-NEXT:    movdqa %xmm1, %xmm4
1265; SSE2-NEXT:    pand %xmm6, %xmm4
1266; SSE2-NEXT:    psrlw $1, %xmm4
1267; SSE2-NEXT:    pand %xmm11, %xmm4
1268; SSE2-NEXT:    pand %xmm12, %xmm1
1269; SSE2-NEXT:    paddb %xmm1, %xmm1
1270; SSE2-NEXT:    por %xmm4, %xmm1
1271; SSE2-NEXT:    movdqa %xmm2, %xmm4
1272; SSE2-NEXT:    pand %xmm13, %xmm4
1273; SSE2-NEXT:    psllw $4, %xmm4
1274; SSE2-NEXT:    pand %xmm7, %xmm4
1275; SSE2-NEXT:    pand %xmm7, %xmm2
1276; SSE2-NEXT:    psrlw $4, %xmm2
1277; SSE2-NEXT:    pand %xmm13, %xmm2
1278; SSE2-NEXT:    por %xmm4, %xmm2
1279; SSE2-NEXT:    movdqa %xmm2, %xmm4
1280; SSE2-NEXT:    pand %xmm5, %xmm4
1281; SSE2-NEXT:    psllw $2, %xmm4
1282; SSE2-NEXT:    pand %xmm8, %xmm4
1283; SSE2-NEXT:    pand %xmm9, %xmm2
1284; SSE2-NEXT:    psrlw $2, %xmm2
1285; SSE2-NEXT:    pand %xmm10, %xmm2
1286; SSE2-NEXT:    por %xmm4, %xmm2
1287; SSE2-NEXT:    movdqa %xmm2, %xmm4
1288; SSE2-NEXT:    pand %xmm6, %xmm4
1289; SSE2-NEXT:    psrlw $1, %xmm4
1290; SSE2-NEXT:    pand %xmm11, %xmm4
1291; SSE2-NEXT:    pand %xmm12, %xmm2
1292; SSE2-NEXT:    paddb %xmm2, %xmm2
1293; SSE2-NEXT:    por %xmm4, %xmm2
1294; SSE2-NEXT:    movdqa %xmm3, %xmm4
1295; SSE2-NEXT:    pand %xmm13, %xmm4
1296; SSE2-NEXT:    psllw $4, %xmm4
1297; SSE2-NEXT:    pand %xmm7, %xmm4
1298; SSE2-NEXT:    pand %xmm7, %xmm3
1299; SSE2-NEXT:    psrlw $4, %xmm3
1300; SSE2-NEXT:    pand %xmm13, %xmm3
1301; SSE2-NEXT:    por %xmm4, %xmm3
1302; SSE2-NEXT:    pand %xmm3, %xmm5
1303; SSE2-NEXT:    psllw $2, %xmm5
1304; SSE2-NEXT:    pand %xmm8, %xmm5
1305; SSE2-NEXT:    pand %xmm9, %xmm3
1306; SSE2-NEXT:    psrlw $2, %xmm3
1307; SSE2-NEXT:    pand %xmm10, %xmm3
1308; SSE2-NEXT:    por %xmm5, %xmm3
1309; SSE2-NEXT:    pand %xmm3, %xmm6
1310; SSE2-NEXT:    psrlw $1, %xmm6
1311; SSE2-NEXT:    pand %xmm11, %xmm6
1312; SSE2-NEXT:    pand %xmm12, %xmm3
1313; SSE2-NEXT:    paddb %xmm3, %xmm3
1314; SSE2-NEXT:    por %xmm6, %xmm3
1315; SSE2-NEXT:    retq
1316;
1317; SSSE3-LABEL: test_bitreverse_v64i8:
1318; SSSE3:       # BB#0:
1319; SSSE3-NEXT:    movdqa %xmm0, %xmm5
1320; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1321; SSSE3-NEXT:    pand %xmm8, %xmm0
1322; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1323; SSSE3-NEXT:    movdqa %xmm9, %xmm6
1324; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1325; SSSE3-NEXT:    psrlw $4, %xmm5
1326; SSSE3-NEXT:    pand %xmm8, %xmm5
1327; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1328; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1329; SSSE3-NEXT:    pshufb %xmm5, %xmm0
1330; SSSE3-NEXT:    por %xmm6, %xmm0
1331; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1332; SSSE3-NEXT:    pand %xmm8, %xmm5
1333; SSSE3-NEXT:    movdqa %xmm9, %xmm6
1334; SSSE3-NEXT:    pshufb %xmm5, %xmm6
1335; SSSE3-NEXT:    psrlw $4, %xmm1
1336; SSSE3-NEXT:    pand %xmm8, %xmm1
1337; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1338; SSSE3-NEXT:    pshufb %xmm1, %xmm5
1339; SSSE3-NEXT:    por %xmm6, %xmm5
1340; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1341; SSSE3-NEXT:    pand %xmm8, %xmm1
1342; SSSE3-NEXT:    movdqa %xmm9, %xmm7
1343; SSSE3-NEXT:    pshufb %xmm1, %xmm7
1344; SSSE3-NEXT:    psrlw $4, %xmm2
1345; SSSE3-NEXT:    pand %xmm8, %xmm2
1346; SSSE3-NEXT:    movdqa %xmm4, %xmm6
1347; SSSE3-NEXT:    pshufb %xmm2, %xmm6
1348; SSSE3-NEXT:    por %xmm7, %xmm6
1349; SSSE3-NEXT:    movdqa %xmm3, %xmm1
1350; SSSE3-NEXT:    pand %xmm8, %xmm1
1351; SSSE3-NEXT:    pshufb %xmm1, %xmm9
1352; SSSE3-NEXT:    psrlw $4, %xmm3
1353; SSSE3-NEXT:    pand %xmm8, %xmm3
1354; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1355; SSSE3-NEXT:    por %xmm9, %xmm4
1356; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1357; SSSE3-NEXT:    movdqa %xmm6, %xmm2
1358; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1359; SSSE3-NEXT:    retq
1360;
1361; AVX1-LABEL: test_bitreverse_v64i8:
1362; AVX1:       # BB#0:
1363; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1364; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1365; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm4
1366; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1367; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1368; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1369; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1370; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1371; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
1372; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
1373; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm4
1374; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1375; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1376; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1377; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1378; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
1379; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1380; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1381; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm4
1382; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1383; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1384; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1385; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
1386; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
1387; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm4
1388; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1389; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1390; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1391; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1392; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1393; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1394; AVX1-NEXT:    retq
1395;
1396; AVX2-LABEL: test_bitreverse_v64i8:
1397; AVX2:       # BB#0:
1398; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1399; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
1400; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1401; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1402; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1403; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
1404; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1405; AVX2-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
1406; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
1407; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
1408; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1409; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
1410; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
1411; AVX2-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
1412; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
1413; AVX2-NEXT:    retq
1414;
1415; AVX512F-LABEL: test_bitreverse_v64i8:
1416; AVX512F:       # BB#0:
1417; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1418; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm3
1419; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1420; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1421; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
1422; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
1423; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1424; AVX512F-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
1425; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
1426; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
1427; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1428; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
1429; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
1430; AVX512F-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
1431; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
1432; AVX512F-NEXT:    retq
1433;
1434; AVX512BW-LABEL: test_bitreverse_v64i8:
1435; AVX512BW:       # BB#0:
1436; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1437; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
1438; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1439; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
1440; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
1441; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
1442; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1443; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
1444; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
1445; AVX512BW-NEXT:    retq
1446;
1447; XOPAVX1-LABEL: test_bitreverse_v64i8:
1448; XOPAVX1:       # BB#0:
1449; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1450; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1451; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1452; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1453; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1454; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1455; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1456; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1457; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1458; XOPAVX1-NEXT:    retq
1459;
1460; XOPAVX2-LABEL: test_bitreverse_v64i8:
1461; XOPAVX2:       # BB#0:
1462; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1463; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1464; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1465; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1466; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
1467; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1468; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1469; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1470; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
1471; XOPAVX2-NEXT:    retq
1472  %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
1473  ret <64 x i8> %b
1474}
1475
1476define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
1477; SSE2-LABEL: test_bitreverse_v32i16:
1478; SSE2:       # BB#0:
1479; SSE2-NEXT:    pxor %xmm14, %xmm14
1480; SSE2-NEXT:    movdqa %xmm0, %xmm4
1481; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15]
1482; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
1483; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
1484; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
1485; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
1486; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
1487; SSE2-NEXT:    packuswb %xmm4, %xmm0
1488; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1489; SSE2-NEXT:    movdqa %xmm0, %xmm5
1490; SSE2-NEXT:    pand %xmm8, %xmm5
1491; SSE2-NEXT:    psllw $4, %xmm5
1492; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1493; SSE2-NEXT:    pand %xmm4, %xmm5
1494; SSE2-NEXT:    pand %xmm4, %xmm0
1495; SSE2-NEXT:    psrlw $4, %xmm0
1496; SSE2-NEXT:    pand %xmm8, %xmm0
1497; SSE2-NEXT:    por %xmm5, %xmm0
1498; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1499; SSE2-NEXT:    movdqa %xmm0, %xmm7
1500; SSE2-NEXT:    pand %xmm5, %xmm7
1501; SSE2-NEXT:    psllw $2, %xmm7
1502; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1503; SSE2-NEXT:    pand %xmm9, %xmm7
1504; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1505; SSE2-NEXT:    pand %xmm10, %xmm0
1506; SSE2-NEXT:    psrlw $2, %xmm0
1507; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
1508; SSE2-NEXT:    pand %xmm11, %xmm0
1509; SSE2-NEXT:    por %xmm7, %xmm0
1510; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1511; SSE2-NEXT:    movdqa %xmm0, %xmm6
1512; SSE2-NEXT:    pand %xmm7, %xmm6
1513; SSE2-NEXT:    psrlw $1, %xmm6
1514; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1515; SSE2-NEXT:    pand %xmm12, %xmm6
1516; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1517; SSE2-NEXT:    pand %xmm13, %xmm0
1518; SSE2-NEXT:    paddb %xmm0, %xmm0
1519; SSE2-NEXT:    por %xmm6, %xmm0
1520; SSE2-NEXT:    movdqa %xmm1, %xmm6
1521; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
1522; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
1523; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6]
1524; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
1525; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
1526; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
1527; SSE2-NEXT:    packuswb %xmm6, %xmm1
1528; SSE2-NEXT:    movdqa %xmm1, %xmm6
1529; SSE2-NEXT:    pand %xmm8, %xmm6
1530; SSE2-NEXT:    psllw $4, %xmm6
1531; SSE2-NEXT:    pand %xmm4, %xmm6
1532; SSE2-NEXT:    pand %xmm4, %xmm1
1533; SSE2-NEXT:    psrlw $4, %xmm1
1534; SSE2-NEXT:    pand %xmm8, %xmm1
1535; SSE2-NEXT:    por %xmm6, %xmm1
1536; SSE2-NEXT:    movdqa %xmm1, %xmm6
1537; SSE2-NEXT:    pand %xmm5, %xmm6
1538; SSE2-NEXT:    psllw $2, %xmm6
1539; SSE2-NEXT:    pand %xmm9, %xmm6
1540; SSE2-NEXT:    pand %xmm10, %xmm1
1541; SSE2-NEXT:    psrlw $2, %xmm1
1542; SSE2-NEXT:    pand %xmm11, %xmm1
1543; SSE2-NEXT:    por %xmm6, %xmm1
1544; SSE2-NEXT:    movdqa %xmm1, %xmm6
1545; SSE2-NEXT:    pand %xmm7, %xmm6
1546; SSE2-NEXT:    psrlw $1, %xmm6
1547; SSE2-NEXT:    pand %xmm12, %xmm6
1548; SSE2-NEXT:    pand %xmm13, %xmm1
1549; SSE2-NEXT:    paddb %xmm1, %xmm1
1550; SSE2-NEXT:    por %xmm6, %xmm1
1551; SSE2-NEXT:    movdqa %xmm2, %xmm6
1552; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
1553; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
1554; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6]
1555; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
1556; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
1557; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
1558; SSE2-NEXT:    packuswb %xmm6, %xmm2
1559; SSE2-NEXT:    movdqa %xmm2, %xmm6
1560; SSE2-NEXT:    pand %xmm8, %xmm6
1561; SSE2-NEXT:    psllw $4, %xmm6
1562; SSE2-NEXT:    pand %xmm4, %xmm6
1563; SSE2-NEXT:    pand %xmm4, %xmm2
1564; SSE2-NEXT:    psrlw $4, %xmm2
1565; SSE2-NEXT:    pand %xmm8, %xmm2
1566; SSE2-NEXT:    por %xmm6, %xmm2
1567; SSE2-NEXT:    movdqa %xmm2, %xmm6
1568; SSE2-NEXT:    pand %xmm5, %xmm6
1569; SSE2-NEXT:    psllw $2, %xmm6
1570; SSE2-NEXT:    pand %xmm9, %xmm6
1571; SSE2-NEXT:    pand %xmm10, %xmm2
1572; SSE2-NEXT:    psrlw $2, %xmm2
1573; SSE2-NEXT:    pand %xmm11, %xmm2
1574; SSE2-NEXT:    por %xmm6, %xmm2
1575; SSE2-NEXT:    movdqa %xmm2, %xmm6
1576; SSE2-NEXT:    pand %xmm7, %xmm6
1577; SSE2-NEXT:    psrlw $1, %xmm6
1578; SSE2-NEXT:    pand %xmm12, %xmm6
1579; SSE2-NEXT:    pand %xmm13, %xmm2
1580; SSE2-NEXT:    paddb %xmm2, %xmm2
1581; SSE2-NEXT:    por %xmm6, %xmm2
1582; SSE2-NEXT:    movdqa %xmm3, %xmm6
1583; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
1584; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
1585; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6]
1586; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
1587; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
1588; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
1589; SSE2-NEXT:    packuswb %xmm6, %xmm3
1590; SSE2-NEXT:    movdqa %xmm3, %xmm6
1591; SSE2-NEXT:    pand %xmm8, %xmm6
1592; SSE2-NEXT:    psllw $4, %xmm6
1593; SSE2-NEXT:    pand %xmm4, %xmm6
1594; SSE2-NEXT:    pand %xmm4, %xmm3
1595; SSE2-NEXT:    psrlw $4, %xmm3
1596; SSE2-NEXT:    pand %xmm8, %xmm3
1597; SSE2-NEXT:    por %xmm6, %xmm3
1598; SSE2-NEXT:    pand %xmm3, %xmm5
1599; SSE2-NEXT:    psllw $2, %xmm5
1600; SSE2-NEXT:    pand %xmm9, %xmm5
1601; SSE2-NEXT:    pand %xmm10, %xmm3
1602; SSE2-NEXT:    psrlw $2, %xmm3
1603; SSE2-NEXT:    pand %xmm11, %xmm3
1604; SSE2-NEXT:    por %xmm5, %xmm3
1605; SSE2-NEXT:    pand %xmm3, %xmm7
1606; SSE2-NEXT:    psrlw $1, %xmm7
1607; SSE2-NEXT:    pand %xmm12, %xmm7
1608; SSE2-NEXT:    pand %xmm13, %xmm3
1609; SSE2-NEXT:    paddb %xmm3, %xmm3
1610; SSE2-NEXT:    por %xmm7, %xmm3
1611; SSE2-NEXT:    retq
1612;
1613; SSSE3-LABEL: test_bitreverse_v32i16:
1614; SSSE3:       # BB#0:
1615; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1616; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1617; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1618; SSSE3-NEXT:    pshufb %xmm8, %xmm1
1619; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1620; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1621; SSSE3-NEXT:    pand %xmm9, %xmm0
1622; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1623; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1624; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1625; SSSE3-NEXT:    psrlw $4, %xmm1
1626; SSSE3-NEXT:    pand %xmm9, %xmm1
1627; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1628; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1629; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1630; SSSE3-NEXT:    por %xmm6, %xmm0
1631; SSSE3-NEXT:    pshufb %xmm8, %xmm5
1632; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1633; SSSE3-NEXT:    pand %xmm9, %xmm1
1634; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1635; SSSE3-NEXT:    pshufb %xmm1, %xmm6
1636; SSSE3-NEXT:    psrlw $4, %xmm5
1637; SSSE3-NEXT:    pand %xmm9, %xmm5
1638; SSSE3-NEXT:    movdqa %xmm4, %xmm1
1639; SSSE3-NEXT:    pshufb %xmm5, %xmm1
1640; SSSE3-NEXT:    por %xmm6, %xmm1
1641; SSSE3-NEXT:    pshufb %xmm8, %xmm2
1642; SSSE3-NEXT:    movdqa %xmm2, %xmm5
1643; SSSE3-NEXT:    pand %xmm9, %xmm5
1644; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1645; SSSE3-NEXT:    pshufb %xmm5, %xmm6
1646; SSSE3-NEXT:    psrlw $4, %xmm2
1647; SSSE3-NEXT:    pand %xmm9, %xmm2
1648; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1649; SSSE3-NEXT:    pshufb %xmm2, %xmm5
1650; SSSE3-NEXT:    por %xmm6, %xmm5
1651; SSSE3-NEXT:    pshufb %xmm8, %xmm3
1652; SSSE3-NEXT:    movdqa %xmm3, %xmm2
1653; SSSE3-NEXT:    pand %xmm9, %xmm2
1654; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1655; SSSE3-NEXT:    psrlw $4, %xmm3
1656; SSSE3-NEXT:    pand %xmm9, %xmm3
1657; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1658; SSSE3-NEXT:    por %xmm7, %xmm4
1659; SSSE3-NEXT:    movdqa %xmm5, %xmm2
1660; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1661; SSSE3-NEXT:    retq
1662;
1663; AVX1-LABEL: test_bitreverse_v32i16:
1664; AVX1:       # BB#0:
1665; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1666; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1667; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1668; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1669; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
1670; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1671; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1672; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1673; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1674; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1675; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
1676; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
1677; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1678; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
1679; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1680; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1681; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1682; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
1683; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
1684; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1685; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1686; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1687; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
1688; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1689; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1690; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1691; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
1692; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
1693; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1694; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
1695; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
1696; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1697; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
1698; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
1699; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
1700; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1701; AVX1-NEXT:    retq
1702;
1703; AVX2-LABEL: test_bitreverse_v32i16:
1704; AVX2:       # BB#0:
1705; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1706; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1707; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1708; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
1709; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1710; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
1711; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1712; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
1713; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1714; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
1715; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
1716; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1717; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
1718; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
1719; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
1720; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
1721; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
1722; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
1723; AVX2-NEXT:    retq
1724;
1725; AVX512F-LABEL: test_bitreverse_v32i16:
1726; AVX512F:       # BB#0:
1727; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1728; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1729; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1730; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm4
1731; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1732; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
1733; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
1734; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
1735; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1736; AVX512F-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
1737; AVX512F-NEXT:    vpor %ymm0, %ymm4, %ymm0
1738; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1739; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm2
1740; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
1741; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
1742; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
1743; AVX512F-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
1744; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
1745; AVX512F-NEXT:    retq
1746;
1747; AVX512BW-LABEL: test_bitreverse_v32i16:
1748; AVX512BW:       # BB#0:
1749; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
1750; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1751; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
1752; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1753; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
1754; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
1755; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
1756; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1757; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
1758; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
1759; AVX512BW-NEXT:    retq
1760;
1761; XOPAVX1-LABEL: test_bitreverse_v32i16:
1762; XOPAVX1:       # BB#0:
1763; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1764; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1765; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1766; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1767; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1768; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1769; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1770; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1771; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1772; XOPAVX1-NEXT:    retq
1773;
1774; XOPAVX2-LABEL: test_bitreverse_v32i16:
1775; XOPAVX2:       # BB#0:
1776; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1777; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1778; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1779; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1780; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
1781; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1782; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1783; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1784; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
1785; XOPAVX2-NEXT:    retq
1786  %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
1787  ret <32 x i16> %b
1788}
1789
1790define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
1791; SSE2-LABEL: test_bitreverse_v16i32:
1792; SSE2:       # BB#0:
1793; SSE2-NEXT:    pxor %xmm14, %xmm14
1794; SSE2-NEXT:    movdqa %xmm0, %xmm4
1795; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15]
1796; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
1797; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
1798; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
1799; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1800; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1801; SSE2-NEXT:    packuswb %xmm4, %xmm0
1802; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1803; SSE2-NEXT:    movdqa %xmm0, %xmm5
1804; SSE2-NEXT:    pand %xmm8, %xmm5
1805; SSE2-NEXT:    psllw $4, %xmm5
1806; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1807; SSE2-NEXT:    pand %xmm4, %xmm5
1808; SSE2-NEXT:    pand %xmm4, %xmm0
1809; SSE2-NEXT:    psrlw $4, %xmm0
1810; SSE2-NEXT:    pand %xmm8, %xmm0
1811; SSE2-NEXT:    por %xmm5, %xmm0
1812; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1813; SSE2-NEXT:    movdqa %xmm0, %xmm7
1814; SSE2-NEXT:    pand %xmm5, %xmm7
1815; SSE2-NEXT:    psllw $2, %xmm7
1816; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1817; SSE2-NEXT:    pand %xmm9, %xmm7
1818; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1819; SSE2-NEXT:    pand %xmm10, %xmm0
1820; SSE2-NEXT:    psrlw $2, %xmm0
1821; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
1822; SSE2-NEXT:    pand %xmm11, %xmm0
1823; SSE2-NEXT:    por %xmm7, %xmm0
1824; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1825; SSE2-NEXT:    movdqa %xmm0, %xmm6
1826; SSE2-NEXT:    pand %xmm7, %xmm6
1827; SSE2-NEXT:    psrlw $1, %xmm6
1828; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1829; SSE2-NEXT:    pand %xmm12, %xmm6
1830; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1831; SSE2-NEXT:    pand %xmm13, %xmm0
1832; SSE2-NEXT:    paddb %xmm0, %xmm0
1833; SSE2-NEXT:    por %xmm6, %xmm0
1834; SSE2-NEXT:    movdqa %xmm1, %xmm6
1835; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
1836; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1837; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1838; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
1839; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1840; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1841; SSE2-NEXT:    packuswb %xmm6, %xmm1
1842; SSE2-NEXT:    movdqa %xmm1, %xmm6
1843; SSE2-NEXT:    pand %xmm8, %xmm6
1844; SSE2-NEXT:    psllw $4, %xmm6
1845; SSE2-NEXT:    pand %xmm4, %xmm6
1846; SSE2-NEXT:    pand %xmm4, %xmm1
1847; SSE2-NEXT:    psrlw $4, %xmm1
1848; SSE2-NEXT:    pand %xmm8, %xmm1
1849; SSE2-NEXT:    por %xmm6, %xmm1
1850; SSE2-NEXT:    movdqa %xmm1, %xmm6
1851; SSE2-NEXT:    pand %xmm5, %xmm6
1852; SSE2-NEXT:    psllw $2, %xmm6
1853; SSE2-NEXT:    pand %xmm9, %xmm6
1854; SSE2-NEXT:    pand %xmm10, %xmm1
1855; SSE2-NEXT:    psrlw $2, %xmm1
1856; SSE2-NEXT:    pand %xmm11, %xmm1
1857; SSE2-NEXT:    por %xmm6, %xmm1
1858; SSE2-NEXT:    movdqa %xmm1, %xmm6
1859; SSE2-NEXT:    pand %xmm7, %xmm6
1860; SSE2-NEXT:    psrlw $1, %xmm6
1861; SSE2-NEXT:    pand %xmm12, %xmm6
1862; SSE2-NEXT:    pand %xmm13, %xmm1
1863; SSE2-NEXT:    paddb %xmm1, %xmm1
1864; SSE2-NEXT:    por %xmm6, %xmm1
1865; SSE2-NEXT:    movdqa %xmm2, %xmm6
1866; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
1867; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1868; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1869; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
1870; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1871; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1872; SSE2-NEXT:    packuswb %xmm6, %xmm2
1873; SSE2-NEXT:    movdqa %xmm2, %xmm6
1874; SSE2-NEXT:    pand %xmm8, %xmm6
1875; SSE2-NEXT:    psllw $4, %xmm6
1876; SSE2-NEXT:    pand %xmm4, %xmm6
1877; SSE2-NEXT:    pand %xmm4, %xmm2
1878; SSE2-NEXT:    psrlw $4, %xmm2
1879; SSE2-NEXT:    pand %xmm8, %xmm2
1880; SSE2-NEXT:    por %xmm6, %xmm2
1881; SSE2-NEXT:    movdqa %xmm2, %xmm6
1882; SSE2-NEXT:    pand %xmm5, %xmm6
1883; SSE2-NEXT:    psllw $2, %xmm6
1884; SSE2-NEXT:    pand %xmm9, %xmm6
1885; SSE2-NEXT:    pand %xmm10, %xmm2
1886; SSE2-NEXT:    psrlw $2, %xmm2
1887; SSE2-NEXT:    pand %xmm11, %xmm2
1888; SSE2-NEXT:    por %xmm6, %xmm2
1889; SSE2-NEXT:    movdqa %xmm2, %xmm6
1890; SSE2-NEXT:    pand %xmm7, %xmm6
1891; SSE2-NEXT:    psrlw $1, %xmm6
1892; SSE2-NEXT:    pand %xmm12, %xmm6
1893; SSE2-NEXT:    pand %xmm13, %xmm2
1894; SSE2-NEXT:    paddb %xmm2, %xmm2
1895; SSE2-NEXT:    por %xmm6, %xmm2
1896; SSE2-NEXT:    movdqa %xmm3, %xmm6
1897; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
1898; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1899; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1900; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
1901; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
1902; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
1903; SSE2-NEXT:    packuswb %xmm6, %xmm3
1904; SSE2-NEXT:    movdqa %xmm3, %xmm6
1905; SSE2-NEXT:    pand %xmm8, %xmm6
1906; SSE2-NEXT:    psllw $4, %xmm6
1907; SSE2-NEXT:    pand %xmm4, %xmm6
1908; SSE2-NEXT:    pand %xmm4, %xmm3
1909; SSE2-NEXT:    psrlw $4, %xmm3
1910; SSE2-NEXT:    pand %xmm8, %xmm3
1911; SSE2-NEXT:    por %xmm6, %xmm3
1912; SSE2-NEXT:    pand %xmm3, %xmm5
1913; SSE2-NEXT:    psllw $2, %xmm5
1914; SSE2-NEXT:    pand %xmm9, %xmm5
1915; SSE2-NEXT:    pand %xmm10, %xmm3
1916; SSE2-NEXT:    psrlw $2, %xmm3
1917; SSE2-NEXT:    pand %xmm11, %xmm3
1918; SSE2-NEXT:    por %xmm5, %xmm3
1919; SSE2-NEXT:    pand %xmm3, %xmm7
1920; SSE2-NEXT:    psrlw $1, %xmm7
1921; SSE2-NEXT:    pand %xmm12, %xmm7
1922; SSE2-NEXT:    pand %xmm13, %xmm3
1923; SSE2-NEXT:    paddb %xmm3, %xmm3
1924; SSE2-NEXT:    por %xmm7, %xmm3
1925; SSE2-NEXT:    retq
1926;
1927; SSSE3-LABEL: test_bitreverse_v16i32:
1928; SSSE3:       # BB#0:
1929; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1930; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1931; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1932; SSSE3-NEXT:    pshufb %xmm8, %xmm1
1933; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1934; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1935; SSSE3-NEXT:    pand %xmm9, %xmm0
1936; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1937; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1938; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1939; SSSE3-NEXT:    psrlw $4, %xmm1
1940; SSSE3-NEXT:    pand %xmm9, %xmm1
1941; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1942; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1943; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1944; SSSE3-NEXT:    por %xmm6, %xmm0
1945; SSSE3-NEXT:    pshufb %xmm8, %xmm5
1946; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1947; SSSE3-NEXT:    pand %xmm9, %xmm1
1948; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1949; SSSE3-NEXT:    pshufb %xmm1, %xmm6
1950; SSSE3-NEXT:    psrlw $4, %xmm5
1951; SSSE3-NEXT:    pand %xmm9, %xmm5
1952; SSSE3-NEXT:    movdqa %xmm4, %xmm1
1953; SSSE3-NEXT:    pshufb %xmm5, %xmm1
1954; SSSE3-NEXT:    por %xmm6, %xmm1
1955; SSSE3-NEXT:    pshufb %xmm8, %xmm2
1956; SSSE3-NEXT:    movdqa %xmm2, %xmm5
1957; SSSE3-NEXT:    pand %xmm9, %xmm5
1958; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1959; SSSE3-NEXT:    pshufb %xmm5, %xmm6
1960; SSSE3-NEXT:    psrlw $4, %xmm2
1961; SSSE3-NEXT:    pand %xmm9, %xmm2
1962; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1963; SSSE3-NEXT:    pshufb %xmm2, %xmm5
1964; SSSE3-NEXT:    por %xmm6, %xmm5
1965; SSSE3-NEXT:    pshufb %xmm8, %xmm3
1966; SSSE3-NEXT:    movdqa %xmm3, %xmm2
1967; SSSE3-NEXT:    pand %xmm9, %xmm2
1968; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1969; SSSE3-NEXT:    psrlw $4, %xmm3
1970; SSSE3-NEXT:    pand %xmm9, %xmm3
1971; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1972; SSSE3-NEXT:    por %xmm7, %xmm4
1973; SSSE3-NEXT:    movdqa %xmm5, %xmm2
1974; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1975; SSSE3-NEXT:    retq
1976;
1977; AVX1-LABEL: test_bitreverse_v16i32:
1978; AVX1:       # BB#0:
1979; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1980; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1981; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1982; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1983; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
1984; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1985; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1986; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1987; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1988; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1989; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
1990; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
1991; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1992; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
1993; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1994; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1995; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1996; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
1997; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
1998; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1999; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2000; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2001; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2002; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2003; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2004; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2005; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2006; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2007; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2008; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
2009; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2010; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2011; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2012; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
2013; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
2014; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2015; AVX1-NEXT:    retq
2016;
2017; AVX2-LABEL: test_bitreverse_v16i32:
2018; AVX2:       # BB#0:
2019; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2020; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2021; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2022; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
2023; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2024; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2025; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2026; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2027; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2028; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2029; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
2030; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2031; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
2032; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2033; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2034; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2035; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2036; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2037; AVX2-NEXT:    retq
2038;
2039; AVX512F-LABEL: test_bitreverse_v16i32:
2040; AVX512F:       # BB#0:
2041; AVX512F-NEXT:    vpsrld $24, %zmm0, %zmm1
2042; AVX512F-NEXT:    vpsrld $8, %zmm0, %zmm2
2043; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
2044; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
2045; AVX512F-NEXT:    vpslld $24, %zmm0, %zmm2
2046; AVX512F-NEXT:    vpslld $8, %zmm0, %zmm0
2047; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
2048; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
2049; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
2050; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
2051; AVX512F-NEXT:    vpslld $4, %zmm1, %zmm1
2052; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
2053; AVX512F-NEXT:    vpsrld $4, %zmm0, %zmm0
2054; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
2055; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
2056; AVX512F-NEXT:    vpslld $2, %zmm1, %zmm1
2057; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
2058; AVX512F-NEXT:    vpsrld $2, %zmm0, %zmm0
2059; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
2060; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
2061; AVX512F-NEXT:    vpslld $1, %zmm1, %zmm1
2062; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
2063; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0
2064; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
2065; AVX512F-NEXT:    retq
2066;
2067; AVX512BW-LABEL: test_bitreverse_v16i32:
2068; AVX512BW:       # BB#0:
2069; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
2070; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2071; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2072; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2073; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2074; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2075; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2076; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2077; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2078; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2079; AVX512BW-NEXT:    retq
2080;
2081; XOPAVX1-LABEL: test_bitreverse_v16i32:
2082; XOPAVX1:       # BB#0:
2083; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2084; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
2085; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2086; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2087; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2088; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2089; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2090; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2091; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2092; XOPAVX1-NEXT:    retq
2093;
2094; XOPAVX2-LABEL: test_bitreverse_v16i32:
2095; XOPAVX2:       # BB#0:
2096; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2097; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
2098; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2099; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2100; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2101; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2102; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2103; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2104; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2105; XOPAVX2-NEXT:    retq
2106  %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
2107  ret <16 x i32> %b
2108}
2109
2110define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
2111; SSE2-LABEL: test_bitreverse_v8i64:
2112; SSE2:       # BB#0:
2113; SSE2-NEXT:    pxor %xmm14, %xmm14
2114; SSE2-NEXT:    movdqa %xmm0, %xmm4
2115; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15]
2116; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
2117; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2118; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2119; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
2120; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2121; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
2122; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
2123; SSE2-NEXT:    packuswb %xmm4, %xmm0
2124; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2125; SSE2-NEXT:    movdqa %xmm0, %xmm5
2126; SSE2-NEXT:    pand %xmm8, %xmm5
2127; SSE2-NEXT:    psllw $4, %xmm5
2128; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
2129; SSE2-NEXT:    pand %xmm4, %xmm5
2130; SSE2-NEXT:    pand %xmm4, %xmm0
2131; SSE2-NEXT:    psrlw $4, %xmm0
2132; SSE2-NEXT:    pand %xmm8, %xmm0
2133; SSE2-NEXT:    por %xmm5, %xmm0
2134; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2135; SSE2-NEXT:    movdqa %xmm0, %xmm7
2136; SSE2-NEXT:    pand %xmm5, %xmm7
2137; SSE2-NEXT:    psllw $2, %xmm7
2138; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
2139; SSE2-NEXT:    pand %xmm9, %xmm7
2140; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
2141; SSE2-NEXT:    pand %xmm10, %xmm0
2142; SSE2-NEXT:    psrlw $2, %xmm0
2143; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
2144; SSE2-NEXT:    pand %xmm11, %xmm0
2145; SSE2-NEXT:    por %xmm7, %xmm0
2146; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
2147; SSE2-NEXT:    movdqa %xmm0, %xmm6
2148; SSE2-NEXT:    pand %xmm7, %xmm6
2149; SSE2-NEXT:    psrlw $1, %xmm6
2150; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
2151; SSE2-NEXT:    pand %xmm12, %xmm6
2152; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2153; SSE2-NEXT:    pand %xmm13, %xmm0
2154; SSE2-NEXT:    paddb %xmm0, %xmm0
2155; SSE2-NEXT:    por %xmm6, %xmm0
2156; SSE2-NEXT:    movdqa %xmm1, %xmm6
2157; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
2158; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
2159; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
2160; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
2161; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
2162; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2163; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2164; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2165; SSE2-NEXT:    packuswb %xmm6, %xmm1
2166; SSE2-NEXT:    movdqa %xmm1, %xmm6
2167; SSE2-NEXT:    pand %xmm8, %xmm6
2168; SSE2-NEXT:    psllw $4, %xmm6
2169; SSE2-NEXT:    pand %xmm4, %xmm6
2170; SSE2-NEXT:    pand %xmm4, %xmm1
2171; SSE2-NEXT:    psrlw $4, %xmm1
2172; SSE2-NEXT:    pand %xmm8, %xmm1
2173; SSE2-NEXT:    por %xmm6, %xmm1
2174; SSE2-NEXT:    movdqa %xmm1, %xmm6
2175; SSE2-NEXT:    pand %xmm5, %xmm6
2176; SSE2-NEXT:    psllw $2, %xmm6
2177; SSE2-NEXT:    pand %xmm9, %xmm6
2178; SSE2-NEXT:    pand %xmm10, %xmm1
2179; SSE2-NEXT:    psrlw $2, %xmm1
2180; SSE2-NEXT:    pand %xmm11, %xmm1
2181; SSE2-NEXT:    por %xmm6, %xmm1
2182; SSE2-NEXT:    movdqa %xmm1, %xmm6
2183; SSE2-NEXT:    pand %xmm7, %xmm6
2184; SSE2-NEXT:    psrlw $1, %xmm6
2185; SSE2-NEXT:    pand %xmm12, %xmm6
2186; SSE2-NEXT:    pand %xmm13, %xmm1
2187; SSE2-NEXT:    paddb %xmm1, %xmm1
2188; SSE2-NEXT:    por %xmm6, %xmm1
2189; SSE2-NEXT:    movdqa %xmm2, %xmm6
2190; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
2191; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
2192; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
2193; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
2194; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
2195; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
2196; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2197; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2198; SSE2-NEXT:    packuswb %xmm6, %xmm2
2199; SSE2-NEXT:    movdqa %xmm2, %xmm6
2200; SSE2-NEXT:    pand %xmm8, %xmm6
2201; SSE2-NEXT:    psllw $4, %xmm6
2202; SSE2-NEXT:    pand %xmm4, %xmm6
2203; SSE2-NEXT:    pand %xmm4, %xmm2
2204; SSE2-NEXT:    psrlw $4, %xmm2
2205; SSE2-NEXT:    pand %xmm8, %xmm2
2206; SSE2-NEXT:    por %xmm6, %xmm2
2207; SSE2-NEXT:    movdqa %xmm2, %xmm6
2208; SSE2-NEXT:    pand %xmm5, %xmm6
2209; SSE2-NEXT:    psllw $2, %xmm6
2210; SSE2-NEXT:    pand %xmm9, %xmm6
2211; SSE2-NEXT:    pand %xmm10, %xmm2
2212; SSE2-NEXT:    psrlw $2, %xmm2
2213; SSE2-NEXT:    pand %xmm11, %xmm2
2214; SSE2-NEXT:    por %xmm6, %xmm2
2215; SSE2-NEXT:    movdqa %xmm2, %xmm6
2216; SSE2-NEXT:    pand %xmm7, %xmm6
2217; SSE2-NEXT:    psrlw $1, %xmm6
2218; SSE2-NEXT:    pand %xmm12, %xmm6
2219; SSE2-NEXT:    pand %xmm13, %xmm2
2220; SSE2-NEXT:    paddb %xmm2, %xmm2
2221; SSE2-NEXT:    por %xmm6, %xmm2
2222; SSE2-NEXT:    movdqa %xmm3, %xmm6
2223; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
2224; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
2225; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
2226; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
2227; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
2228; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
2229; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
2230; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
2231; SSE2-NEXT:    packuswb %xmm6, %xmm3
2232; SSE2-NEXT:    movdqa %xmm3, %xmm6
2233; SSE2-NEXT:    pand %xmm8, %xmm6
2234; SSE2-NEXT:    psllw $4, %xmm6
2235; SSE2-NEXT:    pand %xmm4, %xmm6
2236; SSE2-NEXT:    pand %xmm4, %xmm3
2237; SSE2-NEXT:    psrlw $4, %xmm3
2238; SSE2-NEXT:    pand %xmm8, %xmm3
2239; SSE2-NEXT:    por %xmm6, %xmm3
2240; SSE2-NEXT:    pand %xmm3, %xmm5
2241; SSE2-NEXT:    psllw $2, %xmm5
2242; SSE2-NEXT:    pand %xmm9, %xmm5
2243; SSE2-NEXT:    pand %xmm10, %xmm3
2244; SSE2-NEXT:    psrlw $2, %xmm3
2245; SSE2-NEXT:    pand %xmm11, %xmm3
2246; SSE2-NEXT:    por %xmm5, %xmm3
2247; SSE2-NEXT:    pand %xmm3, %xmm7
2248; SSE2-NEXT:    psrlw $1, %xmm7
2249; SSE2-NEXT:    pand %xmm12, %xmm7
2250; SSE2-NEXT:    pand %xmm13, %xmm3
2251; SSE2-NEXT:    paddb %xmm3, %xmm3
2252; SSE2-NEXT:    por %xmm7, %xmm3
2253; SSE2-NEXT:    retq
2254;
2255; SSSE3-LABEL: test_bitreverse_v8i64:
2256; SSSE3:       # BB#0:
2257; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2258; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2259; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2260; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2261; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2262; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2263; SSSE3-NEXT:    pand %xmm9, %xmm0
2264; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2265; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2266; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2267; SSSE3-NEXT:    psrlw $4, %xmm1
2268; SSSE3-NEXT:    pand %xmm9, %xmm1
2269; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2270; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2271; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2272; SSSE3-NEXT:    por %xmm6, %xmm0
2273; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2274; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2275; SSSE3-NEXT:    pand %xmm9, %xmm1
2276; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2277; SSSE3-NEXT:    pshufb %xmm1, %xmm6
2278; SSSE3-NEXT:    psrlw $4, %xmm5
2279; SSSE3-NEXT:    pand %xmm9, %xmm5
2280; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2281; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2282; SSSE3-NEXT:    por %xmm6, %xmm1
2283; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2284; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2285; SSSE3-NEXT:    pand %xmm9, %xmm5
2286; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2287; SSSE3-NEXT:    pshufb %xmm5, %xmm6
2288; SSSE3-NEXT:    psrlw $4, %xmm2
2289; SSSE3-NEXT:    pand %xmm9, %xmm2
2290; SSSE3-NEXT:    movdqa %xmm4, %xmm5
2291; SSSE3-NEXT:    pshufb %xmm2, %xmm5
2292; SSSE3-NEXT:    por %xmm6, %xmm5
2293; SSSE3-NEXT:    pshufb %xmm8, %xmm3
2294; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2295; SSSE3-NEXT:    pand %xmm9, %xmm2
2296; SSSE3-NEXT:    pshufb %xmm2, %xmm7
2297; SSSE3-NEXT:    psrlw $4, %xmm3
2298; SSSE3-NEXT:    pand %xmm9, %xmm3
2299; SSSE3-NEXT:    pshufb %xmm3, %xmm4
2300; SSSE3-NEXT:    por %xmm7, %xmm4
2301; SSSE3-NEXT:    movdqa %xmm5, %xmm2
2302; SSSE3-NEXT:    movdqa %xmm4, %xmm3
2303; SSSE3-NEXT:    retq
2304;
2305; AVX1-LABEL: test_bitreverse_v8i64:
2306; AVX1:       # BB#0:
2307; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2308; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2309; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2310; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2311; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2312; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2313; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2314; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2315; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2316; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2317; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2318; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2319; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2320; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
2321; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2322; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2323; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2324; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
2325; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
2326; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2327; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2328; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2329; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2330; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2331; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2332; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2333; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2334; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2335; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2336; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
2337; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2338; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2339; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2340; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
2341; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
2342; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2343; AVX1-NEXT:    retq
2344;
2345; AVX2-LABEL: test_bitreverse_v8i64:
2346; AVX2:       # BB#0:
2347; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2348; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2349; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2350; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
2351; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2352; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2353; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2354; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2355; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2356; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2357; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
2358; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2359; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
2360; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2361; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2362; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2363; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2364; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2365; AVX2-NEXT:    retq
2366;
2367; AVX512F-LABEL: test_bitreverse_v8i64:
2368; AVX512F:       # BB#0:
2369; AVX512F-NEXT:    vpsrlq $56, %zmm0, %zmm1
2370; AVX512F-NEXT:    vpsrlq $40, %zmm0, %zmm2
2371; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
2372; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
2373; AVX512F-NEXT:    vpsrlq $24, %zmm0, %zmm2
2374; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
2375; AVX512F-NEXT:    vpsrlq $8, %zmm0, %zmm3
2376; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
2377; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
2378; AVX512F-NEXT:    vporq %zmm1, %zmm3, %zmm1
2379; AVX512F-NEXT:    vpsllq $8, %zmm0, %zmm2
2380; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
2381; AVX512F-NEXT:    vpsllq $24, %zmm0, %zmm3
2382; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
2383; AVX512F-NEXT:    vporq %zmm2, %zmm3, %zmm2
2384; AVX512F-NEXT:    vpsllq $56, %zmm0, %zmm3
2385; AVX512F-NEXT:    vpsllq $40, %zmm0, %zmm0
2386; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
2387; AVX512F-NEXT:    vporq %zmm2, %zmm0, %zmm0
2388; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
2389; AVX512F-NEXT:    vporq %zmm0, %zmm3, %zmm0
2390; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
2391; AVX512F-NEXT:    vpsllq $4, %zmm1, %zmm1
2392; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
2393; AVX512F-NEXT:    vpsrlq $4, %zmm0, %zmm0
2394; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
2395; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
2396; AVX512F-NEXT:    vpsllq $2, %zmm1, %zmm1
2397; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
2398; AVX512F-NEXT:    vpsrlq $2, %zmm0, %zmm0
2399; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
2400; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
2401; AVX512F-NEXT:    vpsllq $1, %zmm1, %zmm1
2402; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
2403; AVX512F-NEXT:    vpsrlq $1, %zmm0, %zmm0
2404; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
2405; AVX512F-NEXT:    retq
2406;
2407; AVX512BW-LABEL: test_bitreverse_v8i64:
2408; AVX512BW:       # BB#0:
2409; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
2410; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2411; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2412; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2413; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2414; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2415; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2416; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2417; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2418; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2419; AVX512BW-NEXT:    retq
2420;
2421; XOPAVX1-LABEL: test_bitreverse_v8i64:
2422; XOPAVX1:       # BB#0:
2423; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2424; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2425; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2426; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2427; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2428; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2429; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2430; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2431; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2432; XOPAVX1-NEXT:    retq
2433;
2434; XOPAVX2-LABEL: test_bitreverse_v8i64:
2435; XOPAVX2:       # BB#0:
2436; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2437; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2438; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2439; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2440; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2441; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2442; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2443; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2444; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2445; XOPAVX2-NEXT:    retq
2446  %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
2447  ret <8 x i64> %b
2448}
2449
2450declare i8 @llvm.bitreverse.i8(i8) readnone
2451declare i16 @llvm.bitreverse.i16(i16) readnone
2452declare i32 @llvm.bitreverse.i32(i32) readnone
2453declare i64 @llvm.bitreverse.i64(i64) readnone
2454
2455declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone
2456declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone
2457declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone
2458declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone
2459
2460declare <32 x i8>  @llvm.bitreverse.v32i8(<32 x i8>) readnone
2461declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone
2462declare <8 x i32>  @llvm.bitreverse.v8i32(<8 x i32>) readnone
2463declare <4 x i64>  @llvm.bitreverse.v4i64(<4 x i64>) readnone
2464
2465declare <64 x i8>  @llvm.bitreverse.v64i8(<64 x i8>) readnone
2466declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone
2467declare <16 x i32>  @llvm.bitreverse.v16i32(<16 x i32>) readnone
2468declare <8 x i64>  @llvm.bitreverse.v8i64(<8 x i64>) readnone
2469