1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
10
11; Make sure we don't crash with avx512bw and xop
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw
13
14define i8 @test_bitreverse_i8(i8 %a) nounwind {
15; SSE-LABEL: test_bitreverse_i8:
16; SSE:       # %bb.0:
17; SSE-NEXT:    rolb $4, %dil
18; SSE-NEXT:    movl %edi, %eax
19; SSE-NEXT:    andb $51, %al
20; SSE-NEXT:    shlb $2, %al
21; SSE-NEXT:    andb $-52, %dil
22; SSE-NEXT:    shrb $2, %dil
23; SSE-NEXT:    orb %al, %dil
24; SSE-NEXT:    movl %edi, %eax
25; SSE-NEXT:    andb $85, %al
26; SSE-NEXT:    addb %al, %al
27; SSE-NEXT:    andb $-86, %dil
28; SSE-NEXT:    shrb %dil
29; SSE-NEXT:    orb %al, %dil
30; SSE-NEXT:    movl %edi, %eax
31; SSE-NEXT:    retq
32;
33; AVX-LABEL: test_bitreverse_i8:
34; AVX:       # %bb.0:
35; AVX-NEXT:    rolb $4, %dil
36; AVX-NEXT:    movl %edi, %eax
37; AVX-NEXT:    andb $51, %al
38; AVX-NEXT:    shlb $2, %al
39; AVX-NEXT:    andb $-52, %dil
40; AVX-NEXT:    shrb $2, %dil
41; AVX-NEXT:    orb %al, %dil
42; AVX-NEXT:    movl %edi, %eax
43; AVX-NEXT:    andb $85, %al
44; AVX-NEXT:    addb %al, %al
45; AVX-NEXT:    andb $-86, %dil
46; AVX-NEXT:    shrb %dil
47; AVX-NEXT:    orb %al, %dil
48; AVX-NEXT:    movl %edi, %eax
49; AVX-NEXT:    retq
50;
51; XOP-LABEL: test_bitreverse_i8:
52; XOP:       # %bb.0:
53; XOP-NEXT:    vmovd %edi, %xmm0
54; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
55; XOP-NEXT:    vpextrb $0, %xmm0, %eax
56; XOP-NEXT:    # kill: def $al killed $al killed $eax
57; XOP-NEXT:    retq
58  %b = call i8 @llvm.bitreverse.i8(i8 %a)
59  ret i8 %b
60}
61
62define i16 @test_bitreverse_i16(i16 %a) nounwind {
63; SSE-LABEL: test_bitreverse_i16:
64; SSE:       # %bb.0:
65; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
66; SSE-NEXT:    rolw $8, %di
67; SSE-NEXT:    movl %edi, %eax
68; SSE-NEXT:    andl $3855, %eax # imm = 0xF0F
69; SSE-NEXT:    shll $4, %eax
70; SSE-NEXT:    andl $61680, %edi # imm = 0xF0F0
71; SSE-NEXT:    shrl $4, %edi
72; SSE-NEXT:    orl %eax, %edi
73; SSE-NEXT:    movl %edi, %eax
74; SSE-NEXT:    andl $13107, %eax # imm = 0x3333
75; SSE-NEXT:    andl $52428, %edi # imm = 0xCCCC
76; SSE-NEXT:    shrl $2, %edi
77; SSE-NEXT:    leal (%rdi,%rax,4), %eax
78; SSE-NEXT:    movl %eax, %ecx
79; SSE-NEXT:    andl $21845, %ecx # imm = 0x5555
80; SSE-NEXT:    andl $43690, %eax # imm = 0xAAAA
81; SSE-NEXT:    shrl %eax
82; SSE-NEXT:    leal (%rax,%rcx,2), %eax
83; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
84; SSE-NEXT:    retq
85;
86; AVX-LABEL: test_bitreverse_i16:
87; AVX:       # %bb.0:
88; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
89; AVX-NEXT:    rolw $8, %di
90; AVX-NEXT:    movl %edi, %eax
91; AVX-NEXT:    andl $3855, %eax # imm = 0xF0F
92; AVX-NEXT:    shll $4, %eax
93; AVX-NEXT:    andl $61680, %edi # imm = 0xF0F0
94; AVX-NEXT:    shrl $4, %edi
95; AVX-NEXT:    orl %eax, %edi
96; AVX-NEXT:    movl %edi, %eax
97; AVX-NEXT:    andl $13107, %eax # imm = 0x3333
98; AVX-NEXT:    andl $52428, %edi # imm = 0xCCCC
99; AVX-NEXT:    shrl $2, %edi
100; AVX-NEXT:    leal (%rdi,%rax,4), %eax
101; AVX-NEXT:    movl %eax, %ecx
102; AVX-NEXT:    andl $21845, %ecx # imm = 0x5555
103; AVX-NEXT:    andl $43690, %eax # imm = 0xAAAA
104; AVX-NEXT:    shrl %eax
105; AVX-NEXT:    leal (%rax,%rcx,2), %eax
106; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
107; AVX-NEXT:    retq
108;
109; XOP-LABEL: test_bitreverse_i16:
110; XOP:       # %bb.0:
111; XOP-NEXT:    vmovd %edi, %xmm0
112; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
113; XOP-NEXT:    vmovd %xmm0, %eax
114; XOP-NEXT:    # kill: def $ax killed $ax killed $eax
115; XOP-NEXT:    retq
116  %b = call i16 @llvm.bitreverse.i16(i16 %a)
117  ret i16 %b
118}
119
120define i32 @test_bitreverse_i32(i32 %a) nounwind {
121; SSE-LABEL: test_bitreverse_i32:
122; SSE:       # %bb.0:
123; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
124; SSE-NEXT:    bswapl %edi
125; SSE-NEXT:    movl %edi, %eax
126; SSE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
127; SSE-NEXT:    shll $4, %eax
128; SSE-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
129; SSE-NEXT:    shrl $4, %edi
130; SSE-NEXT:    orl %eax, %edi
131; SSE-NEXT:    movl %edi, %eax
132; SSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
133; SSE-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
134; SSE-NEXT:    shrl $2, %edi
135; SSE-NEXT:    leal (%rdi,%rax,4), %eax
136; SSE-NEXT:    movl %eax, %ecx
137; SSE-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
138; SSE-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
139; SSE-NEXT:    shrl %eax
140; SSE-NEXT:    leal (%rax,%rcx,2), %eax
141; SSE-NEXT:    retq
142;
143; AVX-LABEL: test_bitreverse_i32:
144; AVX:       # %bb.0:
145; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
146; AVX-NEXT:    bswapl %edi
147; AVX-NEXT:    movl %edi, %eax
148; AVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
149; AVX-NEXT:    shll $4, %eax
150; AVX-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
151; AVX-NEXT:    shrl $4, %edi
152; AVX-NEXT:    orl %eax, %edi
153; AVX-NEXT:    movl %edi, %eax
154; AVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
155; AVX-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
156; AVX-NEXT:    shrl $2, %edi
157; AVX-NEXT:    leal (%rdi,%rax,4), %eax
158; AVX-NEXT:    movl %eax, %ecx
159; AVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
160; AVX-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
161; AVX-NEXT:    shrl %eax
162; AVX-NEXT:    leal (%rax,%rcx,2), %eax
163; AVX-NEXT:    retq
164;
165; XOP-LABEL: test_bitreverse_i32:
166; XOP:       # %bb.0:
167; XOP-NEXT:    vmovd %edi, %xmm0
168; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
169; XOP-NEXT:    vmovd %xmm0, %eax
170; XOP-NEXT:    retq
171  %b = call i32 @llvm.bitreverse.i32(i32 %a)
172  ret i32 %b
173}
174
175define i64 @test_bitreverse_i64(i64 %a) nounwind {
176; SSE-LABEL: test_bitreverse_i64:
177; SSE:       # %bb.0:
178; SSE-NEXT:    bswapq %rdi
179; SSE-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
180; SSE-NEXT:    andq %rdi, %rax
181; SSE-NEXT:    shlq $4, %rax
182; SSE-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
183; SSE-NEXT:    andq %rdi, %rcx
184; SSE-NEXT:    shrq $4, %rcx
185; SSE-NEXT:    orq %rax, %rcx
186; SSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
187; SSE-NEXT:    andq %rcx, %rax
188; SSE-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
189; SSE-NEXT:    andq %rcx, %rdx
190; SSE-NEXT:    shrq $2, %rdx
191; SSE-NEXT:    leaq (%rdx,%rax,4), %rax
192; SSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
193; SSE-NEXT:    andq %rax, %rcx
194; SSE-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
195; SSE-NEXT:    andq %rax, %rdx
196; SSE-NEXT:    shrq %rdx
197; SSE-NEXT:    leaq (%rdx,%rcx,2), %rax
198; SSE-NEXT:    retq
199;
200; AVX-LABEL: test_bitreverse_i64:
201; AVX:       # %bb.0:
202; AVX-NEXT:    bswapq %rdi
203; AVX-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
204; AVX-NEXT:    andq %rdi, %rax
205; AVX-NEXT:    shlq $4, %rax
206; AVX-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
207; AVX-NEXT:    andq %rdi, %rcx
208; AVX-NEXT:    shrq $4, %rcx
209; AVX-NEXT:    orq %rax, %rcx
210; AVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
211; AVX-NEXT:    andq %rcx, %rax
212; AVX-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
213; AVX-NEXT:    andq %rcx, %rdx
214; AVX-NEXT:    shrq $2, %rdx
215; AVX-NEXT:    leaq (%rdx,%rax,4), %rax
216; AVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
217; AVX-NEXT:    andq %rax, %rcx
218; AVX-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
219; AVX-NEXT:    andq %rax, %rdx
220; AVX-NEXT:    shrq %rdx
221; AVX-NEXT:    leaq (%rdx,%rcx,2), %rax
222; AVX-NEXT:    retq
223;
224; XOP-LABEL: test_bitreverse_i64:
225; XOP:       # %bb.0:
226; XOP-NEXT:    vmovq %rdi, %xmm0
227; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
228; XOP-NEXT:    vmovq %xmm0, %rax
229; XOP-NEXT:    retq
230  %b = call i64 @llvm.bitreverse.i64(i64 %a)
231  ret i64 %b
232}
233
234define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
235; SSE2-LABEL: test_bitreverse_v16i8:
236; SSE2:       # %bb.0:
237; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
238; SSE2-NEXT:    movdqa %xmm0, %xmm2
239; SSE2-NEXT:    pand %xmm1, %xmm2
240; SSE2-NEXT:    psllw $4, %xmm2
241; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
242; SSE2-NEXT:    pand %xmm3, %xmm2
243; SSE2-NEXT:    pand %xmm3, %xmm0
244; SSE2-NEXT:    psrlw $4, %xmm0
245; SSE2-NEXT:    pand %xmm1, %xmm0
246; SSE2-NEXT:    por %xmm2, %xmm0
247; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
248; SSE2-NEXT:    pand %xmm0, %xmm1
249; SSE2-NEXT:    psllw $2, %xmm1
250; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
251; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
252; SSE2-NEXT:    psrlw $2, %xmm0
253; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
254; SSE2-NEXT:    por %xmm1, %xmm0
255; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
256; SSE2-NEXT:    pand %xmm0, %xmm1
257; SSE2-NEXT:    paddb %xmm1, %xmm1
258; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
259; SSE2-NEXT:    psrlw $1, %xmm0
260; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
261; SSE2-NEXT:    por %xmm1, %xmm0
262; SSE2-NEXT:    retq
263;
264; SSSE3-LABEL: test_bitreverse_v16i8:
265; SSSE3:       # %bb.0:
266; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
267; SSSE3-NEXT:    movdqa %xmm0, %xmm2
268; SSSE3-NEXT:    pand %xmm1, %xmm2
269; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
270; SSSE3-NEXT:    pshufb %xmm2, %xmm3
271; SSSE3-NEXT:    psrlw $4, %xmm0
272; SSSE3-NEXT:    pand %xmm1, %xmm0
273; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
274; SSSE3-NEXT:    pshufb %xmm0, %xmm1
275; SSSE3-NEXT:    por %xmm3, %xmm1
276; SSSE3-NEXT:    movdqa %xmm1, %xmm0
277; SSSE3-NEXT:    retq
278;
279; AVX-LABEL: test_bitreverse_v16i8:
280; AVX:       # %bb.0:
281; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
282; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
283; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
284; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
285; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
286; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
287; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
288; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
289; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
290; AVX-NEXT:    retq
291;
292; XOP-LABEL: test_bitreverse_v16i8:
293; XOP:       # %bb.0:
294; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
295; XOP-NEXT:    retq
296  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
297  ret <16 x i8> %b
298}
299
300define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
301; SSE2-LABEL: test_bitreverse_v8i16:
302; SSE2:       # %bb.0:
303; SSE2-NEXT:    pxor %xmm1, %xmm1
304; SSE2-NEXT:    movdqa %xmm0, %xmm2
305; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
306; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
307; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
308; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
309; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
310; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
311; SSE2-NEXT:    packuswb %xmm2, %xmm0
312; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
313; SSE2-NEXT:    movdqa %xmm0, %xmm2
314; SSE2-NEXT:    pand %xmm1, %xmm2
315; SSE2-NEXT:    psllw $4, %xmm2
316; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
317; SSE2-NEXT:    pand %xmm3, %xmm2
318; SSE2-NEXT:    pand %xmm3, %xmm0
319; SSE2-NEXT:    psrlw $4, %xmm0
320; SSE2-NEXT:    pand %xmm1, %xmm0
321; SSE2-NEXT:    por %xmm2, %xmm0
322; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
323; SSE2-NEXT:    pand %xmm0, %xmm1
324; SSE2-NEXT:    psllw $2, %xmm1
325; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
326; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
327; SSE2-NEXT:    psrlw $2, %xmm0
328; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
329; SSE2-NEXT:    por %xmm1, %xmm0
330; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
331; SSE2-NEXT:    pand %xmm0, %xmm1
332; SSE2-NEXT:    paddb %xmm1, %xmm1
333; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
334; SSE2-NEXT:    psrlw $1, %xmm0
335; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
336; SSE2-NEXT:    por %xmm1, %xmm0
337; SSE2-NEXT:    retq
338;
339; SSSE3-LABEL: test_bitreverse_v8i16:
340; SSSE3:       # %bb.0:
341; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
342; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
343; SSSE3-NEXT:    movdqa %xmm0, %xmm2
344; SSSE3-NEXT:    pand %xmm1, %xmm2
345; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
346; SSSE3-NEXT:    pshufb %xmm2, %xmm3
347; SSSE3-NEXT:    psrlw $4, %xmm0
348; SSSE3-NEXT:    pand %xmm1, %xmm0
349; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
350; SSSE3-NEXT:    pshufb %xmm0, %xmm1
351; SSSE3-NEXT:    por %xmm3, %xmm1
352; SSSE3-NEXT:    movdqa %xmm1, %xmm0
353; SSSE3-NEXT:    retq
354;
355; AVX-LABEL: test_bitreverse_v8i16:
356; AVX:       # %bb.0:
357; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
358; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
359; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
360; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
361; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
362; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
363; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
364; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
365; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
366; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
367; AVX-NEXT:    retq
368;
369; XOP-LABEL: test_bitreverse_v8i16:
370; XOP:       # %bb.0:
371; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
372; XOP-NEXT:    retq
373  %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
374  ret <8 x i16> %b
375}
376
377define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
378; SSE2-LABEL: test_bitreverse_v4i32:
379; SSE2:       # %bb.0:
380; SSE2-NEXT:    pxor %xmm1, %xmm1
381; SSE2-NEXT:    movdqa %xmm0, %xmm2
382; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
383; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
384; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
385; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
386; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
387; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
388; SSE2-NEXT:    packuswb %xmm2, %xmm0
389; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
390; SSE2-NEXT:    movdqa %xmm0, %xmm2
391; SSE2-NEXT:    pand %xmm1, %xmm2
392; SSE2-NEXT:    psllw $4, %xmm2
393; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
394; SSE2-NEXT:    pand %xmm3, %xmm2
395; SSE2-NEXT:    pand %xmm3, %xmm0
396; SSE2-NEXT:    psrlw $4, %xmm0
397; SSE2-NEXT:    pand %xmm1, %xmm0
398; SSE2-NEXT:    por %xmm2, %xmm0
399; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
400; SSE2-NEXT:    pand %xmm0, %xmm1
401; SSE2-NEXT:    psllw $2, %xmm1
402; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
403; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
404; SSE2-NEXT:    psrlw $2, %xmm0
405; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
406; SSE2-NEXT:    por %xmm1, %xmm0
407; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
408; SSE2-NEXT:    pand %xmm0, %xmm1
409; SSE2-NEXT:    paddb %xmm1, %xmm1
410; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
411; SSE2-NEXT:    psrlw $1, %xmm0
412; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
413; SSE2-NEXT:    por %xmm1, %xmm0
414; SSE2-NEXT:    retq
415;
416; SSSE3-LABEL: test_bitreverse_v4i32:
417; SSSE3:       # %bb.0:
418; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
419; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
420; SSSE3-NEXT:    movdqa %xmm0, %xmm2
421; SSSE3-NEXT:    pand %xmm1, %xmm2
422; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
423; SSSE3-NEXT:    pshufb %xmm2, %xmm3
424; SSSE3-NEXT:    psrlw $4, %xmm0
425; SSSE3-NEXT:    pand %xmm1, %xmm0
426; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
427; SSSE3-NEXT:    pshufb %xmm0, %xmm1
428; SSSE3-NEXT:    por %xmm3, %xmm1
429; SSSE3-NEXT:    movdqa %xmm1, %xmm0
430; SSSE3-NEXT:    retq
431;
432; AVX-LABEL: test_bitreverse_v4i32:
433; AVX:       # %bb.0:
434; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
435; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
436; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
437; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
438; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
439; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
440; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
441; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
442; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
443; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
444; AVX-NEXT:    retq
445;
446; XOP-LABEL: test_bitreverse_v4i32:
447; XOP:       # %bb.0:
448; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
449; XOP-NEXT:    retq
450  %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
451  ret <4 x i32> %b
452}
453
454define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
455; SSE2-LABEL: test_bitreverse_v2i64:
456; SSE2:       # %bb.0:
457; SSE2-NEXT:    pxor %xmm1, %xmm1
458; SSE2-NEXT:    movdqa %xmm0, %xmm2
459; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
460; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
461; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
462; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
463; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
464; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
465; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
466; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
467; SSE2-NEXT:    packuswb %xmm2, %xmm0
468; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
469; SSE2-NEXT:    movdqa %xmm0, %xmm2
470; SSE2-NEXT:    pand %xmm1, %xmm2
471; SSE2-NEXT:    psllw $4, %xmm2
472; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
473; SSE2-NEXT:    pand %xmm3, %xmm2
474; SSE2-NEXT:    pand %xmm3, %xmm0
475; SSE2-NEXT:    psrlw $4, %xmm0
476; SSE2-NEXT:    pand %xmm1, %xmm0
477; SSE2-NEXT:    por %xmm2, %xmm0
478; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
479; SSE2-NEXT:    pand %xmm0, %xmm1
480; SSE2-NEXT:    psllw $2, %xmm1
481; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
482; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
483; SSE2-NEXT:    psrlw $2, %xmm0
484; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
485; SSE2-NEXT:    por %xmm1, %xmm0
486; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
487; SSE2-NEXT:    pand %xmm0, %xmm1
488; SSE2-NEXT:    paddb %xmm1, %xmm1
489; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
490; SSE2-NEXT:    psrlw $1, %xmm0
491; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
492; SSE2-NEXT:    por %xmm1, %xmm0
493; SSE2-NEXT:    retq
494;
495; SSSE3-LABEL: test_bitreverse_v2i64:
496; SSSE3:       # %bb.0:
497; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
498; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
499; SSSE3-NEXT:    movdqa %xmm0, %xmm2
500; SSSE3-NEXT:    pand %xmm1, %xmm2
501; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
502; SSSE3-NEXT:    pshufb %xmm2, %xmm3
503; SSSE3-NEXT:    psrlw $4, %xmm0
504; SSSE3-NEXT:    pand %xmm1, %xmm0
505; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
506; SSSE3-NEXT:    pshufb %xmm0, %xmm1
507; SSSE3-NEXT:    por %xmm3, %xmm1
508; SSSE3-NEXT:    movdqa %xmm1, %xmm0
509; SSSE3-NEXT:    retq
510;
511; AVX-LABEL: test_bitreverse_v2i64:
512; AVX:       # %bb.0:
513; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
514; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
515; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
516; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
517; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
518; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
519; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
520; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
521; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
522; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
523; AVX-NEXT:    retq
524;
525; XOP-LABEL: test_bitreverse_v2i64:
526; XOP:       # %bb.0:
527; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
528; XOP-NEXT:    retq
529  %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
530  ret <2 x i64> %b
531}
532
533define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
534; SSE2-LABEL: test_bitreverse_v32i8:
535; SSE2:       # %bb.0:
536; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
537; SSE2-NEXT:    movdqa %xmm0, %xmm3
538; SSE2-NEXT:    pand %xmm2, %xmm3
539; SSE2-NEXT:    psllw $4, %xmm3
540; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
541; SSE2-NEXT:    pand %xmm5, %xmm3
542; SSE2-NEXT:    pand %xmm5, %xmm0
543; SSE2-NEXT:    psrlw $4, %xmm0
544; SSE2-NEXT:    pand %xmm2, %xmm0
545; SSE2-NEXT:    por %xmm3, %xmm0
546; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
547; SSE2-NEXT:    movdqa %xmm0, %xmm4
548; SSE2-NEXT:    pand %xmm3, %xmm4
549; SSE2-NEXT:    psllw $2, %xmm4
550; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
551; SSE2-NEXT:    pand %xmm8, %xmm4
552; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
553; SSE2-NEXT:    pand %xmm9, %xmm0
554; SSE2-NEXT:    psrlw $2, %xmm0
555; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
556; SSE2-NEXT:    pand %xmm10, %xmm0
557; SSE2-NEXT:    por %xmm4, %xmm0
558; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
559; SSE2-NEXT:    movdqa %xmm0, %xmm7
560; SSE2-NEXT:    pand %xmm4, %xmm7
561; SSE2-NEXT:    psrlw $1, %xmm7
562; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
563; SSE2-NEXT:    pand %xmm11, %xmm7
564; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
565; SSE2-NEXT:    pand %xmm6, %xmm0
566; SSE2-NEXT:    paddb %xmm0, %xmm0
567; SSE2-NEXT:    por %xmm7, %xmm0
568; SSE2-NEXT:    movdqa %xmm1, %xmm7
569; SSE2-NEXT:    pand %xmm2, %xmm7
570; SSE2-NEXT:    psllw $4, %xmm7
571; SSE2-NEXT:    pand %xmm5, %xmm7
572; SSE2-NEXT:    pand %xmm5, %xmm1
573; SSE2-NEXT:    psrlw $4, %xmm1
574; SSE2-NEXT:    pand %xmm2, %xmm1
575; SSE2-NEXT:    por %xmm7, %xmm1
576; SSE2-NEXT:    pand %xmm1, %xmm3
577; SSE2-NEXT:    psllw $2, %xmm3
578; SSE2-NEXT:    pand %xmm8, %xmm3
579; SSE2-NEXT:    pand %xmm9, %xmm1
580; SSE2-NEXT:    psrlw $2, %xmm1
581; SSE2-NEXT:    pand %xmm10, %xmm1
582; SSE2-NEXT:    por %xmm3, %xmm1
583; SSE2-NEXT:    pand %xmm1, %xmm4
584; SSE2-NEXT:    psrlw $1, %xmm4
585; SSE2-NEXT:    pand %xmm11, %xmm4
586; SSE2-NEXT:    pand %xmm6, %xmm1
587; SSE2-NEXT:    paddb %xmm1, %xmm1
588; SSE2-NEXT:    por %xmm4, %xmm1
589; SSE2-NEXT:    retq
590;
591; SSSE3-LABEL: test_bitreverse_v32i8:
592; SSSE3:       # %bb.0:
593; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
594; SSSE3-NEXT:    movdqa %xmm0, %xmm2
595; SSSE3-NEXT:    pand %xmm4, %xmm2
596; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
597; SSSE3-NEXT:    movdqa %xmm5, %xmm6
598; SSSE3-NEXT:    pshufb %xmm2, %xmm6
599; SSSE3-NEXT:    psrlw $4, %xmm0
600; SSSE3-NEXT:    pand %xmm4, %xmm0
601; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
602; SSSE3-NEXT:    movdqa %xmm2, %xmm3
603; SSSE3-NEXT:    pshufb %xmm0, %xmm3
604; SSSE3-NEXT:    por %xmm6, %xmm3
605; SSSE3-NEXT:    movdqa %xmm1, %xmm0
606; SSSE3-NEXT:    pand %xmm4, %xmm0
607; SSSE3-NEXT:    pshufb %xmm0, %xmm5
608; SSSE3-NEXT:    psrlw $4, %xmm1
609; SSSE3-NEXT:    pand %xmm4, %xmm1
610; SSSE3-NEXT:    pshufb %xmm1, %xmm2
611; SSSE3-NEXT:    por %xmm5, %xmm2
612; SSSE3-NEXT:    movdqa %xmm3, %xmm0
613; SSSE3-NEXT:    movdqa %xmm2, %xmm1
614; SSSE3-NEXT:    retq
615;
616; AVX1-LABEL: test_bitreverse_v32i8:
617; AVX1:       # %bb.0:
618; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
619; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
620; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
621; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
622; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
623; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
624; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
625; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
626; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
627; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
628; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
629; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
630; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
631; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
632; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
633; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
634; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
635; AVX1-NEXT:    retq
636;
637; AVX2-LABEL: test_bitreverse_v32i8:
638; AVX2:       # %bb.0:
639; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
640; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
641; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
642; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
643; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
644; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
645; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
646; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
647; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
648; AVX2-NEXT:    retq
649;
650; AVX512-LABEL: test_bitreverse_v32i8:
651; AVX512:       # %bb.0:
652; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
653; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
654; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
655; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
656; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
657; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
658; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
659; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
660; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
661; AVX512-NEXT:    retq
662;
663; XOPAVX1-LABEL: test_bitreverse_v32i8:
664; XOPAVX1:       # %bb.0:
665; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
666; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
667; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
668; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
669; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
670; XOPAVX1-NEXT:    retq
671;
672; XOPAVX2-LABEL: test_bitreverse_v32i8:
673; XOPAVX2:       # %bb.0:
674; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
675; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
676; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
677; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
678; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
679; XOPAVX2-NEXT:    retq
680  %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
681  ret <32 x i8> %b
682}
683
684define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
685; SSE2-LABEL: test_bitreverse_v16i16:
686; SSE2:       # %bb.0:
687; SSE2-NEXT:    pxor %xmm4, %xmm4
688; SSE2-NEXT:    movdqa %xmm0, %xmm2
689; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
690; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
691; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
692; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
693; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
694; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
695; SSE2-NEXT:    packuswb %xmm2, %xmm0
696; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
697; SSE2-NEXT:    movdqa %xmm0, %xmm3
698; SSE2-NEXT:    pand %xmm2, %xmm3
699; SSE2-NEXT:    psllw $4, %xmm3
700; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
701; SSE2-NEXT:    pand %xmm6, %xmm3
702; SSE2-NEXT:    pand %xmm6, %xmm0
703; SSE2-NEXT:    psrlw $4, %xmm0
704; SSE2-NEXT:    pand %xmm2, %xmm0
705; SSE2-NEXT:    por %xmm3, %xmm0
706; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
707; SSE2-NEXT:    movdqa %xmm0, %xmm5
708; SSE2-NEXT:    pand %xmm3, %xmm5
709; SSE2-NEXT:    psllw $2, %xmm5
710; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
711; SSE2-NEXT:    pand %xmm8, %xmm5
712; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
713; SSE2-NEXT:    pand %xmm9, %xmm0
714; SSE2-NEXT:    psrlw $2, %xmm0
715; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
716; SSE2-NEXT:    pand %xmm10, %xmm0
717; SSE2-NEXT:    por %xmm5, %xmm0
718; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
719; SSE2-NEXT:    movdqa %xmm0, %xmm7
720; SSE2-NEXT:    pand %xmm5, %xmm7
721; SSE2-NEXT:    psrlw $1, %xmm7
722; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
723; SSE2-NEXT:    pand %xmm11, %xmm7
724; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
725; SSE2-NEXT:    pand %xmm12, %xmm0
726; SSE2-NEXT:    paddb %xmm0, %xmm0
727; SSE2-NEXT:    por %xmm7, %xmm0
728; SSE2-NEXT:    movdqa %xmm1, %xmm7
729; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
730; SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[1,0,3,2,4,5,6,7]
731; SSE2-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,7,6]
732; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
733; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
734; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
735; SSE2-NEXT:    packuswb %xmm7, %xmm1
736; SSE2-NEXT:    movdqa %xmm1, %xmm4
737; SSE2-NEXT:    pand %xmm2, %xmm4
738; SSE2-NEXT:    psllw $4, %xmm4
739; SSE2-NEXT:    pand %xmm6, %xmm4
740; SSE2-NEXT:    pand %xmm6, %xmm1
741; SSE2-NEXT:    psrlw $4, %xmm1
742; SSE2-NEXT:    pand %xmm2, %xmm1
743; SSE2-NEXT:    por %xmm4, %xmm1
744; SSE2-NEXT:    pand %xmm1, %xmm3
745; SSE2-NEXT:    psllw $2, %xmm3
746; SSE2-NEXT:    pand %xmm8, %xmm3
747; SSE2-NEXT:    pand %xmm9, %xmm1
748; SSE2-NEXT:    psrlw $2, %xmm1
749; SSE2-NEXT:    pand %xmm10, %xmm1
750; SSE2-NEXT:    por %xmm3, %xmm1
751; SSE2-NEXT:    pand %xmm1, %xmm5
752; SSE2-NEXT:    psrlw $1, %xmm5
753; SSE2-NEXT:    pand %xmm11, %xmm5
754; SSE2-NEXT:    pand %xmm12, %xmm1
755; SSE2-NEXT:    paddb %xmm1, %xmm1
756; SSE2-NEXT:    por %xmm5, %xmm1
757; SSE2-NEXT:    retq
758;
759; SSSE3-LABEL: test_bitreverse_v16i16:
760; SSSE3:       # %bb.0:
761; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
762; SSSE3-NEXT:    pshufb %xmm4, %xmm0
763; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
764; SSSE3-NEXT:    movdqa %xmm0, %xmm2
765; SSSE3-NEXT:    pand %xmm5, %xmm2
766; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
767; SSSE3-NEXT:    movdqa %xmm6, %xmm7
768; SSSE3-NEXT:    pshufb %xmm2, %xmm7
769; SSSE3-NEXT:    psrlw $4, %xmm0
770; SSSE3-NEXT:    pand %xmm5, %xmm0
771; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
772; SSSE3-NEXT:    movdqa %xmm2, %xmm3
773; SSSE3-NEXT:    pshufb %xmm0, %xmm3
774; SSSE3-NEXT:    por %xmm7, %xmm3
775; SSSE3-NEXT:    pshufb %xmm4, %xmm1
776; SSSE3-NEXT:    movdqa %xmm1, %xmm0
777; SSSE3-NEXT:    pand %xmm5, %xmm0
778; SSSE3-NEXT:    pshufb %xmm0, %xmm6
779; SSSE3-NEXT:    psrlw $4, %xmm1
780; SSSE3-NEXT:    pand %xmm5, %xmm1
781; SSSE3-NEXT:    pshufb %xmm1, %xmm2
782; SSSE3-NEXT:    por %xmm6, %xmm2
783; SSSE3-NEXT:    movdqa %xmm3, %xmm0
784; SSSE3-NEXT:    movdqa %xmm2, %xmm1
785; SSSE3-NEXT:    retq
786;
787; AVX1-LABEL: test_bitreverse_v16i16:
788; AVX1:       # %bb.0:
789; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
790; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
791; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
792; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
793; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
794; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
795; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
796; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
797; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
798; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
799; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
800; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
801; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
802; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
803; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
804; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
805; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
806; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
807; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
808; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
809; AVX1-NEXT:    retq
810;
811; AVX2-LABEL: test_bitreverse_v16i16:
812; AVX2:       # %bb.0:
813; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
814; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
815; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
816; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
817; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
818; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
819; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
820; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
821; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
822; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
823; AVX2-NEXT:    retq
824;
825; AVX512-LABEL: test_bitreverse_v16i16:
826; AVX512:       # %bb.0:
827; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
828; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
829; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
830; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
831; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
832; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
833; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
834; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
835; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
836; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
837; AVX512-NEXT:    retq
838;
839; XOPAVX1-LABEL: test_bitreverse_v16i16:
840; XOPAVX1:       # %bb.0:
841; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
842; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
843; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
844; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
845; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
846; XOPAVX1-NEXT:    retq
847;
848; XOPAVX2-LABEL: test_bitreverse_v16i16:
849; XOPAVX2:       # %bb.0:
850; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
851; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
852; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
853; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
854; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
855; XOPAVX2-NEXT:    retq
856  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
857  ret <16 x i16> %b
858}
859
860define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
861; SSE2-LABEL: test_bitreverse_v8i32:
862; SSE2:       # %bb.0:
863; SSE2-NEXT:    pxor %xmm4, %xmm4
864; SSE2-NEXT:    movdqa %xmm0, %xmm2
865; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
866; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
867; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
868; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
869; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
870; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
871; SSE2-NEXT:    packuswb %xmm2, %xmm0
872; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
873; SSE2-NEXT:    movdqa %xmm0, %xmm3
874; SSE2-NEXT:    pand %xmm2, %xmm3
875; SSE2-NEXT:    psllw $4, %xmm3
876; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
877; SSE2-NEXT:    pand %xmm6, %xmm3
878; SSE2-NEXT:    pand %xmm6, %xmm0
879; SSE2-NEXT:    psrlw $4, %xmm0
880; SSE2-NEXT:    pand %xmm2, %xmm0
881; SSE2-NEXT:    por %xmm3, %xmm0
882; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
883; SSE2-NEXT:    movdqa %xmm0, %xmm5
884; SSE2-NEXT:    pand %xmm3, %xmm5
885; SSE2-NEXT:    psllw $2, %xmm5
886; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
887; SSE2-NEXT:    pand %xmm8, %xmm5
888; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
889; SSE2-NEXT:    pand %xmm9, %xmm0
890; SSE2-NEXT:    psrlw $2, %xmm0
891; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
892; SSE2-NEXT:    pand %xmm10, %xmm0
893; SSE2-NEXT:    por %xmm5, %xmm0
894; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
895; SSE2-NEXT:    movdqa %xmm0, %xmm7
896; SSE2-NEXT:    pand %xmm5, %xmm7
897; SSE2-NEXT:    psrlw $1, %xmm7
898; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
899; SSE2-NEXT:    pand %xmm11, %xmm7
900; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
901; SSE2-NEXT:    pand %xmm12, %xmm0
902; SSE2-NEXT:    paddb %xmm0, %xmm0
903; SSE2-NEXT:    por %xmm7, %xmm0
904; SSE2-NEXT:    movdqa %xmm1, %xmm7
905; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
906; SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[3,2,1,0,4,5,6,7]
907; SSE2-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4]
908; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
909; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
910; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
911; SSE2-NEXT:    packuswb %xmm7, %xmm1
912; SSE2-NEXT:    movdqa %xmm1, %xmm4
913; SSE2-NEXT:    pand %xmm2, %xmm4
914; SSE2-NEXT:    psllw $4, %xmm4
915; SSE2-NEXT:    pand %xmm6, %xmm4
916; SSE2-NEXT:    pand %xmm6, %xmm1
917; SSE2-NEXT:    psrlw $4, %xmm1
918; SSE2-NEXT:    pand %xmm2, %xmm1
919; SSE2-NEXT:    por %xmm4, %xmm1
920; SSE2-NEXT:    pand %xmm1, %xmm3
921; SSE2-NEXT:    psllw $2, %xmm3
922; SSE2-NEXT:    pand %xmm8, %xmm3
923; SSE2-NEXT:    pand %xmm9, %xmm1
924; SSE2-NEXT:    psrlw $2, %xmm1
925; SSE2-NEXT:    pand %xmm10, %xmm1
926; SSE2-NEXT:    por %xmm3, %xmm1
927; SSE2-NEXT:    pand %xmm1, %xmm5
928; SSE2-NEXT:    psrlw $1, %xmm5
929; SSE2-NEXT:    pand %xmm11, %xmm5
930; SSE2-NEXT:    pand %xmm12, %xmm1
931; SSE2-NEXT:    paddb %xmm1, %xmm1
932; SSE2-NEXT:    por %xmm5, %xmm1
933; SSE2-NEXT:    retq
934;
935; SSSE3-LABEL: test_bitreverse_v8i32:
936; SSSE3:       # %bb.0:
937; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
938; SSSE3-NEXT:    pshufb %xmm4, %xmm0
939; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
940; SSSE3-NEXT:    movdqa %xmm0, %xmm2
941; SSSE3-NEXT:    pand %xmm5, %xmm2
942; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
943; SSSE3-NEXT:    movdqa %xmm6, %xmm7
944; SSSE3-NEXT:    pshufb %xmm2, %xmm7
945; SSSE3-NEXT:    psrlw $4, %xmm0
946; SSSE3-NEXT:    pand %xmm5, %xmm0
947; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
948; SSSE3-NEXT:    movdqa %xmm2, %xmm3
949; SSSE3-NEXT:    pshufb %xmm0, %xmm3
950; SSSE3-NEXT:    por %xmm7, %xmm3
951; SSSE3-NEXT:    pshufb %xmm4, %xmm1
952; SSSE3-NEXT:    movdqa %xmm1, %xmm0
953; SSSE3-NEXT:    pand %xmm5, %xmm0
954; SSSE3-NEXT:    pshufb %xmm0, %xmm6
955; SSSE3-NEXT:    psrlw $4, %xmm1
956; SSSE3-NEXT:    pand %xmm5, %xmm1
957; SSSE3-NEXT:    pshufb %xmm1, %xmm2
958; SSSE3-NEXT:    por %xmm6, %xmm2
959; SSSE3-NEXT:    movdqa %xmm3, %xmm0
960; SSSE3-NEXT:    movdqa %xmm2, %xmm1
961; SSSE3-NEXT:    retq
962;
963; AVX1-LABEL: test_bitreverse_v8i32:
964; AVX1:       # %bb.0:
965; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
966; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
967; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
968; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
969; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
970; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
971; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
972; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
973; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
974; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
975; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
976; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
977; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
978; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
979; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
980; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
981; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
982; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
983; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
984; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
985; AVX1-NEXT:    retq
986;
987; AVX2-LABEL: test_bitreverse_v8i32:
988; AVX2:       # %bb.0:
989; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
990; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
991; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
992; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
993; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
994; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
995; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
996; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
997; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
998; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
999; AVX2-NEXT:    retq
1000;
1001; AVX512-LABEL: test_bitreverse_v8i32:
1002; AVX512:       # %bb.0:
1003; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1004; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1005; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1006; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1007; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1008; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1009; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1010; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1011; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1012; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1013; AVX512-NEXT:    retq
1014;
1015; XOPAVX1-LABEL: test_bitreverse_v8i32:
1016; XOPAVX1:       # %bb.0:
1017; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1018; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1019; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1020; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1021; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1022; XOPAVX1-NEXT:    retq
1023;
1024; XOPAVX2-LABEL: test_bitreverse_v8i32:
1025; XOPAVX2:       # %bb.0:
1026; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1027; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1028; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1029; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1030; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1031; XOPAVX2-NEXT:    retq
1032  %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
1033  ret <8 x i32> %b
1034}
1035
1036define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
1037; SSE2-LABEL: test_bitreverse_v4i64:
1038; SSE2:       # %bb.0:
1039; SSE2-NEXT:    pxor %xmm4, %xmm4
1040; SSE2-NEXT:    movdqa %xmm0, %xmm2
1041; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
1042; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
1043; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1044; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1045; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1046; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1047; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1048; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1049; SSE2-NEXT:    packuswb %xmm2, %xmm0
1050; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1051; SSE2-NEXT:    movdqa %xmm0, %xmm3
1052; SSE2-NEXT:    pand %xmm2, %xmm3
1053; SSE2-NEXT:    psllw $4, %xmm3
1054; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1055; SSE2-NEXT:    pand %xmm6, %xmm3
1056; SSE2-NEXT:    pand %xmm6, %xmm0
1057; SSE2-NEXT:    psrlw $4, %xmm0
1058; SSE2-NEXT:    pand %xmm2, %xmm0
1059; SSE2-NEXT:    por %xmm3, %xmm0
1060; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1061; SSE2-NEXT:    movdqa %xmm0, %xmm5
1062; SSE2-NEXT:    pand %xmm3, %xmm5
1063; SSE2-NEXT:    psllw $2, %xmm5
1064; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1065; SSE2-NEXT:    pand %xmm8, %xmm5
1066; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1067; SSE2-NEXT:    pand %xmm9, %xmm0
1068; SSE2-NEXT:    psrlw $2, %xmm0
1069; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
1070; SSE2-NEXT:    pand %xmm10, %xmm0
1071; SSE2-NEXT:    por %xmm5, %xmm0
1072; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1073; SSE2-NEXT:    movdqa %xmm0, %xmm7
1074; SSE2-NEXT:    pand %xmm5, %xmm7
1075; SSE2-NEXT:    psrlw $1, %xmm7
1076; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1077; SSE2-NEXT:    pand %xmm11, %xmm7
1078; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1079; SSE2-NEXT:    pand %xmm12, %xmm0
1080; SSE2-NEXT:    paddb %xmm0, %xmm0
1081; SSE2-NEXT:    por %xmm7, %xmm0
1082; SSE2-NEXT:    movdqa %xmm1, %xmm7
1083; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
1084; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,3,0,1]
1085; SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[3,2,1,0,4,5,6,7]
1086; SSE2-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4]
1087; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1088; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1089; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1090; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1091; SSE2-NEXT:    packuswb %xmm7, %xmm1
1092; SSE2-NEXT:    movdqa %xmm1, %xmm4
1093; SSE2-NEXT:    pand %xmm2, %xmm4
1094; SSE2-NEXT:    psllw $4, %xmm4
1095; SSE2-NEXT:    pand %xmm6, %xmm4
1096; SSE2-NEXT:    pand %xmm6, %xmm1
1097; SSE2-NEXT:    psrlw $4, %xmm1
1098; SSE2-NEXT:    pand %xmm2, %xmm1
1099; SSE2-NEXT:    por %xmm4, %xmm1
1100; SSE2-NEXT:    pand %xmm1, %xmm3
1101; SSE2-NEXT:    psllw $2, %xmm3
1102; SSE2-NEXT:    pand %xmm8, %xmm3
1103; SSE2-NEXT:    pand %xmm9, %xmm1
1104; SSE2-NEXT:    psrlw $2, %xmm1
1105; SSE2-NEXT:    pand %xmm10, %xmm1
1106; SSE2-NEXT:    por %xmm3, %xmm1
1107; SSE2-NEXT:    pand %xmm1, %xmm5
1108; SSE2-NEXT:    psrlw $1, %xmm5
1109; SSE2-NEXT:    pand %xmm11, %xmm5
1110; SSE2-NEXT:    pand %xmm12, %xmm1
1111; SSE2-NEXT:    paddb %xmm1, %xmm1
1112; SSE2-NEXT:    por %xmm5, %xmm1
1113; SSE2-NEXT:    retq
1114;
1115; SSSE3-LABEL: test_bitreverse_v4i64:
1116; SSSE3:       # %bb.0:
1117; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1118; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1119; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1120; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1121; SSSE3-NEXT:    pand %xmm5, %xmm2
1122; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1123; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1124; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1125; SSSE3-NEXT:    psrlw $4, %xmm0
1126; SSSE3-NEXT:    pand %xmm5, %xmm0
1127; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1128; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1129; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1130; SSSE3-NEXT:    por %xmm7, %xmm3
1131; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1132; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1133; SSSE3-NEXT:    pand %xmm5, %xmm0
1134; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1135; SSSE3-NEXT:    psrlw $4, %xmm1
1136; SSSE3-NEXT:    pand %xmm5, %xmm1
1137; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1138; SSSE3-NEXT:    por %xmm6, %xmm2
1139; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1140; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1141; SSSE3-NEXT:    retq
1142;
1143; AVX1-LABEL: test_bitreverse_v4i64:
1144; AVX1:       # %bb.0:
1145; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1146; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1147; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1148; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1149; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1150; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1151; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1152; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1153; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1154; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1155; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1156; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1157; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1158; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1159; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1160; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1161; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1162; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1163; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1164; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1165; AVX1-NEXT:    retq
1166;
1167; AVX2-LABEL: test_bitreverse_v4i64:
1168; AVX2:       # %bb.0:
1169; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1170; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1171; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1172; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1173; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1174; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1175; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1176; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1177; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1178; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1179; AVX2-NEXT:    retq
1180;
1181; AVX512-LABEL: test_bitreverse_v4i64:
1182; AVX512:       # %bb.0:
1183; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1184; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1185; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1186; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1187; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1188; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1189; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1190; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1191; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1192; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1193; AVX512-NEXT:    retq
1194;
1195; XOPAVX1-LABEL: test_bitreverse_v4i64:
1196; XOPAVX1:       # %bb.0:
1197; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1198; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1199; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1200; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1201; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1202; XOPAVX1-NEXT:    retq
1203;
1204; XOPAVX2-LABEL: test_bitreverse_v4i64:
1205; XOPAVX2:       # %bb.0:
1206; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1207; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1208; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1209; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1210; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1211; XOPAVX2-NEXT:    retq
1212  %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
1213  ret <4 x i64> %b
1214}
1215
1216define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
1217; SSE2-LABEL: test_bitreverse_v64i8:
1218; SSE2:       # %bb.0:
1219; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1220; SSE2-NEXT:    movdqa %xmm0, %xmm5
1221; SSE2-NEXT:    pand %xmm13, %xmm5
1222; SSE2-NEXT:    psllw $4, %xmm5
1223; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1224; SSE2-NEXT:    pand %xmm7, %xmm5
1225; SSE2-NEXT:    pand %xmm7, %xmm0
1226; SSE2-NEXT:    psrlw $4, %xmm0
1227; SSE2-NEXT:    pand %xmm13, %xmm0
1228; SSE2-NEXT:    por %xmm5, %xmm0
1229; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1230; SSE2-NEXT:    movdqa %xmm0, %xmm6
1231; SSE2-NEXT:    pand %xmm5, %xmm6
1232; SSE2-NEXT:    psllw $2, %xmm6
1233; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1234; SSE2-NEXT:    pand %xmm8, %xmm6
1235; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1236; SSE2-NEXT:    pand %xmm9, %xmm0
1237; SSE2-NEXT:    psrlw $2, %xmm0
1238; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
1239; SSE2-NEXT:    pand %xmm10, %xmm0
1240; SSE2-NEXT:    por %xmm6, %xmm0
1241; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1242; SSE2-NEXT:    movdqa %xmm0, %xmm4
1243; SSE2-NEXT:    pand %xmm6, %xmm4
1244; SSE2-NEXT:    psrlw $1, %xmm4
1245; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1246; SSE2-NEXT:    pand %xmm11, %xmm4
1247; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1248; SSE2-NEXT:    pand %xmm12, %xmm0
1249; SSE2-NEXT:    paddb %xmm0, %xmm0
1250; SSE2-NEXT:    por %xmm4, %xmm0
1251; SSE2-NEXT:    movdqa %xmm1, %xmm4
1252; SSE2-NEXT:    pand %xmm13, %xmm4
1253; SSE2-NEXT:    psllw $4, %xmm4
1254; SSE2-NEXT:    pand %xmm7, %xmm4
1255; SSE2-NEXT:    pand %xmm7, %xmm1
1256; SSE2-NEXT:    psrlw $4, %xmm1
1257; SSE2-NEXT:    pand %xmm13, %xmm1
1258; SSE2-NEXT:    por %xmm4, %xmm1
1259; SSE2-NEXT:    movdqa %xmm1, %xmm4
1260; SSE2-NEXT:    pand %xmm5, %xmm4
1261; SSE2-NEXT:    psllw $2, %xmm4
1262; SSE2-NEXT:    pand %xmm8, %xmm4
1263; SSE2-NEXT:    pand %xmm9, %xmm1
1264; SSE2-NEXT:    psrlw $2, %xmm1
1265; SSE2-NEXT:    pand %xmm10, %xmm1
1266; SSE2-NEXT:    por %xmm4, %xmm1
1267; SSE2-NEXT:    movdqa %xmm1, %xmm4
1268; SSE2-NEXT:    pand %xmm6, %xmm4
1269; SSE2-NEXT:    psrlw $1, %xmm4
1270; SSE2-NEXT:    pand %xmm11, %xmm4
1271; SSE2-NEXT:    pand %xmm12, %xmm1
1272; SSE2-NEXT:    paddb %xmm1, %xmm1
1273; SSE2-NEXT:    por %xmm4, %xmm1
1274; SSE2-NEXT:    movdqa %xmm2, %xmm4
1275; SSE2-NEXT:    pand %xmm13, %xmm4
1276; SSE2-NEXT:    psllw $4, %xmm4
1277; SSE2-NEXT:    pand %xmm7, %xmm4
1278; SSE2-NEXT:    pand %xmm7, %xmm2
1279; SSE2-NEXT:    psrlw $4, %xmm2
1280; SSE2-NEXT:    pand %xmm13, %xmm2
1281; SSE2-NEXT:    por %xmm4, %xmm2
1282; SSE2-NEXT:    movdqa %xmm2, %xmm4
1283; SSE2-NEXT:    pand %xmm5, %xmm4
1284; SSE2-NEXT:    psllw $2, %xmm4
1285; SSE2-NEXT:    pand %xmm8, %xmm4
1286; SSE2-NEXT:    pand %xmm9, %xmm2
1287; SSE2-NEXT:    psrlw $2, %xmm2
1288; SSE2-NEXT:    pand %xmm10, %xmm2
1289; SSE2-NEXT:    por %xmm4, %xmm2
1290; SSE2-NEXT:    movdqa %xmm2, %xmm4
1291; SSE2-NEXT:    pand %xmm6, %xmm4
1292; SSE2-NEXT:    psrlw $1, %xmm4
1293; SSE2-NEXT:    pand %xmm11, %xmm4
1294; SSE2-NEXT:    pand %xmm12, %xmm2
1295; SSE2-NEXT:    paddb %xmm2, %xmm2
1296; SSE2-NEXT:    por %xmm4, %xmm2
1297; SSE2-NEXT:    movdqa %xmm3, %xmm4
1298; SSE2-NEXT:    pand %xmm13, %xmm4
1299; SSE2-NEXT:    psllw $4, %xmm4
1300; SSE2-NEXT:    pand %xmm7, %xmm4
1301; SSE2-NEXT:    pand %xmm7, %xmm3
1302; SSE2-NEXT:    psrlw $4, %xmm3
1303; SSE2-NEXT:    pand %xmm13, %xmm3
1304; SSE2-NEXT:    por %xmm4, %xmm3
1305; SSE2-NEXT:    pand %xmm3, %xmm5
1306; SSE2-NEXT:    psllw $2, %xmm5
1307; SSE2-NEXT:    pand %xmm8, %xmm5
1308; SSE2-NEXT:    pand %xmm9, %xmm3
1309; SSE2-NEXT:    psrlw $2, %xmm3
1310; SSE2-NEXT:    pand %xmm10, %xmm3
1311; SSE2-NEXT:    por %xmm5, %xmm3
1312; SSE2-NEXT:    pand %xmm3, %xmm6
1313; SSE2-NEXT:    psrlw $1, %xmm6
1314; SSE2-NEXT:    pand %xmm11, %xmm6
1315; SSE2-NEXT:    pand %xmm12, %xmm3
1316; SSE2-NEXT:    paddb %xmm3, %xmm3
1317; SSE2-NEXT:    por %xmm6, %xmm3
1318; SSE2-NEXT:    retq
1319;
1320; SSSE3-LABEL: test_bitreverse_v64i8:
1321; SSSE3:       # %bb.0:
1322; SSSE3-NEXT:    movdqa %xmm0, %xmm5
1323; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1324; SSSE3-NEXT:    pand %xmm8, %xmm0
1325; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1326; SSSE3-NEXT:    movdqa %xmm9, %xmm6
1327; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1328; SSSE3-NEXT:    psrlw $4, %xmm5
1329; SSSE3-NEXT:    pand %xmm8, %xmm5
1330; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1331; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1332; SSSE3-NEXT:    pshufb %xmm5, %xmm0
1333; SSSE3-NEXT:    por %xmm6, %xmm0
1334; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1335; SSSE3-NEXT:    pand %xmm8, %xmm5
1336; SSSE3-NEXT:    movdqa %xmm9, %xmm6
1337; SSSE3-NEXT:    pshufb %xmm5, %xmm6
1338; SSSE3-NEXT:    psrlw $4, %xmm1
1339; SSSE3-NEXT:    pand %xmm8, %xmm1
1340; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1341; SSSE3-NEXT:    pshufb %xmm1, %xmm5
1342; SSSE3-NEXT:    por %xmm6, %xmm5
1343; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1344; SSSE3-NEXT:    pand %xmm8, %xmm1
1345; SSSE3-NEXT:    movdqa %xmm9, %xmm7
1346; SSSE3-NEXT:    pshufb %xmm1, %xmm7
1347; SSSE3-NEXT:    psrlw $4, %xmm2
1348; SSSE3-NEXT:    pand %xmm8, %xmm2
1349; SSSE3-NEXT:    movdqa %xmm4, %xmm6
1350; SSSE3-NEXT:    pshufb %xmm2, %xmm6
1351; SSSE3-NEXT:    por %xmm7, %xmm6
1352; SSSE3-NEXT:    movdqa %xmm3, %xmm1
1353; SSSE3-NEXT:    pand %xmm8, %xmm1
1354; SSSE3-NEXT:    pshufb %xmm1, %xmm9
1355; SSSE3-NEXT:    psrlw $4, %xmm3
1356; SSSE3-NEXT:    pand %xmm8, %xmm3
1357; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1358; SSSE3-NEXT:    por %xmm9, %xmm4
1359; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1360; SSSE3-NEXT:    movdqa %xmm6, %xmm2
1361; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1362; SSSE3-NEXT:    retq
1363;
1364; AVX1-LABEL: test_bitreverse_v64i8:
1365; AVX1:       # %bb.0:
1366; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1367; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1368; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
1369; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1370; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1371; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1372; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1373; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1374; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
1375; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
1376; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm4
1377; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1378; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1379; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1380; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1381; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
1382; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1383; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1384; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
1385; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1386; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1387; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1388; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
1389; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
1390; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1391; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1392; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1393; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1394; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1395; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1396; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1397; AVX1-NEXT:    retq
1398;
1399; AVX2-LABEL: test_bitreverse_v64i8:
1400; AVX2:       # %bb.0:
1401; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1402; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
1403; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1404; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1405; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1406; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
1407; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1408; AVX2-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
1409; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
1410; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
1411; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1412; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
1413; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
1414; AVX2-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
1415; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
1416; AVX2-NEXT:    retq
1417;
1418; AVX512F-LABEL: test_bitreverse_v64i8:
1419; AVX512F:       # %bb.0:
1420; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1421; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm3
1422; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1423; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1424; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
1425; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
1426; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1427; AVX512F-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
1428; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
1429; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
1430; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1431; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
1432; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
1433; AVX512F-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
1434; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
1435; AVX512F-NEXT:    retq
1436;
1437; AVX512BW-LABEL: test_bitreverse_v64i8:
1438; AVX512BW:       # %bb.0:
1439; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1440; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
1441; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1442; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
1443; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
1444; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
1445; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1446; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
1447; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
1448; AVX512BW-NEXT:    retq
1449;
1450; XOPAVX1-LABEL: test_bitreverse_v64i8:
1451; XOPAVX1:       # %bb.0:
1452; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1453; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1454; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1455; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1456; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1457; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1458; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1459; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1460; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1461; XOPAVX1-NEXT:    retq
1462;
1463; XOPAVX2-LABEL: test_bitreverse_v64i8:
1464; XOPAVX2:       # %bb.0:
1465; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1466; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1467; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1468; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1469; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
1470; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1471; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1472; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1473; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
1474; XOPAVX2-NEXT:    retq
1475  %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
1476  ret <64 x i8> %b
1477}
1478
1479define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
1480; SSE2-LABEL: test_bitreverse_v32i16:
1481; SSE2:       # %bb.0:
1482; SSE2-NEXT:    pxor %xmm14, %xmm14
1483; SSE2-NEXT:    movdqa %xmm0, %xmm4
1484; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15]
1485; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
1486; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
1487; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
1488; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
1489; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
1490; SSE2-NEXT:    packuswb %xmm4, %xmm0
1491; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1492; SSE2-NEXT:    movdqa %xmm0, %xmm5
1493; SSE2-NEXT:    pand %xmm8, %xmm5
1494; SSE2-NEXT:    psllw $4, %xmm5
1495; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1496; SSE2-NEXT:    pand %xmm4, %xmm5
1497; SSE2-NEXT:    pand %xmm4, %xmm0
1498; SSE2-NEXT:    psrlw $4, %xmm0
1499; SSE2-NEXT:    pand %xmm8, %xmm0
1500; SSE2-NEXT:    por %xmm5, %xmm0
1501; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1502; SSE2-NEXT:    movdqa %xmm0, %xmm7
1503; SSE2-NEXT:    pand %xmm5, %xmm7
1504; SSE2-NEXT:    psllw $2, %xmm7
1505; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1506; SSE2-NEXT:    pand %xmm9, %xmm7
1507; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1508; SSE2-NEXT:    pand %xmm10, %xmm0
1509; SSE2-NEXT:    psrlw $2, %xmm0
1510; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
1511; SSE2-NEXT:    pand %xmm11, %xmm0
1512; SSE2-NEXT:    por %xmm7, %xmm0
1513; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1514; SSE2-NEXT:    movdqa %xmm0, %xmm6
1515; SSE2-NEXT:    pand %xmm7, %xmm6
1516; SSE2-NEXT:    psrlw $1, %xmm6
1517; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1518; SSE2-NEXT:    pand %xmm12, %xmm6
1519; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1520; SSE2-NEXT:    pand %xmm13, %xmm0
1521; SSE2-NEXT:    paddb %xmm0, %xmm0
1522; SSE2-NEXT:    por %xmm6, %xmm0
1523; SSE2-NEXT:    movdqa %xmm1, %xmm6
1524; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
1525; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
1526; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6]
1527; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
1528; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
1529; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
1530; SSE2-NEXT:    packuswb %xmm6, %xmm1
1531; SSE2-NEXT:    movdqa %xmm1, %xmm6
1532; SSE2-NEXT:    pand %xmm8, %xmm6
1533; SSE2-NEXT:    psllw $4, %xmm6
1534; SSE2-NEXT:    pand %xmm4, %xmm6
1535; SSE2-NEXT:    pand %xmm4, %xmm1
1536; SSE2-NEXT:    psrlw $4, %xmm1
1537; SSE2-NEXT:    pand %xmm8, %xmm1
1538; SSE2-NEXT:    por %xmm6, %xmm1
1539; SSE2-NEXT:    movdqa %xmm1, %xmm6
1540; SSE2-NEXT:    pand %xmm5, %xmm6
1541; SSE2-NEXT:    psllw $2, %xmm6
1542; SSE2-NEXT:    pand %xmm9, %xmm6
1543; SSE2-NEXT:    pand %xmm10, %xmm1
1544; SSE2-NEXT:    psrlw $2, %xmm1
1545; SSE2-NEXT:    pand %xmm11, %xmm1
1546; SSE2-NEXT:    por %xmm6, %xmm1
1547; SSE2-NEXT:    movdqa %xmm1, %xmm6
1548; SSE2-NEXT:    pand %xmm7, %xmm6
1549; SSE2-NEXT:    psrlw $1, %xmm6
1550; SSE2-NEXT:    pand %xmm12, %xmm6
1551; SSE2-NEXT:    pand %xmm13, %xmm1
1552; SSE2-NEXT:    paddb %xmm1, %xmm1
1553; SSE2-NEXT:    por %xmm6, %xmm1
1554; SSE2-NEXT:    movdqa %xmm2, %xmm6
1555; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
1556; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
1557; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6]
1558; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
1559; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
1560; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
1561; SSE2-NEXT:    packuswb %xmm6, %xmm2
1562; SSE2-NEXT:    movdqa %xmm2, %xmm6
1563; SSE2-NEXT:    pand %xmm8, %xmm6
1564; SSE2-NEXT:    psllw $4, %xmm6
1565; SSE2-NEXT:    pand %xmm4, %xmm6
1566; SSE2-NEXT:    pand %xmm4, %xmm2
1567; SSE2-NEXT:    psrlw $4, %xmm2
1568; SSE2-NEXT:    pand %xmm8, %xmm2
1569; SSE2-NEXT:    por %xmm6, %xmm2
1570; SSE2-NEXT:    movdqa %xmm2, %xmm6
1571; SSE2-NEXT:    pand %xmm5, %xmm6
1572; SSE2-NEXT:    psllw $2, %xmm6
1573; SSE2-NEXT:    pand %xmm9, %xmm6
1574; SSE2-NEXT:    pand %xmm10, %xmm2
1575; SSE2-NEXT:    psrlw $2, %xmm2
1576; SSE2-NEXT:    pand %xmm11, %xmm2
1577; SSE2-NEXT:    por %xmm6, %xmm2
1578; SSE2-NEXT:    movdqa %xmm2, %xmm6
1579; SSE2-NEXT:    pand %xmm7, %xmm6
1580; SSE2-NEXT:    psrlw $1, %xmm6
1581; SSE2-NEXT:    pand %xmm12, %xmm6
1582; SSE2-NEXT:    pand %xmm13, %xmm2
1583; SSE2-NEXT:    paddb %xmm2, %xmm2
1584; SSE2-NEXT:    por %xmm6, %xmm2
1585; SSE2-NEXT:    movdqa %xmm3, %xmm6
1586; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
1587; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
1588; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6]
1589; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
1590; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
1591; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
1592; SSE2-NEXT:    packuswb %xmm6, %xmm3
1593; SSE2-NEXT:    movdqa %xmm3, %xmm6
1594; SSE2-NEXT:    pand %xmm8, %xmm6
1595; SSE2-NEXT:    psllw $4, %xmm6
1596; SSE2-NEXT:    pand %xmm4, %xmm6
1597; SSE2-NEXT:    pand %xmm4, %xmm3
1598; SSE2-NEXT:    psrlw $4, %xmm3
1599; SSE2-NEXT:    pand %xmm8, %xmm3
1600; SSE2-NEXT:    por %xmm6, %xmm3
1601; SSE2-NEXT:    pand %xmm3, %xmm5
1602; SSE2-NEXT:    psllw $2, %xmm5
1603; SSE2-NEXT:    pand %xmm9, %xmm5
1604; SSE2-NEXT:    pand %xmm10, %xmm3
1605; SSE2-NEXT:    psrlw $2, %xmm3
1606; SSE2-NEXT:    pand %xmm11, %xmm3
1607; SSE2-NEXT:    por %xmm5, %xmm3
1608; SSE2-NEXT:    pand %xmm3, %xmm7
1609; SSE2-NEXT:    psrlw $1, %xmm7
1610; SSE2-NEXT:    pand %xmm12, %xmm7
1611; SSE2-NEXT:    pand %xmm13, %xmm3
1612; SSE2-NEXT:    paddb %xmm3, %xmm3
1613; SSE2-NEXT:    por %xmm7, %xmm3
1614; SSE2-NEXT:    retq
1615;
1616; SSSE3-LABEL: test_bitreverse_v32i16:
1617; SSSE3:       # %bb.0:
1618; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1619; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1620; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1621; SSSE3-NEXT:    pshufb %xmm8, %xmm1
1622; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1623; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1624; SSSE3-NEXT:    pand %xmm9, %xmm0
1625; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1626; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1627; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1628; SSSE3-NEXT:    psrlw $4, %xmm1
1629; SSSE3-NEXT:    pand %xmm9, %xmm1
1630; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1631; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1632; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1633; SSSE3-NEXT:    por %xmm6, %xmm0
1634; SSSE3-NEXT:    pshufb %xmm8, %xmm5
1635; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1636; SSSE3-NEXT:    pand %xmm9, %xmm1
1637; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1638; SSSE3-NEXT:    pshufb %xmm1, %xmm6
1639; SSSE3-NEXT:    psrlw $4, %xmm5
1640; SSSE3-NEXT:    pand %xmm9, %xmm5
1641; SSSE3-NEXT:    movdqa %xmm4, %xmm1
1642; SSSE3-NEXT:    pshufb %xmm5, %xmm1
1643; SSSE3-NEXT:    por %xmm6, %xmm1
1644; SSSE3-NEXT:    pshufb %xmm8, %xmm2
1645; SSSE3-NEXT:    movdqa %xmm2, %xmm5
1646; SSSE3-NEXT:    pand %xmm9, %xmm5
1647; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1648; SSSE3-NEXT:    pshufb %xmm5, %xmm6
1649; SSSE3-NEXT:    psrlw $4, %xmm2
1650; SSSE3-NEXT:    pand %xmm9, %xmm2
1651; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1652; SSSE3-NEXT:    pshufb %xmm2, %xmm5
1653; SSSE3-NEXT:    por %xmm6, %xmm5
1654; SSSE3-NEXT:    pshufb %xmm8, %xmm3
1655; SSSE3-NEXT:    movdqa %xmm3, %xmm2
1656; SSSE3-NEXT:    pand %xmm9, %xmm2
1657; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1658; SSSE3-NEXT:    psrlw $4, %xmm3
1659; SSSE3-NEXT:    pand %xmm9, %xmm3
1660; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1661; SSSE3-NEXT:    por %xmm7, %xmm4
1662; SSSE3-NEXT:    movdqa %xmm5, %xmm2
1663; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1664; SSSE3-NEXT:    retq
1665;
1666; AVX1-LABEL: test_bitreverse_v32i16:
1667; AVX1:       # %bb.0:
1668; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1669; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1670; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1671; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1672; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
1673; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1674; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1675; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1676; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1677; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1678; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
1679; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
1680; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1681; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
1682; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1683; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1684; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1685; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
1686; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
1687; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1688; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1689; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1690; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
1691; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1692; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1693; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1694; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
1695; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
1696; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1697; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
1698; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
1699; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1700; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
1701; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
1702; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
1703; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1704; AVX1-NEXT:    retq
1705;
1706; AVX2-LABEL: test_bitreverse_v32i16:
1707; AVX2:       # %bb.0:
1708; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1709; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1710; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1711; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
1712; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1713; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
1714; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1715; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
1716; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1717; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
1718; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
1719; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1720; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
1721; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
1722; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
1723; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
1724; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
1725; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
1726; AVX2-NEXT:    retq
1727;
1728; AVX512F-LABEL: test_bitreverse_v32i16:
1729; AVX512F:       # %bb.0:
1730; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1731; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1732; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1733; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm4
1734; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1735; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
1736; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
1737; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
1738; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1739; AVX512F-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
1740; AVX512F-NEXT:    vpor %ymm0, %ymm4, %ymm0
1741; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1742; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm2
1743; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
1744; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
1745; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
1746; AVX512F-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
1747; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
1748; AVX512F-NEXT:    retq
1749;
1750; AVX512BW-LABEL: test_bitreverse_v32i16:
1751; AVX512BW:       # %bb.0:
1752; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
1753; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1754; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
1755; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1756; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
1757; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
1758; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
1759; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1760; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
1761; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
1762; AVX512BW-NEXT:    retq
1763;
1764; XOPAVX1-LABEL: test_bitreverse_v32i16:
1765; XOPAVX1:       # %bb.0:
1766; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1767; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1768; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1769; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1770; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1771; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1772; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1773; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1774; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1775; XOPAVX1-NEXT:    retq
1776;
1777; XOPAVX2-LABEL: test_bitreverse_v32i16:
1778; XOPAVX2:       # %bb.0:
1779; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1780; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1781; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1782; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1783; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
1784; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1785; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1786; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1787; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
1788; XOPAVX2-NEXT:    retq
1789  %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
1790  ret <32 x i16> %b
1791}
1792
1793define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
1794; SSE2-LABEL: test_bitreverse_v16i32:
1795; SSE2:       # %bb.0:
1796; SSE2-NEXT:    pxor %xmm14, %xmm14
1797; SSE2-NEXT:    movdqa %xmm0, %xmm4
1798; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15]
1799; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
1800; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
1801; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
1802; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1803; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1804; SSE2-NEXT:    packuswb %xmm4, %xmm0
1805; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1806; SSE2-NEXT:    movdqa %xmm0, %xmm5
1807; SSE2-NEXT:    pand %xmm8, %xmm5
1808; SSE2-NEXT:    psllw $4, %xmm5
1809; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1810; SSE2-NEXT:    pand %xmm4, %xmm5
1811; SSE2-NEXT:    pand %xmm4, %xmm0
1812; SSE2-NEXT:    psrlw $4, %xmm0
1813; SSE2-NEXT:    pand %xmm8, %xmm0
1814; SSE2-NEXT:    por %xmm5, %xmm0
1815; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1816; SSE2-NEXT:    movdqa %xmm0, %xmm7
1817; SSE2-NEXT:    pand %xmm5, %xmm7
1818; SSE2-NEXT:    psllw $2, %xmm7
1819; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1820; SSE2-NEXT:    pand %xmm9, %xmm7
1821; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1822; SSE2-NEXT:    pand %xmm10, %xmm0
1823; SSE2-NEXT:    psrlw $2, %xmm0
1824; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
1825; SSE2-NEXT:    pand %xmm11, %xmm0
1826; SSE2-NEXT:    por %xmm7, %xmm0
1827; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1828; SSE2-NEXT:    movdqa %xmm0, %xmm6
1829; SSE2-NEXT:    pand %xmm7, %xmm6
1830; SSE2-NEXT:    psrlw $1, %xmm6
1831; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1832; SSE2-NEXT:    pand %xmm12, %xmm6
1833; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1834; SSE2-NEXT:    pand %xmm13, %xmm0
1835; SSE2-NEXT:    paddb %xmm0, %xmm0
1836; SSE2-NEXT:    por %xmm6, %xmm0
1837; SSE2-NEXT:    movdqa %xmm1, %xmm6
1838; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
1839; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1840; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1841; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
1842; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1843; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1844; SSE2-NEXT:    packuswb %xmm6, %xmm1
1845; SSE2-NEXT:    movdqa %xmm1, %xmm6
1846; SSE2-NEXT:    pand %xmm8, %xmm6
1847; SSE2-NEXT:    psllw $4, %xmm6
1848; SSE2-NEXT:    pand %xmm4, %xmm6
1849; SSE2-NEXT:    pand %xmm4, %xmm1
1850; SSE2-NEXT:    psrlw $4, %xmm1
1851; SSE2-NEXT:    pand %xmm8, %xmm1
1852; SSE2-NEXT:    por %xmm6, %xmm1
1853; SSE2-NEXT:    movdqa %xmm1, %xmm6
1854; SSE2-NEXT:    pand %xmm5, %xmm6
1855; SSE2-NEXT:    psllw $2, %xmm6
1856; SSE2-NEXT:    pand %xmm9, %xmm6
1857; SSE2-NEXT:    pand %xmm10, %xmm1
1858; SSE2-NEXT:    psrlw $2, %xmm1
1859; SSE2-NEXT:    pand %xmm11, %xmm1
1860; SSE2-NEXT:    por %xmm6, %xmm1
1861; SSE2-NEXT:    movdqa %xmm1, %xmm6
1862; SSE2-NEXT:    pand %xmm7, %xmm6
1863; SSE2-NEXT:    psrlw $1, %xmm6
1864; SSE2-NEXT:    pand %xmm12, %xmm6
1865; SSE2-NEXT:    pand %xmm13, %xmm1
1866; SSE2-NEXT:    paddb %xmm1, %xmm1
1867; SSE2-NEXT:    por %xmm6, %xmm1
1868; SSE2-NEXT:    movdqa %xmm2, %xmm6
1869; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
1870; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1871; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1872; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
1873; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1874; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1875; SSE2-NEXT:    packuswb %xmm6, %xmm2
1876; SSE2-NEXT:    movdqa %xmm2, %xmm6
1877; SSE2-NEXT:    pand %xmm8, %xmm6
1878; SSE2-NEXT:    psllw $4, %xmm6
1879; SSE2-NEXT:    pand %xmm4, %xmm6
1880; SSE2-NEXT:    pand %xmm4, %xmm2
1881; SSE2-NEXT:    psrlw $4, %xmm2
1882; SSE2-NEXT:    pand %xmm8, %xmm2
1883; SSE2-NEXT:    por %xmm6, %xmm2
1884; SSE2-NEXT:    movdqa %xmm2, %xmm6
1885; SSE2-NEXT:    pand %xmm5, %xmm6
1886; SSE2-NEXT:    psllw $2, %xmm6
1887; SSE2-NEXT:    pand %xmm9, %xmm6
1888; SSE2-NEXT:    pand %xmm10, %xmm2
1889; SSE2-NEXT:    psrlw $2, %xmm2
1890; SSE2-NEXT:    pand %xmm11, %xmm2
1891; SSE2-NEXT:    por %xmm6, %xmm2
1892; SSE2-NEXT:    movdqa %xmm2, %xmm6
1893; SSE2-NEXT:    pand %xmm7, %xmm6
1894; SSE2-NEXT:    psrlw $1, %xmm6
1895; SSE2-NEXT:    pand %xmm12, %xmm6
1896; SSE2-NEXT:    pand %xmm13, %xmm2
1897; SSE2-NEXT:    paddb %xmm2, %xmm2
1898; SSE2-NEXT:    por %xmm6, %xmm2
1899; SSE2-NEXT:    movdqa %xmm3, %xmm6
1900; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
1901; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1902; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1903; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
1904; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
1905; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
1906; SSE2-NEXT:    packuswb %xmm6, %xmm3
1907; SSE2-NEXT:    movdqa %xmm3, %xmm6
1908; SSE2-NEXT:    pand %xmm8, %xmm6
1909; SSE2-NEXT:    psllw $4, %xmm6
1910; SSE2-NEXT:    pand %xmm4, %xmm6
1911; SSE2-NEXT:    pand %xmm4, %xmm3
1912; SSE2-NEXT:    psrlw $4, %xmm3
1913; SSE2-NEXT:    pand %xmm8, %xmm3
1914; SSE2-NEXT:    por %xmm6, %xmm3
1915; SSE2-NEXT:    pand %xmm3, %xmm5
1916; SSE2-NEXT:    psllw $2, %xmm5
1917; SSE2-NEXT:    pand %xmm9, %xmm5
1918; SSE2-NEXT:    pand %xmm10, %xmm3
1919; SSE2-NEXT:    psrlw $2, %xmm3
1920; SSE2-NEXT:    pand %xmm11, %xmm3
1921; SSE2-NEXT:    por %xmm5, %xmm3
1922; SSE2-NEXT:    pand %xmm3, %xmm7
1923; SSE2-NEXT:    psrlw $1, %xmm7
1924; SSE2-NEXT:    pand %xmm12, %xmm7
1925; SSE2-NEXT:    pand %xmm13, %xmm3
1926; SSE2-NEXT:    paddb %xmm3, %xmm3
1927; SSE2-NEXT:    por %xmm7, %xmm3
1928; SSE2-NEXT:    retq
1929;
1930; SSSE3-LABEL: test_bitreverse_v16i32:
1931; SSSE3:       # %bb.0:
1932; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1933; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1934; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1935; SSSE3-NEXT:    pshufb %xmm8, %xmm1
1936; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1937; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1938; SSSE3-NEXT:    pand %xmm9, %xmm0
1939; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1940; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1941; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1942; SSSE3-NEXT:    psrlw $4, %xmm1
1943; SSSE3-NEXT:    pand %xmm9, %xmm1
1944; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1945; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1946; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1947; SSSE3-NEXT:    por %xmm6, %xmm0
1948; SSSE3-NEXT:    pshufb %xmm8, %xmm5
1949; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1950; SSSE3-NEXT:    pand %xmm9, %xmm1
1951; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1952; SSSE3-NEXT:    pshufb %xmm1, %xmm6
1953; SSSE3-NEXT:    psrlw $4, %xmm5
1954; SSSE3-NEXT:    pand %xmm9, %xmm5
1955; SSSE3-NEXT:    movdqa %xmm4, %xmm1
1956; SSSE3-NEXT:    pshufb %xmm5, %xmm1
1957; SSSE3-NEXT:    por %xmm6, %xmm1
1958; SSSE3-NEXT:    pshufb %xmm8, %xmm2
1959; SSSE3-NEXT:    movdqa %xmm2, %xmm5
1960; SSSE3-NEXT:    pand %xmm9, %xmm5
1961; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1962; SSSE3-NEXT:    pshufb %xmm5, %xmm6
1963; SSSE3-NEXT:    psrlw $4, %xmm2
1964; SSSE3-NEXT:    pand %xmm9, %xmm2
1965; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1966; SSSE3-NEXT:    pshufb %xmm2, %xmm5
1967; SSSE3-NEXT:    por %xmm6, %xmm5
1968; SSSE3-NEXT:    pshufb %xmm8, %xmm3
1969; SSSE3-NEXT:    movdqa %xmm3, %xmm2
1970; SSSE3-NEXT:    pand %xmm9, %xmm2
1971; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1972; SSSE3-NEXT:    psrlw $4, %xmm3
1973; SSSE3-NEXT:    pand %xmm9, %xmm3
1974; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1975; SSSE3-NEXT:    por %xmm7, %xmm4
1976; SSSE3-NEXT:    movdqa %xmm5, %xmm2
1977; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1978; SSSE3-NEXT:    retq
1979;
1980; AVX1-LABEL: test_bitreverse_v16i32:
1981; AVX1:       # %bb.0:
1982; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1983; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1984; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1985; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1986; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
1987; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1988; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1989; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1990; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1991; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1992; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
1993; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
1994; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1995; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
1996; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1997; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1998; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1999; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
2000; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
2001; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2002; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2003; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2004; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2005; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2006; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2007; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2008; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2009; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2010; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2011; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
2012; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2013; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2014; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2015; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
2016; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
2017; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2018; AVX1-NEXT:    retq
2019;
2020; AVX2-LABEL: test_bitreverse_v16i32:
2021; AVX2:       # %bb.0:
2022; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2023; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2024; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2025; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
2026; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2027; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2028; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2029; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2030; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2031; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2032; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
2033; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2034; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
2035; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2036; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2037; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2038; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2039; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2040; AVX2-NEXT:    retq
2041;
2042; AVX512F-LABEL: test_bitreverse_v16i32:
2043; AVX512F:       # %bb.0:
2044; AVX512F-NEXT:    vpsrld $24, %zmm0, %zmm1
2045; AVX512F-NEXT:    vpsrld $8, %zmm0, %zmm2
2046; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
2047; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
2048; AVX512F-NEXT:    vpslld $24, %zmm0, %zmm2
2049; AVX512F-NEXT:    vpslld $8, %zmm0, %zmm0
2050; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
2051; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
2052; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
2053; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
2054; AVX512F-NEXT:    vpslld $4, %zmm1, %zmm1
2055; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
2056; AVX512F-NEXT:    vpsrld $4, %zmm0, %zmm0
2057; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
2058; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
2059; AVX512F-NEXT:    vpslld $2, %zmm1, %zmm1
2060; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
2061; AVX512F-NEXT:    vpsrld $2, %zmm0, %zmm0
2062; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
2063; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
2064; AVX512F-NEXT:    vpslld $1, %zmm1, %zmm1
2065; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
2066; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0
2067; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
2068; AVX512F-NEXT:    retq
2069;
2070; AVX512BW-LABEL: test_bitreverse_v16i32:
2071; AVX512BW:       # %bb.0:
2072; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
2073; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2074; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2075; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2076; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2077; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2078; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2079; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2080; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2081; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2082; AVX512BW-NEXT:    retq
2083;
2084; XOPAVX1-LABEL: test_bitreverse_v16i32:
2085; XOPAVX1:       # %bb.0:
2086; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2087; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
2088; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2089; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2090; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2091; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2092; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2093; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2094; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2095; XOPAVX1-NEXT:    retq
2096;
2097; XOPAVX2-LABEL: test_bitreverse_v16i32:
2098; XOPAVX2:       # %bb.0:
2099; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2100; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
2101; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2102; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2103; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2104; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2105; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2106; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2107; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2108; XOPAVX2-NEXT:    retq
2109  %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
2110  ret <16 x i32> %b
2111}
2112
2113define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
2114; SSE2-LABEL: test_bitreverse_v8i64:
2115; SSE2:       # %bb.0:
2116; SSE2-NEXT:    pxor %xmm14, %xmm14
2117; SSE2-NEXT:    movdqa %xmm0, %xmm4
2118; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15]
2119; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
2120; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2121; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2122; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
2123; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2124; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
2125; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
2126; SSE2-NEXT:    packuswb %xmm4, %xmm0
2127; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2128; SSE2-NEXT:    movdqa %xmm0, %xmm5
2129; SSE2-NEXT:    pand %xmm8, %xmm5
2130; SSE2-NEXT:    psllw $4, %xmm5
2131; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
2132; SSE2-NEXT:    pand %xmm4, %xmm5
2133; SSE2-NEXT:    pand %xmm4, %xmm0
2134; SSE2-NEXT:    psrlw $4, %xmm0
2135; SSE2-NEXT:    pand %xmm8, %xmm0
2136; SSE2-NEXT:    por %xmm5, %xmm0
2137; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2138; SSE2-NEXT:    movdqa %xmm0, %xmm7
2139; SSE2-NEXT:    pand %xmm5, %xmm7
2140; SSE2-NEXT:    psllw $2, %xmm7
2141; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
2142; SSE2-NEXT:    pand %xmm9, %xmm7
2143; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
2144; SSE2-NEXT:    pand %xmm10, %xmm0
2145; SSE2-NEXT:    psrlw $2, %xmm0
2146; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
2147; SSE2-NEXT:    pand %xmm11, %xmm0
2148; SSE2-NEXT:    por %xmm7, %xmm0
2149; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
2150; SSE2-NEXT:    movdqa %xmm0, %xmm6
2151; SSE2-NEXT:    pand %xmm7, %xmm6
2152; SSE2-NEXT:    psrlw $1, %xmm6
2153; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
2154; SSE2-NEXT:    pand %xmm12, %xmm6
2155; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2156; SSE2-NEXT:    pand %xmm13, %xmm0
2157; SSE2-NEXT:    paddb %xmm0, %xmm0
2158; SSE2-NEXT:    por %xmm6, %xmm0
2159; SSE2-NEXT:    movdqa %xmm1, %xmm6
2160; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
2161; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
2162; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
2163; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
2164; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
2165; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2166; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2167; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2168; SSE2-NEXT:    packuswb %xmm6, %xmm1
2169; SSE2-NEXT:    movdqa %xmm1, %xmm6
2170; SSE2-NEXT:    pand %xmm8, %xmm6
2171; SSE2-NEXT:    psllw $4, %xmm6
2172; SSE2-NEXT:    pand %xmm4, %xmm6
2173; SSE2-NEXT:    pand %xmm4, %xmm1
2174; SSE2-NEXT:    psrlw $4, %xmm1
2175; SSE2-NEXT:    pand %xmm8, %xmm1
2176; SSE2-NEXT:    por %xmm6, %xmm1
2177; SSE2-NEXT:    movdqa %xmm1, %xmm6
2178; SSE2-NEXT:    pand %xmm5, %xmm6
2179; SSE2-NEXT:    psllw $2, %xmm6
2180; SSE2-NEXT:    pand %xmm9, %xmm6
2181; SSE2-NEXT:    pand %xmm10, %xmm1
2182; SSE2-NEXT:    psrlw $2, %xmm1
2183; SSE2-NEXT:    pand %xmm11, %xmm1
2184; SSE2-NEXT:    por %xmm6, %xmm1
2185; SSE2-NEXT:    movdqa %xmm1, %xmm6
2186; SSE2-NEXT:    pand %xmm7, %xmm6
2187; SSE2-NEXT:    psrlw $1, %xmm6
2188; SSE2-NEXT:    pand %xmm12, %xmm6
2189; SSE2-NEXT:    pand %xmm13, %xmm1
2190; SSE2-NEXT:    paddb %xmm1, %xmm1
2191; SSE2-NEXT:    por %xmm6, %xmm1
2192; SSE2-NEXT:    movdqa %xmm2, %xmm6
2193; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
2194; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
2195; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
2196; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
2197; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
2198; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
2199; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2200; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2201; SSE2-NEXT:    packuswb %xmm6, %xmm2
2202; SSE2-NEXT:    movdqa %xmm2, %xmm6
2203; SSE2-NEXT:    pand %xmm8, %xmm6
2204; SSE2-NEXT:    psllw $4, %xmm6
2205; SSE2-NEXT:    pand %xmm4, %xmm6
2206; SSE2-NEXT:    pand %xmm4, %xmm2
2207; SSE2-NEXT:    psrlw $4, %xmm2
2208; SSE2-NEXT:    pand %xmm8, %xmm2
2209; SSE2-NEXT:    por %xmm6, %xmm2
2210; SSE2-NEXT:    movdqa %xmm2, %xmm6
2211; SSE2-NEXT:    pand %xmm5, %xmm6
2212; SSE2-NEXT:    psllw $2, %xmm6
2213; SSE2-NEXT:    pand %xmm9, %xmm6
2214; SSE2-NEXT:    pand %xmm10, %xmm2
2215; SSE2-NEXT:    psrlw $2, %xmm2
2216; SSE2-NEXT:    pand %xmm11, %xmm2
2217; SSE2-NEXT:    por %xmm6, %xmm2
2218; SSE2-NEXT:    movdqa %xmm2, %xmm6
2219; SSE2-NEXT:    pand %xmm7, %xmm6
2220; SSE2-NEXT:    psrlw $1, %xmm6
2221; SSE2-NEXT:    pand %xmm12, %xmm6
2222; SSE2-NEXT:    pand %xmm13, %xmm2
2223; SSE2-NEXT:    paddb %xmm2, %xmm2
2224; SSE2-NEXT:    por %xmm6, %xmm2
2225; SSE2-NEXT:    movdqa %xmm3, %xmm6
2226; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
2227; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
2228; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
2229; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
2230; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
2231; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
2232; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
2233; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
2234; SSE2-NEXT:    packuswb %xmm6, %xmm3
2235; SSE2-NEXT:    movdqa %xmm3, %xmm6
2236; SSE2-NEXT:    pand %xmm8, %xmm6
2237; SSE2-NEXT:    psllw $4, %xmm6
2238; SSE2-NEXT:    pand %xmm4, %xmm6
2239; SSE2-NEXT:    pand %xmm4, %xmm3
2240; SSE2-NEXT:    psrlw $4, %xmm3
2241; SSE2-NEXT:    pand %xmm8, %xmm3
2242; SSE2-NEXT:    por %xmm6, %xmm3
2243; SSE2-NEXT:    pand %xmm3, %xmm5
2244; SSE2-NEXT:    psllw $2, %xmm5
2245; SSE2-NEXT:    pand %xmm9, %xmm5
2246; SSE2-NEXT:    pand %xmm10, %xmm3
2247; SSE2-NEXT:    psrlw $2, %xmm3
2248; SSE2-NEXT:    pand %xmm11, %xmm3
2249; SSE2-NEXT:    por %xmm5, %xmm3
2250; SSE2-NEXT:    pand %xmm3, %xmm7
2251; SSE2-NEXT:    psrlw $1, %xmm7
2252; SSE2-NEXT:    pand %xmm12, %xmm7
2253; SSE2-NEXT:    pand %xmm13, %xmm3
2254; SSE2-NEXT:    paddb %xmm3, %xmm3
2255; SSE2-NEXT:    por %xmm7, %xmm3
2256; SSE2-NEXT:    retq
2257;
2258; SSSE3-LABEL: test_bitreverse_v8i64:
2259; SSSE3:       # %bb.0:
2260; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2261; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2262; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2263; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2264; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2265; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2266; SSSE3-NEXT:    pand %xmm9, %xmm0
2267; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2268; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2269; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2270; SSSE3-NEXT:    psrlw $4, %xmm1
2271; SSSE3-NEXT:    pand %xmm9, %xmm1
2272; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2273; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2274; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2275; SSSE3-NEXT:    por %xmm6, %xmm0
2276; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2277; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2278; SSSE3-NEXT:    pand %xmm9, %xmm1
2279; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2280; SSSE3-NEXT:    pshufb %xmm1, %xmm6
2281; SSSE3-NEXT:    psrlw $4, %xmm5
2282; SSSE3-NEXT:    pand %xmm9, %xmm5
2283; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2284; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2285; SSSE3-NEXT:    por %xmm6, %xmm1
2286; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2287; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2288; SSSE3-NEXT:    pand %xmm9, %xmm5
2289; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2290; SSSE3-NEXT:    pshufb %xmm5, %xmm6
2291; SSSE3-NEXT:    psrlw $4, %xmm2
2292; SSSE3-NEXT:    pand %xmm9, %xmm2
2293; SSSE3-NEXT:    movdqa %xmm4, %xmm5
2294; SSSE3-NEXT:    pshufb %xmm2, %xmm5
2295; SSSE3-NEXT:    por %xmm6, %xmm5
2296; SSSE3-NEXT:    pshufb %xmm8, %xmm3
2297; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2298; SSSE3-NEXT:    pand %xmm9, %xmm2
2299; SSSE3-NEXT:    pshufb %xmm2, %xmm7
2300; SSSE3-NEXT:    psrlw $4, %xmm3
2301; SSSE3-NEXT:    pand %xmm9, %xmm3
2302; SSSE3-NEXT:    pshufb %xmm3, %xmm4
2303; SSSE3-NEXT:    por %xmm7, %xmm4
2304; SSSE3-NEXT:    movdqa %xmm5, %xmm2
2305; SSSE3-NEXT:    movdqa %xmm4, %xmm3
2306; SSSE3-NEXT:    retq
2307;
2308; AVX1-LABEL: test_bitreverse_v8i64:
2309; AVX1:       # %bb.0:
2310; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2311; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2312; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2313; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2314; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2315; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2316; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2317; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2318; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2319; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2320; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2321; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2322; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2323; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
2324; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2325; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2326; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2327; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
2328; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
2329; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2330; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2331; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2332; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2333; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2334; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2335; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2336; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2337; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2338; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2339; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
2340; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2341; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2342; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2343; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
2344; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
2345; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2346; AVX1-NEXT:    retq
2347;
2348; AVX2-LABEL: test_bitreverse_v8i64:
2349; AVX2:       # %bb.0:
2350; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2351; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2352; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2353; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
2354; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2355; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2356; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2357; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2358; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2359; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2360; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
2361; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2362; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
2363; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2364; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2365; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2366; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2367; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2368; AVX2-NEXT:    retq
2369;
2370; AVX512F-LABEL: test_bitreverse_v8i64:
2371; AVX512F:       # %bb.0:
2372; AVX512F-NEXT:    vpsrlq $56, %zmm0, %zmm1
2373; AVX512F-NEXT:    vpsrlq $40, %zmm0, %zmm2
2374; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
2375; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
2376; AVX512F-NEXT:    vpsrlq $24, %zmm0, %zmm2
2377; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
2378; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
2379; AVX512F-NEXT:    vpsrlq $8, %zmm0, %zmm2
2380; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
2381; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
2382; AVX512F-NEXT:    vpsllq $8, %zmm0, %zmm2
2383; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
2384; AVX512F-NEXT:    vpsllq $24, %zmm0, %zmm3
2385; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
2386; AVX512F-NEXT:    vporq %zmm2, %zmm3, %zmm2
2387; AVX512F-NEXT:    vpsllq $56, %zmm0, %zmm3
2388; AVX512F-NEXT:    vpsllq $40, %zmm0, %zmm0
2389; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
2390; AVX512F-NEXT:    vporq %zmm2, %zmm0, %zmm0
2391; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
2392; AVX512F-NEXT:    vporq %zmm0, %zmm3, %zmm0
2393; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
2394; AVX512F-NEXT:    vpsllq $4, %zmm1, %zmm1
2395; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
2396; AVX512F-NEXT:    vpsrlq $4, %zmm0, %zmm0
2397; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
2398; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
2399; AVX512F-NEXT:    vpsllq $2, %zmm1, %zmm1
2400; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
2401; AVX512F-NEXT:    vpsrlq $2, %zmm0, %zmm0
2402; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
2403; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
2404; AVX512F-NEXT:    vpsllq $1, %zmm1, %zmm1
2405; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
2406; AVX512F-NEXT:    vpsrlq $1, %zmm0, %zmm0
2407; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
2408; AVX512F-NEXT:    retq
2409;
2410; AVX512BW-LABEL: test_bitreverse_v8i64:
2411; AVX512BW:       # %bb.0:
2412; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
2413; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2414; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2415; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2416; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2417; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2418; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2419; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2420; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2421; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2422; AVX512BW-NEXT:    retq
2423;
2424; XOPAVX1-LABEL: test_bitreverse_v8i64:
2425; XOPAVX1:       # %bb.0:
2426; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2427; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2428; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2429; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2430; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2431; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2432; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2433; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2434; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2435; XOPAVX1-NEXT:    retq
2436;
2437; XOPAVX2-LABEL: test_bitreverse_v8i64:
2438; XOPAVX2:       # %bb.0:
2439; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2440; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2441; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2442; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2443; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2444; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2445; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2446; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2447; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2448; XOPAVX2-NEXT:    retq
2449  %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
2450  ret <8 x i64> %b
2451}
2452
2453;
2454; Constant Folding
2455;
2456
2457define i32 @fold_bitreverse_i32() nounwind {
2458; ALL-LABEL: fold_bitreverse_i32:
2459; ALL:       # %bb.0:
2460; ALL-NEXT:    movl $16711935, %eax # imm = 0xFF00FF
2461; ALL-NEXT:    retq
2462  %b = call i32 @llvm.bitreverse.i32(i32 4278255360)
2463  ret i32 %b
2464}
2465
2466define <16 x i8> @fold_bitreverse_v16i8() nounwind {
2467; SSE-LABEL: fold_bitreverse_v16i8:
2468; SSE:       # %bb.0:
2469; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2470; SSE-NEXT:    retq
2471;
2472; AVX-LABEL: fold_bitreverse_v16i8:
2473; AVX:       # %bb.0:
2474; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2475; AVX-NEXT:    retq
2476;
2477; XOP-LABEL: fold_bitreverse_v16i8:
2478; XOP:       # %bb.0:
2479; XOP-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2480; XOP-NEXT:    retq
2481  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>)
2482  ret <16 x i8> %b
2483}
2484
2485define <16 x i16> @fold_bitreverse_v16i16() nounwind {
2486; SSE-LABEL: fold_bitreverse_v16i16:
2487; SSE:       # %bb.0:
2488; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
2489; SSE-NEXT:    movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
2490; SSE-NEXT:    retq
2491;
2492; AVX-LABEL: fold_bitreverse_v16i16:
2493; AVX:       # %bb.0:
2494; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
2495; AVX-NEXT:    retq
2496;
2497; XOP-LABEL: fold_bitreverse_v16i16:
2498; XOP:       # %bb.0:
2499; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
2500; XOP-NEXT:    retq
2501  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>)
2502  ret <16 x i16> %b
2503}
2504
2505define <16 x i32> @fold_bitreverse_v16i32() nounwind {
2506; SSE-LABEL: fold_bitreverse_v16i32:
2507; SSE:       # %bb.0:
2508; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
2509; SSE-NEXT:    movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
2510; SSE-NEXT:    movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
2511; SSE-NEXT:    movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103]
2512; SSE-NEXT:    retq
2513;
2514; AVX1-LABEL: fold_bitreverse_v16i32:
2515; AVX1:       # %bb.0:
2516; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2517; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2518; AVX1-NEXT:    retq
2519;
2520; AVX2-LABEL: fold_bitreverse_v16i32:
2521; AVX2:       # %bb.0:
2522; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2523; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2524; AVX2-NEXT:    retq
2525;
2526; AVX512-LABEL: fold_bitreverse_v16i32:
2527; AVX512:       # %bb.0:
2528; AVX512-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2529; AVX512-NEXT:    retq
2530;
2531; XOP-LABEL: fold_bitreverse_v16i32:
2532; XOP:       # %bb.0:
2533; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2534; XOP-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2535; XOP-NEXT:    retq
2536  %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>)
2537  ret <16 x i32> %b
2538}
2539
2540declare i8 @llvm.bitreverse.i8(i8) readnone
2541declare i16 @llvm.bitreverse.i16(i16) readnone
2542declare i32 @llvm.bitreverse.i32(i32) readnone
2543declare i64 @llvm.bitreverse.i64(i64) readnone
2544
2545declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone
2546declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone
2547declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone
2548declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone
2549
2550declare <32 x i8>  @llvm.bitreverse.v32i8(<32 x i8>) readnone
2551declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone
2552declare <8 x i32>  @llvm.bitreverse.v8i32(<8 x i32>) readnone
2553declare <4 x i64>  @llvm.bitreverse.v4i64(<4 x i64>) readnone
2554
2555declare <64 x i8>  @llvm.bitreverse.v64i8(<64 x i8>) readnone
2556declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone
2557declare <16 x i32>  @llvm.bitreverse.v16i32(<16 x i32>) readnone
2558declare <8 x i64>  @llvm.bitreverse.v8i64(<8 x i64>) readnone
2559