1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
10
11; Make sure we don't crash with avx512bw and xop
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw
13
14define i8 @test_bitreverse_i8(i8 %a) nounwind {
15; SSE-LABEL: test_bitreverse_i8:
16; SSE:       # %bb.0:
17; SSE-NEXT:    movl %edi, %eax
18; SSE-NEXT:    rolb $4, %al
19; SSE-NEXT:    movl %eax, %ecx
20; SSE-NEXT:    andb $51, %cl
21; SSE-NEXT:    shlb $2, %cl
22; SSE-NEXT:    andb $-52, %al
23; SSE-NEXT:    shrb $2, %al
24; SSE-NEXT:    orb %cl, %al
25; SSE-NEXT:    movl %eax, %ecx
26; SSE-NEXT:    andb $85, %cl
27; SSE-NEXT:    addb %cl, %cl
28; SSE-NEXT:    andb $-86, %al
29; SSE-NEXT:    shrb %al
30; SSE-NEXT:    orb %cl, %al
31; SSE-NEXT:    # kill: def $al killed $al killed $eax
32; SSE-NEXT:    retq
33;
34; AVX-LABEL: test_bitreverse_i8:
35; AVX:       # %bb.0:
36; AVX-NEXT:    movl %edi, %eax
37; AVX-NEXT:    rolb $4, %al
38; AVX-NEXT:    movl %eax, %ecx
39; AVX-NEXT:    andb $51, %cl
40; AVX-NEXT:    shlb $2, %cl
41; AVX-NEXT:    andb $-52, %al
42; AVX-NEXT:    shrb $2, %al
43; AVX-NEXT:    orb %cl, %al
44; AVX-NEXT:    movl %eax, %ecx
45; AVX-NEXT:    andb $85, %cl
46; AVX-NEXT:    addb %cl, %cl
47; AVX-NEXT:    andb $-86, %al
48; AVX-NEXT:    shrb %al
49; AVX-NEXT:    orb %cl, %al
50; AVX-NEXT:    # kill: def $al killed $al killed $eax
51; AVX-NEXT:    retq
52;
53; XOP-LABEL: test_bitreverse_i8:
54; XOP:       # %bb.0:
55; XOP-NEXT:    vmovd %edi, %xmm0
56; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
57; XOP-NEXT:    vpextrb $0, %xmm0, %eax
58; XOP-NEXT:    # kill: def $al killed $al killed $eax
59; XOP-NEXT:    retq
60  %b = call i8 @llvm.bitreverse.i8(i8 %a)
61  ret i8 %b
62}
63
64define i16 @test_bitreverse_i16(i16 %a) nounwind {
65; SSE-LABEL: test_bitreverse_i16:
66; SSE:       # %bb.0:
67; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
68; SSE-NEXT:    rolw $8, %di
69; SSE-NEXT:    movl %edi, %eax
70; SSE-NEXT:    andl $3855, %eax # imm = 0xF0F
71; SSE-NEXT:    shll $4, %eax
72; SSE-NEXT:    andl $61680, %edi # imm = 0xF0F0
73; SSE-NEXT:    shrl $4, %edi
74; SSE-NEXT:    orl %eax, %edi
75; SSE-NEXT:    movl %edi, %eax
76; SSE-NEXT:    andl $13107, %eax # imm = 0x3333
77; SSE-NEXT:    andl $52428, %edi # imm = 0xCCCC
78; SSE-NEXT:    shrl $2, %edi
79; SSE-NEXT:    leal (%rdi,%rax,4), %eax
80; SSE-NEXT:    movl %eax, %ecx
81; SSE-NEXT:    andl $21845, %ecx # imm = 0x5555
82; SSE-NEXT:    andl $43690, %eax # imm = 0xAAAA
83; SSE-NEXT:    shrl %eax
84; SSE-NEXT:    leal (%rax,%rcx,2), %eax
85; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
86; SSE-NEXT:    retq
87;
88; AVX-LABEL: test_bitreverse_i16:
89; AVX:       # %bb.0:
90; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
91; AVX-NEXT:    rolw $8, %di
92; AVX-NEXT:    movl %edi, %eax
93; AVX-NEXT:    andl $3855, %eax # imm = 0xF0F
94; AVX-NEXT:    shll $4, %eax
95; AVX-NEXT:    andl $61680, %edi # imm = 0xF0F0
96; AVX-NEXT:    shrl $4, %edi
97; AVX-NEXT:    orl %eax, %edi
98; AVX-NEXT:    movl %edi, %eax
99; AVX-NEXT:    andl $13107, %eax # imm = 0x3333
100; AVX-NEXT:    andl $52428, %edi # imm = 0xCCCC
101; AVX-NEXT:    shrl $2, %edi
102; AVX-NEXT:    leal (%rdi,%rax,4), %eax
103; AVX-NEXT:    movl %eax, %ecx
104; AVX-NEXT:    andl $21845, %ecx # imm = 0x5555
105; AVX-NEXT:    andl $43690, %eax # imm = 0xAAAA
106; AVX-NEXT:    shrl %eax
107; AVX-NEXT:    leal (%rax,%rcx,2), %eax
108; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
109; AVX-NEXT:    retq
110;
111; XOP-LABEL: test_bitreverse_i16:
112; XOP:       # %bb.0:
113; XOP-NEXT:    vmovd %edi, %xmm0
114; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
115; XOP-NEXT:    vmovd %xmm0, %eax
116; XOP-NEXT:    # kill: def $ax killed $ax killed $eax
117; XOP-NEXT:    retq
118  %b = call i16 @llvm.bitreverse.i16(i16 %a)
119  ret i16 %b
120}
121
122define i32 @test_bitreverse_i32(i32 %a) nounwind {
123; SSE-LABEL: test_bitreverse_i32:
124; SSE:       # %bb.0:
125; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
126; SSE-NEXT:    bswapl %edi
127; SSE-NEXT:    movl %edi, %eax
128; SSE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
129; SSE-NEXT:    shll $4, %eax
130; SSE-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
131; SSE-NEXT:    shrl $4, %edi
132; SSE-NEXT:    orl %eax, %edi
133; SSE-NEXT:    movl %edi, %eax
134; SSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
135; SSE-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
136; SSE-NEXT:    shrl $2, %edi
137; SSE-NEXT:    leal (%rdi,%rax,4), %eax
138; SSE-NEXT:    movl %eax, %ecx
139; SSE-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
140; SSE-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
141; SSE-NEXT:    shrl %eax
142; SSE-NEXT:    leal (%rax,%rcx,2), %eax
143; SSE-NEXT:    retq
144;
145; AVX-LABEL: test_bitreverse_i32:
146; AVX:       # %bb.0:
147; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
148; AVX-NEXT:    bswapl %edi
149; AVX-NEXT:    movl %edi, %eax
150; AVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
151; AVX-NEXT:    shll $4, %eax
152; AVX-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
153; AVX-NEXT:    shrl $4, %edi
154; AVX-NEXT:    orl %eax, %edi
155; AVX-NEXT:    movl %edi, %eax
156; AVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
157; AVX-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
158; AVX-NEXT:    shrl $2, %edi
159; AVX-NEXT:    leal (%rdi,%rax,4), %eax
160; AVX-NEXT:    movl %eax, %ecx
161; AVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
162; AVX-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
163; AVX-NEXT:    shrl %eax
164; AVX-NEXT:    leal (%rax,%rcx,2), %eax
165; AVX-NEXT:    retq
166;
167; XOP-LABEL: test_bitreverse_i32:
168; XOP:       # %bb.0:
169; XOP-NEXT:    vmovd %edi, %xmm0
170; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
171; XOP-NEXT:    vmovd %xmm0, %eax
172; XOP-NEXT:    retq
173  %b = call i32 @llvm.bitreverse.i32(i32 %a)
174  ret i32 %b
175}
176
177define i64 @test_bitreverse_i64(i64 %a) nounwind {
178; SSE-LABEL: test_bitreverse_i64:
179; SSE:       # %bb.0:
180; SSE-NEXT:    bswapq %rdi
181; SSE-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
182; SSE-NEXT:    andq %rdi, %rax
183; SSE-NEXT:    shlq $4, %rax
184; SSE-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
185; SSE-NEXT:    andq %rdi, %rcx
186; SSE-NEXT:    shrq $4, %rcx
187; SSE-NEXT:    orq %rax, %rcx
188; SSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
189; SSE-NEXT:    andq %rcx, %rax
190; SSE-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
191; SSE-NEXT:    andq %rcx, %rdx
192; SSE-NEXT:    shrq $2, %rdx
193; SSE-NEXT:    leaq (%rdx,%rax,4), %rax
194; SSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
195; SSE-NEXT:    andq %rax, %rcx
196; SSE-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
197; SSE-NEXT:    andq %rax, %rdx
198; SSE-NEXT:    shrq %rdx
199; SSE-NEXT:    leaq (%rdx,%rcx,2), %rax
200; SSE-NEXT:    retq
201;
202; AVX-LABEL: test_bitreverse_i64:
203; AVX:       # %bb.0:
204; AVX-NEXT:    bswapq %rdi
205; AVX-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
206; AVX-NEXT:    andq %rdi, %rax
207; AVX-NEXT:    shlq $4, %rax
208; AVX-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
209; AVX-NEXT:    andq %rdi, %rcx
210; AVX-NEXT:    shrq $4, %rcx
211; AVX-NEXT:    orq %rax, %rcx
212; AVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
213; AVX-NEXT:    andq %rcx, %rax
214; AVX-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
215; AVX-NEXT:    andq %rcx, %rdx
216; AVX-NEXT:    shrq $2, %rdx
217; AVX-NEXT:    leaq (%rdx,%rax,4), %rax
218; AVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
219; AVX-NEXT:    andq %rax, %rcx
220; AVX-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
221; AVX-NEXT:    andq %rax, %rdx
222; AVX-NEXT:    shrq %rdx
223; AVX-NEXT:    leaq (%rdx,%rcx,2), %rax
224; AVX-NEXT:    retq
225;
226; XOP-LABEL: test_bitreverse_i64:
227; XOP:       # %bb.0:
228; XOP-NEXT:    vmovq %rdi, %xmm0
229; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
230; XOP-NEXT:    vmovq %xmm0, %rax
231; XOP-NEXT:    retq
232  %b = call i64 @llvm.bitreverse.i64(i64 %a)
233  ret i64 %b
234}
235
236define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
237; SSE2-LABEL: test_bitreverse_v16i8:
238; SSE2:       # %bb.0:
239; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
240; SSE2-NEXT:    movdqa %xmm0, %xmm2
241; SSE2-NEXT:    pand %xmm1, %xmm2
242; SSE2-NEXT:    psllw $4, %xmm2
243; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
244; SSE2-NEXT:    pand %xmm3, %xmm2
245; SSE2-NEXT:    pand %xmm3, %xmm0
246; SSE2-NEXT:    psrlw $4, %xmm0
247; SSE2-NEXT:    pand %xmm1, %xmm0
248; SSE2-NEXT:    por %xmm2, %xmm0
249; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
250; SSE2-NEXT:    pand %xmm0, %xmm1
251; SSE2-NEXT:    psllw $2, %xmm1
252; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
253; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
254; SSE2-NEXT:    psrlw $2, %xmm0
255; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
256; SSE2-NEXT:    por %xmm1, %xmm0
257; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
258; SSE2-NEXT:    pand %xmm0, %xmm1
259; SSE2-NEXT:    paddb %xmm1, %xmm1
260; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
261; SSE2-NEXT:    psrlw $1, %xmm0
262; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
263; SSE2-NEXT:    por %xmm1, %xmm0
264; SSE2-NEXT:    retq
265;
266; SSSE3-LABEL: test_bitreverse_v16i8:
267; SSSE3:       # %bb.0:
268; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
269; SSSE3-NEXT:    movdqa %xmm0, %xmm2
270; SSSE3-NEXT:    pand %xmm1, %xmm2
271; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
272; SSSE3-NEXT:    pshufb %xmm2, %xmm3
273; SSSE3-NEXT:    psrlw $4, %xmm0
274; SSSE3-NEXT:    pand %xmm1, %xmm0
275; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
276; SSSE3-NEXT:    pshufb %xmm0, %xmm1
277; SSSE3-NEXT:    por %xmm3, %xmm1
278; SSSE3-NEXT:    movdqa %xmm1, %xmm0
279; SSSE3-NEXT:    retq
280;
281; AVX-LABEL: test_bitreverse_v16i8:
282; AVX:       # %bb.0:
283; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
284; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
285; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
286; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
287; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
288; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
289; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
290; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
291; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
292; AVX-NEXT:    retq
293;
294; XOP-LABEL: test_bitreverse_v16i8:
295; XOP:       # %bb.0:
296; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
297; XOP-NEXT:    retq
298  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
299  ret <16 x i8> %b
300}
301
302define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
303; SSE2-LABEL: test_bitreverse_v8i16:
304; SSE2:       # %bb.0:
305; SSE2-NEXT:    pxor %xmm1, %xmm1
306; SSE2-NEXT:    movdqa %xmm0, %xmm2
307; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
308; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
309; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
310; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
311; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
312; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
313; SSE2-NEXT:    packuswb %xmm2, %xmm0
314; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
315; SSE2-NEXT:    movdqa %xmm0, %xmm2
316; SSE2-NEXT:    pand %xmm1, %xmm2
317; SSE2-NEXT:    psllw $4, %xmm2
318; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
319; SSE2-NEXT:    pand %xmm3, %xmm2
320; SSE2-NEXT:    pand %xmm3, %xmm0
321; SSE2-NEXT:    psrlw $4, %xmm0
322; SSE2-NEXT:    pand %xmm1, %xmm0
323; SSE2-NEXT:    por %xmm2, %xmm0
324; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
325; SSE2-NEXT:    pand %xmm0, %xmm1
326; SSE2-NEXT:    psllw $2, %xmm1
327; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
328; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
329; SSE2-NEXT:    psrlw $2, %xmm0
330; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
331; SSE2-NEXT:    por %xmm1, %xmm0
332; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
333; SSE2-NEXT:    pand %xmm0, %xmm1
334; SSE2-NEXT:    paddb %xmm1, %xmm1
335; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
336; SSE2-NEXT:    psrlw $1, %xmm0
337; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
338; SSE2-NEXT:    por %xmm1, %xmm0
339; SSE2-NEXT:    retq
340;
341; SSSE3-LABEL: test_bitreverse_v8i16:
342; SSSE3:       # %bb.0:
343; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
344; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
345; SSSE3-NEXT:    movdqa %xmm0, %xmm2
346; SSSE3-NEXT:    pand %xmm1, %xmm2
347; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
348; SSSE3-NEXT:    pshufb %xmm2, %xmm3
349; SSSE3-NEXT:    psrlw $4, %xmm0
350; SSSE3-NEXT:    pand %xmm1, %xmm0
351; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
352; SSSE3-NEXT:    pshufb %xmm0, %xmm1
353; SSSE3-NEXT:    por %xmm3, %xmm1
354; SSSE3-NEXT:    movdqa %xmm1, %xmm0
355; SSSE3-NEXT:    retq
356;
357; AVX-LABEL: test_bitreverse_v8i16:
358; AVX:       # %bb.0:
359; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
360; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
361; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
362; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
363; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
364; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
365; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
366; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
367; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
368; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
369; AVX-NEXT:    retq
370;
371; XOP-LABEL: test_bitreverse_v8i16:
372; XOP:       # %bb.0:
373; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
374; XOP-NEXT:    retq
375  %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
376  ret <8 x i16> %b
377}
378
379define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
380; SSE2-LABEL: test_bitreverse_v4i32:
381; SSE2:       # %bb.0:
382; SSE2-NEXT:    pxor %xmm1, %xmm1
383; SSE2-NEXT:    movdqa %xmm0, %xmm2
384; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
385; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
386; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
387; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
388; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
389; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
390; SSE2-NEXT:    packuswb %xmm2, %xmm0
391; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
392; SSE2-NEXT:    movdqa %xmm0, %xmm2
393; SSE2-NEXT:    pand %xmm1, %xmm2
394; SSE2-NEXT:    psllw $4, %xmm2
395; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
396; SSE2-NEXT:    pand %xmm3, %xmm2
397; SSE2-NEXT:    pand %xmm3, %xmm0
398; SSE2-NEXT:    psrlw $4, %xmm0
399; SSE2-NEXT:    pand %xmm1, %xmm0
400; SSE2-NEXT:    por %xmm2, %xmm0
401; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
402; SSE2-NEXT:    pand %xmm0, %xmm1
403; SSE2-NEXT:    psllw $2, %xmm1
404; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
405; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
406; SSE2-NEXT:    psrlw $2, %xmm0
407; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
408; SSE2-NEXT:    por %xmm1, %xmm0
409; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
410; SSE2-NEXT:    pand %xmm0, %xmm1
411; SSE2-NEXT:    paddb %xmm1, %xmm1
412; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
413; SSE2-NEXT:    psrlw $1, %xmm0
414; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
415; SSE2-NEXT:    por %xmm1, %xmm0
416; SSE2-NEXT:    retq
417;
418; SSSE3-LABEL: test_bitreverse_v4i32:
419; SSSE3:       # %bb.0:
420; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
421; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
422; SSSE3-NEXT:    movdqa %xmm0, %xmm2
423; SSSE3-NEXT:    pand %xmm1, %xmm2
424; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
425; SSSE3-NEXT:    pshufb %xmm2, %xmm3
426; SSSE3-NEXT:    psrlw $4, %xmm0
427; SSSE3-NEXT:    pand %xmm1, %xmm0
428; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
429; SSSE3-NEXT:    pshufb %xmm0, %xmm1
430; SSSE3-NEXT:    por %xmm3, %xmm1
431; SSSE3-NEXT:    movdqa %xmm1, %xmm0
432; SSSE3-NEXT:    retq
433;
434; AVX-LABEL: test_bitreverse_v4i32:
435; AVX:       # %bb.0:
436; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
437; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
438; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
439; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
440; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
441; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
442; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
443; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
444; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
445; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
446; AVX-NEXT:    retq
447;
448; XOP-LABEL: test_bitreverse_v4i32:
449; XOP:       # %bb.0:
450; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
451; XOP-NEXT:    retq
452  %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
453  ret <4 x i32> %b
454}
455
456define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
457; SSE2-LABEL: test_bitreverse_v2i64:
458; SSE2:       # %bb.0:
459; SSE2-NEXT:    pxor %xmm1, %xmm1
460; SSE2-NEXT:    movdqa %xmm0, %xmm2
461; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
462; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
463; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
464; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
465; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
466; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
467; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
468; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
469; SSE2-NEXT:    packuswb %xmm2, %xmm0
470; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
471; SSE2-NEXT:    movdqa %xmm0, %xmm2
472; SSE2-NEXT:    pand %xmm1, %xmm2
473; SSE2-NEXT:    psllw $4, %xmm2
474; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
475; SSE2-NEXT:    pand %xmm3, %xmm2
476; SSE2-NEXT:    pand %xmm3, %xmm0
477; SSE2-NEXT:    psrlw $4, %xmm0
478; SSE2-NEXT:    pand %xmm1, %xmm0
479; SSE2-NEXT:    por %xmm2, %xmm0
480; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
481; SSE2-NEXT:    pand %xmm0, %xmm1
482; SSE2-NEXT:    psllw $2, %xmm1
483; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
484; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
485; SSE2-NEXT:    psrlw $2, %xmm0
486; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
487; SSE2-NEXT:    por %xmm1, %xmm0
488; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
489; SSE2-NEXT:    pand %xmm0, %xmm1
490; SSE2-NEXT:    paddb %xmm1, %xmm1
491; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
492; SSE2-NEXT:    psrlw $1, %xmm0
493; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
494; SSE2-NEXT:    por %xmm1, %xmm0
495; SSE2-NEXT:    retq
496;
497; SSSE3-LABEL: test_bitreverse_v2i64:
498; SSSE3:       # %bb.0:
499; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
500; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
501; SSSE3-NEXT:    movdqa %xmm0, %xmm2
502; SSSE3-NEXT:    pand %xmm1, %xmm2
503; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
504; SSSE3-NEXT:    pshufb %xmm2, %xmm3
505; SSSE3-NEXT:    psrlw $4, %xmm0
506; SSSE3-NEXT:    pand %xmm1, %xmm0
507; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
508; SSSE3-NEXT:    pshufb %xmm0, %xmm1
509; SSSE3-NEXT:    por %xmm3, %xmm1
510; SSSE3-NEXT:    movdqa %xmm1, %xmm0
511; SSSE3-NEXT:    retq
512;
513; AVX-LABEL: test_bitreverse_v2i64:
514; AVX:       # %bb.0:
515; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
516; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
517; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
518; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
519; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
520; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
521; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
522; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
523; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
524; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
525; AVX-NEXT:    retq
526;
527; XOP-LABEL: test_bitreverse_v2i64:
528; XOP:       # %bb.0:
529; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
530; XOP-NEXT:    retq
531  %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
532  ret <2 x i64> %b
533}
534
535define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
536; SSE2-LABEL: test_bitreverse_v32i8:
537; SSE2:       # %bb.0:
538; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
539; SSE2-NEXT:    movdqa %xmm0, %xmm3
540; SSE2-NEXT:    pand %xmm2, %xmm3
541; SSE2-NEXT:    psllw $4, %xmm3
542; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
543; SSE2-NEXT:    pand %xmm5, %xmm3
544; SSE2-NEXT:    pand %xmm5, %xmm0
545; SSE2-NEXT:    psrlw $4, %xmm0
546; SSE2-NEXT:    pand %xmm2, %xmm0
547; SSE2-NEXT:    por %xmm3, %xmm0
548; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
549; SSE2-NEXT:    movdqa %xmm0, %xmm4
550; SSE2-NEXT:    pand %xmm3, %xmm4
551; SSE2-NEXT:    psllw $2, %xmm4
552; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
553; SSE2-NEXT:    pand %xmm8, %xmm4
554; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
555; SSE2-NEXT:    pand %xmm9, %xmm0
556; SSE2-NEXT:    psrlw $2, %xmm0
557; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
558; SSE2-NEXT:    pand %xmm10, %xmm0
559; SSE2-NEXT:    por %xmm4, %xmm0
560; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
561; SSE2-NEXT:    movdqa %xmm0, %xmm7
562; SSE2-NEXT:    pand %xmm4, %xmm7
563; SSE2-NEXT:    psrlw $1, %xmm7
564; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
565; SSE2-NEXT:    pand %xmm11, %xmm7
566; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
567; SSE2-NEXT:    pand %xmm6, %xmm0
568; SSE2-NEXT:    paddb %xmm0, %xmm0
569; SSE2-NEXT:    por %xmm7, %xmm0
570; SSE2-NEXT:    movdqa %xmm1, %xmm7
571; SSE2-NEXT:    pand %xmm2, %xmm7
572; SSE2-NEXT:    psllw $4, %xmm7
573; SSE2-NEXT:    pand %xmm5, %xmm7
574; SSE2-NEXT:    pand %xmm5, %xmm1
575; SSE2-NEXT:    psrlw $4, %xmm1
576; SSE2-NEXT:    pand %xmm2, %xmm1
577; SSE2-NEXT:    por %xmm7, %xmm1
578; SSE2-NEXT:    pand %xmm1, %xmm3
579; SSE2-NEXT:    psllw $2, %xmm3
580; SSE2-NEXT:    pand %xmm8, %xmm3
581; SSE2-NEXT:    pand %xmm9, %xmm1
582; SSE2-NEXT:    psrlw $2, %xmm1
583; SSE2-NEXT:    pand %xmm10, %xmm1
584; SSE2-NEXT:    por %xmm3, %xmm1
585; SSE2-NEXT:    pand %xmm1, %xmm4
586; SSE2-NEXT:    psrlw $1, %xmm4
587; SSE2-NEXT:    pand %xmm11, %xmm4
588; SSE2-NEXT:    pand %xmm6, %xmm1
589; SSE2-NEXT:    paddb %xmm1, %xmm1
590; SSE2-NEXT:    por %xmm4, %xmm1
591; SSE2-NEXT:    retq
592;
593; SSSE3-LABEL: test_bitreverse_v32i8:
594; SSSE3:       # %bb.0:
595; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
596; SSSE3-NEXT:    movdqa %xmm0, %xmm2
597; SSSE3-NEXT:    pand %xmm4, %xmm2
598; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
599; SSSE3-NEXT:    movdqa %xmm5, %xmm6
600; SSSE3-NEXT:    pshufb %xmm2, %xmm6
601; SSSE3-NEXT:    psrlw $4, %xmm0
602; SSSE3-NEXT:    pand %xmm4, %xmm0
603; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
604; SSSE3-NEXT:    movdqa %xmm2, %xmm3
605; SSSE3-NEXT:    pshufb %xmm0, %xmm3
606; SSSE3-NEXT:    por %xmm6, %xmm3
607; SSSE3-NEXT:    movdqa %xmm1, %xmm0
608; SSSE3-NEXT:    pand %xmm4, %xmm0
609; SSSE3-NEXT:    pshufb %xmm0, %xmm5
610; SSSE3-NEXT:    psrlw $4, %xmm1
611; SSSE3-NEXT:    pand %xmm4, %xmm1
612; SSSE3-NEXT:    pshufb %xmm1, %xmm2
613; SSSE3-NEXT:    por %xmm5, %xmm2
614; SSSE3-NEXT:    movdqa %xmm3, %xmm0
615; SSSE3-NEXT:    movdqa %xmm2, %xmm1
616; SSSE3-NEXT:    retq
617;
618; AVX1-LABEL: test_bitreverse_v32i8:
619; AVX1:       # %bb.0:
620; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
621; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
622; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
623; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
624; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
625; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
626; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
627; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
628; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
629; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
630; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
631; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
632; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
633; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
634; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
635; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
636; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
637; AVX1-NEXT:    retq
638;
639; AVX2-LABEL: test_bitreverse_v32i8:
640; AVX2:       # %bb.0:
641; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
642; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
643; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
644; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
645; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
646; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
647; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
648; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
649; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
650; AVX2-NEXT:    retq
651;
652; AVX512-LABEL: test_bitreverse_v32i8:
653; AVX512:       # %bb.0:
654; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
655; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
656; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
657; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
658; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
659; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
660; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
661; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
662; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
663; AVX512-NEXT:    retq
664;
665; XOPAVX1-LABEL: test_bitreverse_v32i8:
666; XOPAVX1:       # %bb.0:
667; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
668; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
669; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
670; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
671; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
672; XOPAVX1-NEXT:    retq
673;
674; XOPAVX2-LABEL: test_bitreverse_v32i8:
675; XOPAVX2:       # %bb.0:
676; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
677; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
678; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
679; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
680; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
681; XOPAVX2-NEXT:    retq
682  %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
683  ret <32 x i8> %b
684}
685
686define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
687; SSE2-LABEL: test_bitreverse_v16i16:
688; SSE2:       # %bb.0:
689; SSE2-NEXT:    pxor %xmm4, %xmm4
690; SSE2-NEXT:    movdqa %xmm0, %xmm2
691; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
692; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
693; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
694; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
695; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
696; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
697; SSE2-NEXT:    packuswb %xmm2, %xmm0
698; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
699; SSE2-NEXT:    movdqa %xmm0, %xmm3
700; SSE2-NEXT:    pand %xmm2, %xmm3
701; SSE2-NEXT:    psllw $4, %xmm3
702; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
703; SSE2-NEXT:    pand %xmm6, %xmm3
704; SSE2-NEXT:    pand %xmm6, %xmm0
705; SSE2-NEXT:    psrlw $4, %xmm0
706; SSE2-NEXT:    pand %xmm2, %xmm0
707; SSE2-NEXT:    por %xmm3, %xmm0
708; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
709; SSE2-NEXT:    movdqa %xmm0, %xmm5
710; SSE2-NEXT:    pand %xmm3, %xmm5
711; SSE2-NEXT:    psllw $2, %xmm5
712; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
713; SSE2-NEXT:    pand %xmm8, %xmm5
714; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
715; SSE2-NEXT:    pand %xmm9, %xmm0
716; SSE2-NEXT:    psrlw $2, %xmm0
717; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
718; SSE2-NEXT:    pand %xmm10, %xmm0
719; SSE2-NEXT:    por %xmm5, %xmm0
720; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
721; SSE2-NEXT:    movdqa %xmm0, %xmm7
722; SSE2-NEXT:    pand %xmm5, %xmm7
723; SSE2-NEXT:    psrlw $1, %xmm7
724; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
725; SSE2-NEXT:    pand %xmm11, %xmm7
726; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
727; SSE2-NEXT:    pand %xmm12, %xmm0
728; SSE2-NEXT:    paddb %xmm0, %xmm0
729; SSE2-NEXT:    por %xmm7, %xmm0
730; SSE2-NEXT:    movdqa %xmm1, %xmm7
731; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
732; SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[1,0,3,2,4,5,6,7]
733; SSE2-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,7,6]
734; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
735; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
736; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
737; SSE2-NEXT:    packuswb %xmm7, %xmm1
738; SSE2-NEXT:    movdqa %xmm1, %xmm4
739; SSE2-NEXT:    pand %xmm2, %xmm4
740; SSE2-NEXT:    psllw $4, %xmm4
741; SSE2-NEXT:    pand %xmm6, %xmm4
742; SSE2-NEXT:    pand %xmm6, %xmm1
743; SSE2-NEXT:    psrlw $4, %xmm1
744; SSE2-NEXT:    pand %xmm2, %xmm1
745; SSE2-NEXT:    por %xmm4, %xmm1
746; SSE2-NEXT:    pand %xmm1, %xmm3
747; SSE2-NEXT:    psllw $2, %xmm3
748; SSE2-NEXT:    pand %xmm8, %xmm3
749; SSE2-NEXT:    pand %xmm9, %xmm1
750; SSE2-NEXT:    psrlw $2, %xmm1
751; SSE2-NEXT:    pand %xmm10, %xmm1
752; SSE2-NEXT:    por %xmm3, %xmm1
753; SSE2-NEXT:    pand %xmm1, %xmm5
754; SSE2-NEXT:    psrlw $1, %xmm5
755; SSE2-NEXT:    pand %xmm11, %xmm5
756; SSE2-NEXT:    pand %xmm12, %xmm1
757; SSE2-NEXT:    paddb %xmm1, %xmm1
758; SSE2-NEXT:    por %xmm5, %xmm1
759; SSE2-NEXT:    retq
760;
761; SSSE3-LABEL: test_bitreverse_v16i16:
762; SSSE3:       # %bb.0:
763; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
764; SSSE3-NEXT:    pshufb %xmm4, %xmm0
765; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
766; SSSE3-NEXT:    movdqa %xmm0, %xmm2
767; SSSE3-NEXT:    pand %xmm5, %xmm2
768; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
769; SSSE3-NEXT:    movdqa %xmm6, %xmm7
770; SSSE3-NEXT:    pshufb %xmm2, %xmm7
771; SSSE3-NEXT:    psrlw $4, %xmm0
772; SSSE3-NEXT:    pand %xmm5, %xmm0
773; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
774; SSSE3-NEXT:    movdqa %xmm2, %xmm3
775; SSSE3-NEXT:    pshufb %xmm0, %xmm3
776; SSSE3-NEXT:    por %xmm7, %xmm3
777; SSSE3-NEXT:    pshufb %xmm4, %xmm1
778; SSSE3-NEXT:    movdqa %xmm1, %xmm0
779; SSSE3-NEXT:    pand %xmm5, %xmm0
780; SSSE3-NEXT:    pshufb %xmm0, %xmm6
781; SSSE3-NEXT:    psrlw $4, %xmm1
782; SSSE3-NEXT:    pand %xmm5, %xmm1
783; SSSE3-NEXT:    pshufb %xmm1, %xmm2
784; SSSE3-NEXT:    por %xmm6, %xmm2
785; SSSE3-NEXT:    movdqa %xmm3, %xmm0
786; SSSE3-NEXT:    movdqa %xmm2, %xmm1
787; SSSE3-NEXT:    retq
788;
789; AVX1-LABEL: test_bitreverse_v16i16:
790; AVX1:       # %bb.0:
791; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
792; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
793; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
794; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
795; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
796; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
797; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
798; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
799; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
800; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
801; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
802; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
803; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
804; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
805; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
806; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
807; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
808; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
809; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
810; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
811; AVX1-NEXT:    retq
812;
813; AVX2-LABEL: test_bitreverse_v16i16:
814; AVX2:       # %bb.0:
815; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
816; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
817; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
818; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
819; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
820; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
821; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
822; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
823; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
824; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
825; AVX2-NEXT:    retq
826;
827; AVX512-LABEL: test_bitreverse_v16i16:
828; AVX512:       # %bb.0:
829; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
830; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
831; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
832; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
833; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
834; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
835; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
836; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
837; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
838; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
839; AVX512-NEXT:    retq
840;
841; XOPAVX1-LABEL: test_bitreverse_v16i16:
842; XOPAVX1:       # %bb.0:
843; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
844; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
845; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
846; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
847; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
848; XOPAVX1-NEXT:    retq
849;
850; XOPAVX2-LABEL: test_bitreverse_v16i16:
851; XOPAVX2:       # %bb.0:
852; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
853; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
854; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
855; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
856; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
857; XOPAVX2-NEXT:    retq
858  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
859  ret <16 x i16> %b
860}
861
862define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
863; SSE2-LABEL: test_bitreverse_v8i32:
864; SSE2:       # %bb.0:
865; SSE2-NEXT:    pxor %xmm4, %xmm4
866; SSE2-NEXT:    movdqa %xmm0, %xmm2
867; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
868; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
869; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
870; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
871; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
872; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
873; SSE2-NEXT:    packuswb %xmm2, %xmm0
874; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
875; SSE2-NEXT:    movdqa %xmm0, %xmm3
876; SSE2-NEXT:    pand %xmm2, %xmm3
877; SSE2-NEXT:    psllw $4, %xmm3
878; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
879; SSE2-NEXT:    pand %xmm6, %xmm3
880; SSE2-NEXT:    pand %xmm6, %xmm0
881; SSE2-NEXT:    psrlw $4, %xmm0
882; SSE2-NEXT:    pand %xmm2, %xmm0
883; SSE2-NEXT:    por %xmm3, %xmm0
884; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
885; SSE2-NEXT:    movdqa %xmm0, %xmm5
886; SSE2-NEXT:    pand %xmm3, %xmm5
887; SSE2-NEXT:    psllw $2, %xmm5
888; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
889; SSE2-NEXT:    pand %xmm8, %xmm5
890; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
891; SSE2-NEXT:    pand %xmm9, %xmm0
892; SSE2-NEXT:    psrlw $2, %xmm0
893; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
894; SSE2-NEXT:    pand %xmm10, %xmm0
895; SSE2-NEXT:    por %xmm5, %xmm0
896; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
897; SSE2-NEXT:    movdqa %xmm0, %xmm7
898; SSE2-NEXT:    pand %xmm5, %xmm7
899; SSE2-NEXT:    psrlw $1, %xmm7
900; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
901; SSE2-NEXT:    pand %xmm11, %xmm7
902; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
903; SSE2-NEXT:    pand %xmm12, %xmm0
904; SSE2-NEXT:    paddb %xmm0, %xmm0
905; SSE2-NEXT:    por %xmm7, %xmm0
906; SSE2-NEXT:    movdqa %xmm1, %xmm7
907; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
908; SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[3,2,1,0,4,5,6,7]
909; SSE2-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4]
910; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
911; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
912; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
913; SSE2-NEXT:    packuswb %xmm7, %xmm1
914; SSE2-NEXT:    movdqa %xmm1, %xmm4
915; SSE2-NEXT:    pand %xmm2, %xmm4
916; SSE2-NEXT:    psllw $4, %xmm4
917; SSE2-NEXT:    pand %xmm6, %xmm4
918; SSE2-NEXT:    pand %xmm6, %xmm1
919; SSE2-NEXT:    psrlw $4, %xmm1
920; SSE2-NEXT:    pand %xmm2, %xmm1
921; SSE2-NEXT:    por %xmm4, %xmm1
922; SSE2-NEXT:    pand %xmm1, %xmm3
923; SSE2-NEXT:    psllw $2, %xmm3
924; SSE2-NEXT:    pand %xmm8, %xmm3
925; SSE2-NEXT:    pand %xmm9, %xmm1
926; SSE2-NEXT:    psrlw $2, %xmm1
927; SSE2-NEXT:    pand %xmm10, %xmm1
928; SSE2-NEXT:    por %xmm3, %xmm1
929; SSE2-NEXT:    pand %xmm1, %xmm5
930; SSE2-NEXT:    psrlw $1, %xmm5
931; SSE2-NEXT:    pand %xmm11, %xmm5
932; SSE2-NEXT:    pand %xmm12, %xmm1
933; SSE2-NEXT:    paddb %xmm1, %xmm1
934; SSE2-NEXT:    por %xmm5, %xmm1
935; SSE2-NEXT:    retq
936;
937; SSSE3-LABEL: test_bitreverse_v8i32:
938; SSSE3:       # %bb.0:
939; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
940; SSSE3-NEXT:    pshufb %xmm4, %xmm0
941; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
942; SSSE3-NEXT:    movdqa %xmm0, %xmm2
943; SSSE3-NEXT:    pand %xmm5, %xmm2
944; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
945; SSSE3-NEXT:    movdqa %xmm6, %xmm7
946; SSSE3-NEXT:    pshufb %xmm2, %xmm7
947; SSSE3-NEXT:    psrlw $4, %xmm0
948; SSSE3-NEXT:    pand %xmm5, %xmm0
949; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
950; SSSE3-NEXT:    movdqa %xmm2, %xmm3
951; SSSE3-NEXT:    pshufb %xmm0, %xmm3
952; SSSE3-NEXT:    por %xmm7, %xmm3
953; SSSE3-NEXT:    pshufb %xmm4, %xmm1
954; SSSE3-NEXT:    movdqa %xmm1, %xmm0
955; SSSE3-NEXT:    pand %xmm5, %xmm0
956; SSSE3-NEXT:    pshufb %xmm0, %xmm6
957; SSSE3-NEXT:    psrlw $4, %xmm1
958; SSSE3-NEXT:    pand %xmm5, %xmm1
959; SSSE3-NEXT:    pshufb %xmm1, %xmm2
960; SSSE3-NEXT:    por %xmm6, %xmm2
961; SSSE3-NEXT:    movdqa %xmm3, %xmm0
962; SSSE3-NEXT:    movdqa %xmm2, %xmm1
963; SSSE3-NEXT:    retq
964;
965; AVX1-LABEL: test_bitreverse_v8i32:
966; AVX1:       # %bb.0:
967; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
968; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
969; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
970; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
971; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
972; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
973; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
974; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
975; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
976; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
977; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
978; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
979; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
980; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
981; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
982; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
983; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
984; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
985; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
986; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
987; AVX1-NEXT:    retq
988;
989; AVX2-LABEL: test_bitreverse_v8i32:
990; AVX2:       # %bb.0:
991; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
992; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
993; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
994; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
995; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
996; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
997; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
998; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
999; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1000; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1001; AVX2-NEXT:    retq
1002;
1003; AVX512-LABEL: test_bitreverse_v8i32:
1004; AVX512:       # %bb.0:
1005; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1006; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1007; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1008; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1009; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1010; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1011; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1012; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1013; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1014; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1015; AVX512-NEXT:    retq
1016;
1017; XOPAVX1-LABEL: test_bitreverse_v8i32:
1018; XOPAVX1:       # %bb.0:
1019; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1020; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1021; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1022; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1023; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1024; XOPAVX1-NEXT:    retq
1025;
1026; XOPAVX2-LABEL: test_bitreverse_v8i32:
1027; XOPAVX2:       # %bb.0:
1028; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1029; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1030; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1031; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1032; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1033; XOPAVX2-NEXT:    retq
1034  %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
1035  ret <8 x i32> %b
1036}
1037
1038define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
1039; SSE2-LABEL: test_bitreverse_v4i64:
1040; SSE2:       # %bb.0:
1041; SSE2-NEXT:    pxor %xmm4, %xmm4
1042; SSE2-NEXT:    movdqa %xmm0, %xmm2
1043; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
1044; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
1045; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1046; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1047; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1048; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1049; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1050; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1051; SSE2-NEXT:    packuswb %xmm2, %xmm0
1052; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1053; SSE2-NEXT:    movdqa %xmm0, %xmm3
1054; SSE2-NEXT:    pand %xmm2, %xmm3
1055; SSE2-NEXT:    psllw $4, %xmm3
1056; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1057; SSE2-NEXT:    pand %xmm6, %xmm3
1058; SSE2-NEXT:    pand %xmm6, %xmm0
1059; SSE2-NEXT:    psrlw $4, %xmm0
1060; SSE2-NEXT:    pand %xmm2, %xmm0
1061; SSE2-NEXT:    por %xmm3, %xmm0
1062; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1063; SSE2-NEXT:    movdqa %xmm0, %xmm5
1064; SSE2-NEXT:    pand %xmm3, %xmm5
1065; SSE2-NEXT:    psllw $2, %xmm5
1066; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1067; SSE2-NEXT:    pand %xmm8, %xmm5
1068; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1069; SSE2-NEXT:    pand %xmm9, %xmm0
1070; SSE2-NEXT:    psrlw $2, %xmm0
1071; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
1072; SSE2-NEXT:    pand %xmm10, %xmm0
1073; SSE2-NEXT:    por %xmm5, %xmm0
1074; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1075; SSE2-NEXT:    movdqa %xmm0, %xmm7
1076; SSE2-NEXT:    pand %xmm5, %xmm7
1077; SSE2-NEXT:    psrlw $1, %xmm7
1078; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1079; SSE2-NEXT:    pand %xmm11, %xmm7
1080; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1081; SSE2-NEXT:    pand %xmm12, %xmm0
1082; SSE2-NEXT:    paddb %xmm0, %xmm0
1083; SSE2-NEXT:    por %xmm7, %xmm0
1084; SSE2-NEXT:    movdqa %xmm1, %xmm7
1085; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
1086; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,3,0,1]
1087; SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[3,2,1,0,4,5,6,7]
1088; SSE2-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4]
1089; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1090; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1091; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1092; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1093; SSE2-NEXT:    packuswb %xmm7, %xmm1
1094; SSE2-NEXT:    movdqa %xmm1, %xmm4
1095; SSE2-NEXT:    pand %xmm2, %xmm4
1096; SSE2-NEXT:    psllw $4, %xmm4
1097; SSE2-NEXT:    pand %xmm6, %xmm4
1098; SSE2-NEXT:    pand %xmm6, %xmm1
1099; SSE2-NEXT:    psrlw $4, %xmm1
1100; SSE2-NEXT:    pand %xmm2, %xmm1
1101; SSE2-NEXT:    por %xmm4, %xmm1
1102; SSE2-NEXT:    pand %xmm1, %xmm3
1103; SSE2-NEXT:    psllw $2, %xmm3
1104; SSE2-NEXT:    pand %xmm8, %xmm3
1105; SSE2-NEXT:    pand %xmm9, %xmm1
1106; SSE2-NEXT:    psrlw $2, %xmm1
1107; SSE2-NEXT:    pand %xmm10, %xmm1
1108; SSE2-NEXT:    por %xmm3, %xmm1
1109; SSE2-NEXT:    pand %xmm1, %xmm5
1110; SSE2-NEXT:    psrlw $1, %xmm5
1111; SSE2-NEXT:    pand %xmm11, %xmm5
1112; SSE2-NEXT:    pand %xmm12, %xmm1
1113; SSE2-NEXT:    paddb %xmm1, %xmm1
1114; SSE2-NEXT:    por %xmm5, %xmm1
1115; SSE2-NEXT:    retq
1116;
1117; SSSE3-LABEL: test_bitreverse_v4i64:
1118; SSSE3:       # %bb.0:
1119; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1120; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1121; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1122; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1123; SSSE3-NEXT:    pand %xmm5, %xmm2
1124; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1125; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1126; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1127; SSSE3-NEXT:    psrlw $4, %xmm0
1128; SSSE3-NEXT:    pand %xmm5, %xmm0
1129; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1130; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1131; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1132; SSSE3-NEXT:    por %xmm7, %xmm3
1133; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1134; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1135; SSSE3-NEXT:    pand %xmm5, %xmm0
1136; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1137; SSSE3-NEXT:    psrlw $4, %xmm1
1138; SSSE3-NEXT:    pand %xmm5, %xmm1
1139; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1140; SSSE3-NEXT:    por %xmm6, %xmm2
1141; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1142; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1143; SSSE3-NEXT:    retq
1144;
1145; AVX1-LABEL: test_bitreverse_v4i64:
1146; AVX1:       # %bb.0:
1147; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1148; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1149; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1150; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1151; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1152; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1153; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1154; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1155; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1156; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1157; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1158; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1159; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1160; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1161; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1162; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1163; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1164; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1165; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1166; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1167; AVX1-NEXT:    retq
1168;
1169; AVX2-LABEL: test_bitreverse_v4i64:
1170; AVX2:       # %bb.0:
1171; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1172; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1173; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1174; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1175; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1176; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1177; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1178; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1179; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1180; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1181; AVX2-NEXT:    retq
1182;
1183; AVX512-LABEL: test_bitreverse_v4i64:
1184; AVX512:       # %bb.0:
1185; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1186; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1187; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1188; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1189; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1190; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1191; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1192; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1193; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1194; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1195; AVX512-NEXT:    retq
1196;
1197; XOPAVX1-LABEL: test_bitreverse_v4i64:
1198; XOPAVX1:       # %bb.0:
1199; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1200; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1201; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1202; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1203; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1204; XOPAVX1-NEXT:    retq
1205;
1206; XOPAVX2-LABEL: test_bitreverse_v4i64:
1207; XOPAVX2:       # %bb.0:
1208; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1209; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1210; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1211; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1212; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1213; XOPAVX2-NEXT:    retq
1214  %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
1215  ret <4 x i64> %b
1216}
1217
1218define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
1219; SSE2-LABEL: test_bitreverse_v64i8:
1220; SSE2:       # %bb.0:
1221; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1222; SSE2-NEXT:    movdqa %xmm0, %xmm5
1223; SSE2-NEXT:    pand %xmm13, %xmm5
1224; SSE2-NEXT:    psllw $4, %xmm5
1225; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1226; SSE2-NEXT:    pand %xmm7, %xmm5
1227; SSE2-NEXT:    pand %xmm7, %xmm0
1228; SSE2-NEXT:    psrlw $4, %xmm0
1229; SSE2-NEXT:    pand %xmm13, %xmm0
1230; SSE2-NEXT:    por %xmm5, %xmm0
1231; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1232; SSE2-NEXT:    movdqa %xmm0, %xmm6
1233; SSE2-NEXT:    pand %xmm5, %xmm6
1234; SSE2-NEXT:    psllw $2, %xmm6
1235; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1236; SSE2-NEXT:    pand %xmm8, %xmm6
1237; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1238; SSE2-NEXT:    pand %xmm9, %xmm0
1239; SSE2-NEXT:    psrlw $2, %xmm0
1240; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
1241; SSE2-NEXT:    pand %xmm10, %xmm0
1242; SSE2-NEXT:    por %xmm6, %xmm0
1243; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1244; SSE2-NEXT:    movdqa %xmm0, %xmm4
1245; SSE2-NEXT:    pand %xmm6, %xmm4
1246; SSE2-NEXT:    psrlw $1, %xmm4
1247; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1248; SSE2-NEXT:    pand %xmm11, %xmm4
1249; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1250; SSE2-NEXT:    pand %xmm12, %xmm0
1251; SSE2-NEXT:    paddb %xmm0, %xmm0
1252; SSE2-NEXT:    por %xmm4, %xmm0
1253; SSE2-NEXT:    movdqa %xmm1, %xmm4
1254; SSE2-NEXT:    pand %xmm13, %xmm4
1255; SSE2-NEXT:    psllw $4, %xmm4
1256; SSE2-NEXT:    pand %xmm7, %xmm4
1257; SSE2-NEXT:    pand %xmm7, %xmm1
1258; SSE2-NEXT:    psrlw $4, %xmm1
1259; SSE2-NEXT:    pand %xmm13, %xmm1
1260; SSE2-NEXT:    por %xmm4, %xmm1
1261; SSE2-NEXT:    movdqa %xmm1, %xmm4
1262; SSE2-NEXT:    pand %xmm5, %xmm4
1263; SSE2-NEXT:    psllw $2, %xmm4
1264; SSE2-NEXT:    pand %xmm8, %xmm4
1265; SSE2-NEXT:    pand %xmm9, %xmm1
1266; SSE2-NEXT:    psrlw $2, %xmm1
1267; SSE2-NEXT:    pand %xmm10, %xmm1
1268; SSE2-NEXT:    por %xmm4, %xmm1
1269; SSE2-NEXT:    movdqa %xmm1, %xmm4
1270; SSE2-NEXT:    pand %xmm6, %xmm4
1271; SSE2-NEXT:    psrlw $1, %xmm4
1272; SSE2-NEXT:    pand %xmm11, %xmm4
1273; SSE2-NEXT:    pand %xmm12, %xmm1
1274; SSE2-NEXT:    paddb %xmm1, %xmm1
1275; SSE2-NEXT:    por %xmm4, %xmm1
1276; SSE2-NEXT:    movdqa %xmm2, %xmm4
1277; SSE2-NEXT:    pand %xmm13, %xmm4
1278; SSE2-NEXT:    psllw $4, %xmm4
1279; SSE2-NEXT:    pand %xmm7, %xmm4
1280; SSE2-NEXT:    pand %xmm7, %xmm2
1281; SSE2-NEXT:    psrlw $4, %xmm2
1282; SSE2-NEXT:    pand %xmm13, %xmm2
1283; SSE2-NEXT:    por %xmm4, %xmm2
1284; SSE2-NEXT:    movdqa %xmm2, %xmm4
1285; SSE2-NEXT:    pand %xmm5, %xmm4
1286; SSE2-NEXT:    psllw $2, %xmm4
1287; SSE2-NEXT:    pand %xmm8, %xmm4
1288; SSE2-NEXT:    pand %xmm9, %xmm2
1289; SSE2-NEXT:    psrlw $2, %xmm2
1290; SSE2-NEXT:    pand %xmm10, %xmm2
1291; SSE2-NEXT:    por %xmm4, %xmm2
1292; SSE2-NEXT:    movdqa %xmm2, %xmm4
1293; SSE2-NEXT:    pand %xmm6, %xmm4
1294; SSE2-NEXT:    psrlw $1, %xmm4
1295; SSE2-NEXT:    pand %xmm11, %xmm4
1296; SSE2-NEXT:    pand %xmm12, %xmm2
1297; SSE2-NEXT:    paddb %xmm2, %xmm2
1298; SSE2-NEXT:    por %xmm4, %xmm2
1299; SSE2-NEXT:    movdqa %xmm3, %xmm4
1300; SSE2-NEXT:    pand %xmm13, %xmm4
1301; SSE2-NEXT:    psllw $4, %xmm4
1302; SSE2-NEXT:    pand %xmm7, %xmm4
1303; SSE2-NEXT:    pand %xmm7, %xmm3
1304; SSE2-NEXT:    psrlw $4, %xmm3
1305; SSE2-NEXT:    pand %xmm13, %xmm3
1306; SSE2-NEXT:    por %xmm4, %xmm3
1307; SSE2-NEXT:    pand %xmm3, %xmm5
1308; SSE2-NEXT:    psllw $2, %xmm5
1309; SSE2-NEXT:    pand %xmm8, %xmm5
1310; SSE2-NEXT:    pand %xmm9, %xmm3
1311; SSE2-NEXT:    psrlw $2, %xmm3
1312; SSE2-NEXT:    pand %xmm10, %xmm3
1313; SSE2-NEXT:    por %xmm5, %xmm3
1314; SSE2-NEXT:    pand %xmm3, %xmm6
1315; SSE2-NEXT:    psrlw $1, %xmm6
1316; SSE2-NEXT:    pand %xmm11, %xmm6
1317; SSE2-NEXT:    pand %xmm12, %xmm3
1318; SSE2-NEXT:    paddb %xmm3, %xmm3
1319; SSE2-NEXT:    por %xmm6, %xmm3
1320; SSE2-NEXT:    retq
1321;
1322; SSSE3-LABEL: test_bitreverse_v64i8:
1323; SSSE3:       # %bb.0:
1324; SSSE3-NEXT:    movdqa %xmm0, %xmm5
1325; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1326; SSSE3-NEXT:    pand %xmm8, %xmm0
1327; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1328; SSSE3-NEXT:    movdqa %xmm9, %xmm6
1329; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1330; SSSE3-NEXT:    psrlw $4, %xmm5
1331; SSSE3-NEXT:    pand %xmm8, %xmm5
1332; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1333; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1334; SSSE3-NEXT:    pshufb %xmm5, %xmm0
1335; SSSE3-NEXT:    por %xmm6, %xmm0
1336; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1337; SSSE3-NEXT:    pand %xmm8, %xmm5
1338; SSSE3-NEXT:    movdqa %xmm9, %xmm6
1339; SSSE3-NEXT:    pshufb %xmm5, %xmm6
1340; SSSE3-NEXT:    psrlw $4, %xmm1
1341; SSSE3-NEXT:    pand %xmm8, %xmm1
1342; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1343; SSSE3-NEXT:    pshufb %xmm1, %xmm5
1344; SSSE3-NEXT:    por %xmm6, %xmm5
1345; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1346; SSSE3-NEXT:    pand %xmm8, %xmm1
1347; SSSE3-NEXT:    movdqa %xmm9, %xmm7
1348; SSSE3-NEXT:    pshufb %xmm1, %xmm7
1349; SSSE3-NEXT:    psrlw $4, %xmm2
1350; SSSE3-NEXT:    pand %xmm8, %xmm2
1351; SSSE3-NEXT:    movdqa %xmm4, %xmm6
1352; SSSE3-NEXT:    pshufb %xmm2, %xmm6
1353; SSSE3-NEXT:    por %xmm7, %xmm6
1354; SSSE3-NEXT:    movdqa %xmm3, %xmm1
1355; SSSE3-NEXT:    pand %xmm8, %xmm1
1356; SSSE3-NEXT:    pshufb %xmm1, %xmm9
1357; SSSE3-NEXT:    psrlw $4, %xmm3
1358; SSSE3-NEXT:    pand %xmm8, %xmm3
1359; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1360; SSSE3-NEXT:    por %xmm9, %xmm4
1361; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1362; SSSE3-NEXT:    movdqa %xmm6, %xmm2
1363; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1364; SSSE3-NEXT:    retq
1365;
1366; AVX1-LABEL: test_bitreverse_v64i8:
1367; AVX1:       # %bb.0:
1368; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1369; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1370; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
1371; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1372; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1373; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1374; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1375; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1376; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
1377; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
1378; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm4
1379; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1380; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1381; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1382; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1383; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
1384; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1385; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1386; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
1387; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1388; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1389; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1390; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
1391; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
1392; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1393; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1394; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1395; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1396; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1397; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1398; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1399; AVX1-NEXT:    retq
1400;
1401; AVX2-LABEL: test_bitreverse_v64i8:
1402; AVX2:       # %bb.0:
1403; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1404; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
1405; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1406; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1407; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1408; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
1409; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1410; AVX2-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
1411; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
1412; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
1413; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1414; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
1415; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
1416; AVX2-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
1417; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
1418; AVX2-NEXT:    retq
1419;
1420; AVX512F-LABEL: test_bitreverse_v64i8:
1421; AVX512F:       # %bb.0:
1422; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1423; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm3
1424; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1425; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1426; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
1427; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
1428; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1429; AVX512F-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
1430; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
1431; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
1432; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1433; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
1434; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
1435; AVX512F-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
1436; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
1437; AVX512F-NEXT:    retq
1438;
1439; AVX512BW-LABEL: test_bitreverse_v64i8:
1440; AVX512BW:       # %bb.0:
1441; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1442; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
1443; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1444; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
1445; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
1446; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
1447; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1448; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
1449; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
1450; AVX512BW-NEXT:    retq
1451;
1452; XOPAVX1-LABEL: test_bitreverse_v64i8:
1453; XOPAVX1:       # %bb.0:
1454; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1455; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1456; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1457; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1458; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1459; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1460; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1461; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1462; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1463; XOPAVX1-NEXT:    retq
1464;
1465; XOPAVX2-LABEL: test_bitreverse_v64i8:
1466; XOPAVX2:       # %bb.0:
1467; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1468; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1469; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1470; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1471; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
1472; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1473; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1474; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1475; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
1476; XOPAVX2-NEXT:    retq
1477  %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
1478  ret <64 x i8> %b
1479}
1480
1481define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
1482; SSE2-LABEL: test_bitreverse_v32i16:
1483; SSE2:       # %bb.0:
1484; SSE2-NEXT:    pxor %xmm14, %xmm14
1485; SSE2-NEXT:    movdqa %xmm0, %xmm4
1486; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15]
1487; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
1488; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
1489; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
1490; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
1491; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
1492; SSE2-NEXT:    packuswb %xmm4, %xmm0
1493; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1494; SSE2-NEXT:    movdqa %xmm0, %xmm5
1495; SSE2-NEXT:    pand %xmm8, %xmm5
1496; SSE2-NEXT:    psllw $4, %xmm5
1497; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1498; SSE2-NEXT:    pand %xmm4, %xmm5
1499; SSE2-NEXT:    pand %xmm4, %xmm0
1500; SSE2-NEXT:    psrlw $4, %xmm0
1501; SSE2-NEXT:    pand %xmm8, %xmm0
1502; SSE2-NEXT:    por %xmm5, %xmm0
1503; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1504; SSE2-NEXT:    movdqa %xmm0, %xmm7
1505; SSE2-NEXT:    pand %xmm5, %xmm7
1506; SSE2-NEXT:    psllw $2, %xmm7
1507; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1508; SSE2-NEXT:    pand %xmm9, %xmm7
1509; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1510; SSE2-NEXT:    pand %xmm10, %xmm0
1511; SSE2-NEXT:    psrlw $2, %xmm0
1512; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
1513; SSE2-NEXT:    pand %xmm11, %xmm0
1514; SSE2-NEXT:    por %xmm7, %xmm0
1515; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1516; SSE2-NEXT:    movdqa %xmm0, %xmm6
1517; SSE2-NEXT:    pand %xmm7, %xmm6
1518; SSE2-NEXT:    psrlw $1, %xmm6
1519; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1520; SSE2-NEXT:    pand %xmm12, %xmm6
1521; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1522; SSE2-NEXT:    pand %xmm13, %xmm0
1523; SSE2-NEXT:    paddb %xmm0, %xmm0
1524; SSE2-NEXT:    por %xmm6, %xmm0
1525; SSE2-NEXT:    movdqa %xmm1, %xmm6
1526; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
1527; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
1528; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6]
1529; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
1530; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
1531; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
1532; SSE2-NEXT:    packuswb %xmm6, %xmm1
1533; SSE2-NEXT:    movdqa %xmm1, %xmm6
1534; SSE2-NEXT:    pand %xmm8, %xmm6
1535; SSE2-NEXT:    psllw $4, %xmm6
1536; SSE2-NEXT:    pand %xmm4, %xmm6
1537; SSE2-NEXT:    pand %xmm4, %xmm1
1538; SSE2-NEXT:    psrlw $4, %xmm1
1539; SSE2-NEXT:    pand %xmm8, %xmm1
1540; SSE2-NEXT:    por %xmm6, %xmm1
1541; SSE2-NEXT:    movdqa %xmm1, %xmm6
1542; SSE2-NEXT:    pand %xmm5, %xmm6
1543; SSE2-NEXT:    psllw $2, %xmm6
1544; SSE2-NEXT:    pand %xmm9, %xmm6
1545; SSE2-NEXT:    pand %xmm10, %xmm1
1546; SSE2-NEXT:    psrlw $2, %xmm1
1547; SSE2-NEXT:    pand %xmm11, %xmm1
1548; SSE2-NEXT:    por %xmm6, %xmm1
1549; SSE2-NEXT:    movdqa %xmm1, %xmm6
1550; SSE2-NEXT:    pand %xmm7, %xmm6
1551; SSE2-NEXT:    psrlw $1, %xmm6
1552; SSE2-NEXT:    pand %xmm12, %xmm6
1553; SSE2-NEXT:    pand %xmm13, %xmm1
1554; SSE2-NEXT:    paddb %xmm1, %xmm1
1555; SSE2-NEXT:    por %xmm6, %xmm1
1556; SSE2-NEXT:    movdqa %xmm2, %xmm6
1557; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
1558; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
1559; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6]
1560; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
1561; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
1562; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
1563; SSE2-NEXT:    packuswb %xmm6, %xmm2
1564; SSE2-NEXT:    movdqa %xmm2, %xmm6
1565; SSE2-NEXT:    pand %xmm8, %xmm6
1566; SSE2-NEXT:    psllw $4, %xmm6
1567; SSE2-NEXT:    pand %xmm4, %xmm6
1568; SSE2-NEXT:    pand %xmm4, %xmm2
1569; SSE2-NEXT:    psrlw $4, %xmm2
1570; SSE2-NEXT:    pand %xmm8, %xmm2
1571; SSE2-NEXT:    por %xmm6, %xmm2
1572; SSE2-NEXT:    movdqa %xmm2, %xmm6
1573; SSE2-NEXT:    pand %xmm5, %xmm6
1574; SSE2-NEXT:    psllw $2, %xmm6
1575; SSE2-NEXT:    pand %xmm9, %xmm6
1576; SSE2-NEXT:    pand %xmm10, %xmm2
1577; SSE2-NEXT:    psrlw $2, %xmm2
1578; SSE2-NEXT:    pand %xmm11, %xmm2
1579; SSE2-NEXT:    por %xmm6, %xmm2
1580; SSE2-NEXT:    movdqa %xmm2, %xmm6
1581; SSE2-NEXT:    pand %xmm7, %xmm6
1582; SSE2-NEXT:    psrlw $1, %xmm6
1583; SSE2-NEXT:    pand %xmm12, %xmm6
1584; SSE2-NEXT:    pand %xmm13, %xmm2
1585; SSE2-NEXT:    paddb %xmm2, %xmm2
1586; SSE2-NEXT:    por %xmm6, %xmm2
1587; SSE2-NEXT:    movdqa %xmm3, %xmm6
1588; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
1589; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
1590; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6]
1591; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
1592; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
1593; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
1594; SSE2-NEXT:    packuswb %xmm6, %xmm3
1595; SSE2-NEXT:    movdqa %xmm3, %xmm6
1596; SSE2-NEXT:    pand %xmm8, %xmm6
1597; SSE2-NEXT:    psllw $4, %xmm6
1598; SSE2-NEXT:    pand %xmm4, %xmm6
1599; SSE2-NEXT:    pand %xmm4, %xmm3
1600; SSE2-NEXT:    psrlw $4, %xmm3
1601; SSE2-NEXT:    pand %xmm8, %xmm3
1602; SSE2-NEXT:    por %xmm6, %xmm3
1603; SSE2-NEXT:    pand %xmm3, %xmm5
1604; SSE2-NEXT:    psllw $2, %xmm5
1605; SSE2-NEXT:    pand %xmm9, %xmm5
1606; SSE2-NEXT:    pand %xmm10, %xmm3
1607; SSE2-NEXT:    psrlw $2, %xmm3
1608; SSE2-NEXT:    pand %xmm11, %xmm3
1609; SSE2-NEXT:    por %xmm5, %xmm3
1610; SSE2-NEXT:    pand %xmm3, %xmm7
1611; SSE2-NEXT:    psrlw $1, %xmm7
1612; SSE2-NEXT:    pand %xmm12, %xmm7
1613; SSE2-NEXT:    pand %xmm13, %xmm3
1614; SSE2-NEXT:    paddb %xmm3, %xmm3
1615; SSE2-NEXT:    por %xmm7, %xmm3
1616; SSE2-NEXT:    retq
1617;
1618; SSSE3-LABEL: test_bitreverse_v32i16:
1619; SSSE3:       # %bb.0:
1620; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1621; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1622; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1623; SSSE3-NEXT:    pshufb %xmm8, %xmm1
1624; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1625; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1626; SSSE3-NEXT:    pand %xmm9, %xmm0
1627; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1628; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1629; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1630; SSSE3-NEXT:    psrlw $4, %xmm1
1631; SSSE3-NEXT:    pand %xmm9, %xmm1
1632; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1633; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1634; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1635; SSSE3-NEXT:    por %xmm6, %xmm0
1636; SSSE3-NEXT:    pshufb %xmm8, %xmm5
1637; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1638; SSSE3-NEXT:    pand %xmm9, %xmm1
1639; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1640; SSSE3-NEXT:    pshufb %xmm1, %xmm6
1641; SSSE3-NEXT:    psrlw $4, %xmm5
1642; SSSE3-NEXT:    pand %xmm9, %xmm5
1643; SSSE3-NEXT:    movdqa %xmm4, %xmm1
1644; SSSE3-NEXT:    pshufb %xmm5, %xmm1
1645; SSSE3-NEXT:    por %xmm6, %xmm1
1646; SSSE3-NEXT:    pshufb %xmm8, %xmm2
1647; SSSE3-NEXT:    movdqa %xmm2, %xmm5
1648; SSSE3-NEXT:    pand %xmm9, %xmm5
1649; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1650; SSSE3-NEXT:    pshufb %xmm5, %xmm6
1651; SSSE3-NEXT:    psrlw $4, %xmm2
1652; SSSE3-NEXT:    pand %xmm9, %xmm2
1653; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1654; SSSE3-NEXT:    pshufb %xmm2, %xmm5
1655; SSSE3-NEXT:    por %xmm6, %xmm5
1656; SSSE3-NEXT:    pshufb %xmm8, %xmm3
1657; SSSE3-NEXT:    movdqa %xmm3, %xmm2
1658; SSSE3-NEXT:    pand %xmm9, %xmm2
1659; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1660; SSSE3-NEXT:    psrlw $4, %xmm3
1661; SSSE3-NEXT:    pand %xmm9, %xmm3
1662; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1663; SSSE3-NEXT:    por %xmm7, %xmm4
1664; SSSE3-NEXT:    movdqa %xmm5, %xmm2
1665; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1666; SSSE3-NEXT:    retq
1667;
1668; AVX1-LABEL: test_bitreverse_v32i16:
1669; AVX1:       # %bb.0:
1670; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1671; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1672; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1673; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1674; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
1675; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1676; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1677; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1678; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1679; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1680; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
1681; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
1682; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1683; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
1684; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1685; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1686; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1687; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
1688; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
1689; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1690; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1691; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1692; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
1693; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1694; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1695; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1696; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
1697; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
1698; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1699; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
1700; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
1701; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1702; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
1703; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
1704; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
1705; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1706; AVX1-NEXT:    retq
1707;
1708; AVX2-LABEL: test_bitreverse_v32i16:
1709; AVX2:       # %bb.0:
1710; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1711; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1712; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1713; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
1714; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1715; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
1716; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1717; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
1718; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1719; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
1720; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
1721; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1722; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
1723; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
1724; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
1725; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
1726; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
1727; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
1728; AVX2-NEXT:    retq
1729;
1730; AVX512F-LABEL: test_bitreverse_v32i16:
1731; AVX512F:       # %bb.0:
1732; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1733; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1734; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1735; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm4
1736; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1737; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
1738; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
1739; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
1740; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1741; AVX512F-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
1742; AVX512F-NEXT:    vpor %ymm0, %ymm4, %ymm0
1743; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1744; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm2
1745; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
1746; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
1747; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
1748; AVX512F-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
1749; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
1750; AVX512F-NEXT:    retq
1751;
1752; AVX512BW-LABEL: test_bitreverse_v32i16:
1753; AVX512BW:       # %bb.0:
1754; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
1755; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1756; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
1757; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1758; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
1759; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
1760; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
1761; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1762; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
1763; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
1764; AVX512BW-NEXT:    retq
1765;
1766; XOPAVX1-LABEL: test_bitreverse_v32i16:
1767; XOPAVX1:       # %bb.0:
1768; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1769; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1770; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1771; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1772; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1773; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1774; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1775; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1776; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1777; XOPAVX1-NEXT:    retq
1778;
1779; XOPAVX2-LABEL: test_bitreverse_v32i16:
1780; XOPAVX2:       # %bb.0:
1781; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1782; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1783; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1784; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1785; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
1786; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1787; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1788; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1789; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
1790; XOPAVX2-NEXT:    retq
1791  %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
1792  ret <32 x i16> %b
1793}
1794
1795define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
1796; SSE2-LABEL: test_bitreverse_v16i32:
1797; SSE2:       # %bb.0:
1798; SSE2-NEXT:    pxor %xmm14, %xmm14
1799; SSE2-NEXT:    movdqa %xmm0, %xmm4
1800; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15]
1801; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
1802; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
1803; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
1804; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1805; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1806; SSE2-NEXT:    packuswb %xmm4, %xmm0
1807; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1808; SSE2-NEXT:    movdqa %xmm0, %xmm5
1809; SSE2-NEXT:    pand %xmm8, %xmm5
1810; SSE2-NEXT:    psllw $4, %xmm5
1811; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1812; SSE2-NEXT:    pand %xmm4, %xmm5
1813; SSE2-NEXT:    pand %xmm4, %xmm0
1814; SSE2-NEXT:    psrlw $4, %xmm0
1815; SSE2-NEXT:    pand %xmm8, %xmm0
1816; SSE2-NEXT:    por %xmm5, %xmm0
1817; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1818; SSE2-NEXT:    movdqa %xmm0, %xmm7
1819; SSE2-NEXT:    pand %xmm5, %xmm7
1820; SSE2-NEXT:    psllw $2, %xmm7
1821; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1822; SSE2-NEXT:    pand %xmm9, %xmm7
1823; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1824; SSE2-NEXT:    pand %xmm10, %xmm0
1825; SSE2-NEXT:    psrlw $2, %xmm0
1826; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
1827; SSE2-NEXT:    pand %xmm11, %xmm0
1828; SSE2-NEXT:    por %xmm7, %xmm0
1829; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1830; SSE2-NEXT:    movdqa %xmm0, %xmm6
1831; SSE2-NEXT:    pand %xmm7, %xmm6
1832; SSE2-NEXT:    psrlw $1, %xmm6
1833; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1834; SSE2-NEXT:    pand %xmm12, %xmm6
1835; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1836; SSE2-NEXT:    pand %xmm13, %xmm0
1837; SSE2-NEXT:    paddb %xmm0, %xmm0
1838; SSE2-NEXT:    por %xmm6, %xmm0
1839; SSE2-NEXT:    movdqa %xmm1, %xmm6
1840; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
1841; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1842; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1843; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
1844; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1845; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1846; SSE2-NEXT:    packuswb %xmm6, %xmm1
1847; SSE2-NEXT:    movdqa %xmm1, %xmm6
1848; SSE2-NEXT:    pand %xmm8, %xmm6
1849; SSE2-NEXT:    psllw $4, %xmm6
1850; SSE2-NEXT:    pand %xmm4, %xmm6
1851; SSE2-NEXT:    pand %xmm4, %xmm1
1852; SSE2-NEXT:    psrlw $4, %xmm1
1853; SSE2-NEXT:    pand %xmm8, %xmm1
1854; SSE2-NEXT:    por %xmm6, %xmm1
1855; SSE2-NEXT:    movdqa %xmm1, %xmm6
1856; SSE2-NEXT:    pand %xmm5, %xmm6
1857; SSE2-NEXT:    psllw $2, %xmm6
1858; SSE2-NEXT:    pand %xmm9, %xmm6
1859; SSE2-NEXT:    pand %xmm10, %xmm1
1860; SSE2-NEXT:    psrlw $2, %xmm1
1861; SSE2-NEXT:    pand %xmm11, %xmm1
1862; SSE2-NEXT:    por %xmm6, %xmm1
1863; SSE2-NEXT:    movdqa %xmm1, %xmm6
1864; SSE2-NEXT:    pand %xmm7, %xmm6
1865; SSE2-NEXT:    psrlw $1, %xmm6
1866; SSE2-NEXT:    pand %xmm12, %xmm6
1867; SSE2-NEXT:    pand %xmm13, %xmm1
1868; SSE2-NEXT:    paddb %xmm1, %xmm1
1869; SSE2-NEXT:    por %xmm6, %xmm1
1870; SSE2-NEXT:    movdqa %xmm2, %xmm6
1871; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
1872; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1873; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1874; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
1875; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1876; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1877; SSE2-NEXT:    packuswb %xmm6, %xmm2
1878; SSE2-NEXT:    movdqa %xmm2, %xmm6
1879; SSE2-NEXT:    pand %xmm8, %xmm6
1880; SSE2-NEXT:    psllw $4, %xmm6
1881; SSE2-NEXT:    pand %xmm4, %xmm6
1882; SSE2-NEXT:    pand %xmm4, %xmm2
1883; SSE2-NEXT:    psrlw $4, %xmm2
1884; SSE2-NEXT:    pand %xmm8, %xmm2
1885; SSE2-NEXT:    por %xmm6, %xmm2
1886; SSE2-NEXT:    movdqa %xmm2, %xmm6
1887; SSE2-NEXT:    pand %xmm5, %xmm6
1888; SSE2-NEXT:    psllw $2, %xmm6
1889; SSE2-NEXT:    pand %xmm9, %xmm6
1890; SSE2-NEXT:    pand %xmm10, %xmm2
1891; SSE2-NEXT:    psrlw $2, %xmm2
1892; SSE2-NEXT:    pand %xmm11, %xmm2
1893; SSE2-NEXT:    por %xmm6, %xmm2
1894; SSE2-NEXT:    movdqa %xmm2, %xmm6
1895; SSE2-NEXT:    pand %xmm7, %xmm6
1896; SSE2-NEXT:    psrlw $1, %xmm6
1897; SSE2-NEXT:    pand %xmm12, %xmm6
1898; SSE2-NEXT:    pand %xmm13, %xmm2
1899; SSE2-NEXT:    paddb %xmm2, %xmm2
1900; SSE2-NEXT:    por %xmm6, %xmm2
1901; SSE2-NEXT:    movdqa %xmm3, %xmm6
1902; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
1903; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1904; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1905; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
1906; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
1907; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
1908; SSE2-NEXT:    packuswb %xmm6, %xmm3
1909; SSE2-NEXT:    movdqa %xmm3, %xmm6
1910; SSE2-NEXT:    pand %xmm8, %xmm6
1911; SSE2-NEXT:    psllw $4, %xmm6
1912; SSE2-NEXT:    pand %xmm4, %xmm6
1913; SSE2-NEXT:    pand %xmm4, %xmm3
1914; SSE2-NEXT:    psrlw $4, %xmm3
1915; SSE2-NEXT:    pand %xmm8, %xmm3
1916; SSE2-NEXT:    por %xmm6, %xmm3
1917; SSE2-NEXT:    pand %xmm3, %xmm5
1918; SSE2-NEXT:    psllw $2, %xmm5
1919; SSE2-NEXT:    pand %xmm9, %xmm5
1920; SSE2-NEXT:    pand %xmm10, %xmm3
1921; SSE2-NEXT:    psrlw $2, %xmm3
1922; SSE2-NEXT:    pand %xmm11, %xmm3
1923; SSE2-NEXT:    por %xmm5, %xmm3
1924; SSE2-NEXT:    pand %xmm3, %xmm7
1925; SSE2-NEXT:    psrlw $1, %xmm7
1926; SSE2-NEXT:    pand %xmm12, %xmm7
1927; SSE2-NEXT:    pand %xmm13, %xmm3
1928; SSE2-NEXT:    paddb %xmm3, %xmm3
1929; SSE2-NEXT:    por %xmm7, %xmm3
1930; SSE2-NEXT:    retq
1931;
1932; SSSE3-LABEL: test_bitreverse_v16i32:
1933; SSSE3:       # %bb.0:
1934; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1935; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1936; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1937; SSSE3-NEXT:    pshufb %xmm8, %xmm1
1938; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1939; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1940; SSSE3-NEXT:    pand %xmm9, %xmm0
1941; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1942; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1943; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1944; SSSE3-NEXT:    psrlw $4, %xmm1
1945; SSSE3-NEXT:    pand %xmm9, %xmm1
1946; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1947; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1948; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1949; SSSE3-NEXT:    por %xmm6, %xmm0
1950; SSSE3-NEXT:    pshufb %xmm8, %xmm5
1951; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1952; SSSE3-NEXT:    pand %xmm9, %xmm1
1953; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1954; SSSE3-NEXT:    pshufb %xmm1, %xmm6
1955; SSSE3-NEXT:    psrlw $4, %xmm5
1956; SSSE3-NEXT:    pand %xmm9, %xmm5
1957; SSSE3-NEXT:    movdqa %xmm4, %xmm1
1958; SSSE3-NEXT:    pshufb %xmm5, %xmm1
1959; SSSE3-NEXT:    por %xmm6, %xmm1
1960; SSSE3-NEXT:    pshufb %xmm8, %xmm2
1961; SSSE3-NEXT:    movdqa %xmm2, %xmm5
1962; SSSE3-NEXT:    pand %xmm9, %xmm5
1963; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1964; SSSE3-NEXT:    pshufb %xmm5, %xmm6
1965; SSSE3-NEXT:    psrlw $4, %xmm2
1966; SSSE3-NEXT:    pand %xmm9, %xmm2
1967; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1968; SSSE3-NEXT:    pshufb %xmm2, %xmm5
1969; SSSE3-NEXT:    por %xmm6, %xmm5
1970; SSSE3-NEXT:    pshufb %xmm8, %xmm3
1971; SSSE3-NEXT:    movdqa %xmm3, %xmm2
1972; SSSE3-NEXT:    pand %xmm9, %xmm2
1973; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1974; SSSE3-NEXT:    psrlw $4, %xmm3
1975; SSSE3-NEXT:    pand %xmm9, %xmm3
1976; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1977; SSSE3-NEXT:    por %xmm7, %xmm4
1978; SSSE3-NEXT:    movdqa %xmm5, %xmm2
1979; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1980; SSSE3-NEXT:    retq
1981;
1982; AVX1-LABEL: test_bitreverse_v16i32:
1983; AVX1:       # %bb.0:
1984; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1985; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1986; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1987; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1988; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
1989; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1990; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1991; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1992; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1993; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1994; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
1995; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
1996; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1997; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
1998; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1999; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2000; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2001; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
2002; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
2003; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2004; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2005; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2006; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2007; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2008; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2009; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2010; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2011; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2012; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2013; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
2014; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2015; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2016; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2017; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
2018; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
2019; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2020; AVX1-NEXT:    retq
2021;
2022; AVX2-LABEL: test_bitreverse_v16i32:
2023; AVX2:       # %bb.0:
2024; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2025; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2026; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2027; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
2028; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2029; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2030; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2031; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2032; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2033; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2034; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
2035; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2036; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
2037; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2038; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2039; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2040; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2041; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2042; AVX2-NEXT:    retq
2043;
2044; AVX512F-LABEL: test_bitreverse_v16i32:
2045; AVX512F:       # %bb.0:
2046; AVX512F-NEXT:    vpsrld $24, %zmm0, %zmm1
2047; AVX512F-NEXT:    vpsrld $8, %zmm0, %zmm2
2048; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
2049; AVX512F-NEXT:    vpord %zmm1, %zmm2, %zmm1
2050; AVX512F-NEXT:    vpslld $24, %zmm0, %zmm2
2051; AVX512F-NEXT:    vpslld $8, %zmm0, %zmm0
2052; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
2053; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
2054; AVX512F-NEXT:    vpord %zmm0, %zmm2, %zmm0
2055; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
2056; AVX512F-NEXT:    vpslld $4, %zmm1, %zmm1
2057; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
2058; AVX512F-NEXT:    vpsrld $4, %zmm0, %zmm0
2059; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
2060; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
2061; AVX512F-NEXT:    vpslld $2, %zmm1, %zmm1
2062; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
2063; AVX512F-NEXT:    vpsrld $2, %zmm0, %zmm0
2064; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
2065; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
2066; AVX512F-NEXT:    vpslld $1, %zmm1, %zmm1
2067; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
2068; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0
2069; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
2070; AVX512F-NEXT:    retq
2071;
2072; AVX512BW-LABEL: test_bitreverse_v16i32:
2073; AVX512BW:       # %bb.0:
2074; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
2075; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2076; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2077; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2078; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2079; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2080; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2081; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2082; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2083; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2084; AVX512BW-NEXT:    retq
2085;
2086; XOPAVX1-LABEL: test_bitreverse_v16i32:
2087; XOPAVX1:       # %bb.0:
2088; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2089; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
2090; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2091; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2092; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2093; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2094; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2095; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2096; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2097; XOPAVX1-NEXT:    retq
2098;
2099; XOPAVX2-LABEL: test_bitreverse_v16i32:
2100; XOPAVX2:       # %bb.0:
2101; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2102; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
2103; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2104; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2105; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2106; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2107; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2108; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2109; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2110; XOPAVX2-NEXT:    retq
2111  %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
2112  ret <16 x i32> %b
2113}
2114
2115define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
2116; SSE2-LABEL: test_bitreverse_v8i64:
2117; SSE2:       # %bb.0:
2118; SSE2-NEXT:    pxor %xmm14, %xmm14
2119; SSE2-NEXT:    movdqa %xmm0, %xmm4
2120; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15]
2121; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
2122; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2123; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2124; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
2125; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2126; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
2127; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
2128; SSE2-NEXT:    packuswb %xmm4, %xmm0
2129; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2130; SSE2-NEXT:    movdqa %xmm0, %xmm5
2131; SSE2-NEXT:    pand %xmm8, %xmm5
2132; SSE2-NEXT:    psllw $4, %xmm5
2133; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
2134; SSE2-NEXT:    pand %xmm4, %xmm5
2135; SSE2-NEXT:    pand %xmm4, %xmm0
2136; SSE2-NEXT:    psrlw $4, %xmm0
2137; SSE2-NEXT:    pand %xmm8, %xmm0
2138; SSE2-NEXT:    por %xmm5, %xmm0
2139; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2140; SSE2-NEXT:    movdqa %xmm0, %xmm7
2141; SSE2-NEXT:    pand %xmm5, %xmm7
2142; SSE2-NEXT:    psllw $2, %xmm7
2143; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
2144; SSE2-NEXT:    pand %xmm9, %xmm7
2145; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
2146; SSE2-NEXT:    pand %xmm10, %xmm0
2147; SSE2-NEXT:    psrlw $2, %xmm0
2148; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
2149; SSE2-NEXT:    pand %xmm11, %xmm0
2150; SSE2-NEXT:    por %xmm7, %xmm0
2151; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
2152; SSE2-NEXT:    movdqa %xmm0, %xmm6
2153; SSE2-NEXT:    pand %xmm7, %xmm6
2154; SSE2-NEXT:    psrlw $1, %xmm6
2155; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
2156; SSE2-NEXT:    pand %xmm12, %xmm6
2157; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2158; SSE2-NEXT:    pand %xmm13, %xmm0
2159; SSE2-NEXT:    paddb %xmm0, %xmm0
2160; SSE2-NEXT:    por %xmm6, %xmm0
2161; SSE2-NEXT:    movdqa %xmm1, %xmm6
2162; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
2163; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
2164; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
2165; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
2166; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
2167; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2168; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2169; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2170; SSE2-NEXT:    packuswb %xmm6, %xmm1
2171; SSE2-NEXT:    movdqa %xmm1, %xmm6
2172; SSE2-NEXT:    pand %xmm8, %xmm6
2173; SSE2-NEXT:    psllw $4, %xmm6
2174; SSE2-NEXT:    pand %xmm4, %xmm6
2175; SSE2-NEXT:    pand %xmm4, %xmm1
2176; SSE2-NEXT:    psrlw $4, %xmm1
2177; SSE2-NEXT:    pand %xmm8, %xmm1
2178; SSE2-NEXT:    por %xmm6, %xmm1
2179; SSE2-NEXT:    movdqa %xmm1, %xmm6
2180; SSE2-NEXT:    pand %xmm5, %xmm6
2181; SSE2-NEXT:    psllw $2, %xmm6
2182; SSE2-NEXT:    pand %xmm9, %xmm6
2183; SSE2-NEXT:    pand %xmm10, %xmm1
2184; SSE2-NEXT:    psrlw $2, %xmm1
2185; SSE2-NEXT:    pand %xmm11, %xmm1
2186; SSE2-NEXT:    por %xmm6, %xmm1
2187; SSE2-NEXT:    movdqa %xmm1, %xmm6
2188; SSE2-NEXT:    pand %xmm7, %xmm6
2189; SSE2-NEXT:    psrlw $1, %xmm6
2190; SSE2-NEXT:    pand %xmm12, %xmm6
2191; SSE2-NEXT:    pand %xmm13, %xmm1
2192; SSE2-NEXT:    paddb %xmm1, %xmm1
2193; SSE2-NEXT:    por %xmm6, %xmm1
2194; SSE2-NEXT:    movdqa %xmm2, %xmm6
2195; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
2196; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
2197; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
2198; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
2199; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
2200; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
2201; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2202; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2203; SSE2-NEXT:    packuswb %xmm6, %xmm2
2204; SSE2-NEXT:    movdqa %xmm2, %xmm6
2205; SSE2-NEXT:    pand %xmm8, %xmm6
2206; SSE2-NEXT:    psllw $4, %xmm6
2207; SSE2-NEXT:    pand %xmm4, %xmm6
2208; SSE2-NEXT:    pand %xmm4, %xmm2
2209; SSE2-NEXT:    psrlw $4, %xmm2
2210; SSE2-NEXT:    pand %xmm8, %xmm2
2211; SSE2-NEXT:    por %xmm6, %xmm2
2212; SSE2-NEXT:    movdqa %xmm2, %xmm6
2213; SSE2-NEXT:    pand %xmm5, %xmm6
2214; SSE2-NEXT:    psllw $2, %xmm6
2215; SSE2-NEXT:    pand %xmm9, %xmm6
2216; SSE2-NEXT:    pand %xmm10, %xmm2
2217; SSE2-NEXT:    psrlw $2, %xmm2
2218; SSE2-NEXT:    pand %xmm11, %xmm2
2219; SSE2-NEXT:    por %xmm6, %xmm2
2220; SSE2-NEXT:    movdqa %xmm2, %xmm6
2221; SSE2-NEXT:    pand %xmm7, %xmm6
2222; SSE2-NEXT:    psrlw $1, %xmm6
2223; SSE2-NEXT:    pand %xmm12, %xmm6
2224; SSE2-NEXT:    pand %xmm13, %xmm2
2225; SSE2-NEXT:    paddb %xmm2, %xmm2
2226; SSE2-NEXT:    por %xmm6, %xmm2
2227; SSE2-NEXT:    movdqa %xmm3, %xmm6
2228; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
2229; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
2230; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
2231; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
2232; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
2233; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
2234; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
2235; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
2236; SSE2-NEXT:    packuswb %xmm6, %xmm3
2237; SSE2-NEXT:    movdqa %xmm3, %xmm6
2238; SSE2-NEXT:    pand %xmm8, %xmm6
2239; SSE2-NEXT:    psllw $4, %xmm6
2240; SSE2-NEXT:    pand %xmm4, %xmm6
2241; SSE2-NEXT:    pand %xmm4, %xmm3
2242; SSE2-NEXT:    psrlw $4, %xmm3
2243; SSE2-NEXT:    pand %xmm8, %xmm3
2244; SSE2-NEXT:    por %xmm6, %xmm3
2245; SSE2-NEXT:    pand %xmm3, %xmm5
2246; SSE2-NEXT:    psllw $2, %xmm5
2247; SSE2-NEXT:    pand %xmm9, %xmm5
2248; SSE2-NEXT:    pand %xmm10, %xmm3
2249; SSE2-NEXT:    psrlw $2, %xmm3
2250; SSE2-NEXT:    pand %xmm11, %xmm3
2251; SSE2-NEXT:    por %xmm5, %xmm3
2252; SSE2-NEXT:    pand %xmm3, %xmm7
2253; SSE2-NEXT:    psrlw $1, %xmm7
2254; SSE2-NEXT:    pand %xmm12, %xmm7
2255; SSE2-NEXT:    pand %xmm13, %xmm3
2256; SSE2-NEXT:    paddb %xmm3, %xmm3
2257; SSE2-NEXT:    por %xmm7, %xmm3
2258; SSE2-NEXT:    retq
2259;
2260; SSSE3-LABEL: test_bitreverse_v8i64:
2261; SSSE3:       # %bb.0:
2262; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2263; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2264; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2265; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2266; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2267; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2268; SSSE3-NEXT:    pand %xmm9, %xmm0
2269; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2270; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2271; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2272; SSSE3-NEXT:    psrlw $4, %xmm1
2273; SSSE3-NEXT:    pand %xmm9, %xmm1
2274; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2275; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2276; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2277; SSSE3-NEXT:    por %xmm6, %xmm0
2278; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2279; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2280; SSSE3-NEXT:    pand %xmm9, %xmm1
2281; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2282; SSSE3-NEXT:    pshufb %xmm1, %xmm6
2283; SSSE3-NEXT:    psrlw $4, %xmm5
2284; SSSE3-NEXT:    pand %xmm9, %xmm5
2285; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2286; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2287; SSSE3-NEXT:    por %xmm6, %xmm1
2288; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2289; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2290; SSSE3-NEXT:    pand %xmm9, %xmm5
2291; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2292; SSSE3-NEXT:    pshufb %xmm5, %xmm6
2293; SSSE3-NEXT:    psrlw $4, %xmm2
2294; SSSE3-NEXT:    pand %xmm9, %xmm2
2295; SSSE3-NEXT:    movdqa %xmm4, %xmm5
2296; SSSE3-NEXT:    pshufb %xmm2, %xmm5
2297; SSSE3-NEXT:    por %xmm6, %xmm5
2298; SSSE3-NEXT:    pshufb %xmm8, %xmm3
2299; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2300; SSSE3-NEXT:    pand %xmm9, %xmm2
2301; SSSE3-NEXT:    pshufb %xmm2, %xmm7
2302; SSSE3-NEXT:    psrlw $4, %xmm3
2303; SSSE3-NEXT:    pand %xmm9, %xmm3
2304; SSSE3-NEXT:    pshufb %xmm3, %xmm4
2305; SSSE3-NEXT:    por %xmm7, %xmm4
2306; SSSE3-NEXT:    movdqa %xmm5, %xmm2
2307; SSSE3-NEXT:    movdqa %xmm4, %xmm3
2308; SSSE3-NEXT:    retq
2309;
2310; AVX1-LABEL: test_bitreverse_v8i64:
2311; AVX1:       # %bb.0:
2312; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2313; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2314; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2315; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2316; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2317; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2318; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2319; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2320; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2321; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2322; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2323; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2324; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2325; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
2326; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2327; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2328; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2329; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
2330; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
2331; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2332; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2333; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2334; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2335; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2336; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2337; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2338; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2339; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2340; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2341; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
2342; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2343; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2344; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2345; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
2346; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
2347; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2348; AVX1-NEXT:    retq
2349;
2350; AVX2-LABEL: test_bitreverse_v8i64:
2351; AVX2:       # %bb.0:
2352; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2353; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2354; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2355; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
2356; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2357; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2358; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2359; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2360; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2361; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2362; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
2363; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2364; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
2365; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2366; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2367; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2368; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2369; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2370; AVX2-NEXT:    retq
2371;
2372; AVX512F-LABEL: test_bitreverse_v8i64:
2373; AVX512F:       # %bb.0:
2374; AVX512F-NEXT:    vpsrlq $56, %zmm0, %zmm1
2375; AVX512F-NEXT:    vpsrlq $40, %zmm0, %zmm2
2376; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
2377; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
2378; AVX512F-NEXT:    vpsrlq $24, %zmm0, %zmm2
2379; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
2380; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
2381; AVX512F-NEXT:    vpsrlq $8, %zmm0, %zmm2
2382; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
2383; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
2384; AVX512F-NEXT:    vpsllq $8, %zmm0, %zmm2
2385; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
2386; AVX512F-NEXT:    vpsllq $24, %zmm0, %zmm3
2387; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
2388; AVX512F-NEXT:    vporq %zmm2, %zmm3, %zmm2
2389; AVX512F-NEXT:    vpsllq $56, %zmm0, %zmm3
2390; AVX512F-NEXT:    vpsllq $40, %zmm0, %zmm0
2391; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
2392; AVX512F-NEXT:    vporq %zmm2, %zmm0, %zmm0
2393; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
2394; AVX512F-NEXT:    vporq %zmm0, %zmm3, %zmm0
2395; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
2396; AVX512F-NEXT:    vpsllq $4, %zmm1, %zmm1
2397; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
2398; AVX512F-NEXT:    vpsrlq $4, %zmm0, %zmm0
2399; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
2400; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
2401; AVX512F-NEXT:    vpsllq $2, %zmm1, %zmm1
2402; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
2403; AVX512F-NEXT:    vpsrlq $2, %zmm0, %zmm0
2404; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
2405; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
2406; AVX512F-NEXT:    vpsllq $1, %zmm1, %zmm1
2407; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
2408; AVX512F-NEXT:    vpsrlq $1, %zmm0, %zmm0
2409; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
2410; AVX512F-NEXT:    retq
2411;
2412; AVX512BW-LABEL: test_bitreverse_v8i64:
2413; AVX512BW:       # %bb.0:
2414; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
2415; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2416; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2417; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2418; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2419; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2420; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2421; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2422; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2423; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2424; AVX512BW-NEXT:    retq
2425;
2426; XOPAVX1-LABEL: test_bitreverse_v8i64:
2427; XOPAVX1:       # %bb.0:
2428; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2429; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2430; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2431; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2432; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2433; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2434; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2435; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2436; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2437; XOPAVX1-NEXT:    retq
2438;
2439; XOPAVX2-LABEL: test_bitreverse_v8i64:
2440; XOPAVX2:       # %bb.0:
2441; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2442; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2443; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2444; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2445; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2446; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2447; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2448; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2449; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2450; XOPAVX2-NEXT:    retq
2451  %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
2452  ret <8 x i64> %b
2453}
2454
2455;
2456; Constant Folding
2457;
2458
2459define i32 @fold_bitreverse_i32() nounwind {
2460; ALL-LABEL: fold_bitreverse_i32:
2461; ALL:       # %bb.0:
2462; ALL-NEXT:    movl $16711935, %eax # imm = 0xFF00FF
2463; ALL-NEXT:    retq
2464  %b = call i32 @llvm.bitreverse.i32(i32 4278255360)
2465  ret i32 %b
2466}
2467
2468define <16 x i8> @fold_bitreverse_v16i8() nounwind {
2469; SSE-LABEL: fold_bitreverse_v16i8:
2470; SSE:       # %bb.0:
2471; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2472; SSE-NEXT:    retq
2473;
2474; AVX-LABEL: fold_bitreverse_v16i8:
2475; AVX:       # %bb.0:
2476; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2477; AVX-NEXT:    retq
2478;
2479; XOP-LABEL: fold_bitreverse_v16i8:
2480; XOP:       # %bb.0:
2481; XOP-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2482; XOP-NEXT:    retq
2483  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>)
2484  ret <16 x i8> %b
2485}
2486
2487define <16 x i16> @fold_bitreverse_v16i16() nounwind {
2488; SSE-LABEL: fold_bitreverse_v16i16:
2489; SSE:       # %bb.0:
2490; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
2491; SSE-NEXT:    movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
2492; SSE-NEXT:    retq
2493;
2494; AVX-LABEL: fold_bitreverse_v16i16:
2495; AVX:       # %bb.0:
2496; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
2497; AVX-NEXT:    retq
2498;
2499; XOP-LABEL: fold_bitreverse_v16i16:
2500; XOP:       # %bb.0:
2501; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
2502; XOP-NEXT:    retq
2503  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>)
2504  ret <16 x i16> %b
2505}
2506
2507define <16 x i32> @fold_bitreverse_v16i32() nounwind {
2508; SSE-LABEL: fold_bitreverse_v16i32:
2509; SSE:       # %bb.0:
2510; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
2511; SSE-NEXT:    movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
2512; SSE-NEXT:    movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
2513; SSE-NEXT:    movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103]
2514; SSE-NEXT:    retq
2515;
2516; AVX1-LABEL: fold_bitreverse_v16i32:
2517; AVX1:       # %bb.0:
2518; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2519; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2520; AVX1-NEXT:    retq
2521;
2522; AVX2-LABEL: fold_bitreverse_v16i32:
2523; AVX2:       # %bb.0:
2524; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2525; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2526; AVX2-NEXT:    retq
2527;
2528; AVX512-LABEL: fold_bitreverse_v16i32:
2529; AVX512:       # %bb.0:
2530; AVX512-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2531; AVX512-NEXT:    retq
2532;
2533; XOP-LABEL: fold_bitreverse_v16i32:
2534; XOP:       # %bb.0:
2535; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2536; XOP-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2537; XOP-NEXT:    retq
2538  %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>)
2539  ret <16 x i32> %b
2540}
2541
2542declare i8 @llvm.bitreverse.i8(i8) readnone
2543declare i16 @llvm.bitreverse.i16(i16) readnone
2544declare i32 @llvm.bitreverse.i32(i32) readnone
2545declare i64 @llvm.bitreverse.i64(i64) readnone
2546
2547declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone
2548declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone
2549declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone
2550declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone
2551
2552declare <32 x i8>  @llvm.bitreverse.v32i8(<32 x i8>) readnone
2553declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone
2554declare <8 x i32>  @llvm.bitreverse.v8i32(<8 x i32>) readnone
2555declare <4 x i64>  @llvm.bitreverse.v4i64(<4 x i64>) readnone
2556
2557declare <64 x i8>  @llvm.bitreverse.v64i8(<64 x i8>) readnone
2558declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone
2559declare <16 x i32>  @llvm.bitreverse.v16i32(<16 x i32>) readnone
2560declare <8 x i64>  @llvm.bitreverse.v8i64(<8 x i64>) readnone
2561