1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
10
11; Make sure we don't crash with avx512bw and xop
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw
13
14define i8 @test_bitreverse_i8(i8 %a) nounwind {
15; SSE-LABEL: test_bitreverse_i8:
16; SSE:       # %bb.0:
17; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
18; SSE-NEXT:    rolb $4, %dil
19; SSE-NEXT:    movl %edi, %eax
20; SSE-NEXT:    andb $51, %al
21; SSE-NEXT:    shlb $2, %al
22; SSE-NEXT:    andb $-52, %dil
23; SSE-NEXT:    shrb $2, %dil
24; SSE-NEXT:    orb %al, %dil
25; SSE-NEXT:    movl %edi, %eax
26; SSE-NEXT:    andb $85, %al
27; SSE-NEXT:    addb %al, %al
28; SSE-NEXT:    andb $-86, %dil
29; SSE-NEXT:    shrb %dil
30; SSE-NEXT:    addl %edi, %eax
31; SSE-NEXT:    # kill: def $al killed $al killed $eax
32; SSE-NEXT:    retq
33;
34; AVX-LABEL: test_bitreverse_i8:
35; AVX:       # %bb.0:
36; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
37; AVX-NEXT:    rolb $4, %dil
38; AVX-NEXT:    movl %edi, %eax
39; AVX-NEXT:    andb $51, %al
40; AVX-NEXT:    shlb $2, %al
41; AVX-NEXT:    andb $-52, %dil
42; AVX-NEXT:    shrb $2, %dil
43; AVX-NEXT:    orb %al, %dil
44; AVX-NEXT:    movl %edi, %eax
45; AVX-NEXT:    andb $85, %al
46; AVX-NEXT:    addb %al, %al
47; AVX-NEXT:    andb $-86, %dil
48; AVX-NEXT:    shrb %dil
49; AVX-NEXT:    addl %edi, %eax
50; AVX-NEXT:    # kill: def $al killed $al killed $eax
51; AVX-NEXT:    retq
52;
53; XOP-LABEL: test_bitreverse_i8:
54; XOP:       # %bb.0:
55; XOP-NEXT:    vmovd %edi, %xmm0
56; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
57; XOP-NEXT:    vmovd %xmm0, %eax
58; XOP-NEXT:    # kill: def $al killed $al killed $eax
59; XOP-NEXT:    retq
60  %b = call i8 @llvm.bitreverse.i8(i8 %a)
61  ret i8 %b
62}
63
64define i16 @test_bitreverse_i16(i16 %a) nounwind {
65; SSE-LABEL: test_bitreverse_i16:
66; SSE:       # %bb.0:
67; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
68; SSE-NEXT:    rolw $8, %di
69; SSE-NEXT:    movl %edi, %eax
70; SSE-NEXT:    andl $3855, %eax # imm = 0xF0F
71; SSE-NEXT:    shll $4, %eax
72; SSE-NEXT:    andl $61680, %edi # imm = 0xF0F0
73; SSE-NEXT:    shrl $4, %edi
74; SSE-NEXT:    orl %eax, %edi
75; SSE-NEXT:    movl %edi, %eax
76; SSE-NEXT:    andl $13107, %eax # imm = 0x3333
77; SSE-NEXT:    andl $52428, %edi # imm = 0xCCCC
78; SSE-NEXT:    shrl $2, %edi
79; SSE-NEXT:    leal (%rdi,%rax,4), %eax
80; SSE-NEXT:    movl %eax, %ecx
81; SSE-NEXT:    andl $21845, %ecx # imm = 0x5555
82; SSE-NEXT:    andl $43690, %eax # imm = 0xAAAA
83; SSE-NEXT:    shrl %eax
84; SSE-NEXT:    leal (%rax,%rcx,2), %eax
85; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
86; SSE-NEXT:    retq
87;
88; AVX-LABEL: test_bitreverse_i16:
89; AVX:       # %bb.0:
90; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
91; AVX-NEXT:    rolw $8, %di
92; AVX-NEXT:    movl %edi, %eax
93; AVX-NEXT:    andl $3855, %eax # imm = 0xF0F
94; AVX-NEXT:    shll $4, %eax
95; AVX-NEXT:    andl $61680, %edi # imm = 0xF0F0
96; AVX-NEXT:    shrl $4, %edi
97; AVX-NEXT:    orl %eax, %edi
98; AVX-NEXT:    movl %edi, %eax
99; AVX-NEXT:    andl $13107, %eax # imm = 0x3333
100; AVX-NEXT:    andl $52428, %edi # imm = 0xCCCC
101; AVX-NEXT:    shrl $2, %edi
102; AVX-NEXT:    leal (%rdi,%rax,4), %eax
103; AVX-NEXT:    movl %eax, %ecx
104; AVX-NEXT:    andl $21845, %ecx # imm = 0x5555
105; AVX-NEXT:    andl $43690, %eax # imm = 0xAAAA
106; AVX-NEXT:    shrl %eax
107; AVX-NEXT:    leal (%rax,%rcx,2), %eax
108; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
109; AVX-NEXT:    retq
110;
111; XOP-LABEL: test_bitreverse_i16:
112; XOP:       # %bb.0:
113; XOP-NEXT:    vmovd %edi, %xmm0
114; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
115; XOP-NEXT:    vmovd %xmm0, %eax
116; XOP-NEXT:    # kill: def $ax killed $ax killed $eax
117; XOP-NEXT:    retq
118  %b = call i16 @llvm.bitreverse.i16(i16 %a)
119  ret i16 %b
120}
121
122define i32 @test_bitreverse_i32(i32 %a) nounwind {
123; SSE-LABEL: test_bitreverse_i32:
124; SSE:       # %bb.0:
125; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
126; SSE-NEXT:    bswapl %edi
127; SSE-NEXT:    movl %edi, %eax
128; SSE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
129; SSE-NEXT:    shll $4, %eax
130; SSE-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
131; SSE-NEXT:    shrl $4, %edi
132; SSE-NEXT:    orl %eax, %edi
133; SSE-NEXT:    movl %edi, %eax
134; SSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
135; SSE-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
136; SSE-NEXT:    shrl $2, %edi
137; SSE-NEXT:    leal (%rdi,%rax,4), %eax
138; SSE-NEXT:    movl %eax, %ecx
139; SSE-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
140; SSE-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
141; SSE-NEXT:    shrl %eax
142; SSE-NEXT:    leal (%rax,%rcx,2), %eax
143; SSE-NEXT:    retq
144;
145; AVX-LABEL: test_bitreverse_i32:
146; AVX:       # %bb.0:
147; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
148; AVX-NEXT:    bswapl %edi
149; AVX-NEXT:    movl %edi, %eax
150; AVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
151; AVX-NEXT:    shll $4, %eax
152; AVX-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
153; AVX-NEXT:    shrl $4, %edi
154; AVX-NEXT:    orl %eax, %edi
155; AVX-NEXT:    movl %edi, %eax
156; AVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
157; AVX-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
158; AVX-NEXT:    shrl $2, %edi
159; AVX-NEXT:    leal (%rdi,%rax,4), %eax
160; AVX-NEXT:    movl %eax, %ecx
161; AVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
162; AVX-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
163; AVX-NEXT:    shrl %eax
164; AVX-NEXT:    leal (%rax,%rcx,2), %eax
165; AVX-NEXT:    retq
166;
167; XOP-LABEL: test_bitreverse_i32:
168; XOP:       # %bb.0:
169; XOP-NEXT:    vmovd %edi, %xmm0
170; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
171; XOP-NEXT:    vmovd %xmm0, %eax
172; XOP-NEXT:    retq
173  %b = call i32 @llvm.bitreverse.i32(i32 %a)
174  ret i32 %b
175}
176
177define i64 @test_bitreverse_i64(i64 %a) nounwind {
178; SSE-LABEL: test_bitreverse_i64:
179; SSE:       # %bb.0:
180; SSE-NEXT:    bswapq %rdi
181; SSE-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
182; SSE-NEXT:    andq %rdi, %rax
183; SSE-NEXT:    shlq $4, %rax
184; SSE-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
185; SSE-NEXT:    andq %rdi, %rcx
186; SSE-NEXT:    shrq $4, %rcx
187; SSE-NEXT:    orq %rax, %rcx
188; SSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
189; SSE-NEXT:    andq %rcx, %rax
190; SSE-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
191; SSE-NEXT:    andq %rcx, %rdx
192; SSE-NEXT:    shrq $2, %rdx
193; SSE-NEXT:    leaq (%rdx,%rax,4), %rax
194; SSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
195; SSE-NEXT:    andq %rax, %rcx
196; SSE-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
197; SSE-NEXT:    andq %rax, %rdx
198; SSE-NEXT:    shrq %rdx
199; SSE-NEXT:    leaq (%rdx,%rcx,2), %rax
200; SSE-NEXT:    retq
201;
202; AVX-LABEL: test_bitreverse_i64:
203; AVX:       # %bb.0:
204; AVX-NEXT:    bswapq %rdi
205; AVX-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
206; AVX-NEXT:    andq %rdi, %rax
207; AVX-NEXT:    shlq $4, %rax
208; AVX-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
209; AVX-NEXT:    andq %rdi, %rcx
210; AVX-NEXT:    shrq $4, %rcx
211; AVX-NEXT:    orq %rax, %rcx
212; AVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
213; AVX-NEXT:    andq %rcx, %rax
214; AVX-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
215; AVX-NEXT:    andq %rcx, %rdx
216; AVX-NEXT:    shrq $2, %rdx
217; AVX-NEXT:    leaq (%rdx,%rax,4), %rax
218; AVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
219; AVX-NEXT:    andq %rax, %rcx
220; AVX-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
221; AVX-NEXT:    andq %rax, %rdx
222; AVX-NEXT:    shrq %rdx
223; AVX-NEXT:    leaq (%rdx,%rcx,2), %rax
224; AVX-NEXT:    retq
225;
226; XOP-LABEL: test_bitreverse_i64:
227; XOP:       # %bb.0:
228; XOP-NEXT:    vmovq %rdi, %xmm0
229; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
230; XOP-NEXT:    vmovq %xmm0, %rax
231; XOP-NEXT:    retq
232  %b = call i64 @llvm.bitreverse.i64(i64 %a)
233  ret i64 %b
234}
235
236define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
237; SSE2-LABEL: test_bitreverse_v16i8:
238; SSE2:       # %bb.0:
239; SSE2-NEXT:    movdqa %xmm0, %xmm1
240; SSE2-NEXT:    psllw $4, %xmm1
241; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
242; SSE2-NEXT:    psrlw $4, %xmm0
243; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
244; SSE2-NEXT:    por %xmm1, %xmm0
245; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
246; SSE2-NEXT:    pand %xmm0, %xmm1
247; SSE2-NEXT:    psllw $2, %xmm1
248; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
249; SSE2-NEXT:    psrlw $2, %xmm0
250; SSE2-NEXT:    por %xmm1, %xmm0
251; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
252; SSE2-NEXT:    pand %xmm0, %xmm1
253; SSE2-NEXT:    paddb %xmm1, %xmm1
254; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
255; SSE2-NEXT:    psrlw $1, %xmm0
256; SSE2-NEXT:    por %xmm1, %xmm0
257; SSE2-NEXT:    retq
258;
259; SSSE3-LABEL: test_bitreverse_v16i8:
260; SSSE3:       # %bb.0:
261; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
262; SSSE3-NEXT:    movdqa %xmm0, %xmm2
263; SSSE3-NEXT:    pand %xmm1, %xmm2
264; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
265; SSSE3-NEXT:    pshufb %xmm2, %xmm3
266; SSSE3-NEXT:    psrlw $4, %xmm0
267; SSSE3-NEXT:    pand %xmm1, %xmm0
268; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
269; SSSE3-NEXT:    pshufb %xmm0, %xmm1
270; SSSE3-NEXT:    por %xmm3, %xmm1
271; SSSE3-NEXT:    movdqa %xmm1, %xmm0
272; SSSE3-NEXT:    retq
273;
274; AVX-LABEL: test_bitreverse_v16i8:
275; AVX:       # %bb.0:
276; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
277; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
278; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
279; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
280; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
281; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
282; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
283; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
284; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
285; AVX-NEXT:    retq
286;
287; XOP-LABEL: test_bitreverse_v16i8:
288; XOP:       # %bb.0:
289; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
290; XOP-NEXT:    retq
291  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
292  ret <16 x i8> %b
293}
294
295define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
296; SSE2-LABEL: test_bitreverse_v8i16:
297; SSE2:       # %bb.0:
298; SSE2-NEXT:    movdqa %xmm0, %xmm1
299; SSE2-NEXT:    psrlw $8, %xmm1
300; SSE2-NEXT:    psllw $8, %xmm0
301; SSE2-NEXT:    por %xmm1, %xmm0
302; SSE2-NEXT:    movdqa %xmm0, %xmm1
303; SSE2-NEXT:    psllw $4, %xmm1
304; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
305; SSE2-NEXT:    psrlw $4, %xmm0
306; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
307; SSE2-NEXT:    por %xmm1, %xmm0
308; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
309; SSE2-NEXT:    pand %xmm0, %xmm1
310; SSE2-NEXT:    psllw $2, %xmm1
311; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
312; SSE2-NEXT:    psrlw $2, %xmm0
313; SSE2-NEXT:    por %xmm1, %xmm0
314; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
315; SSE2-NEXT:    pand %xmm0, %xmm1
316; SSE2-NEXT:    paddb %xmm1, %xmm1
317; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
318; SSE2-NEXT:    psrlw $1, %xmm0
319; SSE2-NEXT:    por %xmm1, %xmm0
320; SSE2-NEXT:    retq
321;
322; SSSE3-LABEL: test_bitreverse_v8i16:
323; SSSE3:       # %bb.0:
324; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
325; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
326; SSSE3-NEXT:    movdqa %xmm0, %xmm2
327; SSSE3-NEXT:    pand %xmm1, %xmm2
328; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
329; SSSE3-NEXT:    pshufb %xmm2, %xmm3
330; SSSE3-NEXT:    psrlw $4, %xmm0
331; SSSE3-NEXT:    pand %xmm1, %xmm0
332; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
333; SSSE3-NEXT:    pshufb %xmm0, %xmm1
334; SSSE3-NEXT:    por %xmm3, %xmm1
335; SSSE3-NEXT:    movdqa %xmm1, %xmm0
336; SSSE3-NEXT:    retq
337;
338; AVX-LABEL: test_bitreverse_v8i16:
339; AVX:       # %bb.0:
340; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
341; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
342; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
343; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
344; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
345; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
346; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
347; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
348; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
349; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
350; AVX-NEXT:    retq
351;
352; XOP-LABEL: test_bitreverse_v8i16:
353; XOP:       # %bb.0:
354; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
355; XOP-NEXT:    retq
356  %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
357  ret <8 x i16> %b
358}
359
360define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
361; SSE2-LABEL: test_bitreverse_v4i32:
362; SSE2:       # %bb.0:
363; SSE2-NEXT:    pxor %xmm1, %xmm1
364; SSE2-NEXT:    movdqa %xmm0, %xmm2
365; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
366; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
367; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
368; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
369; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
370; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
371; SSE2-NEXT:    packuswb %xmm2, %xmm0
372; SSE2-NEXT:    movdqa %xmm0, %xmm1
373; SSE2-NEXT:    psllw $4, %xmm1
374; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
375; SSE2-NEXT:    psrlw $4, %xmm0
376; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
377; SSE2-NEXT:    por %xmm1, %xmm0
378; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
379; SSE2-NEXT:    pand %xmm0, %xmm1
380; SSE2-NEXT:    psllw $2, %xmm1
381; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
382; SSE2-NEXT:    psrlw $2, %xmm0
383; SSE2-NEXT:    por %xmm1, %xmm0
384; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
385; SSE2-NEXT:    pand %xmm0, %xmm1
386; SSE2-NEXT:    paddb %xmm1, %xmm1
387; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
388; SSE2-NEXT:    psrlw $1, %xmm0
389; SSE2-NEXT:    por %xmm1, %xmm0
390; SSE2-NEXT:    retq
391;
392; SSSE3-LABEL: test_bitreverse_v4i32:
393; SSSE3:       # %bb.0:
394; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
395; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
396; SSSE3-NEXT:    movdqa %xmm0, %xmm2
397; SSSE3-NEXT:    pand %xmm1, %xmm2
398; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
399; SSSE3-NEXT:    pshufb %xmm2, %xmm3
400; SSSE3-NEXT:    psrlw $4, %xmm0
401; SSSE3-NEXT:    pand %xmm1, %xmm0
402; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
403; SSSE3-NEXT:    pshufb %xmm0, %xmm1
404; SSSE3-NEXT:    por %xmm3, %xmm1
405; SSSE3-NEXT:    movdqa %xmm1, %xmm0
406; SSSE3-NEXT:    retq
407;
408; AVX-LABEL: test_bitreverse_v4i32:
409; AVX:       # %bb.0:
410; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
411; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
412; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
413; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
414; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
415; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
416; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
417; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
418; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
419; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
420; AVX-NEXT:    retq
421;
422; XOP-LABEL: test_bitreverse_v4i32:
423; XOP:       # %bb.0:
424; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
425; XOP-NEXT:    retq
426  %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
427  ret <4 x i32> %b
428}
429
430define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
431; SSE2-LABEL: test_bitreverse_v2i64:
432; SSE2:       # %bb.0:
433; SSE2-NEXT:    pxor %xmm1, %xmm1
434; SSE2-NEXT:    movdqa %xmm0, %xmm2
435; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
436; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
437; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
438; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
439; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
440; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
441; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
442; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
443; SSE2-NEXT:    packuswb %xmm2, %xmm0
444; SSE2-NEXT:    movdqa %xmm0, %xmm1
445; SSE2-NEXT:    psllw $4, %xmm1
446; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
447; SSE2-NEXT:    psrlw $4, %xmm0
448; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
449; SSE2-NEXT:    por %xmm1, %xmm0
450; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
451; SSE2-NEXT:    pand %xmm0, %xmm1
452; SSE2-NEXT:    psllw $2, %xmm1
453; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
454; SSE2-NEXT:    psrlw $2, %xmm0
455; SSE2-NEXT:    por %xmm1, %xmm0
456; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
457; SSE2-NEXT:    pand %xmm0, %xmm1
458; SSE2-NEXT:    paddb %xmm1, %xmm1
459; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
460; SSE2-NEXT:    psrlw $1, %xmm0
461; SSE2-NEXT:    por %xmm1, %xmm0
462; SSE2-NEXT:    retq
463;
464; SSSE3-LABEL: test_bitreverse_v2i64:
465; SSSE3:       # %bb.0:
466; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
467; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
468; SSSE3-NEXT:    movdqa %xmm0, %xmm2
469; SSSE3-NEXT:    pand %xmm1, %xmm2
470; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
471; SSSE3-NEXT:    pshufb %xmm2, %xmm3
472; SSSE3-NEXT:    psrlw $4, %xmm0
473; SSSE3-NEXT:    pand %xmm1, %xmm0
474; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
475; SSSE3-NEXT:    pshufb %xmm0, %xmm1
476; SSSE3-NEXT:    por %xmm3, %xmm1
477; SSSE3-NEXT:    movdqa %xmm1, %xmm0
478; SSSE3-NEXT:    retq
479;
480; AVX-LABEL: test_bitreverse_v2i64:
481; AVX:       # %bb.0:
482; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
483; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
484; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
485; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
486; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
487; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
488; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
489; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
490; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
491; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
492; AVX-NEXT:    retq
493;
494; XOP-LABEL: test_bitreverse_v2i64:
495; XOP:       # %bb.0:
496; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
497; XOP-NEXT:    retq
498  %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
499  ret <2 x i64> %b
500}
501
502define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
503; SSE2-LABEL: test_bitreverse_v32i8:
504; SSE2:       # %bb.0:
505; SSE2-NEXT:    movdqa %xmm1, %xmm2
506; SSE2-NEXT:    movdqa %xmm0, %xmm3
507; SSE2-NEXT:    psllw $4, %xmm3
508; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
509; SSE2-NEXT:    movdqa %xmm1, %xmm4
510; SSE2-NEXT:    pandn %xmm3, %xmm4
511; SSE2-NEXT:    psrlw $4, %xmm0
512; SSE2-NEXT:    pand %xmm1, %xmm0
513; SSE2-NEXT:    por %xmm4, %xmm0
514; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
515; SSE2-NEXT:    movdqa %xmm0, %xmm4
516; SSE2-NEXT:    pand %xmm3, %xmm4
517; SSE2-NEXT:    psllw $2, %xmm4
518; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
519; SSE2-NEXT:    pand %xmm5, %xmm0
520; SSE2-NEXT:    psrlw $2, %xmm0
521; SSE2-NEXT:    por %xmm4, %xmm0
522; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
523; SSE2-NEXT:    movdqa %xmm0, %xmm6
524; SSE2-NEXT:    pand %xmm4, %xmm6
525; SSE2-NEXT:    paddb %xmm6, %xmm6
526; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
527; SSE2-NEXT:    pand %xmm7, %xmm0
528; SSE2-NEXT:    psrlw $1, %xmm0
529; SSE2-NEXT:    por %xmm6, %xmm0
530; SSE2-NEXT:    movdqa %xmm2, %xmm6
531; SSE2-NEXT:    psllw $4, %xmm6
532; SSE2-NEXT:    psrlw $4, %xmm2
533; SSE2-NEXT:    pand %xmm1, %xmm2
534; SSE2-NEXT:    pandn %xmm6, %xmm1
535; SSE2-NEXT:    por %xmm2, %xmm1
536; SSE2-NEXT:    pand %xmm1, %xmm3
537; SSE2-NEXT:    psllw $2, %xmm3
538; SSE2-NEXT:    pand %xmm5, %xmm1
539; SSE2-NEXT:    psrlw $2, %xmm1
540; SSE2-NEXT:    por %xmm3, %xmm1
541; SSE2-NEXT:    pand %xmm1, %xmm4
542; SSE2-NEXT:    paddb %xmm4, %xmm4
543; SSE2-NEXT:    pand %xmm7, %xmm1
544; SSE2-NEXT:    psrlw $1, %xmm1
545; SSE2-NEXT:    por %xmm4, %xmm1
546; SSE2-NEXT:    retq
547;
548; SSSE3-LABEL: test_bitreverse_v32i8:
549; SSSE3:       # %bb.0:
550; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
551; SSSE3-NEXT:    movdqa %xmm0, %xmm2
552; SSSE3-NEXT:    pand %xmm4, %xmm2
553; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
554; SSSE3-NEXT:    movdqa %xmm5, %xmm6
555; SSSE3-NEXT:    pshufb %xmm2, %xmm6
556; SSSE3-NEXT:    psrlw $4, %xmm0
557; SSSE3-NEXT:    pand %xmm4, %xmm0
558; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
559; SSSE3-NEXT:    movdqa %xmm2, %xmm3
560; SSSE3-NEXT:    pshufb %xmm0, %xmm3
561; SSSE3-NEXT:    por %xmm6, %xmm3
562; SSSE3-NEXT:    movdqa %xmm1, %xmm0
563; SSSE3-NEXT:    pand %xmm4, %xmm0
564; SSSE3-NEXT:    pshufb %xmm0, %xmm5
565; SSSE3-NEXT:    psrlw $4, %xmm1
566; SSSE3-NEXT:    pand %xmm4, %xmm1
567; SSSE3-NEXT:    pshufb %xmm1, %xmm2
568; SSSE3-NEXT:    por %xmm5, %xmm2
569; SSSE3-NEXT:    movdqa %xmm3, %xmm0
570; SSSE3-NEXT:    movdqa %xmm2, %xmm1
571; SSSE3-NEXT:    retq
572;
573; AVX1-LABEL: test_bitreverse_v32i8:
574; AVX1:       # %bb.0:
575; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
576; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
577; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
578; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
579; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
580; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
581; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
582; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
583; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
584; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
585; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
586; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
587; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
588; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
589; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
590; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
591; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
592; AVX1-NEXT:    retq
593;
594; AVX2-LABEL: test_bitreverse_v32i8:
595; AVX2:       # %bb.0:
596; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
597; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
598; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
599; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
600; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
601; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
602; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
603; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
604; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
605; AVX2-NEXT:    retq
606;
607; AVX512-LABEL: test_bitreverse_v32i8:
608; AVX512:       # %bb.0:
609; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
610; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
611; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
612; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
613; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
614; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
615; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
616; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
617; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
618; AVX512-NEXT:    retq
619;
620; XOPAVX1-LABEL: test_bitreverse_v32i8:
621; XOPAVX1:       # %bb.0:
622; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
623; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
624; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
625; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
626; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
627; XOPAVX1-NEXT:    retq
628;
629; XOPAVX2-LABEL: test_bitreverse_v32i8:
630; XOPAVX2:       # %bb.0:
631; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
632; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
633; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
634; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
635; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
636; XOPAVX2-NEXT:    retq
637  %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
638  ret <32 x i8> %b
639}
640
641define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
642; SSE2-LABEL: test_bitreverse_v16i16:
643; SSE2:       # %bb.0:
644; SSE2-NEXT:    movdqa %xmm1, %xmm2
645; SSE2-NEXT:    movdqa %xmm0, %xmm1
646; SSE2-NEXT:    psrlw $8, %xmm1
647; SSE2-NEXT:    psllw $8, %xmm0
648; SSE2-NEXT:    por %xmm1, %xmm0
649; SSE2-NEXT:    movdqa %xmm0, %xmm3
650; SSE2-NEXT:    psllw $4, %xmm3
651; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
652; SSE2-NEXT:    movdqa %xmm1, %xmm4
653; SSE2-NEXT:    pandn %xmm3, %xmm4
654; SSE2-NEXT:    psrlw $4, %xmm0
655; SSE2-NEXT:    pand %xmm1, %xmm0
656; SSE2-NEXT:    por %xmm4, %xmm0
657; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
658; SSE2-NEXT:    movdqa %xmm0, %xmm4
659; SSE2-NEXT:    pand %xmm3, %xmm4
660; SSE2-NEXT:    psllw $2, %xmm4
661; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
662; SSE2-NEXT:    pand %xmm5, %xmm0
663; SSE2-NEXT:    psrlw $2, %xmm0
664; SSE2-NEXT:    por %xmm4, %xmm0
665; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
666; SSE2-NEXT:    movdqa %xmm0, %xmm7
667; SSE2-NEXT:    pand %xmm4, %xmm7
668; SSE2-NEXT:    paddb %xmm7, %xmm7
669; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
670; SSE2-NEXT:    pand %xmm6, %xmm0
671; SSE2-NEXT:    psrlw $1, %xmm0
672; SSE2-NEXT:    por %xmm7, %xmm0
673; SSE2-NEXT:    movdqa %xmm2, %xmm7
674; SSE2-NEXT:    psrlw $8, %xmm7
675; SSE2-NEXT:    psllw $8, %xmm2
676; SSE2-NEXT:    por %xmm7, %xmm2
677; SSE2-NEXT:    movdqa %xmm2, %xmm7
678; SSE2-NEXT:    psllw $4, %xmm7
679; SSE2-NEXT:    psrlw $4, %xmm2
680; SSE2-NEXT:    pand %xmm1, %xmm2
681; SSE2-NEXT:    pandn %xmm7, %xmm1
682; SSE2-NEXT:    por %xmm2, %xmm1
683; SSE2-NEXT:    pand %xmm1, %xmm3
684; SSE2-NEXT:    psllw $2, %xmm3
685; SSE2-NEXT:    pand %xmm5, %xmm1
686; SSE2-NEXT:    psrlw $2, %xmm1
687; SSE2-NEXT:    por %xmm3, %xmm1
688; SSE2-NEXT:    pand %xmm1, %xmm4
689; SSE2-NEXT:    paddb %xmm4, %xmm4
690; SSE2-NEXT:    pand %xmm6, %xmm1
691; SSE2-NEXT:    psrlw $1, %xmm1
692; SSE2-NEXT:    por %xmm4, %xmm1
693; SSE2-NEXT:    retq
694;
695; SSSE3-LABEL: test_bitreverse_v16i16:
696; SSSE3:       # %bb.0:
697; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
698; SSSE3-NEXT:    pshufb %xmm4, %xmm0
699; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
700; SSSE3-NEXT:    movdqa %xmm0, %xmm2
701; SSSE3-NEXT:    pand %xmm5, %xmm2
702; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
703; SSSE3-NEXT:    movdqa %xmm6, %xmm7
704; SSSE3-NEXT:    pshufb %xmm2, %xmm7
705; SSSE3-NEXT:    psrlw $4, %xmm0
706; SSSE3-NEXT:    pand %xmm5, %xmm0
707; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
708; SSSE3-NEXT:    movdqa %xmm2, %xmm3
709; SSSE3-NEXT:    pshufb %xmm0, %xmm3
710; SSSE3-NEXT:    por %xmm7, %xmm3
711; SSSE3-NEXT:    pshufb %xmm4, %xmm1
712; SSSE3-NEXT:    movdqa %xmm1, %xmm0
713; SSSE3-NEXT:    pand %xmm5, %xmm0
714; SSSE3-NEXT:    pshufb %xmm0, %xmm6
715; SSSE3-NEXT:    psrlw $4, %xmm1
716; SSSE3-NEXT:    pand %xmm5, %xmm1
717; SSSE3-NEXT:    pshufb %xmm1, %xmm2
718; SSSE3-NEXT:    por %xmm6, %xmm2
719; SSSE3-NEXT:    movdqa %xmm3, %xmm0
720; SSSE3-NEXT:    movdqa %xmm2, %xmm1
721; SSSE3-NEXT:    retq
722;
723; AVX1-LABEL: test_bitreverse_v16i16:
724; AVX1:       # %bb.0:
725; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
726; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
727; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
728; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
729; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
730; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
731; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
732; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
733; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
734; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
735; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
736; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
737; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
738; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
739; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
740; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
741; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
742; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
743; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
744; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
745; AVX1-NEXT:    retq
746;
747; AVX2-LABEL: test_bitreverse_v16i16:
748; AVX2:       # %bb.0:
749; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
750; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
751; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
752; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
753; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
754; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
755; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
756; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
757; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
758; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
759; AVX2-NEXT:    retq
760;
761; AVX512-LABEL: test_bitreverse_v16i16:
762; AVX512:       # %bb.0:
763; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
764; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
765; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
766; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
767; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
768; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
769; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
770; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
771; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
772; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
773; AVX512-NEXT:    retq
774;
775; XOPAVX1-LABEL: test_bitreverse_v16i16:
776; XOPAVX1:       # %bb.0:
777; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
778; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
779; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
780; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
781; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
782; XOPAVX1-NEXT:    retq
783;
784; XOPAVX2-LABEL: test_bitreverse_v16i16:
785; XOPAVX2:       # %bb.0:
786; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
787; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
788; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
789; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
790; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
791; XOPAVX2-NEXT:    retq
792  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
793  ret <16 x i16> %b
794}
795
796define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
797; SSE2-LABEL: test_bitreverse_v8i32:
798; SSE2:       # %bb.0:
799; SSE2-NEXT:    movdqa %xmm1, %xmm2
800; SSE2-NEXT:    pxor %xmm4, %xmm4
801; SSE2-NEXT:    movdqa %xmm0, %xmm1
802; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
803; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
804; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
805; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
806; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
807; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
808; SSE2-NEXT:    packuswb %xmm1, %xmm0
809; SSE2-NEXT:    movdqa %xmm0, %xmm3
810; SSE2-NEXT:    psllw $4, %xmm3
811; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
812; SSE2-NEXT:    movdqa %xmm1, %xmm5
813; SSE2-NEXT:    pandn %xmm3, %xmm5
814; SSE2-NEXT:    psrlw $4, %xmm0
815; SSE2-NEXT:    pand %xmm1, %xmm0
816; SSE2-NEXT:    por %xmm5, %xmm0
817; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
818; SSE2-NEXT:    movdqa %xmm0, %xmm5
819; SSE2-NEXT:    pand %xmm3, %xmm5
820; SSE2-NEXT:    psllw $2, %xmm5
821; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
822; SSE2-NEXT:    pand %xmm8, %xmm0
823; SSE2-NEXT:    psrlw $2, %xmm0
824; SSE2-NEXT:    por %xmm5, %xmm0
825; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
826; SSE2-NEXT:    movdqa %xmm0, %xmm6
827; SSE2-NEXT:    pand %xmm5, %xmm6
828; SSE2-NEXT:    paddb %xmm6, %xmm6
829; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
830; SSE2-NEXT:    pand %xmm7, %xmm0
831; SSE2-NEXT:    psrlw $1, %xmm0
832; SSE2-NEXT:    por %xmm6, %xmm0
833; SSE2-NEXT:    movdqa %xmm2, %xmm6
834; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
835; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
836; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
837; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
838; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
839; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
840; SSE2-NEXT:    packuswb %xmm6, %xmm2
841; SSE2-NEXT:    movdqa %xmm2, %xmm4
842; SSE2-NEXT:    psllw $4, %xmm4
843; SSE2-NEXT:    psrlw $4, %xmm2
844; SSE2-NEXT:    pand %xmm1, %xmm2
845; SSE2-NEXT:    pandn %xmm4, %xmm1
846; SSE2-NEXT:    por %xmm2, %xmm1
847; SSE2-NEXT:    pand %xmm1, %xmm3
848; SSE2-NEXT:    psllw $2, %xmm3
849; SSE2-NEXT:    pand %xmm8, %xmm1
850; SSE2-NEXT:    psrlw $2, %xmm1
851; SSE2-NEXT:    por %xmm3, %xmm1
852; SSE2-NEXT:    pand %xmm1, %xmm5
853; SSE2-NEXT:    paddb %xmm5, %xmm5
854; SSE2-NEXT:    pand %xmm7, %xmm1
855; SSE2-NEXT:    psrlw $1, %xmm1
856; SSE2-NEXT:    por %xmm5, %xmm1
857; SSE2-NEXT:    retq
858;
859; SSSE3-LABEL: test_bitreverse_v8i32:
860; SSSE3:       # %bb.0:
861; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
862; SSSE3-NEXT:    pshufb %xmm4, %xmm0
863; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
864; SSSE3-NEXT:    movdqa %xmm0, %xmm2
865; SSSE3-NEXT:    pand %xmm5, %xmm2
866; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
867; SSSE3-NEXT:    movdqa %xmm6, %xmm7
868; SSSE3-NEXT:    pshufb %xmm2, %xmm7
869; SSSE3-NEXT:    psrlw $4, %xmm0
870; SSSE3-NEXT:    pand %xmm5, %xmm0
871; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
872; SSSE3-NEXT:    movdqa %xmm2, %xmm3
873; SSSE3-NEXT:    pshufb %xmm0, %xmm3
874; SSSE3-NEXT:    por %xmm7, %xmm3
875; SSSE3-NEXT:    pshufb %xmm4, %xmm1
876; SSSE3-NEXT:    movdqa %xmm1, %xmm0
877; SSSE3-NEXT:    pand %xmm5, %xmm0
878; SSSE3-NEXT:    pshufb %xmm0, %xmm6
879; SSSE3-NEXT:    psrlw $4, %xmm1
880; SSSE3-NEXT:    pand %xmm5, %xmm1
881; SSSE3-NEXT:    pshufb %xmm1, %xmm2
882; SSSE3-NEXT:    por %xmm6, %xmm2
883; SSSE3-NEXT:    movdqa %xmm3, %xmm0
884; SSSE3-NEXT:    movdqa %xmm2, %xmm1
885; SSSE3-NEXT:    retq
886;
887; AVX1-LABEL: test_bitreverse_v8i32:
888; AVX1:       # %bb.0:
889; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
890; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
891; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
892; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
893; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
894; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
895; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
896; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
897; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
898; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
899; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
900; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
901; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
902; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
903; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
904; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
905; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
906; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
907; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
908; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
909; AVX1-NEXT:    retq
910;
911; AVX2-LABEL: test_bitreverse_v8i32:
912; AVX2:       # %bb.0:
913; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
914; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
915; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
916; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
917; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
918; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
919; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
920; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
921; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
922; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
923; AVX2-NEXT:    retq
924;
925; AVX512-LABEL: test_bitreverse_v8i32:
926; AVX512:       # %bb.0:
927; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
928; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
929; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
930; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
931; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
932; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
933; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
934; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
935; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
936; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
937; AVX512-NEXT:    retq
938;
939; XOPAVX1-LABEL: test_bitreverse_v8i32:
940; XOPAVX1:       # %bb.0:
941; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
942; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
943; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
944; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
945; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
946; XOPAVX1-NEXT:    retq
947;
948; XOPAVX2-LABEL: test_bitreverse_v8i32:
949; XOPAVX2:       # %bb.0:
950; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
951; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
952; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
953; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
954; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
955; XOPAVX2-NEXT:    retq
956  %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
957  ret <8 x i32> %b
958}
959
960define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
961; SSE2-LABEL: test_bitreverse_v4i64:
962; SSE2:       # %bb.0:
963; SSE2-NEXT:    movdqa %xmm1, %xmm2
964; SSE2-NEXT:    pxor %xmm4, %xmm4
965; SSE2-NEXT:    movdqa %xmm0, %xmm1
966; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
967; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
968; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
969; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
970; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
971; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
972; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
973; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
974; SSE2-NEXT:    packuswb %xmm1, %xmm0
975; SSE2-NEXT:    movdqa %xmm0, %xmm3
976; SSE2-NEXT:    psllw $4, %xmm3
977; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
978; SSE2-NEXT:    movdqa %xmm1, %xmm5
979; SSE2-NEXT:    pandn %xmm3, %xmm5
980; SSE2-NEXT:    psrlw $4, %xmm0
981; SSE2-NEXT:    pand %xmm1, %xmm0
982; SSE2-NEXT:    por %xmm5, %xmm0
983; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
984; SSE2-NEXT:    movdqa %xmm0, %xmm5
985; SSE2-NEXT:    pand %xmm3, %xmm5
986; SSE2-NEXT:    psllw $2, %xmm5
987; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
988; SSE2-NEXT:    pand %xmm8, %xmm0
989; SSE2-NEXT:    psrlw $2, %xmm0
990; SSE2-NEXT:    por %xmm5, %xmm0
991; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
992; SSE2-NEXT:    movdqa %xmm0, %xmm6
993; SSE2-NEXT:    pand %xmm5, %xmm6
994; SSE2-NEXT:    paddb %xmm6, %xmm6
995; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
996; SSE2-NEXT:    pand %xmm7, %xmm0
997; SSE2-NEXT:    psrlw $1, %xmm0
998; SSE2-NEXT:    por %xmm6, %xmm0
999; SSE2-NEXT:    movdqa %xmm2, %xmm6
1000; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
1001; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
1002; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1003; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1004; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1005; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
1006; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1007; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1008; SSE2-NEXT:    packuswb %xmm6, %xmm2
1009; SSE2-NEXT:    movdqa %xmm2, %xmm4
1010; SSE2-NEXT:    psllw $4, %xmm4
1011; SSE2-NEXT:    psrlw $4, %xmm2
1012; SSE2-NEXT:    pand %xmm1, %xmm2
1013; SSE2-NEXT:    pandn %xmm4, %xmm1
1014; SSE2-NEXT:    por %xmm2, %xmm1
1015; SSE2-NEXT:    pand %xmm1, %xmm3
1016; SSE2-NEXT:    psllw $2, %xmm3
1017; SSE2-NEXT:    pand %xmm8, %xmm1
1018; SSE2-NEXT:    psrlw $2, %xmm1
1019; SSE2-NEXT:    por %xmm3, %xmm1
1020; SSE2-NEXT:    pand %xmm1, %xmm5
1021; SSE2-NEXT:    paddb %xmm5, %xmm5
1022; SSE2-NEXT:    pand %xmm7, %xmm1
1023; SSE2-NEXT:    psrlw $1, %xmm1
1024; SSE2-NEXT:    por %xmm5, %xmm1
1025; SSE2-NEXT:    retq
1026;
1027; SSSE3-LABEL: test_bitreverse_v4i64:
1028; SSSE3:       # %bb.0:
1029; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1030; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1031; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1032; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1033; SSSE3-NEXT:    pand %xmm5, %xmm2
1034; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1035; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1036; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1037; SSSE3-NEXT:    psrlw $4, %xmm0
1038; SSSE3-NEXT:    pand %xmm5, %xmm0
1039; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1040; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1041; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1042; SSSE3-NEXT:    por %xmm7, %xmm3
1043; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1044; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1045; SSSE3-NEXT:    pand %xmm5, %xmm0
1046; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1047; SSSE3-NEXT:    psrlw $4, %xmm1
1048; SSSE3-NEXT:    pand %xmm5, %xmm1
1049; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1050; SSSE3-NEXT:    por %xmm6, %xmm2
1051; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1052; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1053; SSSE3-NEXT:    retq
1054;
1055; AVX1-LABEL: test_bitreverse_v4i64:
1056; AVX1:       # %bb.0:
1057; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1058; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1059; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1060; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1061; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1062; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1063; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1064; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1065; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1066; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1067; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1068; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1069; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1070; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1071; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1072; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1073; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1074; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1075; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1076; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1077; AVX1-NEXT:    retq
1078;
1079; AVX2-LABEL: test_bitreverse_v4i64:
1080; AVX2:       # %bb.0:
1081; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1082; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1083; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1084; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1085; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1086; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1087; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1088; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1089; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1090; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1091; AVX2-NEXT:    retq
1092;
1093; AVX512-LABEL: test_bitreverse_v4i64:
1094; AVX512:       # %bb.0:
1095; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1096; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1097; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1098; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1099; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1100; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1101; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1102; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1103; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1104; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1105; AVX512-NEXT:    retq
1106;
1107; XOPAVX1-LABEL: test_bitreverse_v4i64:
1108; XOPAVX1:       # %bb.0:
1109; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1110; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1111; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1112; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1113; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1114; XOPAVX1-NEXT:    retq
1115;
1116; XOPAVX2-LABEL: test_bitreverse_v4i64:
1117; XOPAVX2:       # %bb.0:
1118; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1119; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1120; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1121; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1122; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1123; XOPAVX2-NEXT:    retq
1124  %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
1125  ret <4 x i64> %b
1126}
1127
1128define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
1129; SSE2-LABEL: test_bitreverse_v64i8:
1130; SSE2:       # %bb.0:
1131; SSE2-NEXT:    movdqa %xmm3, %xmm10
1132; SSE2-NEXT:    movdqa %xmm0, %xmm5
1133; SSE2-NEXT:    psllw $4, %xmm5
1134; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1135; SSE2-NEXT:    movdqa %xmm3, %xmm6
1136; SSE2-NEXT:    pandn %xmm5, %xmm6
1137; SSE2-NEXT:    psrlw $4, %xmm0
1138; SSE2-NEXT:    pand %xmm3, %xmm0
1139; SSE2-NEXT:    por %xmm6, %xmm0
1140; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1141; SSE2-NEXT:    movdqa %xmm0, %xmm6
1142; SSE2-NEXT:    pand %xmm5, %xmm6
1143; SSE2-NEXT:    psllw $2, %xmm6
1144; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1145; SSE2-NEXT:    pand %xmm8, %xmm0
1146; SSE2-NEXT:    psrlw $2, %xmm0
1147; SSE2-NEXT:    por %xmm6, %xmm0
1148; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1149; SSE2-NEXT:    movdqa %xmm0, %xmm7
1150; SSE2-NEXT:    pand %xmm6, %xmm7
1151; SSE2-NEXT:    paddb %xmm7, %xmm7
1152; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1153; SSE2-NEXT:    pand %xmm9, %xmm0
1154; SSE2-NEXT:    psrlw $1, %xmm0
1155; SSE2-NEXT:    por %xmm7, %xmm0
1156; SSE2-NEXT:    movdqa %xmm1, %xmm7
1157; SSE2-NEXT:    psllw $4, %xmm7
1158; SSE2-NEXT:    movdqa %xmm3, %xmm4
1159; SSE2-NEXT:    pandn %xmm7, %xmm4
1160; SSE2-NEXT:    psrlw $4, %xmm1
1161; SSE2-NEXT:    pand %xmm3, %xmm1
1162; SSE2-NEXT:    por %xmm4, %xmm1
1163; SSE2-NEXT:    movdqa %xmm1, %xmm4
1164; SSE2-NEXT:    pand %xmm5, %xmm4
1165; SSE2-NEXT:    psllw $2, %xmm4
1166; SSE2-NEXT:    pand %xmm8, %xmm1
1167; SSE2-NEXT:    psrlw $2, %xmm1
1168; SSE2-NEXT:    por %xmm4, %xmm1
1169; SSE2-NEXT:    movdqa %xmm1, %xmm4
1170; SSE2-NEXT:    pand %xmm6, %xmm4
1171; SSE2-NEXT:    paddb %xmm4, %xmm4
1172; SSE2-NEXT:    pand %xmm9, %xmm1
1173; SSE2-NEXT:    psrlw $1, %xmm1
1174; SSE2-NEXT:    por %xmm4, %xmm1
1175; SSE2-NEXT:    movdqa %xmm2, %xmm4
1176; SSE2-NEXT:    psllw $4, %xmm4
1177; SSE2-NEXT:    movdqa %xmm3, %xmm7
1178; SSE2-NEXT:    pandn %xmm4, %xmm7
1179; SSE2-NEXT:    psrlw $4, %xmm2
1180; SSE2-NEXT:    pand %xmm3, %xmm2
1181; SSE2-NEXT:    por %xmm7, %xmm2
1182; SSE2-NEXT:    movdqa %xmm2, %xmm4
1183; SSE2-NEXT:    pand %xmm5, %xmm4
1184; SSE2-NEXT:    psllw $2, %xmm4
1185; SSE2-NEXT:    pand %xmm8, %xmm2
1186; SSE2-NEXT:    psrlw $2, %xmm2
1187; SSE2-NEXT:    por %xmm4, %xmm2
1188; SSE2-NEXT:    movdqa %xmm2, %xmm4
1189; SSE2-NEXT:    pand %xmm6, %xmm4
1190; SSE2-NEXT:    paddb %xmm4, %xmm4
1191; SSE2-NEXT:    pand %xmm9, %xmm2
1192; SSE2-NEXT:    psrlw $1, %xmm2
1193; SSE2-NEXT:    por %xmm4, %xmm2
1194; SSE2-NEXT:    movdqa %xmm10, %xmm4
1195; SSE2-NEXT:    psllw $4, %xmm4
1196; SSE2-NEXT:    psrlw $4, %xmm10
1197; SSE2-NEXT:    pand %xmm3, %xmm10
1198; SSE2-NEXT:    pandn %xmm4, %xmm3
1199; SSE2-NEXT:    por %xmm10, %xmm3
1200; SSE2-NEXT:    pand %xmm3, %xmm5
1201; SSE2-NEXT:    psllw $2, %xmm5
1202; SSE2-NEXT:    pand %xmm8, %xmm3
1203; SSE2-NEXT:    psrlw $2, %xmm3
1204; SSE2-NEXT:    por %xmm5, %xmm3
1205; SSE2-NEXT:    pand %xmm3, %xmm6
1206; SSE2-NEXT:    paddb %xmm6, %xmm6
1207; SSE2-NEXT:    pand %xmm9, %xmm3
1208; SSE2-NEXT:    psrlw $1, %xmm3
1209; SSE2-NEXT:    por %xmm6, %xmm3
1210; SSE2-NEXT:    retq
1211;
1212; SSSE3-LABEL: test_bitreverse_v64i8:
1213; SSSE3:       # %bb.0:
1214; SSSE3-NEXT:    movdqa %xmm0, %xmm5
1215; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1216; SSSE3-NEXT:    pand %xmm8, %xmm0
1217; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1218; SSSE3-NEXT:    movdqa %xmm9, %xmm6
1219; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1220; SSSE3-NEXT:    psrlw $4, %xmm5
1221; SSSE3-NEXT:    pand %xmm8, %xmm5
1222; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1223; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1224; SSSE3-NEXT:    pshufb %xmm5, %xmm0
1225; SSSE3-NEXT:    por %xmm6, %xmm0
1226; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1227; SSSE3-NEXT:    pand %xmm8, %xmm5
1228; SSSE3-NEXT:    movdqa %xmm9, %xmm6
1229; SSSE3-NEXT:    pshufb %xmm5, %xmm6
1230; SSSE3-NEXT:    psrlw $4, %xmm1
1231; SSSE3-NEXT:    pand %xmm8, %xmm1
1232; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1233; SSSE3-NEXT:    pshufb %xmm1, %xmm5
1234; SSSE3-NEXT:    por %xmm6, %xmm5
1235; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1236; SSSE3-NEXT:    pand %xmm8, %xmm1
1237; SSSE3-NEXT:    movdqa %xmm9, %xmm7
1238; SSSE3-NEXT:    pshufb %xmm1, %xmm7
1239; SSSE3-NEXT:    psrlw $4, %xmm2
1240; SSSE3-NEXT:    pand %xmm8, %xmm2
1241; SSSE3-NEXT:    movdqa %xmm4, %xmm6
1242; SSSE3-NEXT:    pshufb %xmm2, %xmm6
1243; SSSE3-NEXT:    por %xmm7, %xmm6
1244; SSSE3-NEXT:    movdqa %xmm3, %xmm1
1245; SSSE3-NEXT:    pand %xmm8, %xmm1
1246; SSSE3-NEXT:    pshufb %xmm1, %xmm9
1247; SSSE3-NEXT:    psrlw $4, %xmm3
1248; SSSE3-NEXT:    pand %xmm8, %xmm3
1249; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1250; SSSE3-NEXT:    por %xmm9, %xmm4
1251; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1252; SSSE3-NEXT:    movdqa %xmm6, %xmm2
1253; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1254; SSSE3-NEXT:    retq
1255;
1256; AVX1-LABEL: test_bitreverse_v64i8:
1257; AVX1:       # %bb.0:
1258; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1259; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1260; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
1261; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1262; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1263; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1264; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1265; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1266; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
1267; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
1268; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm4
1269; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1270; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1271; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1272; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1273; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
1274; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1275; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1276; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
1277; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1278; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1279; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1280; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
1281; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
1282; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1283; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1284; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1285; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1286; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1287; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1288; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1289; AVX1-NEXT:    retq
1290;
1291; AVX2-LABEL: test_bitreverse_v64i8:
1292; AVX2:       # %bb.0:
1293; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1294; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
1295; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1296; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1297; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1298; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
1299; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1300; AVX2-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
1301; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
1302; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
1303; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1304; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
1305; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
1306; AVX2-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
1307; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
1308; AVX2-NEXT:    retq
1309;
1310; AVX512F-LABEL: test_bitreverse_v64i8:
1311; AVX512F:       # %bb.0:
1312; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1313; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1314; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
1315; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1316; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1317; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm5
1318; AVX512F-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
1319; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
1320; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
1321; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
1322; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1323; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
1324; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
1325; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
1326; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
1327; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1328; AVX512F-NEXT:    vporq %zmm0, %zmm3, %zmm0
1329; AVX512F-NEXT:    retq
1330;
1331; AVX512BW-LABEL: test_bitreverse_v64i8:
1332; AVX512BW:       # %bb.0:
1333; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1334; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
1335; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1336; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
1337; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
1338; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
1339; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1340; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
1341; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
1342; AVX512BW-NEXT:    retq
1343;
1344; XOPAVX1-LABEL: test_bitreverse_v64i8:
1345; XOPAVX1:       # %bb.0:
1346; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1347; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1348; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1349; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1350; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1351; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1352; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1353; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1354; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1355; XOPAVX1-NEXT:    retq
1356;
1357; XOPAVX2-LABEL: test_bitreverse_v64i8:
1358; XOPAVX2:       # %bb.0:
1359; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1360; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1361; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1362; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1363; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
1364; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1365; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1366; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1367; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
1368; XOPAVX2-NEXT:    retq
1369  %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
1370  ret <64 x i8> %b
1371}
1372
1373define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
1374; SSE2-LABEL: test_bitreverse_v32i16:
1375; SSE2:       # %bb.0:
1376; SSE2-NEXT:    movdqa %xmm3, %xmm4
1377; SSE2-NEXT:    movdqa %xmm0, %xmm3
1378; SSE2-NEXT:    psrlw $8, %xmm3
1379; SSE2-NEXT:    psllw $8, %xmm0
1380; SSE2-NEXT:    por %xmm3, %xmm0
1381; SSE2-NEXT:    movdqa %xmm0, %xmm5
1382; SSE2-NEXT:    psllw $4, %xmm5
1383; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1384; SSE2-NEXT:    movdqa %xmm3, %xmm6
1385; SSE2-NEXT:    pandn %xmm5, %xmm6
1386; SSE2-NEXT:    psrlw $4, %xmm0
1387; SSE2-NEXT:    pand %xmm3, %xmm0
1388; SSE2-NEXT:    por %xmm6, %xmm0
1389; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1390; SSE2-NEXT:    movdqa %xmm0, %xmm6
1391; SSE2-NEXT:    pand %xmm10, %xmm6
1392; SSE2-NEXT:    psllw $2, %xmm6
1393; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1394; SSE2-NEXT:    pand %xmm8, %xmm0
1395; SSE2-NEXT:    psrlw $2, %xmm0
1396; SSE2-NEXT:    por %xmm6, %xmm0
1397; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1398; SSE2-NEXT:    movdqa %xmm0, %xmm7
1399; SSE2-NEXT:    pand %xmm6, %xmm7
1400; SSE2-NEXT:    paddb %xmm7, %xmm7
1401; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1402; SSE2-NEXT:    pand %xmm9, %xmm0
1403; SSE2-NEXT:    psrlw $1, %xmm0
1404; SSE2-NEXT:    por %xmm7, %xmm0
1405; SSE2-NEXT:    movdqa %xmm1, %xmm7
1406; SSE2-NEXT:    psrlw $8, %xmm7
1407; SSE2-NEXT:    psllw $8, %xmm1
1408; SSE2-NEXT:    por %xmm7, %xmm1
1409; SSE2-NEXT:    movdqa %xmm1, %xmm7
1410; SSE2-NEXT:    psllw $4, %xmm7
1411; SSE2-NEXT:    movdqa %xmm3, %xmm5
1412; SSE2-NEXT:    pandn %xmm7, %xmm5
1413; SSE2-NEXT:    psrlw $4, %xmm1
1414; SSE2-NEXT:    pand %xmm3, %xmm1
1415; SSE2-NEXT:    por %xmm5, %xmm1
1416; SSE2-NEXT:    movdqa %xmm1, %xmm5
1417; SSE2-NEXT:    pand %xmm10, %xmm5
1418; SSE2-NEXT:    psllw $2, %xmm5
1419; SSE2-NEXT:    pand %xmm8, %xmm1
1420; SSE2-NEXT:    psrlw $2, %xmm1
1421; SSE2-NEXT:    por %xmm5, %xmm1
1422; SSE2-NEXT:    movdqa %xmm1, %xmm5
1423; SSE2-NEXT:    pand %xmm6, %xmm5
1424; SSE2-NEXT:    paddb %xmm5, %xmm5
1425; SSE2-NEXT:    pand %xmm9, %xmm1
1426; SSE2-NEXT:    psrlw $1, %xmm1
1427; SSE2-NEXT:    por %xmm5, %xmm1
1428; SSE2-NEXT:    movdqa %xmm2, %xmm5
1429; SSE2-NEXT:    psrlw $8, %xmm5
1430; SSE2-NEXT:    psllw $8, %xmm2
1431; SSE2-NEXT:    por %xmm5, %xmm2
1432; SSE2-NEXT:    movdqa %xmm2, %xmm5
1433; SSE2-NEXT:    psllw $4, %xmm5
1434; SSE2-NEXT:    movdqa %xmm3, %xmm7
1435; SSE2-NEXT:    pandn %xmm5, %xmm7
1436; SSE2-NEXT:    psrlw $4, %xmm2
1437; SSE2-NEXT:    pand %xmm3, %xmm2
1438; SSE2-NEXT:    por %xmm7, %xmm2
1439; SSE2-NEXT:    movdqa %xmm2, %xmm5
1440; SSE2-NEXT:    pand %xmm10, %xmm5
1441; SSE2-NEXT:    psllw $2, %xmm5
1442; SSE2-NEXT:    pand %xmm8, %xmm2
1443; SSE2-NEXT:    psrlw $2, %xmm2
1444; SSE2-NEXT:    por %xmm5, %xmm2
1445; SSE2-NEXT:    movdqa %xmm2, %xmm5
1446; SSE2-NEXT:    pand %xmm6, %xmm5
1447; SSE2-NEXT:    paddb %xmm5, %xmm5
1448; SSE2-NEXT:    pand %xmm9, %xmm2
1449; SSE2-NEXT:    psrlw $1, %xmm2
1450; SSE2-NEXT:    por %xmm5, %xmm2
1451; SSE2-NEXT:    movdqa %xmm4, %xmm5
1452; SSE2-NEXT:    psrlw $8, %xmm5
1453; SSE2-NEXT:    psllw $8, %xmm4
1454; SSE2-NEXT:    por %xmm5, %xmm4
1455; SSE2-NEXT:    movdqa %xmm4, %xmm5
1456; SSE2-NEXT:    psllw $4, %xmm5
1457; SSE2-NEXT:    psrlw $4, %xmm4
1458; SSE2-NEXT:    pand %xmm3, %xmm4
1459; SSE2-NEXT:    pandn %xmm5, %xmm3
1460; SSE2-NEXT:    por %xmm4, %xmm3
1461; SSE2-NEXT:    pand %xmm3, %xmm10
1462; SSE2-NEXT:    psllw $2, %xmm10
1463; SSE2-NEXT:    pand %xmm8, %xmm3
1464; SSE2-NEXT:    psrlw $2, %xmm3
1465; SSE2-NEXT:    por %xmm10, %xmm3
1466; SSE2-NEXT:    pand %xmm3, %xmm6
1467; SSE2-NEXT:    paddb %xmm6, %xmm6
1468; SSE2-NEXT:    pand %xmm9, %xmm3
1469; SSE2-NEXT:    psrlw $1, %xmm3
1470; SSE2-NEXT:    por %xmm6, %xmm3
1471; SSE2-NEXT:    retq
1472;
1473; SSSE3-LABEL: test_bitreverse_v32i16:
1474; SSSE3:       # %bb.0:
1475; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1476; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1477; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1478; SSSE3-NEXT:    pshufb %xmm8, %xmm1
1479; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1480; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1481; SSSE3-NEXT:    pand %xmm9, %xmm0
1482; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1483; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1484; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1485; SSSE3-NEXT:    psrlw $4, %xmm1
1486; SSSE3-NEXT:    pand %xmm9, %xmm1
1487; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1488; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1489; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1490; SSSE3-NEXT:    por %xmm6, %xmm0
1491; SSSE3-NEXT:    pshufb %xmm8, %xmm5
1492; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1493; SSSE3-NEXT:    pand %xmm9, %xmm1
1494; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1495; SSSE3-NEXT:    pshufb %xmm1, %xmm6
1496; SSSE3-NEXT:    psrlw $4, %xmm5
1497; SSSE3-NEXT:    pand %xmm9, %xmm5
1498; SSSE3-NEXT:    movdqa %xmm4, %xmm1
1499; SSSE3-NEXT:    pshufb %xmm5, %xmm1
1500; SSSE3-NEXT:    por %xmm6, %xmm1
1501; SSSE3-NEXT:    pshufb %xmm8, %xmm2
1502; SSSE3-NEXT:    movdqa %xmm2, %xmm5
1503; SSSE3-NEXT:    pand %xmm9, %xmm5
1504; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1505; SSSE3-NEXT:    pshufb %xmm5, %xmm6
1506; SSSE3-NEXT:    psrlw $4, %xmm2
1507; SSSE3-NEXT:    pand %xmm9, %xmm2
1508; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1509; SSSE3-NEXT:    pshufb %xmm2, %xmm5
1510; SSSE3-NEXT:    por %xmm6, %xmm5
1511; SSSE3-NEXT:    pshufb %xmm8, %xmm3
1512; SSSE3-NEXT:    movdqa %xmm3, %xmm2
1513; SSSE3-NEXT:    pand %xmm9, %xmm2
1514; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1515; SSSE3-NEXT:    psrlw $4, %xmm3
1516; SSSE3-NEXT:    pand %xmm9, %xmm3
1517; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1518; SSSE3-NEXT:    por %xmm7, %xmm4
1519; SSSE3-NEXT:    movdqa %xmm5, %xmm2
1520; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1521; SSSE3-NEXT:    retq
1522;
1523; AVX1-LABEL: test_bitreverse_v32i16:
1524; AVX1:       # %bb.0:
1525; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1526; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1527; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1528; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1529; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
1530; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1531; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1532; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1533; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1534; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1535; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
1536; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
1537; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1538; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
1539; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1540; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1541; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1542; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
1543; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
1544; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1545; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1546; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1547; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
1548; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1549; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1550; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1551; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
1552; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
1553; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1554; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
1555; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
1556; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1557; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
1558; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
1559; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
1560; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1561; AVX1-NEXT:    retq
1562;
1563; AVX2-LABEL: test_bitreverse_v32i16:
1564; AVX2:       # %bb.0:
1565; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1566; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1567; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1568; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
1569; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1570; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
1571; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1572; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
1573; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1574; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
1575; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
1576; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1577; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
1578; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
1579; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
1580; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
1581; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
1582; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
1583; AVX2-NEXT:    retq
1584;
1585; AVX512F-LABEL: test_bitreverse_v32i16:
1586; AVX512F:       # %bb.0:
1587; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1588; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1589; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1590; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1591; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
1592; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1593; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
1594; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1595; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
1596; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
1597; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
1598; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
1599; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
1600; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1601; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
1602; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
1603; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
1604; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
1605; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1606; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
1607; AVX512F-NEXT:    retq
1608;
1609; AVX512BW-LABEL: test_bitreverse_v32i16:
1610; AVX512BW:       # %bb.0:
1611; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
1612; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1613; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
1614; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1615; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
1616; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
1617; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
1618; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1619; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
1620; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
1621; AVX512BW-NEXT:    retq
1622;
1623; XOPAVX1-LABEL: test_bitreverse_v32i16:
1624; XOPAVX1:       # %bb.0:
1625; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1626; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1627; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1628; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1629; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1630; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1631; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1632; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1633; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1634; XOPAVX1-NEXT:    retq
1635;
1636; XOPAVX2-LABEL: test_bitreverse_v32i16:
1637; XOPAVX2:       # %bb.0:
1638; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1639; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1640; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1641; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1642; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
1643; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1644; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1645; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1646; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
1647; XOPAVX2-NEXT:    retq
1648  %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
1649  ret <32 x i16> %b
1650}
1651
1652define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
1653; SSE2-LABEL: test_bitreverse_v16i32:
1654; SSE2:       # %bb.0:
1655; SSE2-NEXT:    movdqa %xmm3, %xmm11
1656; SSE2-NEXT:    pxor %xmm10, %xmm10
1657; SSE2-NEXT:    movdqa %xmm0, %xmm3
1658; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
1659; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
1660; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
1661; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
1662; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1663; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1664; SSE2-NEXT:    packuswb %xmm3, %xmm0
1665; SSE2-NEXT:    movdqa %xmm0, %xmm5
1666; SSE2-NEXT:    psllw $4, %xmm5
1667; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1668; SSE2-NEXT:    movdqa %xmm3, %xmm7
1669; SSE2-NEXT:    pandn %xmm5, %xmm7
1670; SSE2-NEXT:    psrlw $4, %xmm0
1671; SSE2-NEXT:    pand %xmm3, %xmm0
1672; SSE2-NEXT:    por %xmm7, %xmm0
1673; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1674; SSE2-NEXT:    movdqa %xmm0, %xmm7
1675; SSE2-NEXT:    pand %xmm5, %xmm7
1676; SSE2-NEXT:    psllw $2, %xmm7
1677; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1678; SSE2-NEXT:    pand %xmm8, %xmm0
1679; SSE2-NEXT:    psrlw $2, %xmm0
1680; SSE2-NEXT:    por %xmm7, %xmm0
1681; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1682; SSE2-NEXT:    movdqa %xmm0, %xmm6
1683; SSE2-NEXT:    pand %xmm7, %xmm6
1684; SSE2-NEXT:    paddb %xmm6, %xmm6
1685; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1686; SSE2-NEXT:    pand %xmm9, %xmm0
1687; SSE2-NEXT:    psrlw $1, %xmm0
1688; SSE2-NEXT:    por %xmm6, %xmm0
1689; SSE2-NEXT:    movdqa %xmm1, %xmm6
1690; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
1691; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1692; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1693; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
1694; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1695; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1696; SSE2-NEXT:    packuswb %xmm6, %xmm1
1697; SSE2-NEXT:    movdqa %xmm1, %xmm6
1698; SSE2-NEXT:    psllw $4, %xmm6
1699; SSE2-NEXT:    movdqa %xmm3, %xmm4
1700; SSE2-NEXT:    pandn %xmm6, %xmm4
1701; SSE2-NEXT:    psrlw $4, %xmm1
1702; SSE2-NEXT:    pand %xmm3, %xmm1
1703; SSE2-NEXT:    por %xmm4, %xmm1
1704; SSE2-NEXT:    movdqa %xmm1, %xmm4
1705; SSE2-NEXT:    pand %xmm5, %xmm4
1706; SSE2-NEXT:    psllw $2, %xmm4
1707; SSE2-NEXT:    pand %xmm8, %xmm1
1708; SSE2-NEXT:    psrlw $2, %xmm1
1709; SSE2-NEXT:    por %xmm4, %xmm1
1710; SSE2-NEXT:    movdqa %xmm1, %xmm4
1711; SSE2-NEXT:    pand %xmm7, %xmm4
1712; SSE2-NEXT:    paddb %xmm4, %xmm4
1713; SSE2-NEXT:    pand %xmm9, %xmm1
1714; SSE2-NEXT:    psrlw $1, %xmm1
1715; SSE2-NEXT:    por %xmm4, %xmm1
1716; SSE2-NEXT:    movdqa %xmm2, %xmm4
1717; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
1718; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
1719; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
1720; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
1721; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1722; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1723; SSE2-NEXT:    packuswb %xmm4, %xmm2
1724; SSE2-NEXT:    movdqa %xmm2, %xmm4
1725; SSE2-NEXT:    psllw $4, %xmm4
1726; SSE2-NEXT:    movdqa %xmm3, %xmm6
1727; SSE2-NEXT:    pandn %xmm4, %xmm6
1728; SSE2-NEXT:    psrlw $4, %xmm2
1729; SSE2-NEXT:    pand %xmm3, %xmm2
1730; SSE2-NEXT:    por %xmm6, %xmm2
1731; SSE2-NEXT:    movdqa %xmm2, %xmm4
1732; SSE2-NEXT:    pand %xmm5, %xmm4
1733; SSE2-NEXT:    psllw $2, %xmm4
1734; SSE2-NEXT:    pand %xmm8, %xmm2
1735; SSE2-NEXT:    psrlw $2, %xmm2
1736; SSE2-NEXT:    por %xmm4, %xmm2
1737; SSE2-NEXT:    movdqa %xmm2, %xmm4
1738; SSE2-NEXT:    pand %xmm7, %xmm4
1739; SSE2-NEXT:    paddb %xmm4, %xmm4
1740; SSE2-NEXT:    pand %xmm9, %xmm2
1741; SSE2-NEXT:    psrlw $1, %xmm2
1742; SSE2-NEXT:    por %xmm4, %xmm2
1743; SSE2-NEXT:    movdqa %xmm11, %xmm4
1744; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
1745; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
1746; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
1747; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
1748; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm11[3,2,1,0,4,5,6,7]
1749; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1750; SSE2-NEXT:    packuswb %xmm4, %xmm6
1751; SSE2-NEXT:    movdqa %xmm6, %xmm4
1752; SSE2-NEXT:    psllw $4, %xmm4
1753; SSE2-NEXT:    psrlw $4, %xmm6
1754; SSE2-NEXT:    pand %xmm3, %xmm6
1755; SSE2-NEXT:    pandn %xmm4, %xmm3
1756; SSE2-NEXT:    por %xmm6, %xmm3
1757; SSE2-NEXT:    pand %xmm3, %xmm5
1758; SSE2-NEXT:    psllw $2, %xmm5
1759; SSE2-NEXT:    pand %xmm8, %xmm3
1760; SSE2-NEXT:    psrlw $2, %xmm3
1761; SSE2-NEXT:    por %xmm5, %xmm3
1762; SSE2-NEXT:    pand %xmm3, %xmm7
1763; SSE2-NEXT:    paddb %xmm7, %xmm7
1764; SSE2-NEXT:    pand %xmm9, %xmm3
1765; SSE2-NEXT:    psrlw $1, %xmm3
1766; SSE2-NEXT:    por %xmm7, %xmm3
1767; SSE2-NEXT:    retq
1768;
1769; SSSE3-LABEL: test_bitreverse_v16i32:
1770; SSSE3:       # %bb.0:
1771; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1772; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1773; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1774; SSSE3-NEXT:    pshufb %xmm8, %xmm1
1775; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1776; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1777; SSSE3-NEXT:    pand %xmm9, %xmm0
1778; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1779; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1780; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1781; SSSE3-NEXT:    psrlw $4, %xmm1
1782; SSSE3-NEXT:    pand %xmm9, %xmm1
1783; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1784; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1785; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1786; SSSE3-NEXT:    por %xmm6, %xmm0
1787; SSSE3-NEXT:    pshufb %xmm8, %xmm5
1788; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1789; SSSE3-NEXT:    pand %xmm9, %xmm1
1790; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1791; SSSE3-NEXT:    pshufb %xmm1, %xmm6
1792; SSSE3-NEXT:    psrlw $4, %xmm5
1793; SSSE3-NEXT:    pand %xmm9, %xmm5
1794; SSSE3-NEXT:    movdqa %xmm4, %xmm1
1795; SSSE3-NEXT:    pshufb %xmm5, %xmm1
1796; SSSE3-NEXT:    por %xmm6, %xmm1
1797; SSSE3-NEXT:    pshufb %xmm8, %xmm2
1798; SSSE3-NEXT:    movdqa %xmm2, %xmm5
1799; SSSE3-NEXT:    pand %xmm9, %xmm5
1800; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1801; SSSE3-NEXT:    pshufb %xmm5, %xmm6
1802; SSSE3-NEXT:    psrlw $4, %xmm2
1803; SSSE3-NEXT:    pand %xmm9, %xmm2
1804; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1805; SSSE3-NEXT:    pshufb %xmm2, %xmm5
1806; SSSE3-NEXT:    por %xmm6, %xmm5
1807; SSSE3-NEXT:    pshufb %xmm8, %xmm3
1808; SSSE3-NEXT:    movdqa %xmm3, %xmm2
1809; SSSE3-NEXT:    pand %xmm9, %xmm2
1810; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1811; SSSE3-NEXT:    psrlw $4, %xmm3
1812; SSSE3-NEXT:    pand %xmm9, %xmm3
1813; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1814; SSSE3-NEXT:    por %xmm7, %xmm4
1815; SSSE3-NEXT:    movdqa %xmm5, %xmm2
1816; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1817; SSSE3-NEXT:    retq
1818;
1819; AVX1-LABEL: test_bitreverse_v16i32:
1820; AVX1:       # %bb.0:
1821; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1822; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1823; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1824; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1825; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
1826; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1827; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1828; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1829; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1830; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1831; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
1832; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
1833; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1834; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
1835; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1836; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1837; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1838; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
1839; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
1840; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1841; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1842; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1843; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
1844; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1845; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1846; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1847; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
1848; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
1849; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1850; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
1851; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
1852; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1853; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
1854; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
1855; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
1856; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1857; AVX1-NEXT:    retq
1858;
1859; AVX2-LABEL: test_bitreverse_v16i32:
1860; AVX2:       # %bb.0:
1861; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1862; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1863; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1864; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
1865; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1866; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
1867; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1868; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
1869; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1870; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
1871; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
1872; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1873; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
1874; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
1875; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
1876; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
1877; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
1878; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
1879; AVX2-NEXT:    retq
1880;
1881; AVX512F-LABEL: test_bitreverse_v16i32:
1882; AVX512F:       # %bb.0:
1883; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1884; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1885; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1886; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1887; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
1888; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1889; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
1890; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1891; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
1892; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
1893; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
1894; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
1895; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
1896; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1897; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
1898; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
1899; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
1900; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
1901; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1902; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
1903; AVX512F-NEXT:    retq
1904;
1905; AVX512BW-LABEL: test_bitreverse_v16i32:
1906; AVX512BW:       # %bb.0:
1907; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
1908; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1909; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
1910; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1911; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
1912; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
1913; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
1914; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1915; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
1916; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
1917; AVX512BW-NEXT:    retq
1918;
1919; XOPAVX1-LABEL: test_bitreverse_v16i32:
1920; XOPAVX1:       # %bb.0:
1921; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1922; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1923; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1924; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1925; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1926; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1927; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1928; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1929; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1930; XOPAVX1-NEXT:    retq
1931;
1932; XOPAVX2-LABEL: test_bitreverse_v16i32:
1933; XOPAVX2:       # %bb.0:
1934; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1935; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1936; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1937; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1938; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
1939; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1940; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1941; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1942; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
1943; XOPAVX2-NEXT:    retq
1944  %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
1945  ret <16 x i32> %b
1946}
1947
1948define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
1949; SSE2-LABEL: test_bitreverse_v8i64:
1950; SSE2:       # %bb.0:
1951; SSE2-NEXT:    movdqa %xmm3, %xmm11
1952; SSE2-NEXT:    pxor %xmm10, %xmm10
1953; SSE2-NEXT:    movdqa %xmm0, %xmm3
1954; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
1955; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
1956; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
1957; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
1958; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
1959; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1960; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1961; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1962; SSE2-NEXT:    packuswb %xmm3, %xmm0
1963; SSE2-NEXT:    movdqa %xmm0, %xmm5
1964; SSE2-NEXT:    psllw $4, %xmm5
1965; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1966; SSE2-NEXT:    movdqa %xmm3, %xmm7
1967; SSE2-NEXT:    pandn %xmm5, %xmm7
1968; SSE2-NEXT:    psrlw $4, %xmm0
1969; SSE2-NEXT:    pand %xmm3, %xmm0
1970; SSE2-NEXT:    por %xmm7, %xmm0
1971; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1972; SSE2-NEXT:    movdqa %xmm0, %xmm7
1973; SSE2-NEXT:    pand %xmm5, %xmm7
1974; SSE2-NEXT:    psllw $2, %xmm7
1975; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1976; SSE2-NEXT:    pand %xmm8, %xmm0
1977; SSE2-NEXT:    psrlw $2, %xmm0
1978; SSE2-NEXT:    por %xmm7, %xmm0
1979; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1980; SSE2-NEXT:    movdqa %xmm0, %xmm6
1981; SSE2-NEXT:    pand %xmm7, %xmm6
1982; SSE2-NEXT:    paddb %xmm6, %xmm6
1983; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1984; SSE2-NEXT:    pand %xmm9, %xmm0
1985; SSE2-NEXT:    psrlw $1, %xmm0
1986; SSE2-NEXT:    por %xmm6, %xmm0
1987; SSE2-NEXT:    movdqa %xmm1, %xmm6
1988; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
1989; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
1990; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1991; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1992; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
1993; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1994; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1995; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1996; SSE2-NEXT:    packuswb %xmm6, %xmm1
1997; SSE2-NEXT:    movdqa %xmm1, %xmm6
1998; SSE2-NEXT:    psllw $4, %xmm6
1999; SSE2-NEXT:    movdqa %xmm3, %xmm4
2000; SSE2-NEXT:    pandn %xmm6, %xmm4
2001; SSE2-NEXT:    psrlw $4, %xmm1
2002; SSE2-NEXT:    pand %xmm3, %xmm1
2003; SSE2-NEXT:    por %xmm4, %xmm1
2004; SSE2-NEXT:    movdqa %xmm1, %xmm4
2005; SSE2-NEXT:    pand %xmm5, %xmm4
2006; SSE2-NEXT:    psllw $2, %xmm4
2007; SSE2-NEXT:    pand %xmm8, %xmm1
2008; SSE2-NEXT:    psrlw $2, %xmm1
2009; SSE2-NEXT:    por %xmm4, %xmm1
2010; SSE2-NEXT:    movdqa %xmm1, %xmm4
2011; SSE2-NEXT:    pand %xmm7, %xmm4
2012; SSE2-NEXT:    paddb %xmm4, %xmm4
2013; SSE2-NEXT:    pand %xmm9, %xmm1
2014; SSE2-NEXT:    psrlw $1, %xmm1
2015; SSE2-NEXT:    por %xmm4, %xmm1
2016; SSE2-NEXT:    movdqa %xmm2, %xmm4
2017; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
2018; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
2019; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2020; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2021; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
2022; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
2023; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2024; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2025; SSE2-NEXT:    packuswb %xmm4, %xmm2
2026; SSE2-NEXT:    movdqa %xmm2, %xmm4
2027; SSE2-NEXT:    psllw $4, %xmm4
2028; SSE2-NEXT:    movdqa %xmm3, %xmm6
2029; SSE2-NEXT:    pandn %xmm4, %xmm6
2030; SSE2-NEXT:    psrlw $4, %xmm2
2031; SSE2-NEXT:    pand %xmm3, %xmm2
2032; SSE2-NEXT:    por %xmm6, %xmm2
2033; SSE2-NEXT:    movdqa %xmm2, %xmm4
2034; SSE2-NEXT:    pand %xmm5, %xmm4
2035; SSE2-NEXT:    psllw $2, %xmm4
2036; SSE2-NEXT:    pand %xmm8, %xmm2
2037; SSE2-NEXT:    psrlw $2, %xmm2
2038; SSE2-NEXT:    por %xmm4, %xmm2
2039; SSE2-NEXT:    movdqa %xmm2, %xmm4
2040; SSE2-NEXT:    pand %xmm7, %xmm4
2041; SSE2-NEXT:    paddb %xmm4, %xmm4
2042; SSE2-NEXT:    pand %xmm9, %xmm2
2043; SSE2-NEXT:    psrlw $1, %xmm2
2044; SSE2-NEXT:    por %xmm4, %xmm2
2045; SSE2-NEXT:    movdqa %xmm11, %xmm4
2046; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
2047; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
2048; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2049; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2050; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
2051; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm11[2,3,0,1]
2052; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
2053; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
2054; SSE2-NEXT:    packuswb %xmm4, %xmm6
2055; SSE2-NEXT:    movdqa %xmm6, %xmm4
2056; SSE2-NEXT:    psllw $4, %xmm4
2057; SSE2-NEXT:    psrlw $4, %xmm6
2058; SSE2-NEXT:    pand %xmm3, %xmm6
2059; SSE2-NEXT:    pandn %xmm4, %xmm3
2060; SSE2-NEXT:    por %xmm6, %xmm3
2061; SSE2-NEXT:    pand %xmm3, %xmm5
2062; SSE2-NEXT:    psllw $2, %xmm5
2063; SSE2-NEXT:    pand %xmm8, %xmm3
2064; SSE2-NEXT:    psrlw $2, %xmm3
2065; SSE2-NEXT:    por %xmm5, %xmm3
2066; SSE2-NEXT:    pand %xmm3, %xmm7
2067; SSE2-NEXT:    paddb %xmm7, %xmm7
2068; SSE2-NEXT:    pand %xmm9, %xmm3
2069; SSE2-NEXT:    psrlw $1, %xmm3
2070; SSE2-NEXT:    por %xmm7, %xmm3
2071; SSE2-NEXT:    retq
2072;
2073; SSSE3-LABEL: test_bitreverse_v8i64:
2074; SSSE3:       # %bb.0:
2075; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2076; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2077; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2078; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2079; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2080; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2081; SSSE3-NEXT:    pand %xmm9, %xmm0
2082; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2083; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2084; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2085; SSSE3-NEXT:    psrlw $4, %xmm1
2086; SSSE3-NEXT:    pand %xmm9, %xmm1
2087; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2088; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2089; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2090; SSSE3-NEXT:    por %xmm6, %xmm0
2091; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2092; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2093; SSSE3-NEXT:    pand %xmm9, %xmm1
2094; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2095; SSSE3-NEXT:    pshufb %xmm1, %xmm6
2096; SSSE3-NEXT:    psrlw $4, %xmm5
2097; SSSE3-NEXT:    pand %xmm9, %xmm5
2098; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2099; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2100; SSSE3-NEXT:    por %xmm6, %xmm1
2101; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2102; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2103; SSSE3-NEXT:    pand %xmm9, %xmm5
2104; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2105; SSSE3-NEXT:    pshufb %xmm5, %xmm6
2106; SSSE3-NEXT:    psrlw $4, %xmm2
2107; SSSE3-NEXT:    pand %xmm9, %xmm2
2108; SSSE3-NEXT:    movdqa %xmm4, %xmm5
2109; SSSE3-NEXT:    pshufb %xmm2, %xmm5
2110; SSSE3-NEXT:    por %xmm6, %xmm5
2111; SSSE3-NEXT:    pshufb %xmm8, %xmm3
2112; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2113; SSSE3-NEXT:    pand %xmm9, %xmm2
2114; SSSE3-NEXT:    pshufb %xmm2, %xmm7
2115; SSSE3-NEXT:    psrlw $4, %xmm3
2116; SSSE3-NEXT:    pand %xmm9, %xmm3
2117; SSSE3-NEXT:    pshufb %xmm3, %xmm4
2118; SSSE3-NEXT:    por %xmm7, %xmm4
2119; SSSE3-NEXT:    movdqa %xmm5, %xmm2
2120; SSSE3-NEXT:    movdqa %xmm4, %xmm3
2121; SSSE3-NEXT:    retq
2122;
2123; AVX1-LABEL: test_bitreverse_v8i64:
2124; AVX1:       # %bb.0:
2125; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2126; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2127; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2128; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2129; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2130; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2131; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2132; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2133; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2134; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2135; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2136; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2137; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2138; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
2139; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2140; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2141; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2142; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
2143; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
2144; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2145; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2146; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2147; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2148; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2149; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2150; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2151; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2152; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2153; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2154; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
2155; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2156; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2157; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2158; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
2159; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
2160; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2161; AVX1-NEXT:    retq
2162;
2163; AVX2-LABEL: test_bitreverse_v8i64:
2164; AVX2:       # %bb.0:
2165; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2166; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2167; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2168; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
2169; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2170; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2171; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2172; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2173; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2174; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2175; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
2176; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2177; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
2178; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2179; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2180; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2181; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2182; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2183; AVX2-NEXT:    retq
2184;
2185; AVX512F-LABEL: test_bitreverse_v8i64:
2186; AVX512F:       # %bb.0:
2187; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2188; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2189; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2190; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2191; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
2192; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2193; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2194; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2195; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
2196; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2197; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
2198; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
2199; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
2200; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2201; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
2202; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
2203; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
2204; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
2205; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2206; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
2207; AVX512F-NEXT:    retq
2208;
2209; AVX512BW-LABEL: test_bitreverse_v8i64:
2210; AVX512BW:       # %bb.0:
2211; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
2212; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2213; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2214; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2215; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2216; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2217; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2218; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2219; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2220; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2221; AVX512BW-NEXT:    retq
2222;
2223; XOPAVX1-LABEL: test_bitreverse_v8i64:
2224; XOPAVX1:       # %bb.0:
2225; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2226; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2227; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2228; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2229; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2230; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2231; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2232; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2233; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2234; XOPAVX1-NEXT:    retq
2235;
2236; XOPAVX2-LABEL: test_bitreverse_v8i64:
2237; XOPAVX2:       # %bb.0:
2238; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2239; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2240; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2241; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2242; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2243; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2244; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2245; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2246; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2247; XOPAVX2-NEXT:    retq
2248  %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
2249  ret <8 x i64> %b
2250}
2251
2252;
2253; Constant Folding
2254;
2255
2256define i32 @fold_bitreverse_i32() nounwind {
2257; ALL-LABEL: fold_bitreverse_i32:
2258; ALL:       # %bb.0:
2259; ALL-NEXT:    movl $16711935, %eax # imm = 0xFF00FF
2260; ALL-NEXT:    retq
2261  %b = call i32 @llvm.bitreverse.i32(i32 4278255360)
2262  ret i32 %b
2263}
2264
2265define <16 x i8> @fold_bitreverse_v16i8() nounwind {
2266; SSE-LABEL: fold_bitreverse_v16i8:
2267; SSE:       # %bb.0:
2268; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2269; SSE-NEXT:    retq
2270;
2271; AVX-LABEL: fold_bitreverse_v16i8:
2272; AVX:       # %bb.0:
2273; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2274; AVX-NEXT:    retq
2275;
2276; XOP-LABEL: fold_bitreverse_v16i8:
2277; XOP:       # %bb.0:
2278; XOP-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2279; XOP-NEXT:    retq
2280  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>)
2281  ret <16 x i8> %b
2282}
2283
2284define <16 x i16> @fold_bitreverse_v16i16() nounwind {
2285; SSE-LABEL: fold_bitreverse_v16i16:
2286; SSE:       # %bb.0:
2287; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
2288; SSE-NEXT:    movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
2289; SSE-NEXT:    retq
2290;
2291; AVX-LABEL: fold_bitreverse_v16i16:
2292; AVX:       # %bb.0:
2293; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
2294; AVX-NEXT:    retq
2295;
2296; XOP-LABEL: fold_bitreverse_v16i16:
2297; XOP:       # %bb.0:
2298; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
2299; XOP-NEXT:    retq
2300  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>)
2301  ret <16 x i16> %b
2302}
2303
2304define <16 x i32> @fold_bitreverse_v16i32() nounwind {
2305; SSE-LABEL: fold_bitreverse_v16i32:
2306; SSE:       # %bb.0:
2307; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
2308; SSE-NEXT:    movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
2309; SSE-NEXT:    movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
2310; SSE-NEXT:    movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103]
2311; SSE-NEXT:    retq
2312;
2313; AVX1-LABEL: fold_bitreverse_v16i32:
2314; AVX1:       # %bb.0:
2315; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2316; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2317; AVX1-NEXT:    retq
2318;
2319; AVX2-LABEL: fold_bitreverse_v16i32:
2320; AVX2:       # %bb.0:
2321; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2322; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2323; AVX2-NEXT:    retq
2324;
2325; AVX512-LABEL: fold_bitreverse_v16i32:
2326; AVX512:       # %bb.0:
2327; AVX512-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2328; AVX512-NEXT:    retq
2329;
2330; XOP-LABEL: fold_bitreverse_v16i32:
2331; XOP:       # %bb.0:
2332; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2333; XOP-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2334; XOP-NEXT:    retq
2335  %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>)
2336  ret <16 x i32> %b
2337}
2338
2339declare i8 @llvm.bitreverse.i8(i8) readnone
2340declare i16 @llvm.bitreverse.i16(i16) readnone
2341declare i32 @llvm.bitreverse.i32(i32) readnone
2342declare i64 @llvm.bitreverse.i64(i64) readnone
2343
2344declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone
2345declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone
2346declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone
2347declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone
2348
2349declare <32 x i8>  @llvm.bitreverse.v32i8(<32 x i8>) readnone
2350declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone
2351declare <8 x i32>  @llvm.bitreverse.v8i32(<8 x i32>) readnone
2352declare <4 x i64>  @llvm.bitreverse.v4i64(<4 x i64>) readnone
2353
2354declare <64 x i8>  @llvm.bitreverse.v64i8(<64 x i8>) readnone
2355declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone
2356declare <16 x i32>  @llvm.bitreverse.v16i32(<16 x i32>) readnone
2357declare <8 x i64>  @llvm.bitreverse.v8i64(<8 x i64>) readnone
2358