1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNISSE
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX2
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX512F
14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX512BW
15
16; Make sure we don't crash with avx512bw and xop
17; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw
18
19define i8 @test_bitreverse_i8(i8 %a) nounwind {
20; SSE-LABEL: test_bitreverse_i8:
21; SSE:       # %bb.0:
22; SSE-NEXT:    rolb $4, %dil
23; SSE-NEXT:    movl %edi, %eax
24; SSE-NEXT:    andb $51, %al
25; SSE-NEXT:    shlb $2, %al
26; SSE-NEXT:    shrb $2, %dil
27; SSE-NEXT:    andb $51, %dil
28; SSE-NEXT:    orb %dil, %al
29; SSE-NEXT:    movl %eax, %ecx
30; SSE-NEXT:    andb $85, %cl
31; SSE-NEXT:    addb %cl, %cl
32; SSE-NEXT:    shrb %al
33; SSE-NEXT:    andb $85, %al
34; SSE-NEXT:    orb %cl, %al
35; SSE-NEXT:    retq
36;
37; AVX-LABEL: test_bitreverse_i8:
38; AVX:       # %bb.0:
39; AVX-NEXT:    rolb $4, %dil
40; AVX-NEXT:    movl %edi, %eax
41; AVX-NEXT:    andb $51, %al
42; AVX-NEXT:    shlb $2, %al
43; AVX-NEXT:    shrb $2, %dil
44; AVX-NEXT:    andb $51, %dil
45; AVX-NEXT:    orb %dil, %al
46; AVX-NEXT:    movl %eax, %ecx
47; AVX-NEXT:    andb $85, %cl
48; AVX-NEXT:    addb %cl, %cl
49; AVX-NEXT:    shrb %al
50; AVX-NEXT:    andb $85, %al
51; AVX-NEXT:    orb %cl, %al
52; AVX-NEXT:    retq
53;
54; XOP-LABEL: test_bitreverse_i8:
55; XOP:       # %bb.0:
56; XOP-NEXT:    vmovd %edi, %xmm0
57; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
58; XOP-NEXT:    vmovd %xmm0, %eax
59; XOP-NEXT:    # kill: def $al killed $al killed $eax
60; XOP-NEXT:    retq
61;
62; GFNISSE-LABEL: test_bitreverse_i8:
63; GFNISSE:       # %bb.0:
64; GFNISSE-NEXT:    rolb $4, %dil
65; GFNISSE-NEXT:    movl %edi, %eax
66; GFNISSE-NEXT:    andb $51, %al
67; GFNISSE-NEXT:    shlb $2, %al
68; GFNISSE-NEXT:    shrb $2, %dil
69; GFNISSE-NEXT:    andb $51, %dil
70; GFNISSE-NEXT:    orb %dil, %al
71; GFNISSE-NEXT:    movl %eax, %ecx
72; GFNISSE-NEXT:    andb $85, %cl
73; GFNISSE-NEXT:    addb %cl, %cl
74; GFNISSE-NEXT:    shrb %al
75; GFNISSE-NEXT:    andb $85, %al
76; GFNISSE-NEXT:    orb %cl, %al
77; GFNISSE-NEXT:    retq
78;
79; GFNIAVX-LABEL: test_bitreverse_i8:
80; GFNIAVX:       # %bb.0:
81; GFNIAVX-NEXT:    rolb $4, %dil
82; GFNIAVX-NEXT:    movl %edi, %eax
83; GFNIAVX-NEXT:    andb $51, %al
84; GFNIAVX-NEXT:    shlb $2, %al
85; GFNIAVX-NEXT:    shrb $2, %dil
86; GFNIAVX-NEXT:    andb $51, %dil
87; GFNIAVX-NEXT:    orb %dil, %al
88; GFNIAVX-NEXT:    movl %eax, %ecx
89; GFNIAVX-NEXT:    andb $85, %cl
90; GFNIAVX-NEXT:    addb %cl, %cl
91; GFNIAVX-NEXT:    shrb %al
92; GFNIAVX-NEXT:    andb $85, %al
93; GFNIAVX-NEXT:    orb %cl, %al
94; GFNIAVX-NEXT:    retq
95;
96; GFNIAVX2-LABEL: test_bitreverse_i8:
97; GFNIAVX2:       # %bb.0:
98; GFNIAVX2-NEXT:    rolb $4, %dil
99; GFNIAVX2-NEXT:    movl %edi, %eax
100; GFNIAVX2-NEXT:    andb $51, %al
101; GFNIAVX2-NEXT:    shlb $2, %al
102; GFNIAVX2-NEXT:    shrb $2, %dil
103; GFNIAVX2-NEXT:    andb $51, %dil
104; GFNIAVX2-NEXT:    orb %dil, %al
105; GFNIAVX2-NEXT:    movl %eax, %ecx
106; GFNIAVX2-NEXT:    andb $85, %cl
107; GFNIAVX2-NEXT:    addb %cl, %cl
108; GFNIAVX2-NEXT:    shrb %al
109; GFNIAVX2-NEXT:    andb $85, %al
110; GFNIAVX2-NEXT:    orb %cl, %al
111; GFNIAVX2-NEXT:    retq
112;
113; GFNIAVX512F-LABEL: test_bitreverse_i8:
114; GFNIAVX512F:       # %bb.0:
115; GFNIAVX512F-NEXT:    rolb $4, %dil
116; GFNIAVX512F-NEXT:    movl %edi, %eax
117; GFNIAVX512F-NEXT:    andb $51, %al
118; GFNIAVX512F-NEXT:    shlb $2, %al
119; GFNIAVX512F-NEXT:    shrb $2, %dil
120; GFNIAVX512F-NEXT:    andb $51, %dil
121; GFNIAVX512F-NEXT:    orb %dil, %al
122; GFNIAVX512F-NEXT:    movl %eax, %ecx
123; GFNIAVX512F-NEXT:    andb $85, %cl
124; GFNIAVX512F-NEXT:    addb %cl, %cl
125; GFNIAVX512F-NEXT:    shrb %al
126; GFNIAVX512F-NEXT:    andb $85, %al
127; GFNIAVX512F-NEXT:    orb %cl, %al
128; GFNIAVX512F-NEXT:    retq
129;
130; GFNIAVX512BW-LABEL: test_bitreverse_i8:
131; GFNIAVX512BW:       # %bb.0:
132; GFNIAVX512BW-NEXT:    rolb $4, %dil
133; GFNIAVX512BW-NEXT:    movl %edi, %eax
134; GFNIAVX512BW-NEXT:    andb $51, %al
135; GFNIAVX512BW-NEXT:    shlb $2, %al
136; GFNIAVX512BW-NEXT:    shrb $2, %dil
137; GFNIAVX512BW-NEXT:    andb $51, %dil
138; GFNIAVX512BW-NEXT:    orb %dil, %al
139; GFNIAVX512BW-NEXT:    movl %eax, %ecx
140; GFNIAVX512BW-NEXT:    andb $85, %cl
141; GFNIAVX512BW-NEXT:    addb %cl, %cl
142; GFNIAVX512BW-NEXT:    shrb %al
143; GFNIAVX512BW-NEXT:    andb $85, %al
144; GFNIAVX512BW-NEXT:    orb %cl, %al
145; GFNIAVX512BW-NEXT:    retq
146  %b = call i8 @llvm.bitreverse.i8(i8 %a)
147  ret i8 %b
148}
149
150define i16 @test_bitreverse_i16(i16 %a) nounwind {
151; SSE-LABEL: test_bitreverse_i16:
152; SSE:       # %bb.0:
153; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
154; SSE-NEXT:    rolw $8, %di
155; SSE-NEXT:    movl %edi, %eax
156; SSE-NEXT:    andl $3855, %eax # imm = 0xF0F
157; SSE-NEXT:    shll $4, %eax
158; SSE-NEXT:    shrl $4, %edi
159; SSE-NEXT:    andl $3855, %edi # imm = 0xF0F
160; SSE-NEXT:    orl %eax, %edi
161; SSE-NEXT:    movl %edi, %eax
162; SSE-NEXT:    andl $13107, %eax # imm = 0x3333
163; SSE-NEXT:    shrl $2, %edi
164; SSE-NEXT:    andl $13107, %edi # imm = 0x3333
165; SSE-NEXT:    leal (%rdi,%rax,4), %eax
166; SSE-NEXT:    movl %eax, %ecx
167; SSE-NEXT:    andl $21845, %ecx # imm = 0x5555
168; SSE-NEXT:    shrl %eax
169; SSE-NEXT:    andl $21845, %eax # imm = 0x5555
170; SSE-NEXT:    leal (%rax,%rcx,2), %eax
171; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
172; SSE-NEXT:    retq
173;
174; AVX-LABEL: test_bitreverse_i16:
175; AVX:       # %bb.0:
176; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
177; AVX-NEXT:    rolw $8, %di
178; AVX-NEXT:    movl %edi, %eax
179; AVX-NEXT:    andl $3855, %eax # imm = 0xF0F
180; AVX-NEXT:    shll $4, %eax
181; AVX-NEXT:    shrl $4, %edi
182; AVX-NEXT:    andl $3855, %edi # imm = 0xF0F
183; AVX-NEXT:    orl %eax, %edi
184; AVX-NEXT:    movl %edi, %eax
185; AVX-NEXT:    andl $13107, %eax # imm = 0x3333
186; AVX-NEXT:    shrl $2, %edi
187; AVX-NEXT:    andl $13107, %edi # imm = 0x3333
188; AVX-NEXT:    leal (%rdi,%rax,4), %eax
189; AVX-NEXT:    movl %eax, %ecx
190; AVX-NEXT:    andl $21845, %ecx # imm = 0x5555
191; AVX-NEXT:    shrl %eax
192; AVX-NEXT:    andl $21845, %eax # imm = 0x5555
193; AVX-NEXT:    leal (%rax,%rcx,2), %eax
194; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
195; AVX-NEXT:    retq
196;
197; XOP-LABEL: test_bitreverse_i16:
198; XOP:       # %bb.0:
199; XOP-NEXT:    vmovd %edi, %xmm0
200; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
201; XOP-NEXT:    vmovd %xmm0, %eax
202; XOP-NEXT:    # kill: def $ax killed $ax killed $eax
203; XOP-NEXT:    retq
204;
205; GFNISSE-LABEL: test_bitreverse_i16:
206; GFNISSE:       # %bb.0:
207; GFNISSE-NEXT:    # kill: def $edi killed $edi def $rdi
208; GFNISSE-NEXT:    rolw $8, %di
209; GFNISSE-NEXT:    movl %edi, %eax
210; GFNISSE-NEXT:    andl $3855, %eax # imm = 0xF0F
211; GFNISSE-NEXT:    shll $4, %eax
212; GFNISSE-NEXT:    shrl $4, %edi
213; GFNISSE-NEXT:    andl $3855, %edi # imm = 0xF0F
214; GFNISSE-NEXT:    orl %eax, %edi
215; GFNISSE-NEXT:    movl %edi, %eax
216; GFNISSE-NEXT:    andl $13107, %eax # imm = 0x3333
217; GFNISSE-NEXT:    shrl $2, %edi
218; GFNISSE-NEXT:    andl $13107, %edi # imm = 0x3333
219; GFNISSE-NEXT:    leal (%rdi,%rax,4), %eax
220; GFNISSE-NEXT:    movl %eax, %ecx
221; GFNISSE-NEXT:    andl $21845, %ecx # imm = 0x5555
222; GFNISSE-NEXT:    shrl %eax
223; GFNISSE-NEXT:    andl $21845, %eax # imm = 0x5555
224; GFNISSE-NEXT:    leal (%rax,%rcx,2), %eax
225; GFNISSE-NEXT:    # kill: def $ax killed $ax killed $eax
226; GFNISSE-NEXT:    retq
227;
228; GFNIAVX-LABEL: test_bitreverse_i16:
229; GFNIAVX:       # %bb.0:
230; GFNIAVX-NEXT:    # kill: def $edi killed $edi def $rdi
231; GFNIAVX-NEXT:    rolw $8, %di
232; GFNIAVX-NEXT:    movl %edi, %eax
233; GFNIAVX-NEXT:    andl $3855, %eax # imm = 0xF0F
234; GFNIAVX-NEXT:    shll $4, %eax
235; GFNIAVX-NEXT:    shrl $4, %edi
236; GFNIAVX-NEXT:    andl $3855, %edi # imm = 0xF0F
237; GFNIAVX-NEXT:    orl %eax, %edi
238; GFNIAVX-NEXT:    movl %edi, %eax
239; GFNIAVX-NEXT:    andl $13107, %eax # imm = 0x3333
240; GFNIAVX-NEXT:    shrl $2, %edi
241; GFNIAVX-NEXT:    andl $13107, %edi # imm = 0x3333
242; GFNIAVX-NEXT:    leal (%rdi,%rax,4), %eax
243; GFNIAVX-NEXT:    movl %eax, %ecx
244; GFNIAVX-NEXT:    andl $21845, %ecx # imm = 0x5555
245; GFNIAVX-NEXT:    shrl %eax
246; GFNIAVX-NEXT:    andl $21845, %eax # imm = 0x5555
247; GFNIAVX-NEXT:    leal (%rax,%rcx,2), %eax
248; GFNIAVX-NEXT:    # kill: def $ax killed $ax killed $eax
249; GFNIAVX-NEXT:    retq
250;
251; GFNIAVX2-LABEL: test_bitreverse_i16:
252; GFNIAVX2:       # %bb.0:
253; GFNIAVX2-NEXT:    # kill: def $edi killed $edi def $rdi
254; GFNIAVX2-NEXT:    rolw $8, %di
255; GFNIAVX2-NEXT:    movl %edi, %eax
256; GFNIAVX2-NEXT:    andl $3855, %eax # imm = 0xF0F
257; GFNIAVX2-NEXT:    shll $4, %eax
258; GFNIAVX2-NEXT:    shrl $4, %edi
259; GFNIAVX2-NEXT:    andl $3855, %edi # imm = 0xF0F
260; GFNIAVX2-NEXT:    orl %eax, %edi
261; GFNIAVX2-NEXT:    movl %edi, %eax
262; GFNIAVX2-NEXT:    andl $13107, %eax # imm = 0x3333
263; GFNIAVX2-NEXT:    shrl $2, %edi
264; GFNIAVX2-NEXT:    andl $13107, %edi # imm = 0x3333
265; GFNIAVX2-NEXT:    leal (%rdi,%rax,4), %eax
266; GFNIAVX2-NEXT:    movl %eax, %ecx
267; GFNIAVX2-NEXT:    andl $21845, %ecx # imm = 0x5555
268; GFNIAVX2-NEXT:    shrl %eax
269; GFNIAVX2-NEXT:    andl $21845, %eax # imm = 0x5555
270; GFNIAVX2-NEXT:    leal (%rax,%rcx,2), %eax
271; GFNIAVX2-NEXT:    # kill: def $ax killed $ax killed $eax
272; GFNIAVX2-NEXT:    retq
273;
274; GFNIAVX512F-LABEL: test_bitreverse_i16:
275; GFNIAVX512F:       # %bb.0:
276; GFNIAVX512F-NEXT:    # kill: def $edi killed $edi def $rdi
277; GFNIAVX512F-NEXT:    rolw $8, %di
278; GFNIAVX512F-NEXT:    movl %edi, %eax
279; GFNIAVX512F-NEXT:    andl $3855, %eax # imm = 0xF0F
280; GFNIAVX512F-NEXT:    shll $4, %eax
281; GFNIAVX512F-NEXT:    shrl $4, %edi
282; GFNIAVX512F-NEXT:    andl $3855, %edi # imm = 0xF0F
283; GFNIAVX512F-NEXT:    orl %eax, %edi
284; GFNIAVX512F-NEXT:    movl %edi, %eax
285; GFNIAVX512F-NEXT:    andl $13107, %eax # imm = 0x3333
286; GFNIAVX512F-NEXT:    shrl $2, %edi
287; GFNIAVX512F-NEXT:    andl $13107, %edi # imm = 0x3333
288; GFNIAVX512F-NEXT:    leal (%rdi,%rax,4), %eax
289; GFNIAVX512F-NEXT:    movl %eax, %ecx
290; GFNIAVX512F-NEXT:    andl $21845, %ecx # imm = 0x5555
291; GFNIAVX512F-NEXT:    shrl %eax
292; GFNIAVX512F-NEXT:    andl $21845, %eax # imm = 0x5555
293; GFNIAVX512F-NEXT:    leal (%rax,%rcx,2), %eax
294; GFNIAVX512F-NEXT:    # kill: def $ax killed $ax killed $eax
295; GFNIAVX512F-NEXT:    retq
296;
297; GFNIAVX512BW-LABEL: test_bitreverse_i16:
298; GFNIAVX512BW:       # %bb.0:
299; GFNIAVX512BW-NEXT:    # kill: def $edi killed $edi def $rdi
300; GFNIAVX512BW-NEXT:    rolw $8, %di
301; GFNIAVX512BW-NEXT:    movl %edi, %eax
302; GFNIAVX512BW-NEXT:    andl $3855, %eax # imm = 0xF0F
303; GFNIAVX512BW-NEXT:    shll $4, %eax
304; GFNIAVX512BW-NEXT:    shrl $4, %edi
305; GFNIAVX512BW-NEXT:    andl $3855, %edi # imm = 0xF0F
306; GFNIAVX512BW-NEXT:    orl %eax, %edi
307; GFNIAVX512BW-NEXT:    movl %edi, %eax
308; GFNIAVX512BW-NEXT:    andl $13107, %eax # imm = 0x3333
309; GFNIAVX512BW-NEXT:    shrl $2, %edi
310; GFNIAVX512BW-NEXT:    andl $13107, %edi # imm = 0x3333
311; GFNIAVX512BW-NEXT:    leal (%rdi,%rax,4), %eax
312; GFNIAVX512BW-NEXT:    movl %eax, %ecx
313; GFNIAVX512BW-NEXT:    andl $21845, %ecx # imm = 0x5555
314; GFNIAVX512BW-NEXT:    shrl %eax
315; GFNIAVX512BW-NEXT:    andl $21845, %eax # imm = 0x5555
316; GFNIAVX512BW-NEXT:    leal (%rax,%rcx,2), %eax
317; GFNIAVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
318; GFNIAVX512BW-NEXT:    retq
319  %b = call i16 @llvm.bitreverse.i16(i16 %a)
320  ret i16 %b
321}
322
323define i32 @test_bitreverse_i32(i32 %a) nounwind {
324; SSE-LABEL: test_bitreverse_i32:
325; SSE:       # %bb.0:
326; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
327; SSE-NEXT:    bswapl %edi
328; SSE-NEXT:    movl %edi, %eax
329; SSE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
330; SSE-NEXT:    shll $4, %eax
331; SSE-NEXT:    shrl $4, %edi
332; SSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
333; SSE-NEXT:    orl %eax, %edi
334; SSE-NEXT:    movl %edi, %eax
335; SSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
336; SSE-NEXT:    shrl $2, %edi
337; SSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
338; SSE-NEXT:    leal (%rdi,%rax,4), %eax
339; SSE-NEXT:    movl %eax, %ecx
340; SSE-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
341; SSE-NEXT:    shrl %eax
342; SSE-NEXT:    andl $1431655765, %eax # imm = 0x55555555
343; SSE-NEXT:    leal (%rax,%rcx,2), %eax
344; SSE-NEXT:    retq
345;
346; AVX-LABEL: test_bitreverse_i32:
347; AVX:       # %bb.0:
348; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
349; AVX-NEXT:    bswapl %edi
350; AVX-NEXT:    movl %edi, %eax
351; AVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
352; AVX-NEXT:    shll $4, %eax
353; AVX-NEXT:    shrl $4, %edi
354; AVX-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
355; AVX-NEXT:    orl %eax, %edi
356; AVX-NEXT:    movl %edi, %eax
357; AVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
358; AVX-NEXT:    shrl $2, %edi
359; AVX-NEXT:    andl $858993459, %edi # imm = 0x33333333
360; AVX-NEXT:    leal (%rdi,%rax,4), %eax
361; AVX-NEXT:    movl %eax, %ecx
362; AVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
363; AVX-NEXT:    shrl %eax
364; AVX-NEXT:    andl $1431655765, %eax # imm = 0x55555555
365; AVX-NEXT:    leal (%rax,%rcx,2), %eax
366; AVX-NEXT:    retq
367;
368; XOP-LABEL: test_bitreverse_i32:
369; XOP:       # %bb.0:
370; XOP-NEXT:    vmovd %edi, %xmm0
371; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
372; XOP-NEXT:    vmovd %xmm0, %eax
373; XOP-NEXT:    retq
374;
375; GFNISSE-LABEL: test_bitreverse_i32:
376; GFNISSE:       # %bb.0:
377; GFNISSE-NEXT:    # kill: def $edi killed $edi def $rdi
378; GFNISSE-NEXT:    bswapl %edi
379; GFNISSE-NEXT:    movl %edi, %eax
380; GFNISSE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
381; GFNISSE-NEXT:    shll $4, %eax
382; GFNISSE-NEXT:    shrl $4, %edi
383; GFNISSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
384; GFNISSE-NEXT:    orl %eax, %edi
385; GFNISSE-NEXT:    movl %edi, %eax
386; GFNISSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
387; GFNISSE-NEXT:    shrl $2, %edi
388; GFNISSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
389; GFNISSE-NEXT:    leal (%rdi,%rax,4), %eax
390; GFNISSE-NEXT:    movl %eax, %ecx
391; GFNISSE-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
392; GFNISSE-NEXT:    shrl %eax
393; GFNISSE-NEXT:    andl $1431655765, %eax # imm = 0x55555555
394; GFNISSE-NEXT:    leal (%rax,%rcx,2), %eax
395; GFNISSE-NEXT:    retq
396;
397; GFNIAVX-LABEL: test_bitreverse_i32:
398; GFNIAVX:       # %bb.0:
399; GFNIAVX-NEXT:    # kill: def $edi killed $edi def $rdi
400; GFNIAVX-NEXT:    bswapl %edi
401; GFNIAVX-NEXT:    movl %edi, %eax
402; GFNIAVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
403; GFNIAVX-NEXT:    shll $4, %eax
404; GFNIAVX-NEXT:    shrl $4, %edi
405; GFNIAVX-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
406; GFNIAVX-NEXT:    orl %eax, %edi
407; GFNIAVX-NEXT:    movl %edi, %eax
408; GFNIAVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
409; GFNIAVX-NEXT:    shrl $2, %edi
410; GFNIAVX-NEXT:    andl $858993459, %edi # imm = 0x33333333
411; GFNIAVX-NEXT:    leal (%rdi,%rax,4), %eax
412; GFNIAVX-NEXT:    movl %eax, %ecx
413; GFNIAVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
414; GFNIAVX-NEXT:    shrl %eax
415; GFNIAVX-NEXT:    andl $1431655765, %eax # imm = 0x55555555
416; GFNIAVX-NEXT:    leal (%rax,%rcx,2), %eax
417; GFNIAVX-NEXT:    retq
418;
419; GFNIAVX2-LABEL: test_bitreverse_i32:
420; GFNIAVX2:       # %bb.0:
421; GFNIAVX2-NEXT:    # kill: def $edi killed $edi def $rdi
422; GFNIAVX2-NEXT:    bswapl %edi
423; GFNIAVX2-NEXT:    movl %edi, %eax
424; GFNIAVX2-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
425; GFNIAVX2-NEXT:    shll $4, %eax
426; GFNIAVX2-NEXT:    shrl $4, %edi
427; GFNIAVX2-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
428; GFNIAVX2-NEXT:    orl %eax, %edi
429; GFNIAVX2-NEXT:    movl %edi, %eax
430; GFNIAVX2-NEXT:    andl $858993459, %eax # imm = 0x33333333
431; GFNIAVX2-NEXT:    shrl $2, %edi
432; GFNIAVX2-NEXT:    andl $858993459, %edi # imm = 0x33333333
433; GFNIAVX2-NEXT:    leal (%rdi,%rax,4), %eax
434; GFNIAVX2-NEXT:    movl %eax, %ecx
435; GFNIAVX2-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
436; GFNIAVX2-NEXT:    shrl %eax
437; GFNIAVX2-NEXT:    andl $1431655765, %eax # imm = 0x55555555
438; GFNIAVX2-NEXT:    leal (%rax,%rcx,2), %eax
439; GFNIAVX2-NEXT:    retq
440;
441; GFNIAVX512F-LABEL: test_bitreverse_i32:
442; GFNIAVX512F:       # %bb.0:
443; GFNIAVX512F-NEXT:    # kill: def $edi killed $edi def $rdi
444; GFNIAVX512F-NEXT:    bswapl %edi
445; GFNIAVX512F-NEXT:    movl %edi, %eax
446; GFNIAVX512F-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
447; GFNIAVX512F-NEXT:    shll $4, %eax
448; GFNIAVX512F-NEXT:    shrl $4, %edi
449; GFNIAVX512F-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
450; GFNIAVX512F-NEXT:    orl %eax, %edi
451; GFNIAVX512F-NEXT:    movl %edi, %eax
452; GFNIAVX512F-NEXT:    andl $858993459, %eax # imm = 0x33333333
453; GFNIAVX512F-NEXT:    shrl $2, %edi
454; GFNIAVX512F-NEXT:    andl $858993459, %edi # imm = 0x33333333
455; GFNIAVX512F-NEXT:    leal (%rdi,%rax,4), %eax
456; GFNIAVX512F-NEXT:    movl %eax, %ecx
457; GFNIAVX512F-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
458; GFNIAVX512F-NEXT:    shrl %eax
459; GFNIAVX512F-NEXT:    andl $1431655765, %eax # imm = 0x55555555
460; GFNIAVX512F-NEXT:    leal (%rax,%rcx,2), %eax
461; GFNIAVX512F-NEXT:    retq
462;
463; GFNIAVX512BW-LABEL: test_bitreverse_i32:
464; GFNIAVX512BW:       # %bb.0:
465; GFNIAVX512BW-NEXT:    # kill: def $edi killed $edi def $rdi
466; GFNIAVX512BW-NEXT:    bswapl %edi
467; GFNIAVX512BW-NEXT:    movl %edi, %eax
468; GFNIAVX512BW-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
469; GFNIAVX512BW-NEXT:    shll $4, %eax
470; GFNIAVX512BW-NEXT:    shrl $4, %edi
471; GFNIAVX512BW-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
472; GFNIAVX512BW-NEXT:    orl %eax, %edi
473; GFNIAVX512BW-NEXT:    movl %edi, %eax
474; GFNIAVX512BW-NEXT:    andl $858993459, %eax # imm = 0x33333333
475; GFNIAVX512BW-NEXT:    shrl $2, %edi
476; GFNIAVX512BW-NEXT:    andl $858993459, %edi # imm = 0x33333333
477; GFNIAVX512BW-NEXT:    leal (%rdi,%rax,4), %eax
478; GFNIAVX512BW-NEXT:    movl %eax, %ecx
479; GFNIAVX512BW-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
480; GFNIAVX512BW-NEXT:    shrl %eax
481; GFNIAVX512BW-NEXT:    andl $1431655765, %eax # imm = 0x55555555
482; GFNIAVX512BW-NEXT:    leal (%rax,%rcx,2), %eax
483; GFNIAVX512BW-NEXT:    retq
484  %b = call i32 @llvm.bitreverse.i32(i32 %a)
485  ret i32 %b
486}
487
488define i64 @test_bitreverse_i64(i64 %a) nounwind {
489; SSE-LABEL: test_bitreverse_i64:
490; SSE:       # %bb.0:
491; SSE-NEXT:    bswapq %rdi
492; SSE-NEXT:    movq %rdi, %rax
493; SSE-NEXT:    shrq $4, %rax
494; SSE-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
495; SSE-NEXT:    andq %rcx, %rax
496; SSE-NEXT:    andq %rcx, %rdi
497; SSE-NEXT:    shlq $4, %rdi
498; SSE-NEXT:    orq %rax, %rdi
499; SSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
500; SSE-NEXT:    movq %rdi, %rcx
501; SSE-NEXT:    andq %rax, %rcx
502; SSE-NEXT:    shrq $2, %rdi
503; SSE-NEXT:    andq %rax, %rdi
504; SSE-NEXT:    leaq (%rdi,%rcx,4), %rax
505; SSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
506; SSE-NEXT:    movq %rax, %rdx
507; SSE-NEXT:    andq %rcx, %rdx
508; SSE-NEXT:    shrq %rax
509; SSE-NEXT:    andq %rcx, %rax
510; SSE-NEXT:    leaq (%rax,%rdx,2), %rax
511; SSE-NEXT:    retq
512;
513; AVX-LABEL: test_bitreverse_i64:
514; AVX:       # %bb.0:
515; AVX-NEXT:    bswapq %rdi
516; AVX-NEXT:    movq %rdi, %rax
517; AVX-NEXT:    shrq $4, %rax
518; AVX-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
519; AVX-NEXT:    andq %rcx, %rax
520; AVX-NEXT:    andq %rcx, %rdi
521; AVX-NEXT:    shlq $4, %rdi
522; AVX-NEXT:    orq %rax, %rdi
523; AVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
524; AVX-NEXT:    movq %rdi, %rcx
525; AVX-NEXT:    andq %rax, %rcx
526; AVX-NEXT:    shrq $2, %rdi
527; AVX-NEXT:    andq %rax, %rdi
528; AVX-NEXT:    leaq (%rdi,%rcx,4), %rax
529; AVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
530; AVX-NEXT:    movq %rax, %rdx
531; AVX-NEXT:    andq %rcx, %rdx
532; AVX-NEXT:    shrq %rax
533; AVX-NEXT:    andq %rcx, %rax
534; AVX-NEXT:    leaq (%rax,%rdx,2), %rax
535; AVX-NEXT:    retq
536;
537; XOP-LABEL: test_bitreverse_i64:
538; XOP:       # %bb.0:
539; XOP-NEXT:    vmovq %rdi, %xmm0
540; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
541; XOP-NEXT:    vmovq %xmm0, %rax
542; XOP-NEXT:    retq
543;
544; GFNISSE-LABEL: test_bitreverse_i64:
545; GFNISSE:       # %bb.0:
546; GFNISSE-NEXT:    bswapq %rdi
547; GFNISSE-NEXT:    movq %rdi, %rax
548; GFNISSE-NEXT:    shrq $4, %rax
549; GFNISSE-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
550; GFNISSE-NEXT:    andq %rcx, %rax
551; GFNISSE-NEXT:    andq %rcx, %rdi
552; GFNISSE-NEXT:    shlq $4, %rdi
553; GFNISSE-NEXT:    orq %rax, %rdi
554; GFNISSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
555; GFNISSE-NEXT:    movq %rdi, %rcx
556; GFNISSE-NEXT:    andq %rax, %rcx
557; GFNISSE-NEXT:    shrq $2, %rdi
558; GFNISSE-NEXT:    andq %rax, %rdi
559; GFNISSE-NEXT:    leaq (%rdi,%rcx,4), %rax
560; GFNISSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
561; GFNISSE-NEXT:    movq %rax, %rdx
562; GFNISSE-NEXT:    andq %rcx, %rdx
563; GFNISSE-NEXT:    shrq %rax
564; GFNISSE-NEXT:    andq %rcx, %rax
565; GFNISSE-NEXT:    leaq (%rax,%rdx,2), %rax
566; GFNISSE-NEXT:    retq
567;
568; GFNIAVX-LABEL: test_bitreverse_i64:
569; GFNIAVX:       # %bb.0:
570; GFNIAVX-NEXT:    bswapq %rdi
571; GFNIAVX-NEXT:    movq %rdi, %rax
572; GFNIAVX-NEXT:    shrq $4, %rax
573; GFNIAVX-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
574; GFNIAVX-NEXT:    andq %rcx, %rax
575; GFNIAVX-NEXT:    andq %rcx, %rdi
576; GFNIAVX-NEXT:    shlq $4, %rdi
577; GFNIAVX-NEXT:    orq %rax, %rdi
578; GFNIAVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
579; GFNIAVX-NEXT:    movq %rdi, %rcx
580; GFNIAVX-NEXT:    andq %rax, %rcx
581; GFNIAVX-NEXT:    shrq $2, %rdi
582; GFNIAVX-NEXT:    andq %rax, %rdi
583; GFNIAVX-NEXT:    leaq (%rdi,%rcx,4), %rax
584; GFNIAVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
585; GFNIAVX-NEXT:    movq %rax, %rdx
586; GFNIAVX-NEXT:    andq %rcx, %rdx
587; GFNIAVX-NEXT:    shrq %rax
588; GFNIAVX-NEXT:    andq %rcx, %rax
589; GFNIAVX-NEXT:    leaq (%rax,%rdx,2), %rax
590; GFNIAVX-NEXT:    retq
591;
592; GFNIAVX2-LABEL: test_bitreverse_i64:
593; GFNIAVX2:       # %bb.0:
594; GFNIAVX2-NEXT:    bswapq %rdi
595; GFNIAVX2-NEXT:    movq %rdi, %rax
596; GFNIAVX2-NEXT:    shrq $4, %rax
597; GFNIAVX2-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
598; GFNIAVX2-NEXT:    andq %rcx, %rax
599; GFNIAVX2-NEXT:    andq %rcx, %rdi
600; GFNIAVX2-NEXT:    shlq $4, %rdi
601; GFNIAVX2-NEXT:    orq %rax, %rdi
602; GFNIAVX2-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
603; GFNIAVX2-NEXT:    movq %rdi, %rcx
604; GFNIAVX2-NEXT:    andq %rax, %rcx
605; GFNIAVX2-NEXT:    shrq $2, %rdi
606; GFNIAVX2-NEXT:    andq %rax, %rdi
607; GFNIAVX2-NEXT:    leaq (%rdi,%rcx,4), %rax
608; GFNIAVX2-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
609; GFNIAVX2-NEXT:    movq %rax, %rdx
610; GFNIAVX2-NEXT:    andq %rcx, %rdx
611; GFNIAVX2-NEXT:    shrq %rax
612; GFNIAVX2-NEXT:    andq %rcx, %rax
613; GFNIAVX2-NEXT:    leaq (%rax,%rdx,2), %rax
614; GFNIAVX2-NEXT:    retq
615;
616; GFNIAVX512F-LABEL: test_bitreverse_i64:
617; GFNIAVX512F:       # %bb.0:
618; GFNIAVX512F-NEXT:    bswapq %rdi
619; GFNIAVX512F-NEXT:    movq %rdi, %rax
620; GFNIAVX512F-NEXT:    shrq $4, %rax
621; GFNIAVX512F-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
622; GFNIAVX512F-NEXT:    andq %rcx, %rax
623; GFNIAVX512F-NEXT:    andq %rcx, %rdi
624; GFNIAVX512F-NEXT:    shlq $4, %rdi
625; GFNIAVX512F-NEXT:    orq %rax, %rdi
626; GFNIAVX512F-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
627; GFNIAVX512F-NEXT:    movq %rdi, %rcx
628; GFNIAVX512F-NEXT:    andq %rax, %rcx
629; GFNIAVX512F-NEXT:    shrq $2, %rdi
630; GFNIAVX512F-NEXT:    andq %rax, %rdi
631; GFNIAVX512F-NEXT:    leaq (%rdi,%rcx,4), %rax
632; GFNIAVX512F-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
633; GFNIAVX512F-NEXT:    movq %rax, %rdx
634; GFNIAVX512F-NEXT:    andq %rcx, %rdx
635; GFNIAVX512F-NEXT:    shrq %rax
636; GFNIAVX512F-NEXT:    andq %rcx, %rax
637; GFNIAVX512F-NEXT:    leaq (%rax,%rdx,2), %rax
638; GFNIAVX512F-NEXT:    retq
639;
640; GFNIAVX512BW-LABEL: test_bitreverse_i64:
641; GFNIAVX512BW:       # %bb.0:
642; GFNIAVX512BW-NEXT:    bswapq %rdi
643; GFNIAVX512BW-NEXT:    movq %rdi, %rax
644; GFNIAVX512BW-NEXT:    shrq $4, %rax
645; GFNIAVX512BW-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
646; GFNIAVX512BW-NEXT:    andq %rcx, %rax
647; GFNIAVX512BW-NEXT:    andq %rcx, %rdi
648; GFNIAVX512BW-NEXT:    shlq $4, %rdi
649; GFNIAVX512BW-NEXT:    orq %rax, %rdi
650; GFNIAVX512BW-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
651; GFNIAVX512BW-NEXT:    movq %rdi, %rcx
652; GFNIAVX512BW-NEXT:    andq %rax, %rcx
653; GFNIAVX512BW-NEXT:    shrq $2, %rdi
654; GFNIAVX512BW-NEXT:    andq %rax, %rdi
655; GFNIAVX512BW-NEXT:    leaq (%rdi,%rcx,4), %rax
656; GFNIAVX512BW-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
657; GFNIAVX512BW-NEXT:    movq %rax, %rdx
658; GFNIAVX512BW-NEXT:    andq %rcx, %rdx
659; GFNIAVX512BW-NEXT:    shrq %rax
660; GFNIAVX512BW-NEXT:    andq %rcx, %rax
661; GFNIAVX512BW-NEXT:    leaq (%rax,%rdx,2), %rax
662; GFNIAVX512BW-NEXT:    retq
663  %b = call i64 @llvm.bitreverse.i64(i64 %a)
664  ret i64 %b
665}
666
667define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
668; SSE2-LABEL: test_bitreverse_v16i8:
669; SSE2:       # %bb.0:
670; SSE2-NEXT:    movdqa %xmm0, %xmm1
671; SSE2-NEXT:    psrlw $4, %xmm1
672; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
673; SSE2-NEXT:    pand %xmm2, %xmm1
674; SSE2-NEXT:    pand %xmm2, %xmm0
675; SSE2-NEXT:    psllw $4, %xmm0
676; SSE2-NEXT:    por %xmm1, %xmm0
677; SSE2-NEXT:    movdqa %xmm0, %xmm1
678; SSE2-NEXT:    psrlw $2, %xmm1
679; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
680; SSE2-NEXT:    pand %xmm2, %xmm1
681; SSE2-NEXT:    pand %xmm2, %xmm0
682; SSE2-NEXT:    psllw $2, %xmm0
683; SSE2-NEXT:    por %xmm1, %xmm0
684; SSE2-NEXT:    movdqa %xmm0, %xmm1
685; SSE2-NEXT:    psrlw $1, %xmm1
686; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
687; SSE2-NEXT:    pand %xmm2, %xmm1
688; SSE2-NEXT:    pand %xmm2, %xmm0
689; SSE2-NEXT:    paddb %xmm0, %xmm0
690; SSE2-NEXT:    por %xmm1, %xmm0
691; SSE2-NEXT:    retq
692;
693; SSSE3-LABEL: test_bitreverse_v16i8:
694; SSSE3:       # %bb.0:
695; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
696; SSSE3-NEXT:    movdqa %xmm0, %xmm2
697; SSSE3-NEXT:    pand %xmm1, %xmm2
698; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
699; SSSE3-NEXT:    pshufb %xmm2, %xmm3
700; SSSE3-NEXT:    psrlw $4, %xmm0
701; SSSE3-NEXT:    pand %xmm1, %xmm0
702; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
703; SSSE3-NEXT:    pshufb %xmm0, %xmm1
704; SSSE3-NEXT:    por %xmm3, %xmm1
705; SSSE3-NEXT:    movdqa %xmm1, %xmm0
706; SSSE3-NEXT:    retq
707;
708; AVX-LABEL: test_bitreverse_v16i8:
709; AVX:       # %bb.0:
710; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
711; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
712; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
713; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
714; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
715; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
716; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
717; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
718; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
719; AVX-NEXT:    retq
720;
721; XOP-LABEL: test_bitreverse_v16i8:
722; XOP:       # %bb.0:
723; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
724; XOP-NEXT:    retq
725;
726; GFNISSE-LABEL: test_bitreverse_v16i8:
727; GFNISSE:       # %bb.0:
728; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
729; GFNISSE-NEXT:    retq
730;
731; GFNIAVX-LABEL: test_bitreverse_v16i8:
732; GFNIAVX:       # %bb.0:
733; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
734; GFNIAVX-NEXT:    retq
735;
736; GFNIAVX2-LABEL: test_bitreverse_v16i8:
737; GFNIAVX2:       # %bb.0:
738; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
739; GFNIAVX2-NEXT:    retq
740;
741; GFNIAVX512F-LABEL: test_bitreverse_v16i8:
742; GFNIAVX512F:       # %bb.0:
743; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
744; GFNIAVX512F-NEXT:    retq
745;
746; GFNIAVX512BW-LABEL: test_bitreverse_v16i8:
747; GFNIAVX512BW:       # %bb.0:
748; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
749; GFNIAVX512BW-NEXT:    retq
750  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
751  ret <16 x i8> %b
752}
753
754define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
755; SSE2-LABEL: test_bitreverse_v8i16:
756; SSE2:       # %bb.0:
757; SSE2-NEXT:    movdqa %xmm0, %xmm1
758; SSE2-NEXT:    psrlw $8, %xmm1
759; SSE2-NEXT:    psllw $8, %xmm0
760; SSE2-NEXT:    por %xmm1, %xmm0
761; SSE2-NEXT:    movdqa %xmm0, %xmm1
762; SSE2-NEXT:    psrlw $4, %xmm1
763; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
764; SSE2-NEXT:    pand %xmm2, %xmm1
765; SSE2-NEXT:    pand %xmm2, %xmm0
766; SSE2-NEXT:    psllw $4, %xmm0
767; SSE2-NEXT:    por %xmm1, %xmm0
768; SSE2-NEXT:    movdqa %xmm0, %xmm1
769; SSE2-NEXT:    psrlw $2, %xmm1
770; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
771; SSE2-NEXT:    pand %xmm2, %xmm1
772; SSE2-NEXT:    pand %xmm2, %xmm0
773; SSE2-NEXT:    psllw $2, %xmm0
774; SSE2-NEXT:    por %xmm1, %xmm0
775; SSE2-NEXT:    movdqa %xmm0, %xmm1
776; SSE2-NEXT:    psrlw $1, %xmm1
777; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
778; SSE2-NEXT:    pand %xmm2, %xmm1
779; SSE2-NEXT:    pand %xmm2, %xmm0
780; SSE2-NEXT:    paddb %xmm0, %xmm0
781; SSE2-NEXT:    por %xmm1, %xmm0
782; SSE2-NEXT:    retq
783;
784; SSSE3-LABEL: test_bitreverse_v8i16:
785; SSSE3:       # %bb.0:
786; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
787; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
788; SSSE3-NEXT:    movdqa %xmm0, %xmm2
789; SSSE3-NEXT:    pand %xmm1, %xmm2
790; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
791; SSSE3-NEXT:    pshufb %xmm2, %xmm3
792; SSSE3-NEXT:    psrlw $4, %xmm0
793; SSSE3-NEXT:    pand %xmm1, %xmm0
794; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
795; SSSE3-NEXT:    pshufb %xmm0, %xmm1
796; SSSE3-NEXT:    por %xmm3, %xmm1
797; SSSE3-NEXT:    movdqa %xmm1, %xmm0
798; SSSE3-NEXT:    retq
799;
800; AVX-LABEL: test_bitreverse_v8i16:
801; AVX:       # %bb.0:
802; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
803; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
804; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
805; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
806; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
807; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
808; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
809; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
810; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
811; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
812; AVX-NEXT:    retq
813;
814; XOP-LABEL: test_bitreverse_v8i16:
815; XOP:       # %bb.0:
816; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
817; XOP-NEXT:    retq
818;
819; GFNISSE-LABEL: test_bitreverse_v8i16:
820; GFNISSE:       # %bb.0:
821; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
822; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
823; GFNISSE-NEXT:    retq
824;
825; GFNIAVX-LABEL: test_bitreverse_v8i16:
826; GFNIAVX:       # %bb.0:
827; GFNIAVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
828; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
829; GFNIAVX-NEXT:    retq
830;
831; GFNIAVX2-LABEL: test_bitreverse_v8i16:
832; GFNIAVX2:       # %bb.0:
833; GFNIAVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
834; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
835; GFNIAVX2-NEXT:    retq
836;
837; GFNIAVX512F-LABEL: test_bitreverse_v8i16:
838; GFNIAVX512F:       # %bb.0:
839; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
840; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
841; GFNIAVX512F-NEXT:    retq
842;
843; GFNIAVX512BW-LABEL: test_bitreverse_v8i16:
844; GFNIAVX512BW:       # %bb.0:
845; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
846; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
847; GFNIAVX512BW-NEXT:    retq
848  %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
849  ret <8 x i16> %b
850}
851
852define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
853; SSE2-LABEL: test_bitreverse_v4i32:
854; SSE2:       # %bb.0:
855; SSE2-NEXT:    pxor %xmm1, %xmm1
856; SSE2-NEXT:    movdqa %xmm0, %xmm2
857; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
858; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
859; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
860; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
861; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
862; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
863; SSE2-NEXT:    packuswb %xmm2, %xmm0
864; SSE2-NEXT:    movdqa %xmm0, %xmm1
865; SSE2-NEXT:    psrlw $4, %xmm1
866; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
867; SSE2-NEXT:    pand %xmm2, %xmm1
868; SSE2-NEXT:    pand %xmm2, %xmm0
869; SSE2-NEXT:    psllw $4, %xmm0
870; SSE2-NEXT:    por %xmm1, %xmm0
871; SSE2-NEXT:    movdqa %xmm0, %xmm1
872; SSE2-NEXT:    psrlw $2, %xmm1
873; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
874; SSE2-NEXT:    pand %xmm2, %xmm1
875; SSE2-NEXT:    pand %xmm2, %xmm0
876; SSE2-NEXT:    psllw $2, %xmm0
877; SSE2-NEXT:    por %xmm1, %xmm0
878; SSE2-NEXT:    movdqa %xmm0, %xmm1
879; SSE2-NEXT:    psrlw $1, %xmm1
880; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
881; SSE2-NEXT:    pand %xmm2, %xmm1
882; SSE2-NEXT:    pand %xmm2, %xmm0
883; SSE2-NEXT:    paddb %xmm0, %xmm0
884; SSE2-NEXT:    por %xmm1, %xmm0
885; SSE2-NEXT:    retq
886;
887; SSSE3-LABEL: test_bitreverse_v4i32:
888; SSSE3:       # %bb.0:
889; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
890; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
891; SSSE3-NEXT:    movdqa %xmm0, %xmm2
892; SSSE3-NEXT:    pand %xmm1, %xmm2
893; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
894; SSSE3-NEXT:    pshufb %xmm2, %xmm3
895; SSSE3-NEXT:    psrlw $4, %xmm0
896; SSSE3-NEXT:    pand %xmm1, %xmm0
897; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
898; SSSE3-NEXT:    pshufb %xmm0, %xmm1
899; SSSE3-NEXT:    por %xmm3, %xmm1
900; SSSE3-NEXT:    movdqa %xmm1, %xmm0
901; SSSE3-NEXT:    retq
902;
903; AVX-LABEL: test_bitreverse_v4i32:
904; AVX:       # %bb.0:
905; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
906; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
907; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
908; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
909; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
910; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
911; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
912; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
913; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
914; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
915; AVX-NEXT:    retq
916;
917; XOP-LABEL: test_bitreverse_v4i32:
918; XOP:       # %bb.0:
919; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
920; XOP-NEXT:    retq
921;
922; GFNISSE-LABEL: test_bitreverse_v4i32:
923; GFNISSE:       # %bb.0:
924; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
925; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
926; GFNISSE-NEXT:    retq
927;
928; GFNIAVX-LABEL: test_bitreverse_v4i32:
929; GFNIAVX:       # %bb.0:
930; GFNIAVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
931; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
932; GFNIAVX-NEXT:    retq
933;
934; GFNIAVX2-LABEL: test_bitreverse_v4i32:
935; GFNIAVX2:       # %bb.0:
936; GFNIAVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
937; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
938; GFNIAVX2-NEXT:    retq
939;
940; GFNIAVX512F-LABEL: test_bitreverse_v4i32:
941; GFNIAVX512F:       # %bb.0:
942; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
943; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
944; GFNIAVX512F-NEXT:    retq
945;
946; GFNIAVX512BW-LABEL: test_bitreverse_v4i32:
947; GFNIAVX512BW:       # %bb.0:
948; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
949; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
950; GFNIAVX512BW-NEXT:    retq
951  %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
952  ret <4 x i32> %b
953}
954
955define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
956; SSE2-LABEL: test_bitreverse_v2i64:
957; SSE2:       # %bb.0:
958; SSE2-NEXT:    pxor %xmm1, %xmm1
959; SSE2-NEXT:    movdqa %xmm0, %xmm2
960; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
961; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
962; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
963; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
964; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
965; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
966; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
967; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
968; SSE2-NEXT:    packuswb %xmm2, %xmm0
969; SSE2-NEXT:    movdqa %xmm0, %xmm1
970; SSE2-NEXT:    psrlw $4, %xmm1
971; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
972; SSE2-NEXT:    pand %xmm2, %xmm1
973; SSE2-NEXT:    pand %xmm2, %xmm0
974; SSE2-NEXT:    psllw $4, %xmm0
975; SSE2-NEXT:    por %xmm1, %xmm0
976; SSE2-NEXT:    movdqa %xmm0, %xmm1
977; SSE2-NEXT:    psrlw $2, %xmm1
978; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
979; SSE2-NEXT:    pand %xmm2, %xmm1
980; SSE2-NEXT:    pand %xmm2, %xmm0
981; SSE2-NEXT:    psllw $2, %xmm0
982; SSE2-NEXT:    por %xmm1, %xmm0
983; SSE2-NEXT:    movdqa %xmm0, %xmm1
984; SSE2-NEXT:    psrlw $1, %xmm1
985; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
986; SSE2-NEXT:    pand %xmm2, %xmm1
987; SSE2-NEXT:    pand %xmm2, %xmm0
988; SSE2-NEXT:    paddb %xmm0, %xmm0
989; SSE2-NEXT:    por %xmm1, %xmm0
990; SSE2-NEXT:    retq
991;
992; SSSE3-LABEL: test_bitreverse_v2i64:
993; SSSE3:       # %bb.0:
994; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
995; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
996; SSSE3-NEXT:    movdqa %xmm0, %xmm2
997; SSSE3-NEXT:    pand %xmm1, %xmm2
998; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
999; SSSE3-NEXT:    pshufb %xmm2, %xmm3
1000; SSSE3-NEXT:    psrlw $4, %xmm0
1001; SSSE3-NEXT:    pand %xmm1, %xmm0
1002; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1003; SSSE3-NEXT:    pshufb %xmm0, %xmm1
1004; SSSE3-NEXT:    por %xmm3, %xmm1
1005; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1006; SSSE3-NEXT:    retq
1007;
1008; AVX-LABEL: test_bitreverse_v2i64:
1009; AVX:       # %bb.0:
1010; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1011; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1012; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
1013; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1014; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1015; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1016; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
1017; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1018; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
1019; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
1020; AVX-NEXT:    retq
1021;
1022; XOP-LABEL: test_bitreverse_v2i64:
1023; XOP:       # %bb.0:
1024; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
1025; XOP-NEXT:    retq
1026;
1027; GFNISSE-LABEL: test_bitreverse_v2i64:
1028; GFNISSE:       # %bb.0:
1029; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1030; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1031; GFNISSE-NEXT:    retq
1032;
1033; GFNIAVX-LABEL: test_bitreverse_v2i64:
1034; GFNIAVX:       # %bb.0:
1035; GFNIAVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1036; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1037; GFNIAVX-NEXT:    retq
1038;
1039; GFNIAVX2-LABEL: test_bitreverse_v2i64:
1040; GFNIAVX2:       # %bb.0:
1041; GFNIAVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1042; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1043; GFNIAVX2-NEXT:    retq
1044;
1045; GFNIAVX512F-LABEL: test_bitreverse_v2i64:
1046; GFNIAVX512F:       # %bb.0:
1047; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1048; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1049; GFNIAVX512F-NEXT:    retq
1050;
1051; GFNIAVX512BW-LABEL: test_bitreverse_v2i64:
1052; GFNIAVX512BW:       # %bb.0:
1053; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1054; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1055; GFNIAVX512BW-NEXT:    retq
1056  %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
1057  ret <2 x i64> %b
1058}
1059
1060define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
1061; SSE2-LABEL: test_bitreverse_v32i8:
1062; SSE2:       # %bb.0:
1063; SSE2-NEXT:    movdqa %xmm0, %xmm3
1064; SSE2-NEXT:    psrlw $4, %xmm3
1065; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1066; SSE2-NEXT:    pand %xmm2, %xmm3
1067; SSE2-NEXT:    pand %xmm2, %xmm0
1068; SSE2-NEXT:    psllw $4, %xmm0
1069; SSE2-NEXT:    por %xmm3, %xmm0
1070; SSE2-NEXT:    movdqa %xmm0, %xmm4
1071; SSE2-NEXT:    psrlw $2, %xmm4
1072; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1073; SSE2-NEXT:    pand %xmm3, %xmm4
1074; SSE2-NEXT:    pand %xmm3, %xmm0
1075; SSE2-NEXT:    psllw $2, %xmm0
1076; SSE2-NEXT:    por %xmm4, %xmm0
1077; SSE2-NEXT:    movdqa %xmm0, %xmm5
1078; SSE2-NEXT:    psrlw $1, %xmm5
1079; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1080; SSE2-NEXT:    pand %xmm4, %xmm5
1081; SSE2-NEXT:    pand %xmm4, %xmm0
1082; SSE2-NEXT:    paddb %xmm0, %xmm0
1083; SSE2-NEXT:    por %xmm5, %xmm0
1084; SSE2-NEXT:    movdqa %xmm1, %xmm5
1085; SSE2-NEXT:    psrlw $4, %xmm5
1086; SSE2-NEXT:    pand %xmm2, %xmm5
1087; SSE2-NEXT:    pand %xmm2, %xmm1
1088; SSE2-NEXT:    psllw $4, %xmm1
1089; SSE2-NEXT:    por %xmm5, %xmm1
1090; SSE2-NEXT:    movdqa %xmm1, %xmm2
1091; SSE2-NEXT:    psrlw $2, %xmm2
1092; SSE2-NEXT:    pand %xmm3, %xmm2
1093; SSE2-NEXT:    pand %xmm3, %xmm1
1094; SSE2-NEXT:    psllw $2, %xmm1
1095; SSE2-NEXT:    por %xmm2, %xmm1
1096; SSE2-NEXT:    movdqa %xmm1, %xmm2
1097; SSE2-NEXT:    psrlw $1, %xmm2
1098; SSE2-NEXT:    pand %xmm4, %xmm2
1099; SSE2-NEXT:    pand %xmm4, %xmm1
1100; SSE2-NEXT:    paddb %xmm1, %xmm1
1101; SSE2-NEXT:    por %xmm2, %xmm1
1102; SSE2-NEXT:    retq
1103;
1104; SSSE3-LABEL: test_bitreverse_v32i8:
1105; SSSE3:       # %bb.0:
1106; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1107; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1108; SSSE3-NEXT:    pand %xmm4, %xmm2
1109; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1110; SSSE3-NEXT:    movdqa %xmm5, %xmm6
1111; SSSE3-NEXT:    pshufb %xmm2, %xmm6
1112; SSSE3-NEXT:    psrlw $4, %xmm0
1113; SSSE3-NEXT:    pand %xmm4, %xmm0
1114; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1115; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1116; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1117; SSSE3-NEXT:    por %xmm6, %xmm3
1118; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1119; SSSE3-NEXT:    pand %xmm4, %xmm0
1120; SSSE3-NEXT:    pshufb %xmm0, %xmm5
1121; SSSE3-NEXT:    psrlw $4, %xmm1
1122; SSSE3-NEXT:    pand %xmm4, %xmm1
1123; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1124; SSSE3-NEXT:    por %xmm5, %xmm2
1125; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1126; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1127; SSSE3-NEXT:    retq
1128;
1129; AVX1-LABEL: test_bitreverse_v32i8:
1130; AVX1:       # %bb.0:
1131; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1132; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1133; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
1134; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1135; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
1136; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1137; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1138; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1139; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
1140; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
1141; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
1142; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
1143; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1144; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1145; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
1146; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
1147; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1148; AVX1-NEXT:    retq
1149;
1150; AVX2-LABEL: test_bitreverse_v32i8:
1151; AVX2:       # %bb.0:
1152; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1153; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1154; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1155; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1156; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1157; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1158; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1159; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1160; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1161; AVX2-NEXT:    retq
1162;
1163; AVX512-LABEL: test_bitreverse_v32i8:
1164; AVX512:       # %bb.0:
1165; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1166; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1167; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1168; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1169; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1170; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1171; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1172; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1173; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1174; AVX512-NEXT:    retq
1175;
1176; XOPAVX1-LABEL: test_bitreverse_v32i8:
1177; XOPAVX1:       # %bb.0:
1178; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1179; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1180; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1181; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1182; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1183; XOPAVX1-NEXT:    retq
1184;
1185; XOPAVX2-LABEL: test_bitreverse_v32i8:
1186; XOPAVX2:       # %bb.0:
1187; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1188; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1189; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1190; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1191; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1192; XOPAVX2-NEXT:    retq
1193;
1194; GFNISSE-LABEL: test_bitreverse_v32i8:
1195; GFNISSE:       # %bb.0:
1196; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
1197; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
1198; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
1199; GFNISSE-NEXT:    retq
1200;
1201; GFNIAVX-LABEL: test_bitreverse_v32i8:
1202; GFNIAVX:       # %bb.0:
1203; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1204; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
1205; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm2, %xmm1, %xmm1
1206; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm2, %xmm0, %xmm0
1207; GFNIAVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1208; GFNIAVX-NEXT:    retq
1209;
1210; GFNIAVX2-LABEL: test_bitreverse_v32i8:
1211; GFNIAVX2:       # %bb.0:
1212; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1213; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1214; GFNIAVX2-NEXT:    retq
1215;
1216; GFNIAVX512F-LABEL: test_bitreverse_v32i8:
1217; GFNIAVX512F:       # %bb.0:
1218; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1219; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1220; GFNIAVX512F-NEXT:    retq
1221;
1222; GFNIAVX512BW-LABEL: test_bitreverse_v32i8:
1223; GFNIAVX512BW:       # %bb.0:
1224; GFNIAVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1225; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1226; GFNIAVX512BW-NEXT:    retq
1227  %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
1228  ret <32 x i8> %b
1229}
1230
1231define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
1232; SSE2-LABEL: test_bitreverse_v16i16:
1233; SSE2:       # %bb.0:
1234; SSE2-NEXT:    movdqa %xmm0, %xmm2
1235; SSE2-NEXT:    psrlw $8, %xmm2
1236; SSE2-NEXT:    psllw $8, %xmm0
1237; SSE2-NEXT:    por %xmm2, %xmm0
1238; SSE2-NEXT:    movdqa %xmm0, %xmm3
1239; SSE2-NEXT:    psrlw $4, %xmm3
1240; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1241; SSE2-NEXT:    pand %xmm2, %xmm3
1242; SSE2-NEXT:    pand %xmm2, %xmm0
1243; SSE2-NEXT:    psllw $4, %xmm0
1244; SSE2-NEXT:    por %xmm3, %xmm0
1245; SSE2-NEXT:    movdqa %xmm0, %xmm4
1246; SSE2-NEXT:    psrlw $2, %xmm4
1247; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1248; SSE2-NEXT:    pand %xmm3, %xmm4
1249; SSE2-NEXT:    pand %xmm3, %xmm0
1250; SSE2-NEXT:    psllw $2, %xmm0
1251; SSE2-NEXT:    por %xmm4, %xmm0
1252; SSE2-NEXT:    movdqa %xmm0, %xmm5
1253; SSE2-NEXT:    psrlw $1, %xmm5
1254; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1255; SSE2-NEXT:    pand %xmm4, %xmm5
1256; SSE2-NEXT:    pand %xmm4, %xmm0
1257; SSE2-NEXT:    paddb %xmm0, %xmm0
1258; SSE2-NEXT:    por %xmm5, %xmm0
1259; SSE2-NEXT:    movdqa %xmm1, %xmm5
1260; SSE2-NEXT:    psrlw $8, %xmm5
1261; SSE2-NEXT:    psllw $8, %xmm1
1262; SSE2-NEXT:    por %xmm5, %xmm1
1263; SSE2-NEXT:    movdqa %xmm1, %xmm5
1264; SSE2-NEXT:    psrlw $4, %xmm5
1265; SSE2-NEXT:    pand %xmm2, %xmm5
1266; SSE2-NEXT:    pand %xmm2, %xmm1
1267; SSE2-NEXT:    psllw $4, %xmm1
1268; SSE2-NEXT:    por %xmm5, %xmm1
1269; SSE2-NEXT:    movdqa %xmm1, %xmm2
1270; SSE2-NEXT:    psrlw $2, %xmm2
1271; SSE2-NEXT:    pand %xmm3, %xmm2
1272; SSE2-NEXT:    pand %xmm3, %xmm1
1273; SSE2-NEXT:    psllw $2, %xmm1
1274; SSE2-NEXT:    por %xmm2, %xmm1
1275; SSE2-NEXT:    movdqa %xmm1, %xmm2
1276; SSE2-NEXT:    psrlw $1, %xmm2
1277; SSE2-NEXT:    pand %xmm4, %xmm2
1278; SSE2-NEXT:    pand %xmm4, %xmm1
1279; SSE2-NEXT:    paddb %xmm1, %xmm1
1280; SSE2-NEXT:    por %xmm2, %xmm1
1281; SSE2-NEXT:    retq
1282;
1283; SSSE3-LABEL: test_bitreverse_v16i16:
1284; SSSE3:       # %bb.0:
1285; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1286; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1287; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1288; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1289; SSSE3-NEXT:    pand %xmm5, %xmm2
1290; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1291; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1292; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1293; SSSE3-NEXT:    psrlw $4, %xmm0
1294; SSSE3-NEXT:    pand %xmm5, %xmm0
1295; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1296; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1297; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1298; SSSE3-NEXT:    por %xmm7, %xmm3
1299; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1300; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1301; SSSE3-NEXT:    pand %xmm5, %xmm0
1302; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1303; SSSE3-NEXT:    psrlw $4, %xmm1
1304; SSSE3-NEXT:    pand %xmm5, %xmm1
1305; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1306; SSSE3-NEXT:    por %xmm6, %xmm2
1307; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1308; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1309; SSSE3-NEXT:    retq
1310;
1311; AVX1-LABEL: test_bitreverse_v16i16:
1312; AVX1:       # %bb.0:
1313; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1314; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1315; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1316; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1317; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1318; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1319; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1320; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1321; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1322; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1323; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1324; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1325; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1326; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1327; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1328; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1329; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1330; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1331; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1332; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1333; AVX1-NEXT:    retq
1334;
1335; AVX2-LABEL: test_bitreverse_v16i16:
1336; AVX2:       # %bb.0:
1337; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1338; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1339; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1340; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1341; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1342; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1343; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1344; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1345; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1346; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1347; AVX2-NEXT:    retq
1348;
1349; AVX512-LABEL: test_bitreverse_v16i16:
1350; AVX512:       # %bb.0:
1351; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1352; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1353; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1354; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1355; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1356; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1357; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1358; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1359; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1360; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1361; AVX512-NEXT:    retq
1362;
1363; XOPAVX1-LABEL: test_bitreverse_v16i16:
1364; XOPAVX1:       # %bb.0:
1365; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1366; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1367; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1368; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1369; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1370; XOPAVX1-NEXT:    retq
1371;
1372; XOPAVX2-LABEL: test_bitreverse_v16i16:
1373; XOPAVX2:       # %bb.0:
1374; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1375; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1376; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1377; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1378; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1379; XOPAVX2-NEXT:    retq
1380;
1381; GFNISSE-LABEL: test_bitreverse_v16i16:
1382; GFNISSE:       # %bb.0:
1383; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1384; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
1385; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1386; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
1387; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
1388; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
1389; GFNISSE-NEXT:    retq
1390;
1391; GFNIAVX-LABEL: test_bitreverse_v16i16:
1392; GFNIAVX:       # %bb.0:
1393; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1394; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1395; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1396; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1397; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
1398; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1399; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
1400; GFNIAVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1401; GFNIAVX-NEXT:    retq
1402;
1403; GFNIAVX2-LABEL: test_bitreverse_v16i16:
1404; GFNIAVX2:       # %bb.0:
1405; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1406; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1407; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1408; GFNIAVX2-NEXT:    retq
1409;
1410; GFNIAVX512F-LABEL: test_bitreverse_v16i16:
1411; GFNIAVX512F:       # %bb.0:
1412; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1413; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1414; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1415; GFNIAVX512F-NEXT:    retq
1416;
1417; GFNIAVX512BW-LABEL: test_bitreverse_v16i16:
1418; GFNIAVX512BW:       # %bb.0:
1419; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1420; GFNIAVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1421; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1422; GFNIAVX512BW-NEXT:    retq
1423  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
1424  ret <16 x i16> %b
1425}
1426
1427define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
1428; SSE2-LABEL: test_bitreverse_v8i32:
1429; SSE2:       # %bb.0:
1430; SSE2-NEXT:    pxor %xmm2, %xmm2
1431; SSE2-NEXT:    movdqa %xmm0, %xmm3
1432; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1433; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
1434; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
1435; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1436; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1437; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1438; SSE2-NEXT:    packuswb %xmm3, %xmm0
1439; SSE2-NEXT:    movdqa %xmm0, %xmm4
1440; SSE2-NEXT:    psrlw $4, %xmm4
1441; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1442; SSE2-NEXT:    pand %xmm3, %xmm4
1443; SSE2-NEXT:    pand %xmm3, %xmm0
1444; SSE2-NEXT:    psllw $4, %xmm0
1445; SSE2-NEXT:    por %xmm4, %xmm0
1446; SSE2-NEXT:    movdqa %xmm0, %xmm5
1447; SSE2-NEXT:    psrlw $2, %xmm5
1448; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1449; SSE2-NEXT:    pand %xmm4, %xmm5
1450; SSE2-NEXT:    pand %xmm4, %xmm0
1451; SSE2-NEXT:    psllw $2, %xmm0
1452; SSE2-NEXT:    por %xmm5, %xmm0
1453; SSE2-NEXT:    movdqa %xmm0, %xmm6
1454; SSE2-NEXT:    psrlw $1, %xmm6
1455; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1456; SSE2-NEXT:    pand %xmm5, %xmm6
1457; SSE2-NEXT:    pand %xmm5, %xmm0
1458; SSE2-NEXT:    paddb %xmm0, %xmm0
1459; SSE2-NEXT:    por %xmm6, %xmm0
1460; SSE2-NEXT:    movdqa %xmm1, %xmm6
1461; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
1462; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1463; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1464; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1465; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1466; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1467; SSE2-NEXT:    packuswb %xmm6, %xmm1
1468; SSE2-NEXT:    movdqa %xmm1, %xmm2
1469; SSE2-NEXT:    psrlw $4, %xmm2
1470; SSE2-NEXT:    pand %xmm3, %xmm2
1471; SSE2-NEXT:    pand %xmm3, %xmm1
1472; SSE2-NEXT:    psllw $4, %xmm1
1473; SSE2-NEXT:    por %xmm2, %xmm1
1474; SSE2-NEXT:    movdqa %xmm1, %xmm2
1475; SSE2-NEXT:    psrlw $2, %xmm2
1476; SSE2-NEXT:    pand %xmm4, %xmm2
1477; SSE2-NEXT:    pand %xmm4, %xmm1
1478; SSE2-NEXT:    psllw $2, %xmm1
1479; SSE2-NEXT:    por %xmm2, %xmm1
1480; SSE2-NEXT:    movdqa %xmm1, %xmm2
1481; SSE2-NEXT:    psrlw $1, %xmm2
1482; SSE2-NEXT:    pand %xmm5, %xmm2
1483; SSE2-NEXT:    pand %xmm5, %xmm1
1484; SSE2-NEXT:    paddb %xmm1, %xmm1
1485; SSE2-NEXT:    por %xmm2, %xmm1
1486; SSE2-NEXT:    retq
1487;
1488; SSSE3-LABEL: test_bitreverse_v8i32:
1489; SSSE3:       # %bb.0:
1490; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1491; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1492; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1493; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1494; SSSE3-NEXT:    pand %xmm5, %xmm2
1495; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1496; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1497; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1498; SSSE3-NEXT:    psrlw $4, %xmm0
1499; SSSE3-NEXT:    pand %xmm5, %xmm0
1500; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1501; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1502; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1503; SSSE3-NEXT:    por %xmm7, %xmm3
1504; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1505; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1506; SSSE3-NEXT:    pand %xmm5, %xmm0
1507; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1508; SSSE3-NEXT:    psrlw $4, %xmm1
1509; SSSE3-NEXT:    pand %xmm5, %xmm1
1510; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1511; SSSE3-NEXT:    por %xmm6, %xmm2
1512; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1513; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1514; SSSE3-NEXT:    retq
1515;
1516; AVX1-LABEL: test_bitreverse_v8i32:
1517; AVX1:       # %bb.0:
1518; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1519; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1520; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1521; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1522; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1523; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1524; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1525; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1526; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1527; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1528; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1529; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1530; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1531; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1532; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1533; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1534; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1535; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1536; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1537; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1538; AVX1-NEXT:    retq
1539;
1540; AVX2-LABEL: test_bitreverse_v8i32:
1541; AVX2:       # %bb.0:
1542; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1543; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1544; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1545; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1546; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1547; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1548; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1549; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1550; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1551; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1552; AVX2-NEXT:    retq
1553;
1554; AVX512-LABEL: test_bitreverse_v8i32:
1555; AVX512:       # %bb.0:
1556; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1557; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1558; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1559; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1560; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1561; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1562; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1563; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1564; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1565; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1566; AVX512-NEXT:    retq
1567;
1568; XOPAVX1-LABEL: test_bitreverse_v8i32:
1569; XOPAVX1:       # %bb.0:
1570; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1571; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1572; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1573; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1574; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1575; XOPAVX1-NEXT:    retq
1576;
1577; XOPAVX2-LABEL: test_bitreverse_v8i32:
1578; XOPAVX2:       # %bb.0:
1579; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1580; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1581; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1582; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1583; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1584; XOPAVX2-NEXT:    retq
1585;
1586; GFNISSE-LABEL: test_bitreverse_v8i32:
1587; GFNISSE:       # %bb.0:
1588; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1589; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
1590; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1591; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
1592; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
1593; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
1594; GFNISSE-NEXT:    retq
1595;
1596; GFNIAVX-LABEL: test_bitreverse_v8i32:
1597; GFNIAVX:       # %bb.0:
1598; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1599; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1600; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1601; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1602; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
1603; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1604; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
1605; GFNIAVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1606; GFNIAVX-NEXT:    retq
1607;
1608; GFNIAVX2-LABEL: test_bitreverse_v8i32:
1609; GFNIAVX2:       # %bb.0:
1610; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1611; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1612; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1613; GFNIAVX2-NEXT:    retq
1614;
1615; GFNIAVX512F-LABEL: test_bitreverse_v8i32:
1616; GFNIAVX512F:       # %bb.0:
1617; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1618; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1619; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1620; GFNIAVX512F-NEXT:    retq
1621;
1622; GFNIAVX512BW-LABEL: test_bitreverse_v8i32:
1623; GFNIAVX512BW:       # %bb.0:
1624; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1625; GFNIAVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1626; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1627; GFNIAVX512BW-NEXT:    retq
1628  %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
1629  ret <8 x i32> %b
1630}
1631
1632define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
1633; SSE2-LABEL: test_bitreverse_v4i64:
1634; SSE2:       # %bb.0:
1635; SSE2-NEXT:    pxor %xmm2, %xmm2
1636; SSE2-NEXT:    movdqa %xmm0, %xmm3
1637; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1638; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
1639; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
1640; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
1641; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1642; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1643; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1644; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1645; SSE2-NEXT:    packuswb %xmm3, %xmm0
1646; SSE2-NEXT:    movdqa %xmm0, %xmm4
1647; SSE2-NEXT:    psrlw $4, %xmm4
1648; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1649; SSE2-NEXT:    pand %xmm3, %xmm4
1650; SSE2-NEXT:    pand %xmm3, %xmm0
1651; SSE2-NEXT:    psllw $4, %xmm0
1652; SSE2-NEXT:    por %xmm4, %xmm0
1653; SSE2-NEXT:    movdqa %xmm0, %xmm5
1654; SSE2-NEXT:    psrlw $2, %xmm5
1655; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1656; SSE2-NEXT:    pand %xmm4, %xmm5
1657; SSE2-NEXT:    pand %xmm4, %xmm0
1658; SSE2-NEXT:    psllw $2, %xmm0
1659; SSE2-NEXT:    por %xmm5, %xmm0
1660; SSE2-NEXT:    movdqa %xmm0, %xmm6
1661; SSE2-NEXT:    psrlw $1, %xmm6
1662; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1663; SSE2-NEXT:    pand %xmm5, %xmm6
1664; SSE2-NEXT:    pand %xmm5, %xmm0
1665; SSE2-NEXT:    paddb %xmm0, %xmm0
1666; SSE2-NEXT:    por %xmm6, %xmm0
1667; SSE2-NEXT:    movdqa %xmm1, %xmm6
1668; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
1669; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
1670; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1671; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1672; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1673; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1674; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1675; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1676; SSE2-NEXT:    packuswb %xmm6, %xmm1
1677; SSE2-NEXT:    movdqa %xmm1, %xmm2
1678; SSE2-NEXT:    psrlw $4, %xmm2
1679; SSE2-NEXT:    pand %xmm3, %xmm2
1680; SSE2-NEXT:    pand %xmm3, %xmm1
1681; SSE2-NEXT:    psllw $4, %xmm1
1682; SSE2-NEXT:    por %xmm2, %xmm1
1683; SSE2-NEXT:    movdqa %xmm1, %xmm2
1684; SSE2-NEXT:    psrlw $2, %xmm2
1685; SSE2-NEXT:    pand %xmm4, %xmm2
1686; SSE2-NEXT:    pand %xmm4, %xmm1
1687; SSE2-NEXT:    psllw $2, %xmm1
1688; SSE2-NEXT:    por %xmm2, %xmm1
1689; SSE2-NEXT:    movdqa %xmm1, %xmm2
1690; SSE2-NEXT:    psrlw $1, %xmm2
1691; SSE2-NEXT:    pand %xmm5, %xmm2
1692; SSE2-NEXT:    pand %xmm5, %xmm1
1693; SSE2-NEXT:    paddb %xmm1, %xmm1
1694; SSE2-NEXT:    por %xmm2, %xmm1
1695; SSE2-NEXT:    retq
1696;
1697; SSSE3-LABEL: test_bitreverse_v4i64:
1698; SSSE3:       # %bb.0:
1699; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1700; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1701; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1702; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1703; SSSE3-NEXT:    pand %xmm5, %xmm2
1704; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1705; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1706; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1707; SSSE3-NEXT:    psrlw $4, %xmm0
1708; SSSE3-NEXT:    pand %xmm5, %xmm0
1709; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1710; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1711; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1712; SSSE3-NEXT:    por %xmm7, %xmm3
1713; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1714; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1715; SSSE3-NEXT:    pand %xmm5, %xmm0
1716; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1717; SSSE3-NEXT:    psrlw $4, %xmm1
1718; SSSE3-NEXT:    pand %xmm5, %xmm1
1719; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1720; SSSE3-NEXT:    por %xmm6, %xmm2
1721; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1722; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1723; SSSE3-NEXT:    retq
1724;
1725; AVX1-LABEL: test_bitreverse_v4i64:
1726; AVX1:       # %bb.0:
1727; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1728; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1729; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1730; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1731; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1732; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1733; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1734; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1735; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1736; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1737; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1738; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1739; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1740; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1741; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1742; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1743; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1744; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1745; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1746; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1747; AVX1-NEXT:    retq
1748;
1749; AVX2-LABEL: test_bitreverse_v4i64:
1750; AVX2:       # %bb.0:
1751; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1752; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1753; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1754; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1755; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1756; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1757; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1758; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1759; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1760; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1761; AVX2-NEXT:    retq
1762;
1763; AVX512-LABEL: test_bitreverse_v4i64:
1764; AVX512:       # %bb.0:
1765; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1766; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1767; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1768; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1769; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1770; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1771; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1772; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1773; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1774; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1775; AVX512-NEXT:    retq
1776;
1777; XOPAVX1-LABEL: test_bitreverse_v4i64:
1778; XOPAVX1:       # %bb.0:
1779; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1780; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1781; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1782; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1783; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1784; XOPAVX1-NEXT:    retq
1785;
1786; XOPAVX2-LABEL: test_bitreverse_v4i64:
1787; XOPAVX2:       # %bb.0:
1788; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1789; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1790; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1791; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1792; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1793; XOPAVX2-NEXT:    retq
1794;
1795; GFNISSE-LABEL: test_bitreverse_v4i64:
1796; GFNISSE:       # %bb.0:
1797; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1798; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
1799; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1800; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
1801; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
1802; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
1803; GFNISSE-NEXT:    retq
1804;
1805; GFNIAVX-LABEL: test_bitreverse_v4i64:
1806; GFNIAVX:       # %bb.0:
1807; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1808; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1809; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1810; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1811; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
1812; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1813; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
1814; GFNIAVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1815; GFNIAVX-NEXT:    retq
1816;
1817; GFNIAVX2-LABEL: test_bitreverse_v4i64:
1818; GFNIAVX2:       # %bb.0:
1819; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1820; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1821; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1822; GFNIAVX2-NEXT:    retq
1823;
1824; GFNIAVX512F-LABEL: test_bitreverse_v4i64:
1825; GFNIAVX512F:       # %bb.0:
1826; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1827; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1828; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1829; GFNIAVX512F-NEXT:    retq
1830;
1831; GFNIAVX512BW-LABEL: test_bitreverse_v4i64:
1832; GFNIAVX512BW:       # %bb.0:
1833; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1834; GFNIAVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1835; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1836; GFNIAVX512BW-NEXT:    retq
1837  %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
1838  ret <4 x i64> %b
1839}
1840
1841define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
1842; SSE2-LABEL: test_bitreverse_v64i8:
1843; SSE2:       # %bb.0:
1844; SSE2-NEXT:    movdqa %xmm0, %xmm5
1845; SSE2-NEXT:    psrlw $4, %xmm5
1846; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1847; SSE2-NEXT:    pand %xmm4, %xmm5
1848; SSE2-NEXT:    pand %xmm4, %xmm0
1849; SSE2-NEXT:    psllw $4, %xmm0
1850; SSE2-NEXT:    por %xmm5, %xmm0
1851; SSE2-NEXT:    movdqa %xmm0, %xmm6
1852; SSE2-NEXT:    psrlw $2, %xmm6
1853; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1854; SSE2-NEXT:    pand %xmm5, %xmm6
1855; SSE2-NEXT:    pand %xmm5, %xmm0
1856; SSE2-NEXT:    psllw $2, %xmm0
1857; SSE2-NEXT:    por %xmm6, %xmm0
1858; SSE2-NEXT:    movdqa %xmm0, %xmm7
1859; SSE2-NEXT:    psrlw $1, %xmm7
1860; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1861; SSE2-NEXT:    pand %xmm6, %xmm7
1862; SSE2-NEXT:    pand %xmm6, %xmm0
1863; SSE2-NEXT:    paddb %xmm0, %xmm0
1864; SSE2-NEXT:    por %xmm7, %xmm0
1865; SSE2-NEXT:    movdqa %xmm1, %xmm7
1866; SSE2-NEXT:    psrlw $4, %xmm7
1867; SSE2-NEXT:    pand %xmm4, %xmm7
1868; SSE2-NEXT:    pand %xmm4, %xmm1
1869; SSE2-NEXT:    psllw $4, %xmm1
1870; SSE2-NEXT:    por %xmm7, %xmm1
1871; SSE2-NEXT:    movdqa %xmm1, %xmm7
1872; SSE2-NEXT:    psrlw $2, %xmm7
1873; SSE2-NEXT:    pand %xmm5, %xmm7
1874; SSE2-NEXT:    pand %xmm5, %xmm1
1875; SSE2-NEXT:    psllw $2, %xmm1
1876; SSE2-NEXT:    por %xmm7, %xmm1
1877; SSE2-NEXT:    movdqa %xmm1, %xmm7
1878; SSE2-NEXT:    psrlw $1, %xmm7
1879; SSE2-NEXT:    pand %xmm6, %xmm7
1880; SSE2-NEXT:    pand %xmm6, %xmm1
1881; SSE2-NEXT:    paddb %xmm1, %xmm1
1882; SSE2-NEXT:    por %xmm7, %xmm1
1883; SSE2-NEXT:    movdqa %xmm2, %xmm7
1884; SSE2-NEXT:    psrlw $4, %xmm7
1885; SSE2-NEXT:    pand %xmm4, %xmm7
1886; SSE2-NEXT:    pand %xmm4, %xmm2
1887; SSE2-NEXT:    psllw $4, %xmm2
1888; SSE2-NEXT:    por %xmm7, %xmm2
1889; SSE2-NEXT:    movdqa %xmm2, %xmm7
1890; SSE2-NEXT:    psrlw $2, %xmm7
1891; SSE2-NEXT:    pand %xmm5, %xmm7
1892; SSE2-NEXT:    pand %xmm5, %xmm2
1893; SSE2-NEXT:    psllw $2, %xmm2
1894; SSE2-NEXT:    por %xmm7, %xmm2
1895; SSE2-NEXT:    movdqa %xmm2, %xmm7
1896; SSE2-NEXT:    psrlw $1, %xmm7
1897; SSE2-NEXT:    pand %xmm6, %xmm7
1898; SSE2-NEXT:    pand %xmm6, %xmm2
1899; SSE2-NEXT:    paddb %xmm2, %xmm2
1900; SSE2-NEXT:    por %xmm7, %xmm2
1901; SSE2-NEXT:    movdqa %xmm3, %xmm7
1902; SSE2-NEXT:    psrlw $4, %xmm7
1903; SSE2-NEXT:    pand %xmm4, %xmm7
1904; SSE2-NEXT:    pand %xmm4, %xmm3
1905; SSE2-NEXT:    psllw $4, %xmm3
1906; SSE2-NEXT:    por %xmm7, %xmm3
1907; SSE2-NEXT:    movdqa %xmm3, %xmm4
1908; SSE2-NEXT:    psrlw $2, %xmm4
1909; SSE2-NEXT:    pand %xmm5, %xmm4
1910; SSE2-NEXT:    pand %xmm5, %xmm3
1911; SSE2-NEXT:    psllw $2, %xmm3
1912; SSE2-NEXT:    por %xmm4, %xmm3
1913; SSE2-NEXT:    movdqa %xmm3, %xmm4
1914; SSE2-NEXT:    psrlw $1, %xmm4
1915; SSE2-NEXT:    pand %xmm6, %xmm4
1916; SSE2-NEXT:    pand %xmm6, %xmm3
1917; SSE2-NEXT:    paddb %xmm3, %xmm3
1918; SSE2-NEXT:    por %xmm4, %xmm3
1919; SSE2-NEXT:    retq
1920;
1921; SSSE3-LABEL: test_bitreverse_v64i8:
1922; SSSE3:       # %bb.0:
1923; SSSE3-NEXT:    movdqa %xmm0, %xmm5
1924; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1925; SSSE3-NEXT:    pand %xmm8, %xmm0
1926; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1927; SSSE3-NEXT:    movdqa %xmm9, %xmm6
1928; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1929; SSSE3-NEXT:    psrlw $4, %xmm5
1930; SSSE3-NEXT:    pand %xmm8, %xmm5
1931; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1932; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1933; SSSE3-NEXT:    pshufb %xmm5, %xmm0
1934; SSSE3-NEXT:    por %xmm6, %xmm0
1935; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1936; SSSE3-NEXT:    pand %xmm8, %xmm5
1937; SSSE3-NEXT:    movdqa %xmm9, %xmm6
1938; SSSE3-NEXT:    pshufb %xmm5, %xmm6
1939; SSSE3-NEXT:    psrlw $4, %xmm1
1940; SSSE3-NEXT:    pand %xmm8, %xmm1
1941; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1942; SSSE3-NEXT:    pshufb %xmm1, %xmm5
1943; SSSE3-NEXT:    por %xmm6, %xmm5
1944; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1945; SSSE3-NEXT:    pand %xmm8, %xmm1
1946; SSSE3-NEXT:    movdqa %xmm9, %xmm7
1947; SSSE3-NEXT:    pshufb %xmm1, %xmm7
1948; SSSE3-NEXT:    psrlw $4, %xmm2
1949; SSSE3-NEXT:    pand %xmm8, %xmm2
1950; SSSE3-NEXT:    movdqa %xmm4, %xmm6
1951; SSSE3-NEXT:    pshufb %xmm2, %xmm6
1952; SSSE3-NEXT:    por %xmm7, %xmm6
1953; SSSE3-NEXT:    movdqa %xmm3, %xmm1
1954; SSSE3-NEXT:    pand %xmm8, %xmm1
1955; SSSE3-NEXT:    pshufb %xmm1, %xmm9
1956; SSSE3-NEXT:    psrlw $4, %xmm3
1957; SSSE3-NEXT:    pand %xmm8, %xmm3
1958; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1959; SSSE3-NEXT:    por %xmm9, %xmm4
1960; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1961; SSSE3-NEXT:    movdqa %xmm6, %xmm2
1962; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1963; SSSE3-NEXT:    retq
1964;
1965; AVX1-LABEL: test_bitreverse_v64i8:
1966; AVX1:       # %bb.0:
1967; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1968; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1969; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
1970; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1971; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1972; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1973; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1974; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1975; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
1976; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
1977; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm4
1978; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1979; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1980; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1981; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1982; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
1983; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1984; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1985; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
1986; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1987; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1988; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1989; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
1990; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
1991; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1992; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1993; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1994; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1995; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1996; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1997; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1998; AVX1-NEXT:    retq
1999;
2000; AVX2-LABEL: test_bitreverse_v64i8:
2001; AVX2:       # %bb.0:
2002; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2003; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
2004; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2005; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
2006; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2007; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
2008; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2009; AVX2-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
2010; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
2011; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
2012; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
2013; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2014; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
2015; AVX2-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
2016; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
2017; AVX2-NEXT:    retq
2018;
2019; AVX512F-LABEL: test_bitreverse_v64i8:
2020; AVX512F:       # %bb.0:
2021; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2022; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2023; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
2024; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2025; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
2026; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm5
2027; AVX512F-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
2028; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
2029; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
2030; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
2031; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2032; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
2033; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
2034; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
2035; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
2036; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2037; AVX512F-NEXT:    vporq %zmm0, %zmm3, %zmm0
2038; AVX512F-NEXT:    retq
2039;
2040; AVX512BW-LABEL: test_bitreverse_v64i8:
2041; AVX512BW:       # %bb.0:
2042; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2043; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2044; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2045; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2046; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2047; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2048; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2049; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2050; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2051; AVX512BW-NEXT:    retq
2052;
2053; XOPAVX1-LABEL: test_bitreverse_v64i8:
2054; XOPAVX1:       # %bb.0:
2055; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2056; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
2057; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2058; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2059; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2060; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2061; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2062; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2063; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2064; XOPAVX1-NEXT:    retq
2065;
2066; XOPAVX2-LABEL: test_bitreverse_v64i8:
2067; XOPAVX2:       # %bb.0:
2068; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2069; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
2070; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2071; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2072; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2073; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2074; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2075; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2076; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2077; XOPAVX2-NEXT:    retq
2078;
2079; GFNISSE-LABEL: test_bitreverse_v64i8:
2080; GFNISSE:       # %bb.0:
2081; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
2082; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
2083; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
2084; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
2085; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm3
2086; GFNISSE-NEXT:    retq
2087;
2088; GFNIAVX-LABEL: test_bitreverse_v64i8:
2089; GFNIAVX:       # %bb.0:
2090; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
2091; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
2092; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm2
2093; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
2094; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2095; GFNIAVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
2096; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm2
2097; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
2098; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2099; GFNIAVX-NEXT:    retq
2100;
2101; GFNIAVX2-LABEL: test_bitreverse_v64i8:
2102; GFNIAVX2:       # %bb.0:
2103; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2104; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
2105; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
2106; GFNIAVX2-NEXT:    retq
2107;
2108; GFNIAVX512F-LABEL: test_bitreverse_v64i8:
2109; GFNIAVX512F:       # %bb.0:
2110; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2111; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2112; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
2113; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
2114; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2115; GFNIAVX512F-NEXT:    retq
2116;
2117; GFNIAVX512BW-LABEL: test_bitreverse_v64i8:
2118; GFNIAVX512BW:       # %bb.0:
2119; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2120; GFNIAVX512BW-NEXT:    retq
2121  %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
2122  ret <64 x i8> %b
2123}
2124
2125define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
2126; SSE2-LABEL: test_bitreverse_v32i16:
2127; SSE2:       # %bb.0:
2128; SSE2-NEXT:    movdqa %xmm0, %xmm4
2129; SSE2-NEXT:    psrlw $8, %xmm4
2130; SSE2-NEXT:    psllw $8, %xmm0
2131; SSE2-NEXT:    por %xmm4, %xmm0
2132; SSE2-NEXT:    movdqa %xmm0, %xmm5
2133; SSE2-NEXT:    psrlw $4, %xmm5
2134; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2135; SSE2-NEXT:    pand %xmm4, %xmm5
2136; SSE2-NEXT:    pand %xmm4, %xmm0
2137; SSE2-NEXT:    psllw $4, %xmm0
2138; SSE2-NEXT:    por %xmm5, %xmm0
2139; SSE2-NEXT:    movdqa %xmm0, %xmm6
2140; SSE2-NEXT:    psrlw $2, %xmm6
2141; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2142; SSE2-NEXT:    pand %xmm5, %xmm6
2143; SSE2-NEXT:    pand %xmm5, %xmm0
2144; SSE2-NEXT:    psllw $2, %xmm0
2145; SSE2-NEXT:    por %xmm6, %xmm0
2146; SSE2-NEXT:    movdqa %xmm0, %xmm7
2147; SSE2-NEXT:    psrlw $1, %xmm7
2148; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2149; SSE2-NEXT:    pand %xmm6, %xmm7
2150; SSE2-NEXT:    pand %xmm6, %xmm0
2151; SSE2-NEXT:    paddb %xmm0, %xmm0
2152; SSE2-NEXT:    por %xmm7, %xmm0
2153; SSE2-NEXT:    movdqa %xmm1, %xmm7
2154; SSE2-NEXT:    psrlw $8, %xmm7
2155; SSE2-NEXT:    psllw $8, %xmm1
2156; SSE2-NEXT:    por %xmm7, %xmm1
2157; SSE2-NEXT:    movdqa %xmm1, %xmm7
2158; SSE2-NEXT:    psrlw $4, %xmm7
2159; SSE2-NEXT:    pand %xmm4, %xmm7
2160; SSE2-NEXT:    pand %xmm4, %xmm1
2161; SSE2-NEXT:    psllw $4, %xmm1
2162; SSE2-NEXT:    por %xmm7, %xmm1
2163; SSE2-NEXT:    movdqa %xmm1, %xmm7
2164; SSE2-NEXT:    psrlw $2, %xmm7
2165; SSE2-NEXT:    pand %xmm5, %xmm7
2166; SSE2-NEXT:    pand %xmm5, %xmm1
2167; SSE2-NEXT:    psllw $2, %xmm1
2168; SSE2-NEXT:    por %xmm7, %xmm1
2169; SSE2-NEXT:    movdqa %xmm1, %xmm7
2170; SSE2-NEXT:    psrlw $1, %xmm7
2171; SSE2-NEXT:    pand %xmm6, %xmm7
2172; SSE2-NEXT:    pand %xmm6, %xmm1
2173; SSE2-NEXT:    paddb %xmm1, %xmm1
2174; SSE2-NEXT:    por %xmm7, %xmm1
2175; SSE2-NEXT:    movdqa %xmm2, %xmm7
2176; SSE2-NEXT:    psrlw $8, %xmm7
2177; SSE2-NEXT:    psllw $8, %xmm2
2178; SSE2-NEXT:    por %xmm7, %xmm2
2179; SSE2-NEXT:    movdqa %xmm2, %xmm7
2180; SSE2-NEXT:    psrlw $4, %xmm7
2181; SSE2-NEXT:    pand %xmm4, %xmm7
2182; SSE2-NEXT:    pand %xmm4, %xmm2
2183; SSE2-NEXT:    psllw $4, %xmm2
2184; SSE2-NEXT:    por %xmm7, %xmm2
2185; SSE2-NEXT:    movdqa %xmm2, %xmm7
2186; SSE2-NEXT:    psrlw $2, %xmm7
2187; SSE2-NEXT:    pand %xmm5, %xmm7
2188; SSE2-NEXT:    pand %xmm5, %xmm2
2189; SSE2-NEXT:    psllw $2, %xmm2
2190; SSE2-NEXT:    por %xmm7, %xmm2
2191; SSE2-NEXT:    movdqa %xmm2, %xmm7
2192; SSE2-NEXT:    psrlw $1, %xmm7
2193; SSE2-NEXT:    pand %xmm6, %xmm7
2194; SSE2-NEXT:    pand %xmm6, %xmm2
2195; SSE2-NEXT:    paddb %xmm2, %xmm2
2196; SSE2-NEXT:    por %xmm7, %xmm2
2197; SSE2-NEXT:    movdqa %xmm3, %xmm7
2198; SSE2-NEXT:    psrlw $8, %xmm7
2199; SSE2-NEXT:    psllw $8, %xmm3
2200; SSE2-NEXT:    por %xmm7, %xmm3
2201; SSE2-NEXT:    movdqa %xmm3, %xmm7
2202; SSE2-NEXT:    psrlw $4, %xmm7
2203; SSE2-NEXT:    pand %xmm4, %xmm7
2204; SSE2-NEXT:    pand %xmm4, %xmm3
2205; SSE2-NEXT:    psllw $4, %xmm3
2206; SSE2-NEXT:    por %xmm7, %xmm3
2207; SSE2-NEXT:    movdqa %xmm3, %xmm4
2208; SSE2-NEXT:    psrlw $2, %xmm4
2209; SSE2-NEXT:    pand %xmm5, %xmm4
2210; SSE2-NEXT:    pand %xmm5, %xmm3
2211; SSE2-NEXT:    psllw $2, %xmm3
2212; SSE2-NEXT:    por %xmm4, %xmm3
2213; SSE2-NEXT:    movdqa %xmm3, %xmm4
2214; SSE2-NEXT:    psrlw $1, %xmm4
2215; SSE2-NEXT:    pand %xmm6, %xmm4
2216; SSE2-NEXT:    pand %xmm6, %xmm3
2217; SSE2-NEXT:    paddb %xmm3, %xmm3
2218; SSE2-NEXT:    por %xmm4, %xmm3
2219; SSE2-NEXT:    retq
2220;
2221; SSSE3-LABEL: test_bitreverse_v32i16:
2222; SSSE3:       # %bb.0:
2223; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2224; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2225; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2226; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2227; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2228; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2229; SSSE3-NEXT:    pand %xmm9, %xmm0
2230; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2231; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2232; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2233; SSSE3-NEXT:    psrlw $4, %xmm1
2234; SSSE3-NEXT:    pand %xmm9, %xmm1
2235; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2236; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2237; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2238; SSSE3-NEXT:    por %xmm6, %xmm0
2239; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2240; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2241; SSSE3-NEXT:    pand %xmm9, %xmm1
2242; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2243; SSSE3-NEXT:    pshufb %xmm1, %xmm6
2244; SSSE3-NEXT:    psrlw $4, %xmm5
2245; SSSE3-NEXT:    pand %xmm9, %xmm5
2246; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2247; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2248; SSSE3-NEXT:    por %xmm6, %xmm1
2249; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2250; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2251; SSSE3-NEXT:    pand %xmm9, %xmm5
2252; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2253; SSSE3-NEXT:    pshufb %xmm5, %xmm6
2254; SSSE3-NEXT:    psrlw $4, %xmm2
2255; SSSE3-NEXT:    pand %xmm9, %xmm2
2256; SSSE3-NEXT:    movdqa %xmm4, %xmm5
2257; SSSE3-NEXT:    pshufb %xmm2, %xmm5
2258; SSSE3-NEXT:    por %xmm6, %xmm5
2259; SSSE3-NEXT:    pshufb %xmm8, %xmm3
2260; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2261; SSSE3-NEXT:    pand %xmm9, %xmm2
2262; SSSE3-NEXT:    pshufb %xmm2, %xmm7
2263; SSSE3-NEXT:    psrlw $4, %xmm3
2264; SSSE3-NEXT:    pand %xmm9, %xmm3
2265; SSSE3-NEXT:    pshufb %xmm3, %xmm4
2266; SSSE3-NEXT:    por %xmm7, %xmm4
2267; SSSE3-NEXT:    movdqa %xmm5, %xmm2
2268; SSSE3-NEXT:    movdqa %xmm4, %xmm3
2269; SSSE3-NEXT:    retq
2270;
2271; AVX1-LABEL: test_bitreverse_v32i16:
2272; AVX1:       # %bb.0:
2273; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2274; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2275; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2276; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2277; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2278; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2279; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2280; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2281; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2282; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2283; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2284; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2285; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2286; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
2287; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2288; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2289; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2290; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
2291; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
2292; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2293; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2294; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2295; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2296; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2297; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2298; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2299; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2300; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2301; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2302; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
2303; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2304; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2305; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2306; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
2307; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
2308; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2309; AVX1-NEXT:    retq
2310;
2311; AVX2-LABEL: test_bitreverse_v32i16:
2312; AVX2:       # %bb.0:
2313; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2314; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2315; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2316; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
2317; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2318; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2319; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2320; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2321; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2322; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2323; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
2324; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2325; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
2326; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2327; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2328; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2329; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2330; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2331; AVX2-NEXT:    retq
2332;
2333; AVX512F-LABEL: test_bitreverse_v32i16:
2334; AVX512F:       # %bb.0:
2335; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2336; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2337; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2338; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2339; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
2340; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2341; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2342; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2343; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
2344; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2345; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
2346; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
2347; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
2348; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2349; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
2350; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
2351; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
2352; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
2353; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2354; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
2355; AVX512F-NEXT:    retq
2356;
2357; AVX512BW-LABEL: test_bitreverse_v32i16:
2358; AVX512BW:       # %bb.0:
2359; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
2360; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2361; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2362; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2363; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2364; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2365; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2366; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2367; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2368; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2369; AVX512BW-NEXT:    retq
2370;
2371; XOPAVX1-LABEL: test_bitreverse_v32i16:
2372; XOPAVX1:       # %bb.0:
2373; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2374; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
2375; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2376; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2377; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2378; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2379; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2380; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2381; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2382; XOPAVX1-NEXT:    retq
2383;
2384; XOPAVX2-LABEL: test_bitreverse_v32i16:
2385; XOPAVX2:       # %bb.0:
2386; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2387; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
2388; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2389; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2390; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2391; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2392; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2393; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2394; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2395; XOPAVX2-NEXT:    retq
2396;
2397; GFNISSE-LABEL: test_bitreverse_v32i16:
2398; GFNISSE:       # %bb.0:
2399; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2400; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
2401; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
2402; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
2403; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
2404; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
2405; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
2406; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
2407; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
2408; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
2409; GFNISSE-NEXT:    retq
2410;
2411; GFNIAVX-LABEL: test_bitreverse_v32i16:
2412; GFNIAVX:       # %bb.0:
2413; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
2414; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2415; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2416; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
2417; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
2418; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2419; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
2420; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2421; GFNIAVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
2422; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2423; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
2424; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2425; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
2426; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2427; GFNIAVX-NEXT:    retq
2428;
2429; GFNIAVX2-LABEL: test_bitreverse_v32i16:
2430; GFNIAVX2:       # %bb.0:
2431; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2432; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2433; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2434; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2435; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2436; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2437; GFNIAVX2-NEXT:    retq
2438;
2439; GFNIAVX512F-LABEL: test_bitreverse_v32i16:
2440; GFNIAVX512F:       # %bb.0:
2441; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2442; GFNIAVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2443; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2444; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2445; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2446; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2447; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2448; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2449; GFNIAVX512F-NEXT:    retq
2450;
2451; GFNIAVX512BW-LABEL: test_bitreverse_v32i16:
2452; GFNIAVX512BW:       # %bb.0:
2453; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
2454; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2455; GFNIAVX512BW-NEXT:    retq
2456  %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
2457  ret <32 x i16> %b
2458}
2459
2460define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
2461; SSE2-LABEL: test_bitreverse_v16i32:
2462; SSE2:       # %bb.0:
2463; SSE2-NEXT:    pxor %xmm8, %xmm8
2464; SSE2-NEXT:    movdqa %xmm0, %xmm5
2465; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
2466; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
2467; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
2468; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
2469; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
2470; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
2471; SSE2-NEXT:    packuswb %xmm5, %xmm0
2472; SSE2-NEXT:    movdqa %xmm0, %xmm6
2473; SSE2-NEXT:    psrlw $4, %xmm6
2474; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2475; SSE2-NEXT:    pand %xmm5, %xmm6
2476; SSE2-NEXT:    pand %xmm5, %xmm0
2477; SSE2-NEXT:    psllw $4, %xmm0
2478; SSE2-NEXT:    por %xmm6, %xmm0
2479; SSE2-NEXT:    movdqa %xmm0, %xmm7
2480; SSE2-NEXT:    psrlw $2, %xmm7
2481; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2482; SSE2-NEXT:    pand %xmm6, %xmm7
2483; SSE2-NEXT:    pand %xmm6, %xmm0
2484; SSE2-NEXT:    psllw $2, %xmm0
2485; SSE2-NEXT:    por %xmm7, %xmm0
2486; SSE2-NEXT:    movdqa %xmm0, %xmm4
2487; SSE2-NEXT:    psrlw $1, %xmm4
2488; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2489; SSE2-NEXT:    pand %xmm7, %xmm4
2490; SSE2-NEXT:    pand %xmm7, %xmm0
2491; SSE2-NEXT:    paddb %xmm0, %xmm0
2492; SSE2-NEXT:    por %xmm4, %xmm0
2493; SSE2-NEXT:    movdqa %xmm1, %xmm4
2494; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
2495; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2496; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2497; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
2498; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2499; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2500; SSE2-NEXT:    packuswb %xmm4, %xmm1
2501; SSE2-NEXT:    movdqa %xmm1, %xmm4
2502; SSE2-NEXT:    psrlw $4, %xmm4
2503; SSE2-NEXT:    pand %xmm5, %xmm4
2504; SSE2-NEXT:    pand %xmm5, %xmm1
2505; SSE2-NEXT:    psllw $4, %xmm1
2506; SSE2-NEXT:    por %xmm4, %xmm1
2507; SSE2-NEXT:    movdqa %xmm1, %xmm4
2508; SSE2-NEXT:    psrlw $2, %xmm4
2509; SSE2-NEXT:    pand %xmm6, %xmm4
2510; SSE2-NEXT:    pand %xmm6, %xmm1
2511; SSE2-NEXT:    psllw $2, %xmm1
2512; SSE2-NEXT:    por %xmm4, %xmm1
2513; SSE2-NEXT:    movdqa %xmm1, %xmm4
2514; SSE2-NEXT:    psrlw $1, %xmm4
2515; SSE2-NEXT:    pand %xmm7, %xmm4
2516; SSE2-NEXT:    pand %xmm7, %xmm1
2517; SSE2-NEXT:    paddb %xmm1, %xmm1
2518; SSE2-NEXT:    por %xmm4, %xmm1
2519; SSE2-NEXT:    movdqa %xmm2, %xmm4
2520; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
2521; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2522; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2523; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
2524; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2525; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2526; SSE2-NEXT:    packuswb %xmm4, %xmm2
2527; SSE2-NEXT:    movdqa %xmm2, %xmm4
2528; SSE2-NEXT:    psrlw $4, %xmm4
2529; SSE2-NEXT:    pand %xmm5, %xmm4
2530; SSE2-NEXT:    pand %xmm5, %xmm2
2531; SSE2-NEXT:    psllw $4, %xmm2
2532; SSE2-NEXT:    por %xmm4, %xmm2
2533; SSE2-NEXT:    movdqa %xmm2, %xmm4
2534; SSE2-NEXT:    psrlw $2, %xmm4
2535; SSE2-NEXT:    pand %xmm6, %xmm4
2536; SSE2-NEXT:    pand %xmm6, %xmm2
2537; SSE2-NEXT:    psllw $2, %xmm2
2538; SSE2-NEXT:    por %xmm4, %xmm2
2539; SSE2-NEXT:    movdqa %xmm2, %xmm4
2540; SSE2-NEXT:    psrlw $1, %xmm4
2541; SSE2-NEXT:    pand %xmm7, %xmm4
2542; SSE2-NEXT:    pand %xmm7, %xmm2
2543; SSE2-NEXT:    paddb %xmm2, %xmm2
2544; SSE2-NEXT:    por %xmm4, %xmm2
2545; SSE2-NEXT:    movdqa %xmm3, %xmm4
2546; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
2547; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2548; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2549; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
2550; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
2551; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
2552; SSE2-NEXT:    packuswb %xmm4, %xmm3
2553; SSE2-NEXT:    movdqa %xmm3, %xmm4
2554; SSE2-NEXT:    psrlw $4, %xmm4
2555; SSE2-NEXT:    pand %xmm5, %xmm4
2556; SSE2-NEXT:    pand %xmm5, %xmm3
2557; SSE2-NEXT:    psllw $4, %xmm3
2558; SSE2-NEXT:    por %xmm4, %xmm3
2559; SSE2-NEXT:    movdqa %xmm3, %xmm4
2560; SSE2-NEXT:    psrlw $2, %xmm4
2561; SSE2-NEXT:    pand %xmm6, %xmm4
2562; SSE2-NEXT:    pand %xmm6, %xmm3
2563; SSE2-NEXT:    psllw $2, %xmm3
2564; SSE2-NEXT:    por %xmm4, %xmm3
2565; SSE2-NEXT:    movdqa %xmm3, %xmm4
2566; SSE2-NEXT:    psrlw $1, %xmm4
2567; SSE2-NEXT:    pand %xmm7, %xmm4
2568; SSE2-NEXT:    pand %xmm7, %xmm3
2569; SSE2-NEXT:    paddb %xmm3, %xmm3
2570; SSE2-NEXT:    por %xmm4, %xmm3
2571; SSE2-NEXT:    retq
2572;
2573; SSSE3-LABEL: test_bitreverse_v16i32:
2574; SSSE3:       # %bb.0:
2575; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2576; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2577; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2578; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2579; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2580; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2581; SSSE3-NEXT:    pand %xmm9, %xmm0
2582; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2583; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2584; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2585; SSSE3-NEXT:    psrlw $4, %xmm1
2586; SSSE3-NEXT:    pand %xmm9, %xmm1
2587; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2588; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2589; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2590; SSSE3-NEXT:    por %xmm6, %xmm0
2591; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2592; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2593; SSSE3-NEXT:    pand %xmm9, %xmm1
2594; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2595; SSSE3-NEXT:    pshufb %xmm1, %xmm6
2596; SSSE3-NEXT:    psrlw $4, %xmm5
2597; SSSE3-NEXT:    pand %xmm9, %xmm5
2598; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2599; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2600; SSSE3-NEXT:    por %xmm6, %xmm1
2601; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2602; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2603; SSSE3-NEXT:    pand %xmm9, %xmm5
2604; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2605; SSSE3-NEXT:    pshufb %xmm5, %xmm6
2606; SSSE3-NEXT:    psrlw $4, %xmm2
2607; SSSE3-NEXT:    pand %xmm9, %xmm2
2608; SSSE3-NEXT:    movdqa %xmm4, %xmm5
2609; SSSE3-NEXT:    pshufb %xmm2, %xmm5
2610; SSSE3-NEXT:    por %xmm6, %xmm5
2611; SSSE3-NEXT:    pshufb %xmm8, %xmm3
2612; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2613; SSSE3-NEXT:    pand %xmm9, %xmm2
2614; SSSE3-NEXT:    pshufb %xmm2, %xmm7
2615; SSSE3-NEXT:    psrlw $4, %xmm3
2616; SSSE3-NEXT:    pand %xmm9, %xmm3
2617; SSSE3-NEXT:    pshufb %xmm3, %xmm4
2618; SSSE3-NEXT:    por %xmm7, %xmm4
2619; SSSE3-NEXT:    movdqa %xmm5, %xmm2
2620; SSSE3-NEXT:    movdqa %xmm4, %xmm3
2621; SSSE3-NEXT:    retq
2622;
2623; AVX1-LABEL: test_bitreverse_v16i32:
2624; AVX1:       # %bb.0:
2625; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2626; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2627; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2628; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2629; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2630; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2631; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2632; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2633; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2634; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2635; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2636; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2637; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2638; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
2639; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2640; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2641; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2642; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
2643; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
2644; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2645; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2646; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2647; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2648; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2649; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2650; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2651; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2652; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2653; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2654; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
2655; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2656; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2657; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2658; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
2659; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
2660; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2661; AVX1-NEXT:    retq
2662;
2663; AVX2-LABEL: test_bitreverse_v16i32:
2664; AVX2:       # %bb.0:
2665; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2666; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2667; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2668; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
2669; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2670; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2671; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2672; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2673; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2674; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2675; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
2676; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2677; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
2678; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2679; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2680; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2681; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2682; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2683; AVX2-NEXT:    retq
2684;
2685; AVX512F-LABEL: test_bitreverse_v16i32:
2686; AVX512F:       # %bb.0:
2687; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2688; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2689; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2690; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2691; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
2692; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2693; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2694; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2695; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
2696; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2697; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
2698; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
2699; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
2700; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2701; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
2702; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
2703; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
2704; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
2705; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2706; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
2707; AVX512F-NEXT:    retq
2708;
2709; AVX512BW-LABEL: test_bitreverse_v16i32:
2710; AVX512BW:       # %bb.0:
2711; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
2712; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2713; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2714; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2715; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2716; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2717; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2718; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2719; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2720; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2721; AVX512BW-NEXT:    retq
2722;
2723; XOPAVX1-LABEL: test_bitreverse_v16i32:
2724; XOPAVX1:       # %bb.0:
2725; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2726; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
2727; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2728; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2729; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2730; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2731; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2732; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2733; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2734; XOPAVX1-NEXT:    retq
2735;
2736; XOPAVX2-LABEL: test_bitreverse_v16i32:
2737; XOPAVX2:       # %bb.0:
2738; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2739; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
2740; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2741; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2742; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2743; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2744; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2745; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2746; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2747; XOPAVX2-NEXT:    retq
2748;
2749; GFNISSE-LABEL: test_bitreverse_v16i32:
2750; GFNISSE:       # %bb.0:
2751; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2752; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
2753; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
2754; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
2755; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
2756; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
2757; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
2758; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
2759; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
2760; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
2761; GFNISSE-NEXT:    retq
2762;
2763; GFNIAVX-LABEL: test_bitreverse_v16i32:
2764; GFNIAVX:       # %bb.0:
2765; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
2766; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2767; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2768; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
2769; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
2770; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2771; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
2772; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2773; GFNIAVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
2774; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2775; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
2776; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2777; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
2778; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2779; GFNIAVX-NEXT:    retq
2780;
2781; GFNIAVX2-LABEL: test_bitreverse_v16i32:
2782; GFNIAVX2:       # %bb.0:
2783; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2784; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2785; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2786; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2787; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2788; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2789; GFNIAVX2-NEXT:    retq
2790;
2791; GFNIAVX512F-LABEL: test_bitreverse_v16i32:
2792; GFNIAVX512F:       # %bb.0:
2793; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2794; GFNIAVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2795; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2796; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2797; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2798; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2799; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2800; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2801; GFNIAVX512F-NEXT:    retq
2802;
2803; GFNIAVX512BW-LABEL: test_bitreverse_v16i32:
2804; GFNIAVX512BW:       # %bb.0:
2805; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
2806; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2807; GFNIAVX512BW-NEXT:    retq
2808  %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
2809  ret <16 x i32> %b
2810}
2811
2812define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
2813; SSE2-LABEL: test_bitreverse_v8i64:
2814; SSE2:       # %bb.0:
2815; SSE2-NEXT:    pxor %xmm8, %xmm8
2816; SSE2-NEXT:    movdqa %xmm0, %xmm5
2817; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
2818; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
2819; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
2820; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
2821; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
2822; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2823; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
2824; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
2825; SSE2-NEXT:    packuswb %xmm5, %xmm0
2826; SSE2-NEXT:    movdqa %xmm0, %xmm6
2827; SSE2-NEXT:    psrlw $4, %xmm6
2828; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2829; SSE2-NEXT:    pand %xmm5, %xmm6
2830; SSE2-NEXT:    pand %xmm5, %xmm0
2831; SSE2-NEXT:    psllw $4, %xmm0
2832; SSE2-NEXT:    por %xmm6, %xmm0
2833; SSE2-NEXT:    movdqa %xmm0, %xmm7
2834; SSE2-NEXT:    psrlw $2, %xmm7
2835; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2836; SSE2-NEXT:    pand %xmm6, %xmm7
2837; SSE2-NEXT:    pand %xmm6, %xmm0
2838; SSE2-NEXT:    psllw $2, %xmm0
2839; SSE2-NEXT:    por %xmm7, %xmm0
2840; SSE2-NEXT:    movdqa %xmm0, %xmm4
2841; SSE2-NEXT:    psrlw $1, %xmm4
2842; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2843; SSE2-NEXT:    pand %xmm7, %xmm4
2844; SSE2-NEXT:    pand %xmm7, %xmm0
2845; SSE2-NEXT:    paddb %xmm0, %xmm0
2846; SSE2-NEXT:    por %xmm4, %xmm0
2847; SSE2-NEXT:    movdqa %xmm1, %xmm4
2848; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
2849; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
2850; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2851; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2852; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
2853; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2854; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2855; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2856; SSE2-NEXT:    packuswb %xmm4, %xmm1
2857; SSE2-NEXT:    movdqa %xmm1, %xmm4
2858; SSE2-NEXT:    psrlw $4, %xmm4
2859; SSE2-NEXT:    pand %xmm5, %xmm4
2860; SSE2-NEXT:    pand %xmm5, %xmm1
2861; SSE2-NEXT:    psllw $4, %xmm1
2862; SSE2-NEXT:    por %xmm4, %xmm1
2863; SSE2-NEXT:    movdqa %xmm1, %xmm4
2864; SSE2-NEXT:    psrlw $2, %xmm4
2865; SSE2-NEXT:    pand %xmm6, %xmm4
2866; SSE2-NEXT:    pand %xmm6, %xmm1
2867; SSE2-NEXT:    psllw $2, %xmm1
2868; SSE2-NEXT:    por %xmm4, %xmm1
2869; SSE2-NEXT:    movdqa %xmm1, %xmm4
2870; SSE2-NEXT:    psrlw $1, %xmm4
2871; SSE2-NEXT:    pand %xmm7, %xmm4
2872; SSE2-NEXT:    pand %xmm7, %xmm1
2873; SSE2-NEXT:    paddb %xmm1, %xmm1
2874; SSE2-NEXT:    por %xmm4, %xmm1
2875; SSE2-NEXT:    movdqa %xmm2, %xmm4
2876; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
2877; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
2878; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2879; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2880; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
2881; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
2882; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2883; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2884; SSE2-NEXT:    packuswb %xmm4, %xmm2
2885; SSE2-NEXT:    movdqa %xmm2, %xmm4
2886; SSE2-NEXT:    psrlw $4, %xmm4
2887; SSE2-NEXT:    pand %xmm5, %xmm4
2888; SSE2-NEXT:    pand %xmm5, %xmm2
2889; SSE2-NEXT:    psllw $4, %xmm2
2890; SSE2-NEXT:    por %xmm4, %xmm2
2891; SSE2-NEXT:    movdqa %xmm2, %xmm4
2892; SSE2-NEXT:    psrlw $2, %xmm4
2893; SSE2-NEXT:    pand %xmm6, %xmm4
2894; SSE2-NEXT:    pand %xmm6, %xmm2
2895; SSE2-NEXT:    psllw $2, %xmm2
2896; SSE2-NEXT:    por %xmm4, %xmm2
2897; SSE2-NEXT:    movdqa %xmm2, %xmm4
2898; SSE2-NEXT:    psrlw $1, %xmm4
2899; SSE2-NEXT:    pand %xmm7, %xmm4
2900; SSE2-NEXT:    pand %xmm7, %xmm2
2901; SSE2-NEXT:    paddb %xmm2, %xmm2
2902; SSE2-NEXT:    por %xmm4, %xmm2
2903; SSE2-NEXT:    movdqa %xmm3, %xmm4
2904; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
2905; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
2906; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2907; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2908; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
2909; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
2910; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
2911; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
2912; SSE2-NEXT:    packuswb %xmm4, %xmm3
2913; SSE2-NEXT:    movdqa %xmm3, %xmm4
2914; SSE2-NEXT:    psrlw $4, %xmm4
2915; SSE2-NEXT:    pand %xmm5, %xmm4
2916; SSE2-NEXT:    pand %xmm5, %xmm3
2917; SSE2-NEXT:    psllw $4, %xmm3
2918; SSE2-NEXT:    por %xmm4, %xmm3
2919; SSE2-NEXT:    movdqa %xmm3, %xmm4
2920; SSE2-NEXT:    psrlw $2, %xmm4
2921; SSE2-NEXT:    pand %xmm6, %xmm4
2922; SSE2-NEXT:    pand %xmm6, %xmm3
2923; SSE2-NEXT:    psllw $2, %xmm3
2924; SSE2-NEXT:    por %xmm4, %xmm3
2925; SSE2-NEXT:    movdqa %xmm3, %xmm4
2926; SSE2-NEXT:    psrlw $1, %xmm4
2927; SSE2-NEXT:    pand %xmm7, %xmm4
2928; SSE2-NEXT:    pand %xmm7, %xmm3
2929; SSE2-NEXT:    paddb %xmm3, %xmm3
2930; SSE2-NEXT:    por %xmm4, %xmm3
2931; SSE2-NEXT:    retq
2932;
2933; SSSE3-LABEL: test_bitreverse_v8i64:
2934; SSSE3:       # %bb.0:
2935; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2936; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2937; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2938; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2939; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2940; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2941; SSSE3-NEXT:    pand %xmm9, %xmm0
2942; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2943; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2944; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2945; SSSE3-NEXT:    psrlw $4, %xmm1
2946; SSSE3-NEXT:    pand %xmm9, %xmm1
2947; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2948; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2949; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2950; SSSE3-NEXT:    por %xmm6, %xmm0
2951; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2952; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2953; SSSE3-NEXT:    pand %xmm9, %xmm1
2954; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2955; SSSE3-NEXT:    pshufb %xmm1, %xmm6
2956; SSSE3-NEXT:    psrlw $4, %xmm5
2957; SSSE3-NEXT:    pand %xmm9, %xmm5
2958; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2959; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2960; SSSE3-NEXT:    por %xmm6, %xmm1
2961; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2962; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2963; SSSE3-NEXT:    pand %xmm9, %xmm5
2964; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2965; SSSE3-NEXT:    pshufb %xmm5, %xmm6
2966; SSSE3-NEXT:    psrlw $4, %xmm2
2967; SSSE3-NEXT:    pand %xmm9, %xmm2
2968; SSSE3-NEXT:    movdqa %xmm4, %xmm5
2969; SSSE3-NEXT:    pshufb %xmm2, %xmm5
2970; SSSE3-NEXT:    por %xmm6, %xmm5
2971; SSSE3-NEXT:    pshufb %xmm8, %xmm3
2972; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2973; SSSE3-NEXT:    pand %xmm9, %xmm2
2974; SSSE3-NEXT:    pshufb %xmm2, %xmm7
2975; SSSE3-NEXT:    psrlw $4, %xmm3
2976; SSSE3-NEXT:    pand %xmm9, %xmm3
2977; SSSE3-NEXT:    pshufb %xmm3, %xmm4
2978; SSSE3-NEXT:    por %xmm7, %xmm4
2979; SSSE3-NEXT:    movdqa %xmm5, %xmm2
2980; SSSE3-NEXT:    movdqa %xmm4, %xmm3
2981; SSSE3-NEXT:    retq
2982;
2983; AVX1-LABEL: test_bitreverse_v8i64:
2984; AVX1:       # %bb.0:
2985; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2986; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2987; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2988; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2989; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2990; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2991; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2992; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2993; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2994; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2995; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2996; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2997; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2998; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
2999; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
3000; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
3001; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
3002; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
3003; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
3004; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3005; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3006; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
3007; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
3008; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
3009; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
3010; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
3011; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
3012; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
3013; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
3014; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
3015; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
3016; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
3017; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
3018; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
3019; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
3020; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
3021; AVX1-NEXT:    retq
3022;
3023; AVX2-LABEL: test_bitreverse_v8i64:
3024; AVX2:       # %bb.0:
3025; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3026; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3027; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3028; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
3029; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3030; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
3031; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
3032; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
3033; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3034; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
3035; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
3036; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3037; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
3038; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
3039; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
3040; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
3041; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
3042; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
3043; AVX2-NEXT:    retq
3044;
3045; AVX512F-LABEL: test_bitreverse_v8i64:
3046; AVX512F:       # %bb.0:
3047; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
3048; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3049; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3050; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3051; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
3052; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3053; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
3054; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3055; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
3056; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
3057; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
3058; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
3059; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
3060; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3061; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
3062; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
3063; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
3064; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
3065; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3066; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
3067; AVX512F-NEXT:    retq
3068;
3069; AVX512BW-LABEL: test_bitreverse_v8i64:
3070; AVX512BW:       # %bb.0:
3071; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
3072; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3073; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
3074; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3075; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
3076; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
3077; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
3078; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3079; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
3080; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
3081; AVX512BW-NEXT:    retq
3082;
3083; XOPAVX1-LABEL: test_bitreverse_v8i64:
3084; XOPAVX1:       # %bb.0:
3085; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3086; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
3087; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3088; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
3089; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3090; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3091; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3092; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
3093; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
3094; XOPAVX1-NEXT:    retq
3095;
3096; XOPAVX2-LABEL: test_bitreverse_v8i64:
3097; XOPAVX2:       # %bb.0:
3098; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
3099; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
3100; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3101; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
3102; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
3103; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
3104; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3105; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
3106; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
3107; XOPAVX2-NEXT:    retq
3108;
3109; GFNISSE-LABEL: test_bitreverse_v8i64:
3110; GFNISSE:       # %bb.0:
3111; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3112; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
3113; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
3114; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
3115; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
3116; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
3117; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
3118; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
3119; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
3120; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
3121; GFNISSE-NEXT:    retq
3122;
3123; GFNIAVX-LABEL: test_bitreverse_v8i64:
3124; GFNIAVX:       # %bb.0:
3125; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
3126; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3127; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
3128; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
3129; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
3130; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
3131; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
3132; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3133; GFNIAVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
3134; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
3135; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
3136; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
3137; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
3138; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
3139; GFNIAVX-NEXT:    retq
3140;
3141; GFNIAVX2-LABEL: test_bitreverse_v8i64:
3142; GFNIAVX2:       # %bb.0:
3143; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3144; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3145; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
3146; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
3147; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3148; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
3149; GFNIAVX2-NEXT:    retq
3150;
3151; GFNIAVX512F-LABEL: test_bitreverse_v8i64:
3152; GFNIAVX512F:       # %bb.0:
3153; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
3154; GFNIAVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3155; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3156; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
3157; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
3158; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3159; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
3160; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3161; GFNIAVX512F-NEXT:    retq
3162;
3163; GFNIAVX512BW-LABEL: test_bitreverse_v8i64:
3164; GFNIAVX512BW:       # %bb.0:
3165; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
3166; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
3167; GFNIAVX512BW-NEXT:    retq
3168  %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
3169  ret <8 x i64> %b
3170}
3171
3172;
3173; Constant Folding
3174;
3175
3176define i32 @fold_bitreverse_i32() nounwind {
3177; ALL-LABEL: fold_bitreverse_i32:
3178; ALL:       # %bb.0:
3179; ALL-NEXT:    movl $16711935, %eax # imm = 0xFF00FF
3180; ALL-NEXT:    retq
3181  %b = call i32 @llvm.bitreverse.i32(i32 4278255360)
3182  ret i32 %b
3183}
3184
3185define <16 x i8> @fold_bitreverse_v16i8() nounwind {
3186; SSE-LABEL: fold_bitreverse_v16i8:
3187; SSE:       # %bb.0:
3188; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3189; SSE-NEXT:    retq
3190;
3191; AVX-LABEL: fold_bitreverse_v16i8:
3192; AVX:       # %bb.0:
3193; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3194; AVX-NEXT:    retq
3195;
3196; XOP-LABEL: fold_bitreverse_v16i8:
3197; XOP:       # %bb.0:
3198; XOP-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3199; XOP-NEXT:    retq
3200;
3201; GFNISSE-LABEL: fold_bitreverse_v16i8:
3202; GFNISSE:       # %bb.0:
3203; GFNISSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3204; GFNISSE-NEXT:    retq
3205;
3206; GFNIAVX-LABEL: fold_bitreverse_v16i8:
3207; GFNIAVX:       # %bb.0:
3208; GFNIAVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3209; GFNIAVX-NEXT:    retq
3210;
3211; GFNIAVX2-LABEL: fold_bitreverse_v16i8:
3212; GFNIAVX2:       # %bb.0:
3213; GFNIAVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3214; GFNIAVX2-NEXT:    retq
3215;
3216; GFNIAVX512F-LABEL: fold_bitreverse_v16i8:
3217; GFNIAVX512F:       # %bb.0:
3218; GFNIAVX512F-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3219; GFNIAVX512F-NEXT:    retq
3220;
3221; GFNIAVX512BW-LABEL: fold_bitreverse_v16i8:
3222; GFNIAVX512BW:       # %bb.0:
3223; GFNIAVX512BW-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3224; GFNIAVX512BW-NEXT:    retq
3225  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>)
3226  ret <16 x i8> %b
3227}
3228
3229define <16 x i16> @fold_bitreverse_v16i16() nounwind {
3230; SSE-LABEL: fold_bitreverse_v16i16:
3231; SSE:       # %bb.0:
3232; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
3233; SSE-NEXT:    movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
3234; SSE-NEXT:    retq
3235;
3236; AVX-LABEL: fold_bitreverse_v16i16:
3237; AVX:       # %bb.0:
3238; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3239; AVX-NEXT:    retq
3240;
3241; XOP-LABEL: fold_bitreverse_v16i16:
3242; XOP:       # %bb.0:
3243; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3244; XOP-NEXT:    retq
3245;
3246; GFNISSE-LABEL: fold_bitreverse_v16i16:
3247; GFNISSE:       # %bb.0:
3248; GFNISSE-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
3249; GFNISSE-NEXT:    movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
3250; GFNISSE-NEXT:    retq
3251;
3252; GFNIAVX-LABEL: fold_bitreverse_v16i16:
3253; GFNIAVX:       # %bb.0:
3254; GFNIAVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3255; GFNIAVX-NEXT:    retq
3256;
3257; GFNIAVX2-LABEL: fold_bitreverse_v16i16:
3258; GFNIAVX2:       # %bb.0:
3259; GFNIAVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3260; GFNIAVX2-NEXT:    retq
3261;
3262; GFNIAVX512F-LABEL: fold_bitreverse_v16i16:
3263; GFNIAVX512F:       # %bb.0:
3264; GFNIAVX512F-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3265; GFNIAVX512F-NEXT:    retq
3266;
3267; GFNIAVX512BW-LABEL: fold_bitreverse_v16i16:
3268; GFNIAVX512BW:       # %bb.0:
3269; GFNIAVX512BW-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3270; GFNIAVX512BW-NEXT:    retq
3271  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>)
3272  ret <16 x i16> %b
3273}
3274
3275define <16 x i32> @fold_bitreverse_v16i32() nounwind {
3276; SSE-LABEL: fold_bitreverse_v16i32:
3277; SSE:       # %bb.0:
3278; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
3279; SSE-NEXT:    movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
3280; SSE-NEXT:    movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
3281; SSE-NEXT:    movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103]
3282; SSE-NEXT:    retq
3283;
3284; AVX1-LABEL: fold_bitreverse_v16i32:
3285; AVX1:       # %bb.0:
3286; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3287; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3288; AVX1-NEXT:    retq
3289;
3290; AVX2-LABEL: fold_bitreverse_v16i32:
3291; AVX2:       # %bb.0:
3292; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3293; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3294; AVX2-NEXT:    retq
3295;
3296; AVX512-LABEL: fold_bitreverse_v16i32:
3297; AVX512:       # %bb.0:
3298; AVX512-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3299; AVX512-NEXT:    retq
3300;
3301; XOP-LABEL: fold_bitreverse_v16i32:
3302; XOP:       # %bb.0:
3303; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3304; XOP-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3305; XOP-NEXT:    retq
3306;
3307; GFNISSE-LABEL: fold_bitreverse_v16i32:
3308; GFNISSE:       # %bb.0:
3309; GFNISSE-NEXT:    movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
3310; GFNISSE-NEXT:    movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
3311; GFNISSE-NEXT:    movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
3312; GFNISSE-NEXT:    movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103]
3313; GFNISSE-NEXT:    retq
3314;
3315; GFNIAVX-LABEL: fold_bitreverse_v16i32:
3316; GFNIAVX:       # %bb.0:
3317; GFNIAVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3318; GFNIAVX-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3319; GFNIAVX-NEXT:    retq
3320;
3321; GFNIAVX2-LABEL: fold_bitreverse_v16i32:
3322; GFNIAVX2:       # %bb.0:
3323; GFNIAVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3324; GFNIAVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3325; GFNIAVX2-NEXT:    retq
3326;
3327; GFNIAVX512F-LABEL: fold_bitreverse_v16i32:
3328; GFNIAVX512F:       # %bb.0:
3329; GFNIAVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3330; GFNIAVX512F-NEXT:    retq
3331;
3332; GFNIAVX512BW-LABEL: fold_bitreverse_v16i32:
3333; GFNIAVX512BW:       # %bb.0:
3334; GFNIAVX512BW-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3335; GFNIAVX512BW-NEXT:    retq
3336  %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>)
3337  ret <16 x i32> %b
3338}
3339
3340declare i8 @llvm.bitreverse.i8(i8) readnone
3341declare i16 @llvm.bitreverse.i16(i16) readnone
3342declare i32 @llvm.bitreverse.i32(i32) readnone
3343declare i64 @llvm.bitreverse.i64(i64) readnone
3344
3345declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone
3346declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone
3347declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone
3348declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone
3349
3350declare <32 x i8>  @llvm.bitreverse.v32i8(<32 x i8>) readnone
3351declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone
3352declare <8 x i32>  @llvm.bitreverse.v8i32(<8 x i32>) readnone
3353declare <4 x i64>  @llvm.bitreverse.v4i64(<4 x i64>) readnone
3354
3355declare <64 x i8>  @llvm.bitreverse.v64i8(<64 x i8>) readnone
3356declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone
3357declare <16 x i32>  @llvm.bitreverse.v16i32(<16 x i32>) readnone
3358declare <8 x i64>  @llvm.bitreverse.v8i64(<8 x i64>) readnone
3359