1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNISSE
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX2
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX512F
14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX512BW
15
16; Make sure we don't crash with avx512bw and xop
17; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw
18
19define i8 @test_bitreverse_i8(i8 %a) nounwind {
20; SSE-LABEL: test_bitreverse_i8:
21; SSE:       # %bb.0:
22; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
23; SSE-NEXT:    rolb $4, %dil
24; SSE-NEXT:    movl %edi, %eax
25; SSE-NEXT:    andb $51, %al
26; SSE-NEXT:    shlb $2, %al
27; SSE-NEXT:    andb $-52, %dil
28; SSE-NEXT:    shrb $2, %dil
29; SSE-NEXT:    orb %al, %dil
30; SSE-NEXT:    movl %edi, %eax
31; SSE-NEXT:    andb $85, %al
32; SSE-NEXT:    addb %al, %al
33; SSE-NEXT:    andb $-86, %dil
34; SSE-NEXT:    shrb %dil
35; SSE-NEXT:    addl %edi, %eax
36; SSE-NEXT:    # kill: def $al killed $al killed $eax
37; SSE-NEXT:    retq
38;
39; AVX-LABEL: test_bitreverse_i8:
40; AVX:       # %bb.0:
41; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
42; AVX-NEXT:    rolb $4, %dil
43; AVX-NEXT:    movl %edi, %eax
44; AVX-NEXT:    andb $51, %al
45; AVX-NEXT:    shlb $2, %al
46; AVX-NEXT:    andb $-52, %dil
47; AVX-NEXT:    shrb $2, %dil
48; AVX-NEXT:    orb %al, %dil
49; AVX-NEXT:    movl %edi, %eax
50; AVX-NEXT:    andb $85, %al
51; AVX-NEXT:    addb %al, %al
52; AVX-NEXT:    andb $-86, %dil
53; AVX-NEXT:    shrb %dil
54; AVX-NEXT:    addl %edi, %eax
55; AVX-NEXT:    # kill: def $al killed $al killed $eax
56; AVX-NEXT:    retq
57;
58; XOP-LABEL: test_bitreverse_i8:
59; XOP:       # %bb.0:
60; XOP-NEXT:    vmovd %edi, %xmm0
61; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
62; XOP-NEXT:    vmovd %xmm0, %eax
63; XOP-NEXT:    # kill: def $al killed $al killed $eax
64; XOP-NEXT:    retq
65;
66; GFNISSE-LABEL: test_bitreverse_i8:
67; GFNISSE:       # %bb.0:
68; GFNISSE-NEXT:    # kill: def $edi killed $edi def $rdi
69; GFNISSE-NEXT:    rolb $4, %dil
70; GFNISSE-NEXT:    movl %edi, %eax
71; GFNISSE-NEXT:    andb $51, %al
72; GFNISSE-NEXT:    shlb $2, %al
73; GFNISSE-NEXT:    andb $-52, %dil
74; GFNISSE-NEXT:    shrb $2, %dil
75; GFNISSE-NEXT:    orb %al, %dil
76; GFNISSE-NEXT:    movl %edi, %eax
77; GFNISSE-NEXT:    andb $85, %al
78; GFNISSE-NEXT:    addb %al, %al
79; GFNISSE-NEXT:    andb $-86, %dil
80; GFNISSE-NEXT:    shrb %dil
81; GFNISSE-NEXT:    addl %edi, %eax
82; GFNISSE-NEXT:    # kill: def $al killed $al killed $eax
83; GFNISSE-NEXT:    retq
84;
85; GFNIAVX-LABEL: test_bitreverse_i8:
86; GFNIAVX:       # %bb.0:
87; GFNIAVX-NEXT:    # kill: def $edi killed $edi def $rdi
88; GFNIAVX-NEXT:    rolb $4, %dil
89; GFNIAVX-NEXT:    movl %edi, %eax
90; GFNIAVX-NEXT:    andb $51, %al
91; GFNIAVX-NEXT:    shlb $2, %al
92; GFNIAVX-NEXT:    andb $-52, %dil
93; GFNIAVX-NEXT:    shrb $2, %dil
94; GFNIAVX-NEXT:    orb %al, %dil
95; GFNIAVX-NEXT:    movl %edi, %eax
96; GFNIAVX-NEXT:    andb $85, %al
97; GFNIAVX-NEXT:    addb %al, %al
98; GFNIAVX-NEXT:    andb $-86, %dil
99; GFNIAVX-NEXT:    shrb %dil
100; GFNIAVX-NEXT:    addl %edi, %eax
101; GFNIAVX-NEXT:    # kill: def $al killed $al killed $eax
102; GFNIAVX-NEXT:    retq
103;
104; GFNIAVX2-LABEL: test_bitreverse_i8:
105; GFNIAVX2:       # %bb.0:
106; GFNIAVX2-NEXT:    # kill: def $edi killed $edi def $rdi
107; GFNIAVX2-NEXT:    rolb $4, %dil
108; GFNIAVX2-NEXT:    movl %edi, %eax
109; GFNIAVX2-NEXT:    andb $51, %al
110; GFNIAVX2-NEXT:    shlb $2, %al
111; GFNIAVX2-NEXT:    andb $-52, %dil
112; GFNIAVX2-NEXT:    shrb $2, %dil
113; GFNIAVX2-NEXT:    orb %al, %dil
114; GFNIAVX2-NEXT:    movl %edi, %eax
115; GFNIAVX2-NEXT:    andb $85, %al
116; GFNIAVX2-NEXT:    addb %al, %al
117; GFNIAVX2-NEXT:    andb $-86, %dil
118; GFNIAVX2-NEXT:    shrb %dil
119; GFNIAVX2-NEXT:    addl %edi, %eax
120; GFNIAVX2-NEXT:    # kill: def $al killed $al killed $eax
121; GFNIAVX2-NEXT:    retq
122;
123; GFNIAVX512F-LABEL: test_bitreverse_i8:
124; GFNIAVX512F:       # %bb.0:
125; GFNIAVX512F-NEXT:    # kill: def $edi killed $edi def $rdi
126; GFNIAVX512F-NEXT:    rolb $4, %dil
127; GFNIAVX512F-NEXT:    movl %edi, %eax
128; GFNIAVX512F-NEXT:    andb $51, %al
129; GFNIAVX512F-NEXT:    shlb $2, %al
130; GFNIAVX512F-NEXT:    andb $-52, %dil
131; GFNIAVX512F-NEXT:    shrb $2, %dil
132; GFNIAVX512F-NEXT:    orb %al, %dil
133; GFNIAVX512F-NEXT:    movl %edi, %eax
134; GFNIAVX512F-NEXT:    andb $85, %al
135; GFNIAVX512F-NEXT:    addb %al, %al
136; GFNIAVX512F-NEXT:    andb $-86, %dil
137; GFNIAVX512F-NEXT:    shrb %dil
138; GFNIAVX512F-NEXT:    addl %edi, %eax
139; GFNIAVX512F-NEXT:    # kill: def $al killed $al killed $eax
140; GFNIAVX512F-NEXT:    retq
141;
142; GFNIAVX512BW-LABEL: test_bitreverse_i8:
143; GFNIAVX512BW:       # %bb.0:
144; GFNIAVX512BW-NEXT:    # kill: def $edi killed $edi def $rdi
145; GFNIAVX512BW-NEXT:    rolb $4, %dil
146; GFNIAVX512BW-NEXT:    movl %edi, %eax
147; GFNIAVX512BW-NEXT:    andb $51, %al
148; GFNIAVX512BW-NEXT:    shlb $2, %al
149; GFNIAVX512BW-NEXT:    andb $-52, %dil
150; GFNIAVX512BW-NEXT:    shrb $2, %dil
151; GFNIAVX512BW-NEXT:    orb %al, %dil
152; GFNIAVX512BW-NEXT:    movl %edi, %eax
153; GFNIAVX512BW-NEXT:    andb $85, %al
154; GFNIAVX512BW-NEXT:    addb %al, %al
155; GFNIAVX512BW-NEXT:    andb $-86, %dil
156; GFNIAVX512BW-NEXT:    shrb %dil
157; GFNIAVX512BW-NEXT:    addl %edi, %eax
158; GFNIAVX512BW-NEXT:    # kill: def $al killed $al killed $eax
159; GFNIAVX512BW-NEXT:    retq
160  %b = call i8 @llvm.bitreverse.i8(i8 %a)
161  ret i8 %b
162}
163
164define i16 @test_bitreverse_i16(i16 %a) nounwind {
165; SSE-LABEL: test_bitreverse_i16:
166; SSE:       # %bb.0:
167; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
168; SSE-NEXT:    rolw $8, %di
169; SSE-NEXT:    movl %edi, %eax
170; SSE-NEXT:    andl $3855, %eax # imm = 0xF0F
171; SSE-NEXT:    shll $4, %eax
172; SSE-NEXT:    andl $61680, %edi # imm = 0xF0F0
173; SSE-NEXT:    shrl $4, %edi
174; SSE-NEXT:    orl %eax, %edi
175; SSE-NEXT:    movl %edi, %eax
176; SSE-NEXT:    andl $13107, %eax # imm = 0x3333
177; SSE-NEXT:    andl $52428, %edi # imm = 0xCCCC
178; SSE-NEXT:    shrl $2, %edi
179; SSE-NEXT:    leal (%rdi,%rax,4), %eax
180; SSE-NEXT:    movl %eax, %ecx
181; SSE-NEXT:    andl $21845, %ecx # imm = 0x5555
182; SSE-NEXT:    andl $43690, %eax # imm = 0xAAAA
183; SSE-NEXT:    shrl %eax
184; SSE-NEXT:    leal (%rax,%rcx,2), %eax
185; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
186; SSE-NEXT:    retq
187;
188; AVX-LABEL: test_bitreverse_i16:
189; AVX:       # %bb.0:
190; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
191; AVX-NEXT:    rolw $8, %di
192; AVX-NEXT:    movl %edi, %eax
193; AVX-NEXT:    andl $3855, %eax # imm = 0xF0F
194; AVX-NEXT:    shll $4, %eax
195; AVX-NEXT:    andl $61680, %edi # imm = 0xF0F0
196; AVX-NEXT:    shrl $4, %edi
197; AVX-NEXT:    orl %eax, %edi
198; AVX-NEXT:    movl %edi, %eax
199; AVX-NEXT:    andl $13107, %eax # imm = 0x3333
200; AVX-NEXT:    andl $52428, %edi # imm = 0xCCCC
201; AVX-NEXT:    shrl $2, %edi
202; AVX-NEXT:    leal (%rdi,%rax,4), %eax
203; AVX-NEXT:    movl %eax, %ecx
204; AVX-NEXT:    andl $21845, %ecx # imm = 0x5555
205; AVX-NEXT:    andl $43690, %eax # imm = 0xAAAA
206; AVX-NEXT:    shrl %eax
207; AVX-NEXT:    leal (%rax,%rcx,2), %eax
208; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
209; AVX-NEXT:    retq
210;
211; XOP-LABEL: test_bitreverse_i16:
212; XOP:       # %bb.0:
213; XOP-NEXT:    vmovd %edi, %xmm0
214; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
215; XOP-NEXT:    vmovd %xmm0, %eax
216; XOP-NEXT:    # kill: def $ax killed $ax killed $eax
217; XOP-NEXT:    retq
218;
219; GFNISSE-LABEL: test_bitreverse_i16:
220; GFNISSE:       # %bb.0:
221; GFNISSE-NEXT:    # kill: def $edi killed $edi def $rdi
222; GFNISSE-NEXT:    rolw $8, %di
223; GFNISSE-NEXT:    movl %edi, %eax
224; GFNISSE-NEXT:    andl $3855, %eax # imm = 0xF0F
225; GFNISSE-NEXT:    shll $4, %eax
226; GFNISSE-NEXT:    andl $61680, %edi # imm = 0xF0F0
227; GFNISSE-NEXT:    shrl $4, %edi
228; GFNISSE-NEXT:    orl %eax, %edi
229; GFNISSE-NEXT:    movl %edi, %eax
230; GFNISSE-NEXT:    andl $13107, %eax # imm = 0x3333
231; GFNISSE-NEXT:    andl $52428, %edi # imm = 0xCCCC
232; GFNISSE-NEXT:    shrl $2, %edi
233; GFNISSE-NEXT:    leal (%rdi,%rax,4), %eax
234; GFNISSE-NEXT:    movl %eax, %ecx
235; GFNISSE-NEXT:    andl $21845, %ecx # imm = 0x5555
236; GFNISSE-NEXT:    andl $43690, %eax # imm = 0xAAAA
237; GFNISSE-NEXT:    shrl %eax
238; GFNISSE-NEXT:    leal (%rax,%rcx,2), %eax
239; GFNISSE-NEXT:    # kill: def $ax killed $ax killed $eax
240; GFNISSE-NEXT:    retq
241;
242; GFNIAVX-LABEL: test_bitreverse_i16:
243; GFNIAVX:       # %bb.0:
244; GFNIAVX-NEXT:    # kill: def $edi killed $edi def $rdi
245; GFNIAVX-NEXT:    rolw $8, %di
246; GFNIAVX-NEXT:    movl %edi, %eax
247; GFNIAVX-NEXT:    andl $3855, %eax # imm = 0xF0F
248; GFNIAVX-NEXT:    shll $4, %eax
249; GFNIAVX-NEXT:    andl $61680, %edi # imm = 0xF0F0
250; GFNIAVX-NEXT:    shrl $4, %edi
251; GFNIAVX-NEXT:    orl %eax, %edi
252; GFNIAVX-NEXT:    movl %edi, %eax
253; GFNIAVX-NEXT:    andl $13107, %eax # imm = 0x3333
254; GFNIAVX-NEXT:    andl $52428, %edi # imm = 0xCCCC
255; GFNIAVX-NEXT:    shrl $2, %edi
256; GFNIAVX-NEXT:    leal (%rdi,%rax,4), %eax
257; GFNIAVX-NEXT:    movl %eax, %ecx
258; GFNIAVX-NEXT:    andl $21845, %ecx # imm = 0x5555
259; GFNIAVX-NEXT:    andl $43690, %eax # imm = 0xAAAA
260; GFNIAVX-NEXT:    shrl %eax
261; GFNIAVX-NEXT:    leal (%rax,%rcx,2), %eax
262; GFNIAVX-NEXT:    # kill: def $ax killed $ax killed $eax
263; GFNIAVX-NEXT:    retq
264;
265; GFNIAVX2-LABEL: test_bitreverse_i16:
266; GFNIAVX2:       # %bb.0:
267; GFNIAVX2-NEXT:    # kill: def $edi killed $edi def $rdi
268; GFNIAVX2-NEXT:    rolw $8, %di
269; GFNIAVX2-NEXT:    movl %edi, %eax
270; GFNIAVX2-NEXT:    andl $3855, %eax # imm = 0xF0F
271; GFNIAVX2-NEXT:    shll $4, %eax
272; GFNIAVX2-NEXT:    andl $61680, %edi # imm = 0xF0F0
273; GFNIAVX2-NEXT:    shrl $4, %edi
274; GFNIAVX2-NEXT:    orl %eax, %edi
275; GFNIAVX2-NEXT:    movl %edi, %eax
276; GFNIAVX2-NEXT:    andl $13107, %eax # imm = 0x3333
277; GFNIAVX2-NEXT:    andl $52428, %edi # imm = 0xCCCC
278; GFNIAVX2-NEXT:    shrl $2, %edi
279; GFNIAVX2-NEXT:    leal (%rdi,%rax,4), %eax
280; GFNIAVX2-NEXT:    movl %eax, %ecx
281; GFNIAVX2-NEXT:    andl $21845, %ecx # imm = 0x5555
282; GFNIAVX2-NEXT:    andl $43690, %eax # imm = 0xAAAA
283; GFNIAVX2-NEXT:    shrl %eax
284; GFNIAVX2-NEXT:    leal (%rax,%rcx,2), %eax
285; GFNIAVX2-NEXT:    # kill: def $ax killed $ax killed $eax
286; GFNIAVX2-NEXT:    retq
287;
288; GFNIAVX512F-LABEL: test_bitreverse_i16:
289; GFNIAVX512F:       # %bb.0:
290; GFNIAVX512F-NEXT:    # kill: def $edi killed $edi def $rdi
291; GFNIAVX512F-NEXT:    rolw $8, %di
292; GFNIAVX512F-NEXT:    movl %edi, %eax
293; GFNIAVX512F-NEXT:    andl $3855, %eax # imm = 0xF0F
294; GFNIAVX512F-NEXT:    shll $4, %eax
295; GFNIAVX512F-NEXT:    andl $61680, %edi # imm = 0xF0F0
296; GFNIAVX512F-NEXT:    shrl $4, %edi
297; GFNIAVX512F-NEXT:    orl %eax, %edi
298; GFNIAVX512F-NEXT:    movl %edi, %eax
299; GFNIAVX512F-NEXT:    andl $13107, %eax # imm = 0x3333
300; GFNIAVX512F-NEXT:    andl $52428, %edi # imm = 0xCCCC
301; GFNIAVX512F-NEXT:    shrl $2, %edi
302; GFNIAVX512F-NEXT:    leal (%rdi,%rax,4), %eax
303; GFNIAVX512F-NEXT:    movl %eax, %ecx
304; GFNIAVX512F-NEXT:    andl $21845, %ecx # imm = 0x5555
305; GFNIAVX512F-NEXT:    andl $43690, %eax # imm = 0xAAAA
306; GFNIAVX512F-NEXT:    shrl %eax
307; GFNIAVX512F-NEXT:    leal (%rax,%rcx,2), %eax
308; GFNIAVX512F-NEXT:    # kill: def $ax killed $ax killed $eax
309; GFNIAVX512F-NEXT:    retq
310;
311; GFNIAVX512BW-LABEL: test_bitreverse_i16:
312; GFNIAVX512BW:       # %bb.0:
313; GFNIAVX512BW-NEXT:    # kill: def $edi killed $edi def $rdi
314; GFNIAVX512BW-NEXT:    rolw $8, %di
315; GFNIAVX512BW-NEXT:    movl %edi, %eax
316; GFNIAVX512BW-NEXT:    andl $3855, %eax # imm = 0xF0F
317; GFNIAVX512BW-NEXT:    shll $4, %eax
318; GFNIAVX512BW-NEXT:    andl $61680, %edi # imm = 0xF0F0
319; GFNIAVX512BW-NEXT:    shrl $4, %edi
320; GFNIAVX512BW-NEXT:    orl %eax, %edi
321; GFNIAVX512BW-NEXT:    movl %edi, %eax
322; GFNIAVX512BW-NEXT:    andl $13107, %eax # imm = 0x3333
323; GFNIAVX512BW-NEXT:    andl $52428, %edi # imm = 0xCCCC
324; GFNIAVX512BW-NEXT:    shrl $2, %edi
325; GFNIAVX512BW-NEXT:    leal (%rdi,%rax,4), %eax
326; GFNIAVX512BW-NEXT:    movl %eax, %ecx
327; GFNIAVX512BW-NEXT:    andl $21845, %ecx # imm = 0x5555
328; GFNIAVX512BW-NEXT:    andl $43690, %eax # imm = 0xAAAA
329; GFNIAVX512BW-NEXT:    shrl %eax
330; GFNIAVX512BW-NEXT:    leal (%rax,%rcx,2), %eax
331; GFNIAVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
332; GFNIAVX512BW-NEXT:    retq
333  %b = call i16 @llvm.bitreverse.i16(i16 %a)
334  ret i16 %b
335}
336
337define i32 @test_bitreverse_i32(i32 %a) nounwind {
338; SSE-LABEL: test_bitreverse_i32:
339; SSE:       # %bb.0:
340; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
341; SSE-NEXT:    bswapl %edi
342; SSE-NEXT:    movl %edi, %eax
343; SSE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
344; SSE-NEXT:    shll $4, %eax
345; SSE-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
346; SSE-NEXT:    shrl $4, %edi
347; SSE-NEXT:    orl %eax, %edi
348; SSE-NEXT:    movl %edi, %eax
349; SSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
350; SSE-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
351; SSE-NEXT:    shrl $2, %edi
352; SSE-NEXT:    leal (%rdi,%rax,4), %eax
353; SSE-NEXT:    movl %eax, %ecx
354; SSE-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
355; SSE-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
356; SSE-NEXT:    shrl %eax
357; SSE-NEXT:    leal (%rax,%rcx,2), %eax
358; SSE-NEXT:    retq
359;
360; AVX-LABEL: test_bitreverse_i32:
361; AVX:       # %bb.0:
362; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
363; AVX-NEXT:    bswapl %edi
364; AVX-NEXT:    movl %edi, %eax
365; AVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
366; AVX-NEXT:    shll $4, %eax
367; AVX-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
368; AVX-NEXT:    shrl $4, %edi
369; AVX-NEXT:    orl %eax, %edi
370; AVX-NEXT:    movl %edi, %eax
371; AVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
372; AVX-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
373; AVX-NEXT:    shrl $2, %edi
374; AVX-NEXT:    leal (%rdi,%rax,4), %eax
375; AVX-NEXT:    movl %eax, %ecx
376; AVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
377; AVX-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
378; AVX-NEXT:    shrl %eax
379; AVX-NEXT:    leal (%rax,%rcx,2), %eax
380; AVX-NEXT:    retq
381;
382; XOP-LABEL: test_bitreverse_i32:
383; XOP:       # %bb.0:
384; XOP-NEXT:    vmovd %edi, %xmm0
385; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
386; XOP-NEXT:    vmovd %xmm0, %eax
387; XOP-NEXT:    retq
388;
389; GFNISSE-LABEL: test_bitreverse_i32:
390; GFNISSE:       # %bb.0:
391; GFNISSE-NEXT:    # kill: def $edi killed $edi def $rdi
392; GFNISSE-NEXT:    bswapl %edi
393; GFNISSE-NEXT:    movl %edi, %eax
394; GFNISSE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
395; GFNISSE-NEXT:    shll $4, %eax
396; GFNISSE-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
397; GFNISSE-NEXT:    shrl $4, %edi
398; GFNISSE-NEXT:    orl %eax, %edi
399; GFNISSE-NEXT:    movl %edi, %eax
400; GFNISSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
401; GFNISSE-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
402; GFNISSE-NEXT:    shrl $2, %edi
403; GFNISSE-NEXT:    leal (%rdi,%rax,4), %eax
404; GFNISSE-NEXT:    movl %eax, %ecx
405; GFNISSE-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
406; GFNISSE-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
407; GFNISSE-NEXT:    shrl %eax
408; GFNISSE-NEXT:    leal (%rax,%rcx,2), %eax
409; GFNISSE-NEXT:    retq
410;
411; GFNIAVX-LABEL: test_bitreverse_i32:
412; GFNIAVX:       # %bb.0:
413; GFNIAVX-NEXT:    # kill: def $edi killed $edi def $rdi
414; GFNIAVX-NEXT:    bswapl %edi
415; GFNIAVX-NEXT:    movl %edi, %eax
416; GFNIAVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
417; GFNIAVX-NEXT:    shll $4, %eax
418; GFNIAVX-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
419; GFNIAVX-NEXT:    shrl $4, %edi
420; GFNIAVX-NEXT:    orl %eax, %edi
421; GFNIAVX-NEXT:    movl %edi, %eax
422; GFNIAVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
423; GFNIAVX-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
424; GFNIAVX-NEXT:    shrl $2, %edi
425; GFNIAVX-NEXT:    leal (%rdi,%rax,4), %eax
426; GFNIAVX-NEXT:    movl %eax, %ecx
427; GFNIAVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
428; GFNIAVX-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
429; GFNIAVX-NEXT:    shrl %eax
430; GFNIAVX-NEXT:    leal (%rax,%rcx,2), %eax
431; GFNIAVX-NEXT:    retq
432;
433; GFNIAVX2-LABEL: test_bitreverse_i32:
434; GFNIAVX2:       # %bb.0:
435; GFNIAVX2-NEXT:    # kill: def $edi killed $edi def $rdi
436; GFNIAVX2-NEXT:    bswapl %edi
437; GFNIAVX2-NEXT:    movl %edi, %eax
438; GFNIAVX2-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
439; GFNIAVX2-NEXT:    shll $4, %eax
440; GFNIAVX2-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
441; GFNIAVX2-NEXT:    shrl $4, %edi
442; GFNIAVX2-NEXT:    orl %eax, %edi
443; GFNIAVX2-NEXT:    movl %edi, %eax
444; GFNIAVX2-NEXT:    andl $858993459, %eax # imm = 0x33333333
445; GFNIAVX2-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
446; GFNIAVX2-NEXT:    shrl $2, %edi
447; GFNIAVX2-NEXT:    leal (%rdi,%rax,4), %eax
448; GFNIAVX2-NEXT:    movl %eax, %ecx
449; GFNIAVX2-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
450; GFNIAVX2-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
451; GFNIAVX2-NEXT:    shrl %eax
452; GFNIAVX2-NEXT:    leal (%rax,%rcx,2), %eax
453; GFNIAVX2-NEXT:    retq
454;
455; GFNIAVX512F-LABEL: test_bitreverse_i32:
456; GFNIAVX512F:       # %bb.0:
457; GFNIAVX512F-NEXT:    # kill: def $edi killed $edi def $rdi
458; GFNIAVX512F-NEXT:    bswapl %edi
459; GFNIAVX512F-NEXT:    movl %edi, %eax
460; GFNIAVX512F-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
461; GFNIAVX512F-NEXT:    shll $4, %eax
462; GFNIAVX512F-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
463; GFNIAVX512F-NEXT:    shrl $4, %edi
464; GFNIAVX512F-NEXT:    orl %eax, %edi
465; GFNIAVX512F-NEXT:    movl %edi, %eax
466; GFNIAVX512F-NEXT:    andl $858993459, %eax # imm = 0x33333333
467; GFNIAVX512F-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
468; GFNIAVX512F-NEXT:    shrl $2, %edi
469; GFNIAVX512F-NEXT:    leal (%rdi,%rax,4), %eax
470; GFNIAVX512F-NEXT:    movl %eax, %ecx
471; GFNIAVX512F-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
472; GFNIAVX512F-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
473; GFNIAVX512F-NEXT:    shrl %eax
474; GFNIAVX512F-NEXT:    leal (%rax,%rcx,2), %eax
475; GFNIAVX512F-NEXT:    retq
476;
477; GFNIAVX512BW-LABEL: test_bitreverse_i32:
478; GFNIAVX512BW:       # %bb.0:
479; GFNIAVX512BW-NEXT:    # kill: def $edi killed $edi def $rdi
480; GFNIAVX512BW-NEXT:    bswapl %edi
481; GFNIAVX512BW-NEXT:    movl %edi, %eax
482; GFNIAVX512BW-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
483; GFNIAVX512BW-NEXT:    shll $4, %eax
484; GFNIAVX512BW-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
485; GFNIAVX512BW-NEXT:    shrl $4, %edi
486; GFNIAVX512BW-NEXT:    orl %eax, %edi
487; GFNIAVX512BW-NEXT:    movl %edi, %eax
488; GFNIAVX512BW-NEXT:    andl $858993459, %eax # imm = 0x33333333
489; GFNIAVX512BW-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
490; GFNIAVX512BW-NEXT:    shrl $2, %edi
491; GFNIAVX512BW-NEXT:    leal (%rdi,%rax,4), %eax
492; GFNIAVX512BW-NEXT:    movl %eax, %ecx
493; GFNIAVX512BW-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
494; GFNIAVX512BW-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
495; GFNIAVX512BW-NEXT:    shrl %eax
496; GFNIAVX512BW-NEXT:    leal (%rax,%rcx,2), %eax
497; GFNIAVX512BW-NEXT:    retq
498  %b = call i32 @llvm.bitreverse.i32(i32 %a)
499  ret i32 %b
500}
501
502define i64 @test_bitreverse_i64(i64 %a) nounwind {
503; SSE-LABEL: test_bitreverse_i64:
504; SSE:       # %bb.0:
505; SSE-NEXT:    bswapq %rdi
506; SSE-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
507; SSE-NEXT:    andq %rdi, %rax
508; SSE-NEXT:    shlq $4, %rax
509; SSE-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
510; SSE-NEXT:    andq %rdi, %rcx
511; SSE-NEXT:    shrq $4, %rcx
512; SSE-NEXT:    orq %rax, %rcx
513; SSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
514; SSE-NEXT:    andq %rcx, %rax
515; SSE-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
516; SSE-NEXT:    andq %rcx, %rdx
517; SSE-NEXT:    shrq $2, %rdx
518; SSE-NEXT:    leaq (%rdx,%rax,4), %rax
519; SSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
520; SSE-NEXT:    andq %rax, %rcx
521; SSE-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
522; SSE-NEXT:    andq %rax, %rdx
523; SSE-NEXT:    shrq %rdx
524; SSE-NEXT:    leaq (%rdx,%rcx,2), %rax
525; SSE-NEXT:    retq
526;
527; AVX-LABEL: test_bitreverse_i64:
528; AVX:       # %bb.0:
529; AVX-NEXT:    bswapq %rdi
530; AVX-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
531; AVX-NEXT:    andq %rdi, %rax
532; AVX-NEXT:    shlq $4, %rax
533; AVX-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
534; AVX-NEXT:    andq %rdi, %rcx
535; AVX-NEXT:    shrq $4, %rcx
536; AVX-NEXT:    orq %rax, %rcx
537; AVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
538; AVX-NEXT:    andq %rcx, %rax
539; AVX-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
540; AVX-NEXT:    andq %rcx, %rdx
541; AVX-NEXT:    shrq $2, %rdx
542; AVX-NEXT:    leaq (%rdx,%rax,4), %rax
543; AVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
544; AVX-NEXT:    andq %rax, %rcx
545; AVX-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
546; AVX-NEXT:    andq %rax, %rdx
547; AVX-NEXT:    shrq %rdx
548; AVX-NEXT:    leaq (%rdx,%rcx,2), %rax
549; AVX-NEXT:    retq
550;
551; XOP-LABEL: test_bitreverse_i64:
552; XOP:       # %bb.0:
553; XOP-NEXT:    vmovq %rdi, %xmm0
554; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
555; XOP-NEXT:    vmovq %xmm0, %rax
556; XOP-NEXT:    retq
557;
558; GFNISSE-LABEL: test_bitreverse_i64:
559; GFNISSE:       # %bb.0:
560; GFNISSE-NEXT:    bswapq %rdi
561; GFNISSE-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
562; GFNISSE-NEXT:    andq %rdi, %rax
563; GFNISSE-NEXT:    shlq $4, %rax
564; GFNISSE-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
565; GFNISSE-NEXT:    andq %rdi, %rcx
566; GFNISSE-NEXT:    shrq $4, %rcx
567; GFNISSE-NEXT:    orq %rax, %rcx
568; GFNISSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
569; GFNISSE-NEXT:    andq %rcx, %rax
570; GFNISSE-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
571; GFNISSE-NEXT:    andq %rcx, %rdx
572; GFNISSE-NEXT:    shrq $2, %rdx
573; GFNISSE-NEXT:    leaq (%rdx,%rax,4), %rax
574; GFNISSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
575; GFNISSE-NEXT:    andq %rax, %rcx
576; GFNISSE-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
577; GFNISSE-NEXT:    andq %rax, %rdx
578; GFNISSE-NEXT:    shrq %rdx
579; GFNISSE-NEXT:    leaq (%rdx,%rcx,2), %rax
580; GFNISSE-NEXT:    retq
581;
582; GFNIAVX-LABEL: test_bitreverse_i64:
583; GFNIAVX:       # %bb.0:
584; GFNIAVX-NEXT:    bswapq %rdi
585; GFNIAVX-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
586; GFNIAVX-NEXT:    andq %rdi, %rax
587; GFNIAVX-NEXT:    shlq $4, %rax
588; GFNIAVX-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
589; GFNIAVX-NEXT:    andq %rdi, %rcx
590; GFNIAVX-NEXT:    shrq $4, %rcx
591; GFNIAVX-NEXT:    orq %rax, %rcx
592; GFNIAVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
593; GFNIAVX-NEXT:    andq %rcx, %rax
594; GFNIAVX-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
595; GFNIAVX-NEXT:    andq %rcx, %rdx
596; GFNIAVX-NEXT:    shrq $2, %rdx
597; GFNIAVX-NEXT:    leaq (%rdx,%rax,4), %rax
598; GFNIAVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
599; GFNIAVX-NEXT:    andq %rax, %rcx
600; GFNIAVX-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
601; GFNIAVX-NEXT:    andq %rax, %rdx
602; GFNIAVX-NEXT:    shrq %rdx
603; GFNIAVX-NEXT:    leaq (%rdx,%rcx,2), %rax
604; GFNIAVX-NEXT:    retq
605;
606; GFNIAVX2-LABEL: test_bitreverse_i64:
607; GFNIAVX2:       # %bb.0:
608; GFNIAVX2-NEXT:    bswapq %rdi
609; GFNIAVX2-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
610; GFNIAVX2-NEXT:    andq %rdi, %rax
611; GFNIAVX2-NEXT:    shlq $4, %rax
612; GFNIAVX2-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
613; GFNIAVX2-NEXT:    andq %rdi, %rcx
614; GFNIAVX2-NEXT:    shrq $4, %rcx
615; GFNIAVX2-NEXT:    orq %rax, %rcx
616; GFNIAVX2-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
617; GFNIAVX2-NEXT:    andq %rcx, %rax
618; GFNIAVX2-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
619; GFNIAVX2-NEXT:    andq %rcx, %rdx
620; GFNIAVX2-NEXT:    shrq $2, %rdx
621; GFNIAVX2-NEXT:    leaq (%rdx,%rax,4), %rax
622; GFNIAVX2-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
623; GFNIAVX2-NEXT:    andq %rax, %rcx
624; GFNIAVX2-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
625; GFNIAVX2-NEXT:    andq %rax, %rdx
626; GFNIAVX2-NEXT:    shrq %rdx
627; GFNIAVX2-NEXT:    leaq (%rdx,%rcx,2), %rax
628; GFNIAVX2-NEXT:    retq
629;
630; GFNIAVX512F-LABEL: test_bitreverse_i64:
631; GFNIAVX512F:       # %bb.0:
632; GFNIAVX512F-NEXT:    bswapq %rdi
633; GFNIAVX512F-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
634; GFNIAVX512F-NEXT:    andq %rdi, %rax
635; GFNIAVX512F-NEXT:    shlq $4, %rax
636; GFNIAVX512F-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
637; GFNIAVX512F-NEXT:    andq %rdi, %rcx
638; GFNIAVX512F-NEXT:    shrq $4, %rcx
639; GFNIAVX512F-NEXT:    orq %rax, %rcx
640; GFNIAVX512F-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
641; GFNIAVX512F-NEXT:    andq %rcx, %rax
642; GFNIAVX512F-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
643; GFNIAVX512F-NEXT:    andq %rcx, %rdx
644; GFNIAVX512F-NEXT:    shrq $2, %rdx
645; GFNIAVX512F-NEXT:    leaq (%rdx,%rax,4), %rax
646; GFNIAVX512F-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
647; GFNIAVX512F-NEXT:    andq %rax, %rcx
648; GFNIAVX512F-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
649; GFNIAVX512F-NEXT:    andq %rax, %rdx
650; GFNIAVX512F-NEXT:    shrq %rdx
651; GFNIAVX512F-NEXT:    leaq (%rdx,%rcx,2), %rax
652; GFNIAVX512F-NEXT:    retq
653;
654; GFNIAVX512BW-LABEL: test_bitreverse_i64:
655; GFNIAVX512BW:       # %bb.0:
656; GFNIAVX512BW-NEXT:    bswapq %rdi
657; GFNIAVX512BW-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
658; GFNIAVX512BW-NEXT:    andq %rdi, %rax
659; GFNIAVX512BW-NEXT:    shlq $4, %rax
660; GFNIAVX512BW-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
661; GFNIAVX512BW-NEXT:    andq %rdi, %rcx
662; GFNIAVX512BW-NEXT:    shrq $4, %rcx
663; GFNIAVX512BW-NEXT:    orq %rax, %rcx
664; GFNIAVX512BW-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
665; GFNIAVX512BW-NEXT:    andq %rcx, %rax
666; GFNIAVX512BW-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
667; GFNIAVX512BW-NEXT:    andq %rcx, %rdx
668; GFNIAVX512BW-NEXT:    shrq $2, %rdx
669; GFNIAVX512BW-NEXT:    leaq (%rdx,%rax,4), %rax
670; GFNIAVX512BW-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
671; GFNIAVX512BW-NEXT:    andq %rax, %rcx
672; GFNIAVX512BW-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
673; GFNIAVX512BW-NEXT:    andq %rax, %rdx
674; GFNIAVX512BW-NEXT:    shrq %rdx
675; GFNIAVX512BW-NEXT:    leaq (%rdx,%rcx,2), %rax
676; GFNIAVX512BW-NEXT:    retq
677  %b = call i64 @llvm.bitreverse.i64(i64 %a)
678  ret i64 %b
679}
680
681define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
682; SSE2-LABEL: test_bitreverse_v16i8:
683; SSE2:       # %bb.0:
684; SSE2-NEXT:    movdqa %xmm0, %xmm1
685; SSE2-NEXT:    psllw $4, %xmm1
686; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
687; SSE2-NEXT:    psrlw $4, %xmm0
688; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
689; SSE2-NEXT:    por %xmm1, %xmm0
690; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
691; SSE2-NEXT:    pand %xmm0, %xmm1
692; SSE2-NEXT:    psllw $2, %xmm1
693; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
694; SSE2-NEXT:    psrlw $2, %xmm0
695; SSE2-NEXT:    por %xmm1, %xmm0
696; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
697; SSE2-NEXT:    pand %xmm0, %xmm1
698; SSE2-NEXT:    paddb %xmm1, %xmm1
699; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
700; SSE2-NEXT:    psrlw $1, %xmm0
701; SSE2-NEXT:    por %xmm1, %xmm0
702; SSE2-NEXT:    retq
703;
704; SSSE3-LABEL: test_bitreverse_v16i8:
705; SSSE3:       # %bb.0:
706; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
707; SSSE3-NEXT:    movdqa %xmm0, %xmm2
708; SSSE3-NEXT:    pand %xmm1, %xmm2
709; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
710; SSSE3-NEXT:    pshufb %xmm2, %xmm3
711; SSSE3-NEXT:    psrlw $4, %xmm0
712; SSSE3-NEXT:    pand %xmm1, %xmm0
713; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
714; SSSE3-NEXT:    pshufb %xmm0, %xmm1
715; SSSE3-NEXT:    por %xmm3, %xmm1
716; SSSE3-NEXT:    movdqa %xmm1, %xmm0
717; SSSE3-NEXT:    retq
718;
719; AVX-LABEL: test_bitreverse_v16i8:
720; AVX:       # %bb.0:
721; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
722; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
723; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
724; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
725; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
726; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
727; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
728; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
729; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
730; AVX-NEXT:    retq
731;
732; XOP-LABEL: test_bitreverse_v16i8:
733; XOP:       # %bb.0:
734; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
735; XOP-NEXT:    retq
736;
737; GFNISSE-LABEL: test_bitreverse_v16i8:
738; GFNISSE:       # %bb.0:
739; GFNISSE-NEXT:    gf2p8affineqb $0, {{.*}}(%rip), %xmm0
740; GFNISSE-NEXT:    retq
741;
742; GFNIAVX-LABEL: test_bitreverse_v16i8:
743; GFNIAVX:       # %bb.0:
744; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
745; GFNIAVX-NEXT:    retq
746;
747; GFNIAVX2-LABEL: test_bitreverse_v16i8:
748; GFNIAVX2:       # %bb.0:
749; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
750; GFNIAVX2-NEXT:    retq
751;
752; GFNIAVX512F-LABEL: test_bitreverse_v16i8:
753; GFNIAVX512F:       # %bb.0:
754; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
755; GFNIAVX512F-NEXT:    retq
756;
757; GFNIAVX512BW-LABEL: test_bitreverse_v16i8:
758; GFNIAVX512BW:       # %bb.0:
759; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
760; GFNIAVX512BW-NEXT:    retq
761  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
762  ret <16 x i8> %b
763}
764
765define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
766; SSE2-LABEL: test_bitreverse_v8i16:
767; SSE2:       # %bb.0:
768; SSE2-NEXT:    movdqa %xmm0, %xmm1
769; SSE2-NEXT:    psrlw $8, %xmm1
770; SSE2-NEXT:    psllw $8, %xmm0
771; SSE2-NEXT:    por %xmm1, %xmm0
772; SSE2-NEXT:    movdqa %xmm0, %xmm1
773; SSE2-NEXT:    psllw $4, %xmm1
774; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
775; SSE2-NEXT:    psrlw $4, %xmm0
776; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
777; SSE2-NEXT:    por %xmm1, %xmm0
778; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
779; SSE2-NEXT:    pand %xmm0, %xmm1
780; SSE2-NEXT:    psllw $2, %xmm1
781; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
782; SSE2-NEXT:    psrlw $2, %xmm0
783; SSE2-NEXT:    por %xmm1, %xmm0
784; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
785; SSE2-NEXT:    pand %xmm0, %xmm1
786; SSE2-NEXT:    paddb %xmm1, %xmm1
787; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
788; SSE2-NEXT:    psrlw $1, %xmm0
789; SSE2-NEXT:    por %xmm1, %xmm0
790; SSE2-NEXT:    retq
791;
792; SSSE3-LABEL: test_bitreverse_v8i16:
793; SSSE3:       # %bb.0:
794; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
795; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
796; SSSE3-NEXT:    movdqa %xmm0, %xmm2
797; SSSE3-NEXT:    pand %xmm1, %xmm2
798; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
799; SSSE3-NEXT:    pshufb %xmm2, %xmm3
800; SSSE3-NEXT:    psrlw $4, %xmm0
801; SSSE3-NEXT:    pand %xmm1, %xmm0
802; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
803; SSSE3-NEXT:    pshufb %xmm0, %xmm1
804; SSSE3-NEXT:    por %xmm3, %xmm1
805; SSSE3-NEXT:    movdqa %xmm1, %xmm0
806; SSSE3-NEXT:    retq
807;
808; AVX-LABEL: test_bitreverse_v8i16:
809; AVX:       # %bb.0:
810; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
811; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
812; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
813; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
814; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
815; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
816; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
817; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
818; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
819; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
820; AVX-NEXT:    retq
821;
822; XOP-LABEL: test_bitreverse_v8i16:
823; XOP:       # %bb.0:
824; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
825; XOP-NEXT:    retq
826;
827; GFNISSE-LABEL: test_bitreverse_v8i16:
828; GFNISSE:       # %bb.0:
829; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
830; GFNISSE-NEXT:    gf2p8affineqb $0, {{.*}}(%rip), %xmm0
831; GFNISSE-NEXT:    retq
832;
833; GFNIAVX-LABEL: test_bitreverse_v8i16:
834; GFNIAVX:       # %bb.0:
835; GFNIAVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
836; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
837; GFNIAVX-NEXT:    retq
838;
839; GFNIAVX2-LABEL: test_bitreverse_v8i16:
840; GFNIAVX2:       # %bb.0:
841; GFNIAVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
842; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
843; GFNIAVX2-NEXT:    retq
844;
845; GFNIAVX512F-LABEL: test_bitreverse_v8i16:
846; GFNIAVX512F:       # %bb.0:
847; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
848; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
849; GFNIAVX512F-NEXT:    retq
850;
851; GFNIAVX512BW-LABEL: test_bitreverse_v8i16:
852; GFNIAVX512BW:       # %bb.0:
853; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
854; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
855; GFNIAVX512BW-NEXT:    retq
856  %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
857  ret <8 x i16> %b
858}
859
860define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
861; SSE2-LABEL: test_bitreverse_v4i32:
862; SSE2:       # %bb.0:
863; SSE2-NEXT:    pxor %xmm1, %xmm1
864; SSE2-NEXT:    movdqa %xmm0, %xmm2
865; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
866; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
867; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
868; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
869; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
870; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
871; SSE2-NEXT:    packuswb %xmm2, %xmm0
872; SSE2-NEXT:    movdqa %xmm0, %xmm1
873; SSE2-NEXT:    psllw $4, %xmm1
874; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
875; SSE2-NEXT:    psrlw $4, %xmm0
876; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
877; SSE2-NEXT:    por %xmm1, %xmm0
878; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
879; SSE2-NEXT:    pand %xmm0, %xmm1
880; SSE2-NEXT:    psllw $2, %xmm1
881; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
882; SSE2-NEXT:    psrlw $2, %xmm0
883; SSE2-NEXT:    por %xmm1, %xmm0
884; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
885; SSE2-NEXT:    pand %xmm0, %xmm1
886; SSE2-NEXT:    paddb %xmm1, %xmm1
887; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
888; SSE2-NEXT:    psrlw $1, %xmm0
889; SSE2-NEXT:    por %xmm1, %xmm0
890; SSE2-NEXT:    retq
891;
892; SSSE3-LABEL: test_bitreverse_v4i32:
893; SSSE3:       # %bb.0:
894; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
895; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
896; SSSE3-NEXT:    movdqa %xmm0, %xmm2
897; SSSE3-NEXT:    pand %xmm1, %xmm2
898; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
899; SSSE3-NEXT:    pshufb %xmm2, %xmm3
900; SSSE3-NEXT:    psrlw $4, %xmm0
901; SSSE3-NEXT:    pand %xmm1, %xmm0
902; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
903; SSSE3-NEXT:    pshufb %xmm0, %xmm1
904; SSSE3-NEXT:    por %xmm3, %xmm1
905; SSSE3-NEXT:    movdqa %xmm1, %xmm0
906; SSSE3-NEXT:    retq
907;
908; AVX-LABEL: test_bitreverse_v4i32:
909; AVX:       # %bb.0:
910; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
911; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
912; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
913; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
914; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
915; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
916; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
917; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
918; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
919; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
920; AVX-NEXT:    retq
921;
922; XOP-LABEL: test_bitreverse_v4i32:
923; XOP:       # %bb.0:
924; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
925; XOP-NEXT:    retq
926;
927; GFNISSE-LABEL: test_bitreverse_v4i32:
928; GFNISSE:       # %bb.0:
929; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
930; GFNISSE-NEXT:    gf2p8affineqb $0, {{.*}}(%rip), %xmm0
931; GFNISSE-NEXT:    retq
932;
933; GFNIAVX-LABEL: test_bitreverse_v4i32:
934; GFNIAVX:       # %bb.0:
935; GFNIAVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
936; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
937; GFNIAVX-NEXT:    retq
938;
939; GFNIAVX2-LABEL: test_bitreverse_v4i32:
940; GFNIAVX2:       # %bb.0:
941; GFNIAVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
942; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
943; GFNIAVX2-NEXT:    retq
944;
945; GFNIAVX512F-LABEL: test_bitreverse_v4i32:
946; GFNIAVX512F:       # %bb.0:
947; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
948; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
949; GFNIAVX512F-NEXT:    retq
950;
951; GFNIAVX512BW-LABEL: test_bitreverse_v4i32:
952; GFNIAVX512BW:       # %bb.0:
953; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
954; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
955; GFNIAVX512BW-NEXT:    retq
956  %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
957  ret <4 x i32> %b
958}
959
960define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
961; SSE2-LABEL: test_bitreverse_v2i64:
962; SSE2:       # %bb.0:
963; SSE2-NEXT:    pxor %xmm1, %xmm1
964; SSE2-NEXT:    movdqa %xmm0, %xmm2
965; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
966; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
967; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
968; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
969; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
970; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
971; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
972; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
973; SSE2-NEXT:    packuswb %xmm2, %xmm0
974; SSE2-NEXT:    movdqa %xmm0, %xmm1
975; SSE2-NEXT:    psllw $4, %xmm1
976; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
977; SSE2-NEXT:    psrlw $4, %xmm0
978; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
979; SSE2-NEXT:    por %xmm1, %xmm0
980; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
981; SSE2-NEXT:    pand %xmm0, %xmm1
982; SSE2-NEXT:    psllw $2, %xmm1
983; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
984; SSE2-NEXT:    psrlw $2, %xmm0
985; SSE2-NEXT:    por %xmm1, %xmm0
986; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
987; SSE2-NEXT:    pand %xmm0, %xmm1
988; SSE2-NEXT:    paddb %xmm1, %xmm1
989; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
990; SSE2-NEXT:    psrlw $1, %xmm0
991; SSE2-NEXT:    por %xmm1, %xmm0
992; SSE2-NEXT:    retq
993;
994; SSSE3-LABEL: test_bitreverse_v2i64:
995; SSSE3:       # %bb.0:
996; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
997; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
998; SSSE3-NEXT:    movdqa %xmm0, %xmm2
999; SSSE3-NEXT:    pand %xmm1, %xmm2
1000; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1001; SSSE3-NEXT:    pshufb %xmm2, %xmm3
1002; SSSE3-NEXT:    psrlw $4, %xmm0
1003; SSSE3-NEXT:    pand %xmm1, %xmm0
1004; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1005; SSSE3-NEXT:    pshufb %xmm0, %xmm1
1006; SSSE3-NEXT:    por %xmm3, %xmm1
1007; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1008; SSSE3-NEXT:    retq
1009;
1010; AVX-LABEL: test_bitreverse_v2i64:
1011; AVX:       # %bb.0:
1012; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1013; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1014; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
1015; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1016; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1017; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1018; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
1019; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1020; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
1021; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
1022; AVX-NEXT:    retq
1023;
1024; XOP-LABEL: test_bitreverse_v2i64:
1025; XOP:       # %bb.0:
1026; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
1027; XOP-NEXT:    retq
1028;
1029; GFNISSE-LABEL: test_bitreverse_v2i64:
1030; GFNISSE:       # %bb.0:
1031; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1032; GFNISSE-NEXT:    gf2p8affineqb $0, {{.*}}(%rip), %xmm0
1033; GFNISSE-NEXT:    retq
1034;
1035; GFNIAVX-LABEL: test_bitreverse_v2i64:
1036; GFNIAVX:       # %bb.0:
1037; GFNIAVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1038; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
1039; GFNIAVX-NEXT:    retq
1040;
1041; GFNIAVX2-LABEL: test_bitreverse_v2i64:
1042; GFNIAVX2:       # %bb.0:
1043; GFNIAVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1044; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
1045; GFNIAVX2-NEXT:    retq
1046;
1047; GFNIAVX512F-LABEL: test_bitreverse_v2i64:
1048; GFNIAVX512F:       # %bb.0:
1049; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1050; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
1051; GFNIAVX512F-NEXT:    retq
1052;
1053; GFNIAVX512BW-LABEL: test_bitreverse_v2i64:
1054; GFNIAVX512BW:       # %bb.0:
1055; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1056; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0
1057; GFNIAVX512BW-NEXT:    retq
1058  %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
1059  ret <2 x i64> %b
1060}
1061
1062define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
1063; SSE2-LABEL: test_bitreverse_v32i8:
1064; SSE2:       # %bb.0:
1065; SSE2-NEXT:    movdqa %xmm1, %xmm2
1066; SSE2-NEXT:    movdqa %xmm0, %xmm3
1067; SSE2-NEXT:    psllw $4, %xmm3
1068; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1069; SSE2-NEXT:    movdqa %xmm1, %xmm4
1070; SSE2-NEXT:    pandn %xmm3, %xmm4
1071; SSE2-NEXT:    psrlw $4, %xmm0
1072; SSE2-NEXT:    pand %xmm1, %xmm0
1073; SSE2-NEXT:    por %xmm4, %xmm0
1074; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1075; SSE2-NEXT:    movdqa %xmm0, %xmm4
1076; SSE2-NEXT:    pand %xmm3, %xmm4
1077; SSE2-NEXT:    psllw $2, %xmm4
1078; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1079; SSE2-NEXT:    pand %xmm5, %xmm0
1080; SSE2-NEXT:    psrlw $2, %xmm0
1081; SSE2-NEXT:    por %xmm4, %xmm0
1082; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1083; SSE2-NEXT:    movdqa %xmm0, %xmm6
1084; SSE2-NEXT:    pand %xmm4, %xmm6
1085; SSE2-NEXT:    paddb %xmm6, %xmm6
1086; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1087; SSE2-NEXT:    pand %xmm7, %xmm0
1088; SSE2-NEXT:    psrlw $1, %xmm0
1089; SSE2-NEXT:    por %xmm6, %xmm0
1090; SSE2-NEXT:    movdqa %xmm2, %xmm6
1091; SSE2-NEXT:    psllw $4, %xmm6
1092; SSE2-NEXT:    psrlw $4, %xmm2
1093; SSE2-NEXT:    pand %xmm1, %xmm2
1094; SSE2-NEXT:    pandn %xmm6, %xmm1
1095; SSE2-NEXT:    por %xmm2, %xmm1
1096; SSE2-NEXT:    pand %xmm1, %xmm3
1097; SSE2-NEXT:    psllw $2, %xmm3
1098; SSE2-NEXT:    pand %xmm5, %xmm1
1099; SSE2-NEXT:    psrlw $2, %xmm1
1100; SSE2-NEXT:    por %xmm3, %xmm1
1101; SSE2-NEXT:    pand %xmm1, %xmm4
1102; SSE2-NEXT:    paddb %xmm4, %xmm4
1103; SSE2-NEXT:    pand %xmm7, %xmm1
1104; SSE2-NEXT:    psrlw $1, %xmm1
1105; SSE2-NEXT:    por %xmm4, %xmm1
1106; SSE2-NEXT:    retq
1107;
1108; SSSE3-LABEL: test_bitreverse_v32i8:
1109; SSSE3:       # %bb.0:
1110; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1111; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1112; SSSE3-NEXT:    pand %xmm4, %xmm2
1113; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1114; SSSE3-NEXT:    movdqa %xmm5, %xmm6
1115; SSSE3-NEXT:    pshufb %xmm2, %xmm6
1116; SSSE3-NEXT:    psrlw $4, %xmm0
1117; SSSE3-NEXT:    pand %xmm4, %xmm0
1118; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1119; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1120; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1121; SSSE3-NEXT:    por %xmm6, %xmm3
1122; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1123; SSSE3-NEXT:    pand %xmm4, %xmm0
1124; SSSE3-NEXT:    pshufb %xmm0, %xmm5
1125; SSSE3-NEXT:    psrlw $4, %xmm1
1126; SSSE3-NEXT:    pand %xmm4, %xmm1
1127; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1128; SSSE3-NEXT:    por %xmm5, %xmm2
1129; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1130; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1131; SSSE3-NEXT:    retq
1132;
1133; AVX1-LABEL: test_bitreverse_v32i8:
1134; AVX1:       # %bb.0:
1135; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1136; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1137; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
1138; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1139; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
1140; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1141; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1142; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1143; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
1144; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
1145; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
1146; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
1147; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1148; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1149; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
1150; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
1151; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1152; AVX1-NEXT:    retq
1153;
1154; AVX2-LABEL: test_bitreverse_v32i8:
1155; AVX2:       # %bb.0:
1156; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1157; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1158; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1159; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1160; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1161; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1162; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1163; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1164; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1165; AVX2-NEXT:    retq
1166;
1167; AVX512-LABEL: test_bitreverse_v32i8:
1168; AVX512:       # %bb.0:
1169; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1170; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1171; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1172; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1173; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1174; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1175; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1176; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1177; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1178; AVX512-NEXT:    retq
1179;
1180; XOPAVX1-LABEL: test_bitreverse_v32i8:
1181; XOPAVX1:       # %bb.0:
1182; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1183; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1184; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1185; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1186; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1187; XOPAVX1-NEXT:    retq
1188;
1189; XOPAVX2-LABEL: test_bitreverse_v32i8:
1190; XOPAVX2:       # %bb.0:
1191; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1192; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1193; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1194; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1195; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1196; XOPAVX2-NEXT:    retq
1197;
1198; GFNISSE-LABEL: test_bitreverse_v32i8:
1199; GFNISSE:       # %bb.0:
1200; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
1201; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
1202; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
1203; GFNISSE-NEXT:    retq
1204;
1205; GFNIAVX-LABEL: test_bitreverse_v32i8:
1206; GFNIAVX:       # %bb.0:
1207; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1208; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
1209; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm2, %xmm1, %xmm1
1210; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm2, %xmm0, %xmm0
1211; GFNIAVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1212; GFNIAVX-NEXT:    retq
1213;
1214; GFNIAVX2-LABEL: test_bitreverse_v32i8:
1215; GFNIAVX2:       # %bb.0:
1216; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1217; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1218; GFNIAVX2-NEXT:    retq
1219;
1220; GFNIAVX512F-LABEL: test_bitreverse_v32i8:
1221; GFNIAVX512F:       # %bb.0:
1222; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1223; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1224; GFNIAVX512F-NEXT:    retq
1225;
1226; GFNIAVX512BW-LABEL: test_bitreverse_v32i8:
1227; GFNIAVX512BW:       # %bb.0:
1228; GFNIAVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1229; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1230; GFNIAVX512BW-NEXT:    retq
1231  %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
1232  ret <32 x i8> %b
1233}
1234
1235define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
1236; SSE2-LABEL: test_bitreverse_v16i16:
1237; SSE2:       # %bb.0:
1238; SSE2-NEXT:    movdqa %xmm1, %xmm2
1239; SSE2-NEXT:    movdqa %xmm0, %xmm1
1240; SSE2-NEXT:    psrlw $8, %xmm1
1241; SSE2-NEXT:    psllw $8, %xmm0
1242; SSE2-NEXT:    por %xmm1, %xmm0
1243; SSE2-NEXT:    movdqa %xmm0, %xmm3
1244; SSE2-NEXT:    psllw $4, %xmm3
1245; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1246; SSE2-NEXT:    movdqa %xmm1, %xmm4
1247; SSE2-NEXT:    pandn %xmm3, %xmm4
1248; SSE2-NEXT:    psrlw $4, %xmm0
1249; SSE2-NEXT:    pand %xmm1, %xmm0
1250; SSE2-NEXT:    por %xmm4, %xmm0
1251; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1252; SSE2-NEXT:    movdqa %xmm0, %xmm4
1253; SSE2-NEXT:    pand %xmm3, %xmm4
1254; SSE2-NEXT:    psllw $2, %xmm4
1255; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1256; SSE2-NEXT:    pand %xmm5, %xmm0
1257; SSE2-NEXT:    psrlw $2, %xmm0
1258; SSE2-NEXT:    por %xmm4, %xmm0
1259; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1260; SSE2-NEXT:    movdqa %xmm0, %xmm7
1261; SSE2-NEXT:    pand %xmm4, %xmm7
1262; SSE2-NEXT:    paddb %xmm7, %xmm7
1263; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1264; SSE2-NEXT:    pand %xmm6, %xmm0
1265; SSE2-NEXT:    psrlw $1, %xmm0
1266; SSE2-NEXT:    por %xmm7, %xmm0
1267; SSE2-NEXT:    movdqa %xmm2, %xmm7
1268; SSE2-NEXT:    psrlw $8, %xmm7
1269; SSE2-NEXT:    psllw $8, %xmm2
1270; SSE2-NEXT:    por %xmm7, %xmm2
1271; SSE2-NEXT:    movdqa %xmm2, %xmm7
1272; SSE2-NEXT:    psllw $4, %xmm7
1273; SSE2-NEXT:    psrlw $4, %xmm2
1274; SSE2-NEXT:    pand %xmm1, %xmm2
1275; SSE2-NEXT:    pandn %xmm7, %xmm1
1276; SSE2-NEXT:    por %xmm2, %xmm1
1277; SSE2-NEXT:    pand %xmm1, %xmm3
1278; SSE2-NEXT:    psllw $2, %xmm3
1279; SSE2-NEXT:    pand %xmm5, %xmm1
1280; SSE2-NEXT:    psrlw $2, %xmm1
1281; SSE2-NEXT:    por %xmm3, %xmm1
1282; SSE2-NEXT:    pand %xmm1, %xmm4
1283; SSE2-NEXT:    paddb %xmm4, %xmm4
1284; SSE2-NEXT:    pand %xmm6, %xmm1
1285; SSE2-NEXT:    psrlw $1, %xmm1
1286; SSE2-NEXT:    por %xmm4, %xmm1
1287; SSE2-NEXT:    retq
1288;
1289; SSSE3-LABEL: test_bitreverse_v16i16:
1290; SSSE3:       # %bb.0:
1291; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1292; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1293; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1294; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1295; SSSE3-NEXT:    pand %xmm5, %xmm2
1296; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1297; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1298; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1299; SSSE3-NEXT:    psrlw $4, %xmm0
1300; SSSE3-NEXT:    pand %xmm5, %xmm0
1301; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1302; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1303; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1304; SSSE3-NEXT:    por %xmm7, %xmm3
1305; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1306; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1307; SSSE3-NEXT:    pand %xmm5, %xmm0
1308; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1309; SSSE3-NEXT:    psrlw $4, %xmm1
1310; SSSE3-NEXT:    pand %xmm5, %xmm1
1311; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1312; SSSE3-NEXT:    por %xmm6, %xmm2
1313; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1314; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1315; SSSE3-NEXT:    retq
1316;
1317; AVX1-LABEL: test_bitreverse_v16i16:
1318; AVX1:       # %bb.0:
1319; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1320; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1321; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1322; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1323; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1324; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1325; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1326; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1327; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1328; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1329; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1330; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1331; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1332; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1333; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1334; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1335; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1336; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1337; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1338; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1339; AVX1-NEXT:    retq
1340;
1341; AVX2-LABEL: test_bitreverse_v16i16:
1342; AVX2:       # %bb.0:
1343; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1344; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1345; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1346; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1347; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1348; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1349; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1350; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1351; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1352; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1353; AVX2-NEXT:    retq
1354;
1355; AVX512-LABEL: test_bitreverse_v16i16:
1356; AVX512:       # %bb.0:
1357; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1358; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1359; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1360; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1361; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1362; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1363; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1364; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1365; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1366; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1367; AVX512-NEXT:    retq
1368;
1369; XOPAVX1-LABEL: test_bitreverse_v16i16:
1370; XOPAVX1:       # %bb.0:
1371; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1372; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1373; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1374; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1375; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1376; XOPAVX1-NEXT:    retq
1377;
1378; XOPAVX2-LABEL: test_bitreverse_v16i16:
1379; XOPAVX2:       # %bb.0:
1380; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1381; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1382; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1383; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1384; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1385; XOPAVX2-NEXT:    retq
1386;
1387; GFNISSE-LABEL: test_bitreverse_v16i16:
1388; GFNISSE:       # %bb.0:
1389; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1390; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
1391; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1392; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
1393; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
1394; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
1395; GFNISSE-NEXT:    retq
1396;
1397; GFNIAVX-LABEL: test_bitreverse_v16i16:
1398; GFNIAVX:       # %bb.0:
1399; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1400; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1401; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1402; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1403; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
1404; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1405; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
1406; GFNIAVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1407; GFNIAVX-NEXT:    retq
1408;
1409; GFNIAVX2-LABEL: test_bitreverse_v16i16:
1410; GFNIAVX2:       # %bb.0:
1411; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1412; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1413; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1414; GFNIAVX2-NEXT:    retq
1415;
1416; GFNIAVX512F-LABEL: test_bitreverse_v16i16:
1417; GFNIAVX512F:       # %bb.0:
1418; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1419; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1420; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1421; GFNIAVX512F-NEXT:    retq
1422;
1423; GFNIAVX512BW-LABEL: test_bitreverse_v16i16:
1424; GFNIAVX512BW:       # %bb.0:
1425; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1426; GFNIAVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1427; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1428; GFNIAVX512BW-NEXT:    retq
1429  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
1430  ret <16 x i16> %b
1431}
1432
1433define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
1434; SSE2-LABEL: test_bitreverse_v8i32:
1435; SSE2:       # %bb.0:
1436; SSE2-NEXT:    movdqa %xmm1, %xmm2
1437; SSE2-NEXT:    pxor %xmm4, %xmm4
1438; SSE2-NEXT:    movdqa %xmm0, %xmm1
1439; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
1440; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1441; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1442; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1443; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1444; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1445; SSE2-NEXT:    packuswb %xmm1, %xmm0
1446; SSE2-NEXT:    movdqa %xmm0, %xmm3
1447; SSE2-NEXT:    psllw $4, %xmm3
1448; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1449; SSE2-NEXT:    movdqa %xmm1, %xmm5
1450; SSE2-NEXT:    pandn %xmm3, %xmm5
1451; SSE2-NEXT:    psrlw $4, %xmm0
1452; SSE2-NEXT:    pand %xmm1, %xmm0
1453; SSE2-NEXT:    por %xmm5, %xmm0
1454; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1455; SSE2-NEXT:    movdqa %xmm0, %xmm5
1456; SSE2-NEXT:    pand %xmm3, %xmm5
1457; SSE2-NEXT:    psllw $2, %xmm5
1458; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1459; SSE2-NEXT:    pand %xmm8, %xmm0
1460; SSE2-NEXT:    psrlw $2, %xmm0
1461; SSE2-NEXT:    por %xmm5, %xmm0
1462; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1463; SSE2-NEXT:    movdqa %xmm0, %xmm6
1464; SSE2-NEXT:    pand %xmm5, %xmm6
1465; SSE2-NEXT:    paddb %xmm6, %xmm6
1466; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1467; SSE2-NEXT:    pand %xmm7, %xmm0
1468; SSE2-NEXT:    psrlw $1, %xmm0
1469; SSE2-NEXT:    por %xmm6, %xmm0
1470; SSE2-NEXT:    movdqa %xmm2, %xmm6
1471; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
1472; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1473; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1474; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1475; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1476; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1477; SSE2-NEXT:    packuswb %xmm6, %xmm2
1478; SSE2-NEXT:    movdqa %xmm2, %xmm4
1479; SSE2-NEXT:    psllw $4, %xmm4
1480; SSE2-NEXT:    psrlw $4, %xmm2
1481; SSE2-NEXT:    pand %xmm1, %xmm2
1482; SSE2-NEXT:    pandn %xmm4, %xmm1
1483; SSE2-NEXT:    por %xmm2, %xmm1
1484; SSE2-NEXT:    pand %xmm1, %xmm3
1485; SSE2-NEXT:    psllw $2, %xmm3
1486; SSE2-NEXT:    pand %xmm8, %xmm1
1487; SSE2-NEXT:    psrlw $2, %xmm1
1488; SSE2-NEXT:    por %xmm3, %xmm1
1489; SSE2-NEXT:    pand %xmm1, %xmm5
1490; SSE2-NEXT:    paddb %xmm5, %xmm5
1491; SSE2-NEXT:    pand %xmm7, %xmm1
1492; SSE2-NEXT:    psrlw $1, %xmm1
1493; SSE2-NEXT:    por %xmm5, %xmm1
1494; SSE2-NEXT:    retq
1495;
1496; SSSE3-LABEL: test_bitreverse_v8i32:
1497; SSSE3:       # %bb.0:
1498; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1499; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1500; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1501; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1502; SSSE3-NEXT:    pand %xmm5, %xmm2
1503; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1504; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1505; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1506; SSSE3-NEXT:    psrlw $4, %xmm0
1507; SSSE3-NEXT:    pand %xmm5, %xmm0
1508; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1509; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1510; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1511; SSSE3-NEXT:    por %xmm7, %xmm3
1512; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1513; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1514; SSSE3-NEXT:    pand %xmm5, %xmm0
1515; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1516; SSSE3-NEXT:    psrlw $4, %xmm1
1517; SSSE3-NEXT:    pand %xmm5, %xmm1
1518; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1519; SSSE3-NEXT:    por %xmm6, %xmm2
1520; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1521; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1522; SSSE3-NEXT:    retq
1523;
1524; AVX1-LABEL: test_bitreverse_v8i32:
1525; AVX1:       # %bb.0:
1526; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1527; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1528; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1529; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1530; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1531; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1532; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1533; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1534; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1535; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1536; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1537; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1538; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1539; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1540; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1541; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1542; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1543; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1544; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1545; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1546; AVX1-NEXT:    retq
1547;
1548; AVX2-LABEL: test_bitreverse_v8i32:
1549; AVX2:       # %bb.0:
1550; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1551; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1552; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1553; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1554; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1555; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1556; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1557; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1558; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1559; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1560; AVX2-NEXT:    retq
1561;
1562; AVX512-LABEL: test_bitreverse_v8i32:
1563; AVX512:       # %bb.0:
1564; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1565; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1566; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1567; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1568; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1569; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1570; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1571; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1572; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1573; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1574; AVX512-NEXT:    retq
1575;
1576; XOPAVX1-LABEL: test_bitreverse_v8i32:
1577; XOPAVX1:       # %bb.0:
1578; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1579; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1580; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1581; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1582; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1583; XOPAVX1-NEXT:    retq
1584;
1585; XOPAVX2-LABEL: test_bitreverse_v8i32:
1586; XOPAVX2:       # %bb.0:
1587; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1588; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1589; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1590; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1591; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1592; XOPAVX2-NEXT:    retq
1593;
1594; GFNISSE-LABEL: test_bitreverse_v8i32:
1595; GFNISSE:       # %bb.0:
1596; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1597; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
1598; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1599; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
1600; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
1601; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
1602; GFNISSE-NEXT:    retq
1603;
1604; GFNIAVX-LABEL: test_bitreverse_v8i32:
1605; GFNIAVX:       # %bb.0:
1606; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1607; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1608; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1609; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1610; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
1611; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1612; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
1613; GFNIAVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1614; GFNIAVX-NEXT:    retq
1615;
1616; GFNIAVX2-LABEL: test_bitreverse_v8i32:
1617; GFNIAVX2:       # %bb.0:
1618; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1619; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1620; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1621; GFNIAVX2-NEXT:    retq
1622;
1623; GFNIAVX512F-LABEL: test_bitreverse_v8i32:
1624; GFNIAVX512F:       # %bb.0:
1625; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1626; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1627; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1628; GFNIAVX512F-NEXT:    retq
1629;
1630; GFNIAVX512BW-LABEL: test_bitreverse_v8i32:
1631; GFNIAVX512BW:       # %bb.0:
1632; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1633; GFNIAVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1634; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1635; GFNIAVX512BW-NEXT:    retq
1636  %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
1637  ret <8 x i32> %b
1638}
1639
1640define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
1641; SSE2-LABEL: test_bitreverse_v4i64:
1642; SSE2:       # %bb.0:
1643; SSE2-NEXT:    movdqa %xmm1, %xmm2
1644; SSE2-NEXT:    pxor %xmm4, %xmm4
1645; SSE2-NEXT:    movdqa %xmm0, %xmm1
1646; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
1647; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1648; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1649; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1650; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1651; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1652; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1653; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1654; SSE2-NEXT:    packuswb %xmm1, %xmm0
1655; SSE2-NEXT:    movdqa %xmm0, %xmm3
1656; SSE2-NEXT:    psllw $4, %xmm3
1657; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1658; SSE2-NEXT:    movdqa %xmm1, %xmm5
1659; SSE2-NEXT:    pandn %xmm3, %xmm5
1660; SSE2-NEXT:    psrlw $4, %xmm0
1661; SSE2-NEXT:    pand %xmm1, %xmm0
1662; SSE2-NEXT:    por %xmm5, %xmm0
1663; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1664; SSE2-NEXT:    movdqa %xmm0, %xmm5
1665; SSE2-NEXT:    pand %xmm3, %xmm5
1666; SSE2-NEXT:    psllw $2, %xmm5
1667; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1668; SSE2-NEXT:    pand %xmm8, %xmm0
1669; SSE2-NEXT:    psrlw $2, %xmm0
1670; SSE2-NEXT:    por %xmm5, %xmm0
1671; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1672; SSE2-NEXT:    movdqa %xmm0, %xmm6
1673; SSE2-NEXT:    pand %xmm5, %xmm6
1674; SSE2-NEXT:    paddb %xmm6, %xmm6
1675; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1676; SSE2-NEXT:    pand %xmm7, %xmm0
1677; SSE2-NEXT:    psrlw $1, %xmm0
1678; SSE2-NEXT:    por %xmm6, %xmm0
1679; SSE2-NEXT:    movdqa %xmm2, %xmm6
1680; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
1681; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
1682; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1683; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1684; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1685; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
1686; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1687; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1688; SSE2-NEXT:    packuswb %xmm6, %xmm2
1689; SSE2-NEXT:    movdqa %xmm2, %xmm4
1690; SSE2-NEXT:    psllw $4, %xmm4
1691; SSE2-NEXT:    psrlw $4, %xmm2
1692; SSE2-NEXT:    pand %xmm1, %xmm2
1693; SSE2-NEXT:    pandn %xmm4, %xmm1
1694; SSE2-NEXT:    por %xmm2, %xmm1
1695; SSE2-NEXT:    pand %xmm1, %xmm3
1696; SSE2-NEXT:    psllw $2, %xmm3
1697; SSE2-NEXT:    pand %xmm8, %xmm1
1698; SSE2-NEXT:    psrlw $2, %xmm1
1699; SSE2-NEXT:    por %xmm3, %xmm1
1700; SSE2-NEXT:    pand %xmm1, %xmm5
1701; SSE2-NEXT:    paddb %xmm5, %xmm5
1702; SSE2-NEXT:    pand %xmm7, %xmm1
1703; SSE2-NEXT:    psrlw $1, %xmm1
1704; SSE2-NEXT:    por %xmm5, %xmm1
1705; SSE2-NEXT:    retq
1706;
1707; SSSE3-LABEL: test_bitreverse_v4i64:
1708; SSSE3:       # %bb.0:
1709; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1710; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1711; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1712; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1713; SSSE3-NEXT:    pand %xmm5, %xmm2
1714; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1715; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1716; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1717; SSSE3-NEXT:    psrlw $4, %xmm0
1718; SSSE3-NEXT:    pand %xmm5, %xmm0
1719; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1720; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1721; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1722; SSSE3-NEXT:    por %xmm7, %xmm3
1723; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1724; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1725; SSSE3-NEXT:    pand %xmm5, %xmm0
1726; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1727; SSSE3-NEXT:    psrlw $4, %xmm1
1728; SSSE3-NEXT:    pand %xmm5, %xmm1
1729; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1730; SSSE3-NEXT:    por %xmm6, %xmm2
1731; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1732; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1733; SSSE3-NEXT:    retq
1734;
1735; AVX1-LABEL: test_bitreverse_v4i64:
1736; AVX1:       # %bb.0:
1737; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1738; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1739; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1740; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1741; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1742; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1743; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1744; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1745; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1746; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1747; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1748; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1749; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1750; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1751; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1752; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1753; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1754; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1755; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1756; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1757; AVX1-NEXT:    retq
1758;
1759; AVX2-LABEL: test_bitreverse_v4i64:
1760; AVX2:       # %bb.0:
1761; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1762; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1763; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1764; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1765; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1766; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1767; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1768; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1769; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1770; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1771; AVX2-NEXT:    retq
1772;
1773; AVX512-LABEL: test_bitreverse_v4i64:
1774; AVX512:       # %bb.0:
1775; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1776; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1777; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1778; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1779; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1780; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1781; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1782; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1783; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1784; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1785; AVX512-NEXT:    retq
1786;
1787; XOPAVX1-LABEL: test_bitreverse_v4i64:
1788; XOPAVX1:       # %bb.0:
1789; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1790; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1791; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1792; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1793; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1794; XOPAVX1-NEXT:    retq
1795;
1796; XOPAVX2-LABEL: test_bitreverse_v4i64:
1797; XOPAVX2:       # %bb.0:
1798; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1799; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1800; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1801; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1802; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1803; XOPAVX2-NEXT:    retq
1804;
1805; GFNISSE-LABEL: test_bitreverse_v4i64:
1806; GFNISSE:       # %bb.0:
1807; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1808; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
1809; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1810; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
1811; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
1812; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
1813; GFNISSE-NEXT:    retq
1814;
1815; GFNIAVX-LABEL: test_bitreverse_v4i64:
1816; GFNIAVX:       # %bb.0:
1817; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1818; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1819; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1820; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1821; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
1822; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1823; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
1824; GFNIAVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1825; GFNIAVX-NEXT:    retq
1826;
1827; GFNIAVX2-LABEL: test_bitreverse_v4i64:
1828; GFNIAVX2:       # %bb.0:
1829; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1830; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1831; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1832; GFNIAVX2-NEXT:    retq
1833;
1834; GFNIAVX512F-LABEL: test_bitreverse_v4i64:
1835; GFNIAVX512F:       # %bb.0:
1836; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1837; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1838; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1839; GFNIAVX512F-NEXT:    retq
1840;
1841; GFNIAVX512BW-LABEL: test_bitreverse_v4i64:
1842; GFNIAVX512BW:       # %bb.0:
1843; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1844; GFNIAVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1845; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1846; GFNIAVX512BW-NEXT:    retq
1847  %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
1848  ret <4 x i64> %b
1849}
1850
1851define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
1852; SSE2-LABEL: test_bitreverse_v64i8:
1853; SSE2:       # %bb.0:
1854; SSE2-NEXT:    movdqa %xmm3, %xmm10
1855; SSE2-NEXT:    movdqa %xmm0, %xmm5
1856; SSE2-NEXT:    psllw $4, %xmm5
1857; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1858; SSE2-NEXT:    movdqa %xmm3, %xmm6
1859; SSE2-NEXT:    pandn %xmm5, %xmm6
1860; SSE2-NEXT:    psrlw $4, %xmm0
1861; SSE2-NEXT:    pand %xmm3, %xmm0
1862; SSE2-NEXT:    por %xmm6, %xmm0
1863; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1864; SSE2-NEXT:    movdqa %xmm0, %xmm6
1865; SSE2-NEXT:    pand %xmm5, %xmm6
1866; SSE2-NEXT:    psllw $2, %xmm6
1867; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1868; SSE2-NEXT:    pand %xmm8, %xmm0
1869; SSE2-NEXT:    psrlw $2, %xmm0
1870; SSE2-NEXT:    por %xmm6, %xmm0
1871; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1872; SSE2-NEXT:    movdqa %xmm0, %xmm7
1873; SSE2-NEXT:    pand %xmm6, %xmm7
1874; SSE2-NEXT:    paddb %xmm7, %xmm7
1875; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1876; SSE2-NEXT:    pand %xmm9, %xmm0
1877; SSE2-NEXT:    psrlw $1, %xmm0
1878; SSE2-NEXT:    por %xmm7, %xmm0
1879; SSE2-NEXT:    movdqa %xmm1, %xmm7
1880; SSE2-NEXT:    psllw $4, %xmm7
1881; SSE2-NEXT:    movdqa %xmm3, %xmm4
1882; SSE2-NEXT:    pandn %xmm7, %xmm4
1883; SSE2-NEXT:    psrlw $4, %xmm1
1884; SSE2-NEXT:    pand %xmm3, %xmm1
1885; SSE2-NEXT:    por %xmm4, %xmm1
1886; SSE2-NEXT:    movdqa %xmm1, %xmm4
1887; SSE2-NEXT:    pand %xmm5, %xmm4
1888; SSE2-NEXT:    psllw $2, %xmm4
1889; SSE2-NEXT:    pand %xmm8, %xmm1
1890; SSE2-NEXT:    psrlw $2, %xmm1
1891; SSE2-NEXT:    por %xmm4, %xmm1
1892; SSE2-NEXT:    movdqa %xmm1, %xmm4
1893; SSE2-NEXT:    pand %xmm6, %xmm4
1894; SSE2-NEXT:    paddb %xmm4, %xmm4
1895; SSE2-NEXT:    pand %xmm9, %xmm1
1896; SSE2-NEXT:    psrlw $1, %xmm1
1897; SSE2-NEXT:    por %xmm4, %xmm1
1898; SSE2-NEXT:    movdqa %xmm2, %xmm4
1899; SSE2-NEXT:    psllw $4, %xmm4
1900; SSE2-NEXT:    movdqa %xmm3, %xmm7
1901; SSE2-NEXT:    pandn %xmm4, %xmm7
1902; SSE2-NEXT:    psrlw $4, %xmm2
1903; SSE2-NEXT:    pand %xmm3, %xmm2
1904; SSE2-NEXT:    por %xmm7, %xmm2
1905; SSE2-NEXT:    movdqa %xmm2, %xmm4
1906; SSE2-NEXT:    pand %xmm5, %xmm4
1907; SSE2-NEXT:    psllw $2, %xmm4
1908; SSE2-NEXT:    pand %xmm8, %xmm2
1909; SSE2-NEXT:    psrlw $2, %xmm2
1910; SSE2-NEXT:    por %xmm4, %xmm2
1911; SSE2-NEXT:    movdqa %xmm2, %xmm4
1912; SSE2-NEXT:    pand %xmm6, %xmm4
1913; SSE2-NEXT:    paddb %xmm4, %xmm4
1914; SSE2-NEXT:    pand %xmm9, %xmm2
1915; SSE2-NEXT:    psrlw $1, %xmm2
1916; SSE2-NEXT:    por %xmm4, %xmm2
1917; SSE2-NEXT:    movdqa %xmm10, %xmm4
1918; SSE2-NEXT:    psllw $4, %xmm4
1919; SSE2-NEXT:    psrlw $4, %xmm10
1920; SSE2-NEXT:    pand %xmm3, %xmm10
1921; SSE2-NEXT:    pandn %xmm4, %xmm3
1922; SSE2-NEXT:    por %xmm10, %xmm3
1923; SSE2-NEXT:    pand %xmm3, %xmm5
1924; SSE2-NEXT:    psllw $2, %xmm5
1925; SSE2-NEXT:    pand %xmm8, %xmm3
1926; SSE2-NEXT:    psrlw $2, %xmm3
1927; SSE2-NEXT:    por %xmm5, %xmm3
1928; SSE2-NEXT:    pand %xmm3, %xmm6
1929; SSE2-NEXT:    paddb %xmm6, %xmm6
1930; SSE2-NEXT:    pand %xmm9, %xmm3
1931; SSE2-NEXT:    psrlw $1, %xmm3
1932; SSE2-NEXT:    por %xmm6, %xmm3
1933; SSE2-NEXT:    retq
1934;
1935; SSSE3-LABEL: test_bitreverse_v64i8:
1936; SSSE3:       # %bb.0:
1937; SSSE3-NEXT:    movdqa %xmm0, %xmm5
1938; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1939; SSSE3-NEXT:    pand %xmm8, %xmm0
1940; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1941; SSSE3-NEXT:    movdqa %xmm9, %xmm6
1942; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1943; SSSE3-NEXT:    psrlw $4, %xmm5
1944; SSSE3-NEXT:    pand %xmm8, %xmm5
1945; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1946; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1947; SSSE3-NEXT:    pshufb %xmm5, %xmm0
1948; SSSE3-NEXT:    por %xmm6, %xmm0
1949; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1950; SSSE3-NEXT:    pand %xmm8, %xmm5
1951; SSSE3-NEXT:    movdqa %xmm9, %xmm6
1952; SSSE3-NEXT:    pshufb %xmm5, %xmm6
1953; SSSE3-NEXT:    psrlw $4, %xmm1
1954; SSSE3-NEXT:    pand %xmm8, %xmm1
1955; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1956; SSSE3-NEXT:    pshufb %xmm1, %xmm5
1957; SSSE3-NEXT:    por %xmm6, %xmm5
1958; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1959; SSSE3-NEXT:    pand %xmm8, %xmm1
1960; SSSE3-NEXT:    movdqa %xmm9, %xmm7
1961; SSSE3-NEXT:    pshufb %xmm1, %xmm7
1962; SSSE3-NEXT:    psrlw $4, %xmm2
1963; SSSE3-NEXT:    pand %xmm8, %xmm2
1964; SSSE3-NEXT:    movdqa %xmm4, %xmm6
1965; SSSE3-NEXT:    pshufb %xmm2, %xmm6
1966; SSSE3-NEXT:    por %xmm7, %xmm6
1967; SSSE3-NEXT:    movdqa %xmm3, %xmm1
1968; SSSE3-NEXT:    pand %xmm8, %xmm1
1969; SSSE3-NEXT:    pshufb %xmm1, %xmm9
1970; SSSE3-NEXT:    psrlw $4, %xmm3
1971; SSSE3-NEXT:    pand %xmm8, %xmm3
1972; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1973; SSSE3-NEXT:    por %xmm9, %xmm4
1974; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1975; SSSE3-NEXT:    movdqa %xmm6, %xmm2
1976; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1977; SSSE3-NEXT:    retq
1978;
1979; AVX1-LABEL: test_bitreverse_v64i8:
1980; AVX1:       # %bb.0:
1981; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1982; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1983; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
1984; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1985; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1986; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1987; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1988; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1989; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
1990; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
1991; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm4
1992; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1993; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1994; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1995; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1996; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
1997; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1998; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1999; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
2000; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
2001; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2002; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
2003; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
2004; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
2005; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
2006; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
2007; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2008; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
2009; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
2010; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
2011; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2012; AVX1-NEXT:    retq
2013;
2014; AVX2-LABEL: test_bitreverse_v64i8:
2015; AVX2:       # %bb.0:
2016; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2017; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
2018; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2019; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
2020; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2021; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
2022; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2023; AVX2-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
2024; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
2025; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
2026; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
2027; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2028; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
2029; AVX2-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
2030; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
2031; AVX2-NEXT:    retq
2032;
2033; AVX512F-LABEL: test_bitreverse_v64i8:
2034; AVX512F:       # %bb.0:
2035; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2036; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2037; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
2038; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2039; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
2040; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm5
2041; AVX512F-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
2042; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
2043; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
2044; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
2045; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2046; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
2047; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
2048; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
2049; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
2050; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2051; AVX512F-NEXT:    vporq %zmm0, %zmm3, %zmm0
2052; AVX512F-NEXT:    retq
2053;
2054; AVX512BW-LABEL: test_bitreverse_v64i8:
2055; AVX512BW:       # %bb.0:
2056; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2057; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2058; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2059; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2060; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2061; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2062; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2063; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2064; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2065; AVX512BW-NEXT:    retq
2066;
2067; XOPAVX1-LABEL: test_bitreverse_v64i8:
2068; XOPAVX1:       # %bb.0:
2069; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2070; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
2071; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2072; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2073; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2074; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2075; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2076; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2077; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2078; XOPAVX1-NEXT:    retq
2079;
2080; XOPAVX2-LABEL: test_bitreverse_v64i8:
2081; XOPAVX2:       # %bb.0:
2082; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2083; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
2084; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2085; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2086; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2087; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2088; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2089; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2090; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2091; XOPAVX2-NEXT:    retq
2092;
2093; GFNISSE-LABEL: test_bitreverse_v64i8:
2094; GFNISSE:       # %bb.0:
2095; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
2096; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
2097; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
2098; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
2099; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm3
2100; GFNISSE-NEXT:    retq
2101;
2102; GFNIAVX-LABEL: test_bitreverse_v64i8:
2103; GFNIAVX:       # %bb.0:
2104; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
2105; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
2106; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm2
2107; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
2108; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2109; GFNIAVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
2110; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm2
2111; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
2112; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2113; GFNIAVX-NEXT:    retq
2114;
2115; GFNIAVX2-LABEL: test_bitreverse_v64i8:
2116; GFNIAVX2:       # %bb.0:
2117; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2118; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
2119; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
2120; GFNIAVX2-NEXT:    retq
2121;
2122; GFNIAVX512F-LABEL: test_bitreverse_v64i8:
2123; GFNIAVX512F:       # %bb.0:
2124; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2125; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2126; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
2127; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
2128; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2129; GFNIAVX512F-NEXT:    retq
2130;
2131; GFNIAVX512BW-LABEL: test_bitreverse_v64i8:
2132; GFNIAVX512BW:       # %bb.0:
2133; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip){1to8}, %zmm0, %zmm0
2134; GFNIAVX512BW-NEXT:    retq
2135  %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
2136  ret <64 x i8> %b
2137}
2138
2139define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
2140; SSE2-LABEL: test_bitreverse_v32i16:
2141; SSE2:       # %bb.0:
2142; SSE2-NEXT:    movdqa %xmm3, %xmm4
2143; SSE2-NEXT:    movdqa %xmm0, %xmm3
2144; SSE2-NEXT:    psrlw $8, %xmm3
2145; SSE2-NEXT:    psllw $8, %xmm0
2146; SSE2-NEXT:    por %xmm3, %xmm0
2147; SSE2-NEXT:    movdqa %xmm0, %xmm5
2148; SSE2-NEXT:    psllw $4, %xmm5
2149; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2150; SSE2-NEXT:    movdqa %xmm3, %xmm6
2151; SSE2-NEXT:    pandn %xmm5, %xmm6
2152; SSE2-NEXT:    psrlw $4, %xmm0
2153; SSE2-NEXT:    pand %xmm3, %xmm0
2154; SSE2-NEXT:    por %xmm6, %xmm0
2155; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2156; SSE2-NEXT:    movdqa %xmm0, %xmm6
2157; SSE2-NEXT:    pand %xmm10, %xmm6
2158; SSE2-NEXT:    psllw $2, %xmm6
2159; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
2160; SSE2-NEXT:    pand %xmm8, %xmm0
2161; SSE2-NEXT:    psrlw $2, %xmm0
2162; SSE2-NEXT:    por %xmm6, %xmm0
2163; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2164; SSE2-NEXT:    movdqa %xmm0, %xmm7
2165; SSE2-NEXT:    pand %xmm6, %xmm7
2166; SSE2-NEXT:    paddb %xmm7, %xmm7
2167; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
2168; SSE2-NEXT:    pand %xmm9, %xmm0
2169; SSE2-NEXT:    psrlw $1, %xmm0
2170; SSE2-NEXT:    por %xmm7, %xmm0
2171; SSE2-NEXT:    movdqa %xmm1, %xmm7
2172; SSE2-NEXT:    psrlw $8, %xmm7
2173; SSE2-NEXT:    psllw $8, %xmm1
2174; SSE2-NEXT:    por %xmm7, %xmm1
2175; SSE2-NEXT:    movdqa %xmm1, %xmm7
2176; SSE2-NEXT:    psllw $4, %xmm7
2177; SSE2-NEXT:    movdqa %xmm3, %xmm5
2178; SSE2-NEXT:    pandn %xmm7, %xmm5
2179; SSE2-NEXT:    psrlw $4, %xmm1
2180; SSE2-NEXT:    pand %xmm3, %xmm1
2181; SSE2-NEXT:    por %xmm5, %xmm1
2182; SSE2-NEXT:    movdqa %xmm1, %xmm5
2183; SSE2-NEXT:    pand %xmm10, %xmm5
2184; SSE2-NEXT:    psllw $2, %xmm5
2185; SSE2-NEXT:    pand %xmm8, %xmm1
2186; SSE2-NEXT:    psrlw $2, %xmm1
2187; SSE2-NEXT:    por %xmm5, %xmm1
2188; SSE2-NEXT:    movdqa %xmm1, %xmm5
2189; SSE2-NEXT:    pand %xmm6, %xmm5
2190; SSE2-NEXT:    paddb %xmm5, %xmm5
2191; SSE2-NEXT:    pand %xmm9, %xmm1
2192; SSE2-NEXT:    psrlw $1, %xmm1
2193; SSE2-NEXT:    por %xmm5, %xmm1
2194; SSE2-NEXT:    movdqa %xmm2, %xmm5
2195; SSE2-NEXT:    psrlw $8, %xmm5
2196; SSE2-NEXT:    psllw $8, %xmm2
2197; SSE2-NEXT:    por %xmm5, %xmm2
2198; SSE2-NEXT:    movdqa %xmm2, %xmm5
2199; SSE2-NEXT:    psllw $4, %xmm5
2200; SSE2-NEXT:    movdqa %xmm3, %xmm7
2201; SSE2-NEXT:    pandn %xmm5, %xmm7
2202; SSE2-NEXT:    psrlw $4, %xmm2
2203; SSE2-NEXT:    pand %xmm3, %xmm2
2204; SSE2-NEXT:    por %xmm7, %xmm2
2205; SSE2-NEXT:    movdqa %xmm2, %xmm5
2206; SSE2-NEXT:    pand %xmm10, %xmm5
2207; SSE2-NEXT:    psllw $2, %xmm5
2208; SSE2-NEXT:    pand %xmm8, %xmm2
2209; SSE2-NEXT:    psrlw $2, %xmm2
2210; SSE2-NEXT:    por %xmm5, %xmm2
2211; SSE2-NEXT:    movdqa %xmm2, %xmm5
2212; SSE2-NEXT:    pand %xmm6, %xmm5
2213; SSE2-NEXT:    paddb %xmm5, %xmm5
2214; SSE2-NEXT:    pand %xmm9, %xmm2
2215; SSE2-NEXT:    psrlw $1, %xmm2
2216; SSE2-NEXT:    por %xmm5, %xmm2
2217; SSE2-NEXT:    movdqa %xmm4, %xmm5
2218; SSE2-NEXT:    psrlw $8, %xmm5
2219; SSE2-NEXT:    psllw $8, %xmm4
2220; SSE2-NEXT:    por %xmm5, %xmm4
2221; SSE2-NEXT:    movdqa %xmm4, %xmm5
2222; SSE2-NEXT:    psllw $4, %xmm5
2223; SSE2-NEXT:    psrlw $4, %xmm4
2224; SSE2-NEXT:    pand %xmm3, %xmm4
2225; SSE2-NEXT:    pandn %xmm5, %xmm3
2226; SSE2-NEXT:    por %xmm4, %xmm3
2227; SSE2-NEXT:    pand %xmm3, %xmm10
2228; SSE2-NEXT:    psllw $2, %xmm10
2229; SSE2-NEXT:    pand %xmm8, %xmm3
2230; SSE2-NEXT:    psrlw $2, %xmm3
2231; SSE2-NEXT:    por %xmm10, %xmm3
2232; SSE2-NEXT:    pand %xmm3, %xmm6
2233; SSE2-NEXT:    paddb %xmm6, %xmm6
2234; SSE2-NEXT:    pand %xmm9, %xmm3
2235; SSE2-NEXT:    psrlw $1, %xmm3
2236; SSE2-NEXT:    por %xmm6, %xmm3
2237; SSE2-NEXT:    retq
2238;
2239; SSSE3-LABEL: test_bitreverse_v32i16:
2240; SSSE3:       # %bb.0:
2241; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2242; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2243; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2244; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2245; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2246; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2247; SSSE3-NEXT:    pand %xmm9, %xmm0
2248; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2249; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2250; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2251; SSSE3-NEXT:    psrlw $4, %xmm1
2252; SSSE3-NEXT:    pand %xmm9, %xmm1
2253; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2254; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2255; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2256; SSSE3-NEXT:    por %xmm6, %xmm0
2257; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2258; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2259; SSSE3-NEXT:    pand %xmm9, %xmm1
2260; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2261; SSSE3-NEXT:    pshufb %xmm1, %xmm6
2262; SSSE3-NEXT:    psrlw $4, %xmm5
2263; SSSE3-NEXT:    pand %xmm9, %xmm5
2264; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2265; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2266; SSSE3-NEXT:    por %xmm6, %xmm1
2267; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2268; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2269; SSSE3-NEXT:    pand %xmm9, %xmm5
2270; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2271; SSSE3-NEXT:    pshufb %xmm5, %xmm6
2272; SSSE3-NEXT:    psrlw $4, %xmm2
2273; SSSE3-NEXT:    pand %xmm9, %xmm2
2274; SSSE3-NEXT:    movdqa %xmm4, %xmm5
2275; SSSE3-NEXT:    pshufb %xmm2, %xmm5
2276; SSSE3-NEXT:    por %xmm6, %xmm5
2277; SSSE3-NEXT:    pshufb %xmm8, %xmm3
2278; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2279; SSSE3-NEXT:    pand %xmm9, %xmm2
2280; SSSE3-NEXT:    pshufb %xmm2, %xmm7
2281; SSSE3-NEXT:    psrlw $4, %xmm3
2282; SSSE3-NEXT:    pand %xmm9, %xmm3
2283; SSSE3-NEXT:    pshufb %xmm3, %xmm4
2284; SSSE3-NEXT:    por %xmm7, %xmm4
2285; SSSE3-NEXT:    movdqa %xmm5, %xmm2
2286; SSSE3-NEXT:    movdqa %xmm4, %xmm3
2287; SSSE3-NEXT:    retq
2288;
2289; AVX1-LABEL: test_bitreverse_v32i16:
2290; AVX1:       # %bb.0:
2291; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2292; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2293; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2294; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2295; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2296; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2297; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2298; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2299; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2300; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2301; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2302; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2303; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2304; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
2305; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2306; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2307; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2308; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
2309; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
2310; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2311; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2312; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2313; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2314; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2315; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2316; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2317; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2318; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2319; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2320; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
2321; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2322; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2323; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2324; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
2325; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
2326; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2327; AVX1-NEXT:    retq
2328;
2329; AVX2-LABEL: test_bitreverse_v32i16:
2330; AVX2:       # %bb.0:
2331; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2332; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2333; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2334; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
2335; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2336; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2337; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2338; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2339; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2340; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2341; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
2342; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2343; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
2344; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2345; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2346; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2347; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2348; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2349; AVX2-NEXT:    retq
2350;
2351; AVX512F-LABEL: test_bitreverse_v32i16:
2352; AVX512F:       # %bb.0:
2353; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2354; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2355; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2356; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2357; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
2358; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2359; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2360; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2361; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
2362; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2363; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
2364; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
2365; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
2366; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2367; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
2368; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
2369; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
2370; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
2371; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2372; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
2373; AVX512F-NEXT:    retq
2374;
2375; AVX512BW-LABEL: test_bitreverse_v32i16:
2376; AVX512BW:       # %bb.0:
2377; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
2378; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2379; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2380; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2381; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2382; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2383; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2384; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2385; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2386; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2387; AVX512BW-NEXT:    retq
2388;
2389; XOPAVX1-LABEL: test_bitreverse_v32i16:
2390; XOPAVX1:       # %bb.0:
2391; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2392; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
2393; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2394; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2395; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2396; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2397; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2398; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2399; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2400; XOPAVX1-NEXT:    retq
2401;
2402; XOPAVX2-LABEL: test_bitreverse_v32i16:
2403; XOPAVX2:       # %bb.0:
2404; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2405; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
2406; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2407; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2408; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2409; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2410; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2411; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2412; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2413; XOPAVX2-NEXT:    retq
2414;
2415; GFNISSE-LABEL: test_bitreverse_v32i16:
2416; GFNISSE:       # %bb.0:
2417; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2418; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
2419; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
2420; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
2421; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
2422; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
2423; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
2424; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
2425; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
2426; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
2427; GFNISSE-NEXT:    retq
2428;
2429; GFNIAVX-LABEL: test_bitreverse_v32i16:
2430; GFNIAVX:       # %bb.0:
2431; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
2432; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2433; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2434; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
2435; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
2436; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2437; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
2438; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2439; GFNIAVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
2440; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2441; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
2442; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2443; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
2444; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2445; GFNIAVX-NEXT:    retq
2446;
2447; GFNIAVX2-LABEL: test_bitreverse_v32i16:
2448; GFNIAVX2:       # %bb.0:
2449; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2450; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2451; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2452; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2453; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2454; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2455; GFNIAVX2-NEXT:    retq
2456;
2457; GFNIAVX512F-LABEL: test_bitreverse_v32i16:
2458; GFNIAVX512F:       # %bb.0:
2459; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2460; GFNIAVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2461; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2462; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2463; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2464; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2465; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2466; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2467; GFNIAVX512F-NEXT:    retq
2468;
2469; GFNIAVX512BW-LABEL: test_bitreverse_v32i16:
2470; GFNIAVX512BW:       # %bb.0:
2471; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
2472; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip){1to8}, %zmm0, %zmm0
2473; GFNIAVX512BW-NEXT:    retq
2474  %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
2475  ret <32 x i16> %b
2476}
2477
2478define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
2479; SSE2-LABEL: test_bitreverse_v16i32:
2480; SSE2:       # %bb.0:
2481; SSE2-NEXT:    movdqa %xmm3, %xmm11
2482; SSE2-NEXT:    pxor %xmm10, %xmm10
2483; SSE2-NEXT:    movdqa %xmm0, %xmm3
2484; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
2485; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
2486; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
2487; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
2488; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
2489; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
2490; SSE2-NEXT:    packuswb %xmm3, %xmm0
2491; SSE2-NEXT:    movdqa %xmm0, %xmm5
2492; SSE2-NEXT:    psllw $4, %xmm5
2493; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2494; SSE2-NEXT:    movdqa %xmm3, %xmm7
2495; SSE2-NEXT:    pandn %xmm5, %xmm7
2496; SSE2-NEXT:    psrlw $4, %xmm0
2497; SSE2-NEXT:    pand %xmm3, %xmm0
2498; SSE2-NEXT:    por %xmm7, %xmm0
2499; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2500; SSE2-NEXT:    movdqa %xmm0, %xmm7
2501; SSE2-NEXT:    pand %xmm5, %xmm7
2502; SSE2-NEXT:    psllw $2, %xmm7
2503; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
2504; SSE2-NEXT:    pand %xmm8, %xmm0
2505; SSE2-NEXT:    psrlw $2, %xmm0
2506; SSE2-NEXT:    por %xmm7, %xmm0
2507; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2508; SSE2-NEXT:    movdqa %xmm0, %xmm6
2509; SSE2-NEXT:    pand %xmm7, %xmm6
2510; SSE2-NEXT:    paddb %xmm6, %xmm6
2511; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
2512; SSE2-NEXT:    pand %xmm9, %xmm0
2513; SSE2-NEXT:    psrlw $1, %xmm0
2514; SSE2-NEXT:    por %xmm6, %xmm0
2515; SSE2-NEXT:    movdqa %xmm1, %xmm6
2516; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
2517; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
2518; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
2519; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
2520; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2521; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2522; SSE2-NEXT:    packuswb %xmm6, %xmm1
2523; SSE2-NEXT:    movdqa %xmm1, %xmm6
2524; SSE2-NEXT:    psllw $4, %xmm6
2525; SSE2-NEXT:    movdqa %xmm3, %xmm4
2526; SSE2-NEXT:    pandn %xmm6, %xmm4
2527; SSE2-NEXT:    psrlw $4, %xmm1
2528; SSE2-NEXT:    pand %xmm3, %xmm1
2529; SSE2-NEXT:    por %xmm4, %xmm1
2530; SSE2-NEXT:    movdqa %xmm1, %xmm4
2531; SSE2-NEXT:    pand %xmm5, %xmm4
2532; SSE2-NEXT:    psllw $2, %xmm4
2533; SSE2-NEXT:    pand %xmm8, %xmm1
2534; SSE2-NEXT:    psrlw $2, %xmm1
2535; SSE2-NEXT:    por %xmm4, %xmm1
2536; SSE2-NEXT:    movdqa %xmm1, %xmm4
2537; SSE2-NEXT:    pand %xmm7, %xmm4
2538; SSE2-NEXT:    paddb %xmm4, %xmm4
2539; SSE2-NEXT:    pand %xmm9, %xmm1
2540; SSE2-NEXT:    psrlw $1, %xmm1
2541; SSE2-NEXT:    por %xmm4, %xmm1
2542; SSE2-NEXT:    movdqa %xmm2, %xmm4
2543; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
2544; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2545; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2546; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
2547; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2548; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2549; SSE2-NEXT:    packuswb %xmm4, %xmm2
2550; SSE2-NEXT:    movdqa %xmm2, %xmm4
2551; SSE2-NEXT:    psllw $4, %xmm4
2552; SSE2-NEXT:    movdqa %xmm3, %xmm6
2553; SSE2-NEXT:    pandn %xmm4, %xmm6
2554; SSE2-NEXT:    psrlw $4, %xmm2
2555; SSE2-NEXT:    pand %xmm3, %xmm2
2556; SSE2-NEXT:    por %xmm6, %xmm2
2557; SSE2-NEXT:    movdqa %xmm2, %xmm4
2558; SSE2-NEXT:    pand %xmm5, %xmm4
2559; SSE2-NEXT:    psllw $2, %xmm4
2560; SSE2-NEXT:    pand %xmm8, %xmm2
2561; SSE2-NEXT:    psrlw $2, %xmm2
2562; SSE2-NEXT:    por %xmm4, %xmm2
2563; SSE2-NEXT:    movdqa %xmm2, %xmm4
2564; SSE2-NEXT:    pand %xmm7, %xmm4
2565; SSE2-NEXT:    paddb %xmm4, %xmm4
2566; SSE2-NEXT:    pand %xmm9, %xmm2
2567; SSE2-NEXT:    psrlw $1, %xmm2
2568; SSE2-NEXT:    por %xmm4, %xmm2
2569; SSE2-NEXT:    movdqa %xmm11, %xmm4
2570; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
2571; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2572; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2573; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
2574; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm11[3,2,1,0,4,5,6,7]
2575; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
2576; SSE2-NEXT:    packuswb %xmm4, %xmm6
2577; SSE2-NEXT:    movdqa %xmm6, %xmm4
2578; SSE2-NEXT:    psllw $4, %xmm4
2579; SSE2-NEXT:    psrlw $4, %xmm6
2580; SSE2-NEXT:    pand %xmm3, %xmm6
2581; SSE2-NEXT:    pandn %xmm4, %xmm3
2582; SSE2-NEXT:    por %xmm6, %xmm3
2583; SSE2-NEXT:    pand %xmm3, %xmm5
2584; SSE2-NEXT:    psllw $2, %xmm5
2585; SSE2-NEXT:    pand %xmm8, %xmm3
2586; SSE2-NEXT:    psrlw $2, %xmm3
2587; SSE2-NEXT:    por %xmm5, %xmm3
2588; SSE2-NEXT:    pand %xmm3, %xmm7
2589; SSE2-NEXT:    paddb %xmm7, %xmm7
2590; SSE2-NEXT:    pand %xmm9, %xmm3
2591; SSE2-NEXT:    psrlw $1, %xmm3
2592; SSE2-NEXT:    por %xmm7, %xmm3
2593; SSE2-NEXT:    retq
2594;
2595; SSSE3-LABEL: test_bitreverse_v16i32:
2596; SSSE3:       # %bb.0:
2597; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2598; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2599; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2600; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2601; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2602; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2603; SSSE3-NEXT:    pand %xmm9, %xmm0
2604; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2605; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2606; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2607; SSSE3-NEXT:    psrlw $4, %xmm1
2608; SSSE3-NEXT:    pand %xmm9, %xmm1
2609; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2610; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2611; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2612; SSSE3-NEXT:    por %xmm6, %xmm0
2613; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2614; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2615; SSSE3-NEXT:    pand %xmm9, %xmm1
2616; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2617; SSSE3-NEXT:    pshufb %xmm1, %xmm6
2618; SSSE3-NEXT:    psrlw $4, %xmm5
2619; SSSE3-NEXT:    pand %xmm9, %xmm5
2620; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2621; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2622; SSSE3-NEXT:    por %xmm6, %xmm1
2623; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2624; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2625; SSSE3-NEXT:    pand %xmm9, %xmm5
2626; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2627; SSSE3-NEXT:    pshufb %xmm5, %xmm6
2628; SSSE3-NEXT:    psrlw $4, %xmm2
2629; SSSE3-NEXT:    pand %xmm9, %xmm2
2630; SSSE3-NEXT:    movdqa %xmm4, %xmm5
2631; SSSE3-NEXT:    pshufb %xmm2, %xmm5
2632; SSSE3-NEXT:    por %xmm6, %xmm5
2633; SSSE3-NEXT:    pshufb %xmm8, %xmm3
2634; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2635; SSSE3-NEXT:    pand %xmm9, %xmm2
2636; SSSE3-NEXT:    pshufb %xmm2, %xmm7
2637; SSSE3-NEXT:    psrlw $4, %xmm3
2638; SSSE3-NEXT:    pand %xmm9, %xmm3
2639; SSSE3-NEXT:    pshufb %xmm3, %xmm4
2640; SSSE3-NEXT:    por %xmm7, %xmm4
2641; SSSE3-NEXT:    movdqa %xmm5, %xmm2
2642; SSSE3-NEXT:    movdqa %xmm4, %xmm3
2643; SSSE3-NEXT:    retq
2644;
2645; AVX1-LABEL: test_bitreverse_v16i32:
2646; AVX1:       # %bb.0:
2647; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2648; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2649; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2650; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2651; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2652; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2653; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2654; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2655; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2656; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2657; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2658; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2659; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2660; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
2661; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2662; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2663; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2664; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
2665; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
2666; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2667; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2668; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2669; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2670; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2671; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2672; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2673; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2674; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2675; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2676; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
2677; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2678; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2679; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2680; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
2681; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
2682; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2683; AVX1-NEXT:    retq
2684;
2685; AVX2-LABEL: test_bitreverse_v16i32:
2686; AVX2:       # %bb.0:
2687; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2688; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2689; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2690; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
2691; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2692; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2693; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2694; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2695; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2696; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2697; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
2698; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2699; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
2700; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2701; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2702; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2703; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2704; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2705; AVX2-NEXT:    retq
2706;
2707; AVX512F-LABEL: test_bitreverse_v16i32:
2708; AVX512F:       # %bb.0:
2709; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2710; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2711; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2712; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2713; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
2714; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2715; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2716; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2717; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
2718; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2719; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
2720; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
2721; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
2722; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2723; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
2724; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
2725; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
2726; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
2727; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2728; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
2729; AVX512F-NEXT:    retq
2730;
2731; AVX512BW-LABEL: test_bitreverse_v16i32:
2732; AVX512BW:       # %bb.0:
2733; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
2734; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2735; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2736; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2737; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2738; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2739; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2740; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2741; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2742; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2743; AVX512BW-NEXT:    retq
2744;
2745; XOPAVX1-LABEL: test_bitreverse_v16i32:
2746; XOPAVX1:       # %bb.0:
2747; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2748; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
2749; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2750; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2751; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2752; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2753; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2754; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2755; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2756; XOPAVX1-NEXT:    retq
2757;
2758; XOPAVX2-LABEL: test_bitreverse_v16i32:
2759; XOPAVX2:       # %bb.0:
2760; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2761; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
2762; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2763; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2764; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2765; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2766; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2767; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2768; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2769; XOPAVX2-NEXT:    retq
2770;
2771; GFNISSE-LABEL: test_bitreverse_v16i32:
2772; GFNISSE:       # %bb.0:
2773; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2774; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
2775; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
2776; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
2777; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
2778; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
2779; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
2780; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
2781; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
2782; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
2783; GFNISSE-NEXT:    retq
2784;
2785; GFNIAVX-LABEL: test_bitreverse_v16i32:
2786; GFNIAVX:       # %bb.0:
2787; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
2788; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2789; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2790; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
2791; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
2792; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2793; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
2794; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2795; GFNIAVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
2796; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2797; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
2798; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2799; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
2800; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2801; GFNIAVX-NEXT:    retq
2802;
2803; GFNIAVX2-LABEL: test_bitreverse_v16i32:
2804; GFNIAVX2:       # %bb.0:
2805; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2806; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2807; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2808; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2809; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2810; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2811; GFNIAVX2-NEXT:    retq
2812;
2813; GFNIAVX512F-LABEL: test_bitreverse_v16i32:
2814; GFNIAVX512F:       # %bb.0:
2815; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2816; GFNIAVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2817; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2818; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2819; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2820; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2821; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2822; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2823; GFNIAVX512F-NEXT:    retq
2824;
2825; GFNIAVX512BW-LABEL: test_bitreverse_v16i32:
2826; GFNIAVX512BW:       # %bb.0:
2827; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
2828; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip){1to8}, %zmm0, %zmm0
2829; GFNIAVX512BW-NEXT:    retq
2830  %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
2831  ret <16 x i32> %b
2832}
2833
2834define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
2835; SSE2-LABEL: test_bitreverse_v8i64:
2836; SSE2:       # %bb.0:
2837; SSE2-NEXT:    movdqa %xmm3, %xmm11
2838; SSE2-NEXT:    pxor %xmm10, %xmm10
2839; SSE2-NEXT:    movdqa %xmm0, %xmm3
2840; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
2841; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
2842; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
2843; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
2844; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
2845; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2846; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
2847; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
2848; SSE2-NEXT:    packuswb %xmm3, %xmm0
2849; SSE2-NEXT:    movdqa %xmm0, %xmm5
2850; SSE2-NEXT:    psllw $4, %xmm5
2851; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2852; SSE2-NEXT:    movdqa %xmm3, %xmm7
2853; SSE2-NEXT:    pandn %xmm5, %xmm7
2854; SSE2-NEXT:    psrlw $4, %xmm0
2855; SSE2-NEXT:    pand %xmm3, %xmm0
2856; SSE2-NEXT:    por %xmm7, %xmm0
2857; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2858; SSE2-NEXT:    movdqa %xmm0, %xmm7
2859; SSE2-NEXT:    pand %xmm5, %xmm7
2860; SSE2-NEXT:    psllw $2, %xmm7
2861; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
2862; SSE2-NEXT:    pand %xmm8, %xmm0
2863; SSE2-NEXT:    psrlw $2, %xmm0
2864; SSE2-NEXT:    por %xmm7, %xmm0
2865; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2866; SSE2-NEXT:    movdqa %xmm0, %xmm6
2867; SSE2-NEXT:    pand %xmm7, %xmm6
2868; SSE2-NEXT:    paddb %xmm6, %xmm6
2869; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
2870; SSE2-NEXT:    pand %xmm9, %xmm0
2871; SSE2-NEXT:    psrlw $1, %xmm0
2872; SSE2-NEXT:    por %xmm6, %xmm0
2873; SSE2-NEXT:    movdqa %xmm1, %xmm6
2874; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
2875; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
2876; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
2877; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
2878; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
2879; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2880; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2881; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2882; SSE2-NEXT:    packuswb %xmm6, %xmm1
2883; SSE2-NEXT:    movdqa %xmm1, %xmm6
2884; SSE2-NEXT:    psllw $4, %xmm6
2885; SSE2-NEXT:    movdqa %xmm3, %xmm4
2886; SSE2-NEXT:    pandn %xmm6, %xmm4
2887; SSE2-NEXT:    psrlw $4, %xmm1
2888; SSE2-NEXT:    pand %xmm3, %xmm1
2889; SSE2-NEXT:    por %xmm4, %xmm1
2890; SSE2-NEXT:    movdqa %xmm1, %xmm4
2891; SSE2-NEXT:    pand %xmm5, %xmm4
2892; SSE2-NEXT:    psllw $2, %xmm4
2893; SSE2-NEXT:    pand %xmm8, %xmm1
2894; SSE2-NEXT:    psrlw $2, %xmm1
2895; SSE2-NEXT:    por %xmm4, %xmm1
2896; SSE2-NEXT:    movdqa %xmm1, %xmm4
2897; SSE2-NEXT:    pand %xmm7, %xmm4
2898; SSE2-NEXT:    paddb %xmm4, %xmm4
2899; SSE2-NEXT:    pand %xmm9, %xmm1
2900; SSE2-NEXT:    psrlw $1, %xmm1
2901; SSE2-NEXT:    por %xmm4, %xmm1
2902; SSE2-NEXT:    movdqa %xmm2, %xmm4
2903; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
2904; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
2905; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2906; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2907; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
2908; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
2909; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2910; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2911; SSE2-NEXT:    packuswb %xmm4, %xmm2
2912; SSE2-NEXT:    movdqa %xmm2, %xmm4
2913; SSE2-NEXT:    psllw $4, %xmm4
2914; SSE2-NEXT:    movdqa %xmm3, %xmm6
2915; SSE2-NEXT:    pandn %xmm4, %xmm6
2916; SSE2-NEXT:    psrlw $4, %xmm2
2917; SSE2-NEXT:    pand %xmm3, %xmm2
2918; SSE2-NEXT:    por %xmm6, %xmm2
2919; SSE2-NEXT:    movdqa %xmm2, %xmm4
2920; SSE2-NEXT:    pand %xmm5, %xmm4
2921; SSE2-NEXT:    psllw $2, %xmm4
2922; SSE2-NEXT:    pand %xmm8, %xmm2
2923; SSE2-NEXT:    psrlw $2, %xmm2
2924; SSE2-NEXT:    por %xmm4, %xmm2
2925; SSE2-NEXT:    movdqa %xmm2, %xmm4
2926; SSE2-NEXT:    pand %xmm7, %xmm4
2927; SSE2-NEXT:    paddb %xmm4, %xmm4
2928; SSE2-NEXT:    pand %xmm9, %xmm2
2929; SSE2-NEXT:    psrlw $1, %xmm2
2930; SSE2-NEXT:    por %xmm4, %xmm2
2931; SSE2-NEXT:    movdqa %xmm11, %xmm4
2932; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
2933; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
2934; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2935; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2936; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
2937; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm11[2,3,0,1]
2938; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
2939; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
2940; SSE2-NEXT:    packuswb %xmm4, %xmm6
2941; SSE2-NEXT:    movdqa %xmm6, %xmm4
2942; SSE2-NEXT:    psllw $4, %xmm4
2943; SSE2-NEXT:    psrlw $4, %xmm6
2944; SSE2-NEXT:    pand %xmm3, %xmm6
2945; SSE2-NEXT:    pandn %xmm4, %xmm3
2946; SSE2-NEXT:    por %xmm6, %xmm3
2947; SSE2-NEXT:    pand %xmm3, %xmm5
2948; SSE2-NEXT:    psllw $2, %xmm5
2949; SSE2-NEXT:    pand %xmm8, %xmm3
2950; SSE2-NEXT:    psrlw $2, %xmm3
2951; SSE2-NEXT:    por %xmm5, %xmm3
2952; SSE2-NEXT:    pand %xmm3, %xmm7
2953; SSE2-NEXT:    paddb %xmm7, %xmm7
2954; SSE2-NEXT:    pand %xmm9, %xmm3
2955; SSE2-NEXT:    psrlw $1, %xmm3
2956; SSE2-NEXT:    por %xmm7, %xmm3
2957; SSE2-NEXT:    retq
2958;
2959; SSSE3-LABEL: test_bitreverse_v8i64:
2960; SSSE3:       # %bb.0:
2961; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2962; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2963; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2964; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2965; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2966; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2967; SSSE3-NEXT:    pand %xmm9, %xmm0
2968; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2969; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2970; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2971; SSSE3-NEXT:    psrlw $4, %xmm1
2972; SSSE3-NEXT:    pand %xmm9, %xmm1
2973; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2974; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2975; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2976; SSSE3-NEXT:    por %xmm6, %xmm0
2977; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2978; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2979; SSSE3-NEXT:    pand %xmm9, %xmm1
2980; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2981; SSSE3-NEXT:    pshufb %xmm1, %xmm6
2982; SSSE3-NEXT:    psrlw $4, %xmm5
2983; SSSE3-NEXT:    pand %xmm9, %xmm5
2984; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2985; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2986; SSSE3-NEXT:    por %xmm6, %xmm1
2987; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2988; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2989; SSSE3-NEXT:    pand %xmm9, %xmm5
2990; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2991; SSSE3-NEXT:    pshufb %xmm5, %xmm6
2992; SSSE3-NEXT:    psrlw $4, %xmm2
2993; SSSE3-NEXT:    pand %xmm9, %xmm2
2994; SSSE3-NEXT:    movdqa %xmm4, %xmm5
2995; SSSE3-NEXT:    pshufb %xmm2, %xmm5
2996; SSSE3-NEXT:    por %xmm6, %xmm5
2997; SSSE3-NEXT:    pshufb %xmm8, %xmm3
2998; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2999; SSSE3-NEXT:    pand %xmm9, %xmm2
3000; SSSE3-NEXT:    pshufb %xmm2, %xmm7
3001; SSSE3-NEXT:    psrlw $4, %xmm3
3002; SSSE3-NEXT:    pand %xmm9, %xmm3
3003; SSSE3-NEXT:    pshufb %xmm3, %xmm4
3004; SSSE3-NEXT:    por %xmm7, %xmm4
3005; SSSE3-NEXT:    movdqa %xmm5, %xmm2
3006; SSSE3-NEXT:    movdqa %xmm4, %xmm3
3007; SSSE3-NEXT:    retq
3008;
3009; AVX1-LABEL: test_bitreverse_v8i64:
3010; AVX1:       # %bb.0:
3011; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3012; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3013; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
3014; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3015; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
3016; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3017; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
3018; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
3019; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
3020; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3021; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
3022; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
3023; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
3024; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
3025; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
3026; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
3027; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
3028; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
3029; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
3030; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3031; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3032; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
3033; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
3034; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
3035; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
3036; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
3037; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
3038; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
3039; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
3040; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
3041; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
3042; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
3043; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
3044; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
3045; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
3046; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
3047; AVX1-NEXT:    retq
3048;
3049; AVX2-LABEL: test_bitreverse_v8i64:
3050; AVX2:       # %bb.0:
3051; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3052; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3053; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3054; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
3055; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3056; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
3057; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
3058; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
3059; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3060; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
3061; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
3062; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3063; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
3064; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
3065; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
3066; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
3067; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
3068; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
3069; AVX2-NEXT:    retq
3070;
3071; AVX512F-LABEL: test_bitreverse_v8i64:
3072; AVX512F:       # %bb.0:
3073; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
3074; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3075; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3076; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3077; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
3078; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3079; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
3080; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3081; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
3082; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
3083; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
3084; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
3085; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
3086; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3087; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
3088; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
3089; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
3090; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
3091; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3092; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
3093; AVX512F-NEXT:    retq
3094;
3095; AVX512BW-LABEL: test_bitreverse_v8i64:
3096; AVX512BW:       # %bb.0:
3097; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
3098; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3099; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
3100; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3101; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
3102; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
3103; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
3104; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3105; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
3106; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
3107; AVX512BW-NEXT:    retq
3108;
3109; XOPAVX1-LABEL: test_bitreverse_v8i64:
3110; XOPAVX1:       # %bb.0:
3111; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3112; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
3113; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3114; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
3115; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3116; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3117; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3118; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
3119; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
3120; XOPAVX1-NEXT:    retq
3121;
3122; XOPAVX2-LABEL: test_bitreverse_v8i64:
3123; XOPAVX2:       # %bb.0:
3124; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
3125; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
3126; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3127; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
3128; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
3129; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
3130; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3131; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
3132; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
3133; XOPAVX2-NEXT:    retq
3134;
3135; GFNISSE-LABEL: test_bitreverse_v8i64:
3136; GFNISSE:       # %bb.0:
3137; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3138; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
3139; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
3140; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
3141; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
3142; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
3143; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
3144; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
3145; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
3146; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
3147; GFNISSE-NEXT:    retq
3148;
3149; GFNIAVX-LABEL: test_bitreverse_v8i64:
3150; GFNIAVX:       # %bb.0:
3151; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
3152; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3153; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
3154; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
3155; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
3156; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
3157; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
3158; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3159; GFNIAVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
3160; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
3161; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
3162; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
3163; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
3164; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
3165; GFNIAVX-NEXT:    retq
3166;
3167; GFNIAVX2-LABEL: test_bitreverse_v8i64:
3168; GFNIAVX2:       # %bb.0:
3169; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3170; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3171; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
3172; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
3173; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3174; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
3175; GFNIAVX2-NEXT:    retq
3176;
3177; GFNIAVX512F-LABEL: test_bitreverse_v8i64:
3178; GFNIAVX512F:       # %bb.0:
3179; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
3180; GFNIAVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3181; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3182; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
3183; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
3184; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3185; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
3186; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3187; GFNIAVX512F-NEXT:    retq
3188;
3189; GFNIAVX512BW-LABEL: test_bitreverse_v8i64:
3190; GFNIAVX512BW:       # %bb.0:
3191; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
3192; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{.*}}(%rip){1to8}, %zmm0, %zmm0
3193; GFNIAVX512BW-NEXT:    retq
3194  %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
3195  ret <8 x i64> %b
3196}
3197
3198;
3199; Constant Folding
3200;
3201
3202define i32 @fold_bitreverse_i32() nounwind {
3203; ALL-LABEL: fold_bitreverse_i32:
3204; ALL:       # %bb.0:
3205; ALL-NEXT:    movl $16711935, %eax # imm = 0xFF00FF
3206; ALL-NEXT:    retq
3207  %b = call i32 @llvm.bitreverse.i32(i32 4278255360)
3208  ret i32 %b
3209}
3210
3211define <16 x i8> @fold_bitreverse_v16i8() nounwind {
3212; SSE-LABEL: fold_bitreverse_v16i8:
3213; SSE:       # %bb.0:
3214; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3215; SSE-NEXT:    retq
3216;
3217; AVX-LABEL: fold_bitreverse_v16i8:
3218; AVX:       # %bb.0:
3219; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3220; AVX-NEXT:    retq
3221;
3222; XOP-LABEL: fold_bitreverse_v16i8:
3223; XOP:       # %bb.0:
3224; XOP-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3225; XOP-NEXT:    retq
3226;
3227; GFNISSE-LABEL: fold_bitreverse_v16i8:
3228; GFNISSE:       # %bb.0:
3229; GFNISSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3230; GFNISSE-NEXT:    retq
3231;
3232; GFNIAVX-LABEL: fold_bitreverse_v16i8:
3233; GFNIAVX:       # %bb.0:
3234; GFNIAVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3235; GFNIAVX-NEXT:    retq
3236;
3237; GFNIAVX2-LABEL: fold_bitreverse_v16i8:
3238; GFNIAVX2:       # %bb.0:
3239; GFNIAVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3240; GFNIAVX2-NEXT:    retq
3241;
3242; GFNIAVX512F-LABEL: fold_bitreverse_v16i8:
3243; GFNIAVX512F:       # %bb.0:
3244; GFNIAVX512F-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3245; GFNIAVX512F-NEXT:    retq
3246;
3247; GFNIAVX512BW-LABEL: fold_bitreverse_v16i8:
3248; GFNIAVX512BW:       # %bb.0:
3249; GFNIAVX512BW-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3250; GFNIAVX512BW-NEXT:    retq
3251  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>)
3252  ret <16 x i8> %b
3253}
3254
3255define <16 x i16> @fold_bitreverse_v16i16() nounwind {
3256; SSE-LABEL: fold_bitreverse_v16i16:
3257; SSE:       # %bb.0:
3258; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
3259; SSE-NEXT:    movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
3260; SSE-NEXT:    retq
3261;
3262; AVX-LABEL: fold_bitreverse_v16i16:
3263; AVX:       # %bb.0:
3264; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3265; AVX-NEXT:    retq
3266;
3267; XOP-LABEL: fold_bitreverse_v16i16:
3268; XOP:       # %bb.0:
3269; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3270; XOP-NEXT:    retq
3271;
3272; GFNISSE-LABEL: fold_bitreverse_v16i16:
3273; GFNISSE:       # %bb.0:
3274; GFNISSE-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
3275; GFNISSE-NEXT:    movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
3276; GFNISSE-NEXT:    retq
3277;
3278; GFNIAVX-LABEL: fold_bitreverse_v16i16:
3279; GFNIAVX:       # %bb.0:
3280; GFNIAVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3281; GFNIAVX-NEXT:    retq
3282;
3283; GFNIAVX2-LABEL: fold_bitreverse_v16i16:
3284; GFNIAVX2:       # %bb.0:
3285; GFNIAVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3286; GFNIAVX2-NEXT:    retq
3287;
3288; GFNIAVX512F-LABEL: fold_bitreverse_v16i16:
3289; GFNIAVX512F:       # %bb.0:
3290; GFNIAVX512F-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3291; GFNIAVX512F-NEXT:    retq
3292;
3293; GFNIAVX512BW-LABEL: fold_bitreverse_v16i16:
3294; GFNIAVX512BW:       # %bb.0:
3295; GFNIAVX512BW-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3296; GFNIAVX512BW-NEXT:    retq
3297  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>)
3298  ret <16 x i16> %b
3299}
3300
3301define <16 x i32> @fold_bitreverse_v16i32() nounwind {
3302; SSE-LABEL: fold_bitreverse_v16i32:
3303; SSE:       # %bb.0:
3304; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
3305; SSE-NEXT:    movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
3306; SSE-NEXT:    movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
3307; SSE-NEXT:    movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103]
3308; SSE-NEXT:    retq
3309;
3310; AVX1-LABEL: fold_bitreverse_v16i32:
3311; AVX1:       # %bb.0:
3312; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3313; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3314; AVX1-NEXT:    retq
3315;
3316; AVX2-LABEL: fold_bitreverse_v16i32:
3317; AVX2:       # %bb.0:
3318; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3319; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3320; AVX2-NEXT:    retq
3321;
3322; AVX512-LABEL: fold_bitreverse_v16i32:
3323; AVX512:       # %bb.0:
3324; AVX512-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3325; AVX512-NEXT:    retq
3326;
3327; XOP-LABEL: fold_bitreverse_v16i32:
3328; XOP:       # %bb.0:
3329; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3330; XOP-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3331; XOP-NEXT:    retq
3332;
3333; GFNISSE-LABEL: fold_bitreverse_v16i32:
3334; GFNISSE:       # %bb.0:
3335; GFNISSE-NEXT:    movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
3336; GFNISSE-NEXT:    movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
3337; GFNISSE-NEXT:    movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
3338; GFNISSE-NEXT:    movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103]
3339; GFNISSE-NEXT:    retq
3340;
3341; GFNIAVX-LABEL: fold_bitreverse_v16i32:
3342; GFNIAVX:       # %bb.0:
3343; GFNIAVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3344; GFNIAVX-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3345; GFNIAVX-NEXT:    retq
3346;
3347; GFNIAVX2-LABEL: fold_bitreverse_v16i32:
3348; GFNIAVX2:       # %bb.0:
3349; GFNIAVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3350; GFNIAVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3351; GFNIAVX2-NEXT:    retq
3352;
3353; GFNIAVX512F-LABEL: fold_bitreverse_v16i32:
3354; GFNIAVX512F:       # %bb.0:
3355; GFNIAVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3356; GFNIAVX512F-NEXT:    retq
3357;
3358; GFNIAVX512BW-LABEL: fold_bitreverse_v16i32:
3359; GFNIAVX512BW:       # %bb.0:
3360; GFNIAVX512BW-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3361; GFNIAVX512BW-NEXT:    retq
3362  %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>)
3363  ret <16 x i32> %b
3364}
3365
3366declare i8 @llvm.bitreverse.i8(i8) readnone
3367declare i16 @llvm.bitreverse.i16(i16) readnone
3368declare i32 @llvm.bitreverse.i32(i32) readnone
3369declare i64 @llvm.bitreverse.i64(i64) readnone
3370
3371declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone
3372declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone
3373declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone
3374declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone
3375
3376declare <32 x i8>  @llvm.bitreverse.v32i8(<32 x i8>) readnone
3377declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone
3378declare <8 x i32>  @llvm.bitreverse.v8i32(<8 x i32>) readnone
3379declare <4 x i64>  @llvm.bitreverse.v4i64(<4 x i64>) readnone
3380
3381declare <64 x i8>  @llvm.bitreverse.v64i8(<64 x i8>) readnone
3382declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone
3383declare <16 x i32>  @llvm.bitreverse.v16i32(<16 x i32>) readnone
3384declare <8 x i64>  @llvm.bitreverse.v8i64(<8 x i64>) readnone
3385