1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNISSE
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX2
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX512F
14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX512BW
15
16; Make sure we don't crash with avx512bw and xop
17; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw
18
19define i8 @test_bitreverse_i8(i8 %a) nounwind {
20; SSE-LABEL: test_bitreverse_i8:
21; SSE:       # %bb.0:
22; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
23; SSE-NEXT:    rolb $4, %dil
24; SSE-NEXT:    movl %edi, %eax
25; SSE-NEXT:    andb $51, %al
26; SSE-NEXT:    shlb $2, %al
27; SSE-NEXT:    shrb $2, %dil
28; SSE-NEXT:    andb $51, %dil
29; SSE-NEXT:    orb %al, %dil
30; SSE-NEXT:    movl %edi, %eax
31; SSE-NEXT:    andb $85, %al
32; SSE-NEXT:    addb %al, %al
33; SSE-NEXT:    shrb %dil
34; SSE-NEXT:    andb $85, %dil
35; SSE-NEXT:    addl %edi, %eax
36; SSE-NEXT:    # kill: def $al killed $al killed $eax
37; SSE-NEXT:    retq
38;
39; AVX-LABEL: test_bitreverse_i8:
40; AVX:       # %bb.0:
41; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
42; AVX-NEXT:    rolb $4, %dil
43; AVX-NEXT:    movl %edi, %eax
44; AVX-NEXT:    andb $51, %al
45; AVX-NEXT:    shlb $2, %al
46; AVX-NEXT:    shrb $2, %dil
47; AVX-NEXT:    andb $51, %dil
48; AVX-NEXT:    orb %al, %dil
49; AVX-NEXT:    movl %edi, %eax
50; AVX-NEXT:    andb $85, %al
51; AVX-NEXT:    addb %al, %al
52; AVX-NEXT:    shrb %dil
53; AVX-NEXT:    andb $85, %dil
54; AVX-NEXT:    addl %edi, %eax
55; AVX-NEXT:    # kill: def $al killed $al killed $eax
56; AVX-NEXT:    retq
57;
58; XOP-LABEL: test_bitreverse_i8:
59; XOP:       # %bb.0:
60; XOP-NEXT:    vmovd %edi, %xmm0
61; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
62; XOP-NEXT:    vmovd %xmm0, %eax
63; XOP-NEXT:    # kill: def $al killed $al killed $eax
64; XOP-NEXT:    retq
65;
66; GFNISSE-LABEL: test_bitreverse_i8:
67; GFNISSE:       # %bb.0:
68; GFNISSE-NEXT:    # kill: def $edi killed $edi def $rdi
69; GFNISSE-NEXT:    rolb $4, %dil
70; GFNISSE-NEXT:    movl %edi, %eax
71; GFNISSE-NEXT:    andb $51, %al
72; GFNISSE-NEXT:    shlb $2, %al
73; GFNISSE-NEXT:    shrb $2, %dil
74; GFNISSE-NEXT:    andb $51, %dil
75; GFNISSE-NEXT:    orb %al, %dil
76; GFNISSE-NEXT:    movl %edi, %eax
77; GFNISSE-NEXT:    andb $85, %al
78; GFNISSE-NEXT:    addb %al, %al
79; GFNISSE-NEXT:    shrb %dil
80; GFNISSE-NEXT:    andb $85, %dil
81; GFNISSE-NEXT:    addl %edi, %eax
82; GFNISSE-NEXT:    # kill: def $al killed $al killed $eax
83; GFNISSE-NEXT:    retq
84;
85; GFNIAVX-LABEL: test_bitreverse_i8:
86; GFNIAVX:       # %bb.0:
87; GFNIAVX-NEXT:    # kill: def $edi killed $edi def $rdi
88; GFNIAVX-NEXT:    rolb $4, %dil
89; GFNIAVX-NEXT:    movl %edi, %eax
90; GFNIAVX-NEXT:    andb $51, %al
91; GFNIAVX-NEXT:    shlb $2, %al
92; GFNIAVX-NEXT:    shrb $2, %dil
93; GFNIAVX-NEXT:    andb $51, %dil
94; GFNIAVX-NEXT:    orb %al, %dil
95; GFNIAVX-NEXT:    movl %edi, %eax
96; GFNIAVX-NEXT:    andb $85, %al
97; GFNIAVX-NEXT:    addb %al, %al
98; GFNIAVX-NEXT:    shrb %dil
99; GFNIAVX-NEXT:    andb $85, %dil
100; GFNIAVX-NEXT:    addl %edi, %eax
101; GFNIAVX-NEXT:    # kill: def $al killed $al killed $eax
102; GFNIAVX-NEXT:    retq
103;
104; GFNIAVX2-LABEL: test_bitreverse_i8:
105; GFNIAVX2:       # %bb.0:
106; GFNIAVX2-NEXT:    # kill: def $edi killed $edi def $rdi
107; GFNIAVX2-NEXT:    rolb $4, %dil
108; GFNIAVX2-NEXT:    movl %edi, %eax
109; GFNIAVX2-NEXT:    andb $51, %al
110; GFNIAVX2-NEXT:    shlb $2, %al
111; GFNIAVX2-NEXT:    shrb $2, %dil
112; GFNIAVX2-NEXT:    andb $51, %dil
113; GFNIAVX2-NEXT:    orb %al, %dil
114; GFNIAVX2-NEXT:    movl %edi, %eax
115; GFNIAVX2-NEXT:    andb $85, %al
116; GFNIAVX2-NEXT:    addb %al, %al
117; GFNIAVX2-NEXT:    shrb %dil
118; GFNIAVX2-NEXT:    andb $85, %dil
119; GFNIAVX2-NEXT:    addl %edi, %eax
120; GFNIAVX2-NEXT:    # kill: def $al killed $al killed $eax
121; GFNIAVX2-NEXT:    retq
122;
123; GFNIAVX512F-LABEL: test_bitreverse_i8:
124; GFNIAVX512F:       # %bb.0:
125; GFNIAVX512F-NEXT:    # kill: def $edi killed $edi def $rdi
126; GFNIAVX512F-NEXT:    rolb $4, %dil
127; GFNIAVX512F-NEXT:    movl %edi, %eax
128; GFNIAVX512F-NEXT:    andb $51, %al
129; GFNIAVX512F-NEXT:    shlb $2, %al
130; GFNIAVX512F-NEXT:    shrb $2, %dil
131; GFNIAVX512F-NEXT:    andb $51, %dil
132; GFNIAVX512F-NEXT:    orb %al, %dil
133; GFNIAVX512F-NEXT:    movl %edi, %eax
134; GFNIAVX512F-NEXT:    andb $85, %al
135; GFNIAVX512F-NEXT:    addb %al, %al
136; GFNIAVX512F-NEXT:    shrb %dil
137; GFNIAVX512F-NEXT:    andb $85, %dil
138; GFNIAVX512F-NEXT:    addl %edi, %eax
139; GFNIAVX512F-NEXT:    # kill: def $al killed $al killed $eax
140; GFNIAVX512F-NEXT:    retq
141;
142; GFNIAVX512BW-LABEL: test_bitreverse_i8:
143; GFNIAVX512BW:       # %bb.0:
144; GFNIAVX512BW-NEXT:    # kill: def $edi killed $edi def $rdi
145; GFNIAVX512BW-NEXT:    rolb $4, %dil
146; GFNIAVX512BW-NEXT:    movl %edi, %eax
147; GFNIAVX512BW-NEXT:    andb $51, %al
148; GFNIAVX512BW-NEXT:    shlb $2, %al
149; GFNIAVX512BW-NEXT:    shrb $2, %dil
150; GFNIAVX512BW-NEXT:    andb $51, %dil
151; GFNIAVX512BW-NEXT:    orb %al, %dil
152; GFNIAVX512BW-NEXT:    movl %edi, %eax
153; GFNIAVX512BW-NEXT:    andb $85, %al
154; GFNIAVX512BW-NEXT:    addb %al, %al
155; GFNIAVX512BW-NEXT:    shrb %dil
156; GFNIAVX512BW-NEXT:    andb $85, %dil
157; GFNIAVX512BW-NEXT:    addl %edi, %eax
158; GFNIAVX512BW-NEXT:    # kill: def $al killed $al killed $eax
159; GFNIAVX512BW-NEXT:    retq
160  %b = call i8 @llvm.bitreverse.i8(i8 %a)
161  ret i8 %b
162}
163
164define i16 @test_bitreverse_i16(i16 %a) nounwind {
165; SSE-LABEL: test_bitreverse_i16:
166; SSE:       # %bb.0:
167; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
168; SSE-NEXT:    rolw $8, %di
169; SSE-NEXT:    movl %edi, %eax
170; SSE-NEXT:    andl $3855, %eax # imm = 0xF0F
171; SSE-NEXT:    shll $4, %eax
172; SSE-NEXT:    shrl $4, %edi
173; SSE-NEXT:    andl $3855, %edi # imm = 0xF0F
174; SSE-NEXT:    orl %eax, %edi
175; SSE-NEXT:    movl %edi, %eax
176; SSE-NEXT:    andl $13107, %eax # imm = 0x3333
177; SSE-NEXT:    shrl $2, %edi
178; SSE-NEXT:    andl $13107, %edi # imm = 0x3333
179; SSE-NEXT:    leal (%rdi,%rax,4), %eax
180; SSE-NEXT:    movl %eax, %ecx
181; SSE-NEXT:    andl $21845, %ecx # imm = 0x5555
182; SSE-NEXT:    shrl %eax
183; SSE-NEXT:    andl $21845, %eax # imm = 0x5555
184; SSE-NEXT:    leal (%rax,%rcx,2), %eax
185; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
186; SSE-NEXT:    retq
187;
188; AVX-LABEL: test_bitreverse_i16:
189; AVX:       # %bb.0:
190; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
191; AVX-NEXT:    rolw $8, %di
192; AVX-NEXT:    movl %edi, %eax
193; AVX-NEXT:    andl $3855, %eax # imm = 0xF0F
194; AVX-NEXT:    shll $4, %eax
195; AVX-NEXT:    shrl $4, %edi
196; AVX-NEXT:    andl $3855, %edi # imm = 0xF0F
197; AVX-NEXT:    orl %eax, %edi
198; AVX-NEXT:    movl %edi, %eax
199; AVX-NEXT:    andl $13107, %eax # imm = 0x3333
200; AVX-NEXT:    shrl $2, %edi
201; AVX-NEXT:    andl $13107, %edi # imm = 0x3333
202; AVX-NEXT:    leal (%rdi,%rax,4), %eax
203; AVX-NEXT:    movl %eax, %ecx
204; AVX-NEXT:    andl $21845, %ecx # imm = 0x5555
205; AVX-NEXT:    shrl %eax
206; AVX-NEXT:    andl $21845, %eax # imm = 0x5555
207; AVX-NEXT:    leal (%rax,%rcx,2), %eax
208; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
209; AVX-NEXT:    retq
210;
211; XOP-LABEL: test_bitreverse_i16:
212; XOP:       # %bb.0:
213; XOP-NEXT:    vmovd %edi, %xmm0
214; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
215; XOP-NEXT:    vmovd %xmm0, %eax
216; XOP-NEXT:    # kill: def $ax killed $ax killed $eax
217; XOP-NEXT:    retq
218;
219; GFNISSE-LABEL: test_bitreverse_i16:
220; GFNISSE:       # %bb.0:
221; GFNISSE-NEXT:    # kill: def $edi killed $edi def $rdi
222; GFNISSE-NEXT:    rolw $8, %di
223; GFNISSE-NEXT:    movl %edi, %eax
224; GFNISSE-NEXT:    andl $3855, %eax # imm = 0xF0F
225; GFNISSE-NEXT:    shll $4, %eax
226; GFNISSE-NEXT:    shrl $4, %edi
227; GFNISSE-NEXT:    andl $3855, %edi # imm = 0xF0F
228; GFNISSE-NEXT:    orl %eax, %edi
229; GFNISSE-NEXT:    movl %edi, %eax
230; GFNISSE-NEXT:    andl $13107, %eax # imm = 0x3333
231; GFNISSE-NEXT:    shrl $2, %edi
232; GFNISSE-NEXT:    andl $13107, %edi # imm = 0x3333
233; GFNISSE-NEXT:    leal (%rdi,%rax,4), %eax
234; GFNISSE-NEXT:    movl %eax, %ecx
235; GFNISSE-NEXT:    andl $21845, %ecx # imm = 0x5555
236; GFNISSE-NEXT:    shrl %eax
237; GFNISSE-NEXT:    andl $21845, %eax # imm = 0x5555
238; GFNISSE-NEXT:    leal (%rax,%rcx,2), %eax
239; GFNISSE-NEXT:    # kill: def $ax killed $ax killed $eax
240; GFNISSE-NEXT:    retq
241;
242; GFNIAVX-LABEL: test_bitreverse_i16:
243; GFNIAVX:       # %bb.0:
244; GFNIAVX-NEXT:    # kill: def $edi killed $edi def $rdi
245; GFNIAVX-NEXT:    rolw $8, %di
246; GFNIAVX-NEXT:    movl %edi, %eax
247; GFNIAVX-NEXT:    andl $3855, %eax # imm = 0xF0F
248; GFNIAVX-NEXT:    shll $4, %eax
249; GFNIAVX-NEXT:    shrl $4, %edi
250; GFNIAVX-NEXT:    andl $3855, %edi # imm = 0xF0F
251; GFNIAVX-NEXT:    orl %eax, %edi
252; GFNIAVX-NEXT:    movl %edi, %eax
253; GFNIAVX-NEXT:    andl $13107, %eax # imm = 0x3333
254; GFNIAVX-NEXT:    shrl $2, %edi
255; GFNIAVX-NEXT:    andl $13107, %edi # imm = 0x3333
256; GFNIAVX-NEXT:    leal (%rdi,%rax,4), %eax
257; GFNIAVX-NEXT:    movl %eax, %ecx
258; GFNIAVX-NEXT:    andl $21845, %ecx # imm = 0x5555
259; GFNIAVX-NEXT:    shrl %eax
260; GFNIAVX-NEXT:    andl $21845, %eax # imm = 0x5555
261; GFNIAVX-NEXT:    leal (%rax,%rcx,2), %eax
262; GFNIAVX-NEXT:    # kill: def $ax killed $ax killed $eax
263; GFNIAVX-NEXT:    retq
264;
265; GFNIAVX2-LABEL: test_bitreverse_i16:
266; GFNIAVX2:       # %bb.0:
267; GFNIAVX2-NEXT:    # kill: def $edi killed $edi def $rdi
268; GFNIAVX2-NEXT:    rolw $8, %di
269; GFNIAVX2-NEXT:    movl %edi, %eax
270; GFNIAVX2-NEXT:    andl $3855, %eax # imm = 0xF0F
271; GFNIAVX2-NEXT:    shll $4, %eax
272; GFNIAVX2-NEXT:    shrl $4, %edi
273; GFNIAVX2-NEXT:    andl $3855, %edi # imm = 0xF0F
274; GFNIAVX2-NEXT:    orl %eax, %edi
275; GFNIAVX2-NEXT:    movl %edi, %eax
276; GFNIAVX2-NEXT:    andl $13107, %eax # imm = 0x3333
277; GFNIAVX2-NEXT:    shrl $2, %edi
278; GFNIAVX2-NEXT:    andl $13107, %edi # imm = 0x3333
279; GFNIAVX2-NEXT:    leal (%rdi,%rax,4), %eax
280; GFNIAVX2-NEXT:    movl %eax, %ecx
281; GFNIAVX2-NEXT:    andl $21845, %ecx # imm = 0x5555
282; GFNIAVX2-NEXT:    shrl %eax
283; GFNIAVX2-NEXT:    andl $21845, %eax # imm = 0x5555
284; GFNIAVX2-NEXT:    leal (%rax,%rcx,2), %eax
285; GFNIAVX2-NEXT:    # kill: def $ax killed $ax killed $eax
286; GFNIAVX2-NEXT:    retq
287;
288; GFNIAVX512F-LABEL: test_bitreverse_i16:
289; GFNIAVX512F:       # %bb.0:
290; GFNIAVX512F-NEXT:    # kill: def $edi killed $edi def $rdi
291; GFNIAVX512F-NEXT:    rolw $8, %di
292; GFNIAVX512F-NEXT:    movl %edi, %eax
293; GFNIAVX512F-NEXT:    andl $3855, %eax # imm = 0xF0F
294; GFNIAVX512F-NEXT:    shll $4, %eax
295; GFNIAVX512F-NEXT:    shrl $4, %edi
296; GFNIAVX512F-NEXT:    andl $3855, %edi # imm = 0xF0F
297; GFNIAVX512F-NEXT:    orl %eax, %edi
298; GFNIAVX512F-NEXT:    movl %edi, %eax
299; GFNIAVX512F-NEXT:    andl $13107, %eax # imm = 0x3333
300; GFNIAVX512F-NEXT:    shrl $2, %edi
301; GFNIAVX512F-NEXT:    andl $13107, %edi # imm = 0x3333
302; GFNIAVX512F-NEXT:    leal (%rdi,%rax,4), %eax
303; GFNIAVX512F-NEXT:    movl %eax, %ecx
304; GFNIAVX512F-NEXT:    andl $21845, %ecx # imm = 0x5555
305; GFNIAVX512F-NEXT:    shrl %eax
306; GFNIAVX512F-NEXT:    andl $21845, %eax # imm = 0x5555
307; GFNIAVX512F-NEXT:    leal (%rax,%rcx,2), %eax
308; GFNIAVX512F-NEXT:    # kill: def $ax killed $ax killed $eax
309; GFNIAVX512F-NEXT:    retq
310;
311; GFNIAVX512BW-LABEL: test_bitreverse_i16:
312; GFNIAVX512BW:       # %bb.0:
313; GFNIAVX512BW-NEXT:    # kill: def $edi killed $edi def $rdi
314; GFNIAVX512BW-NEXT:    rolw $8, %di
315; GFNIAVX512BW-NEXT:    movl %edi, %eax
316; GFNIAVX512BW-NEXT:    andl $3855, %eax # imm = 0xF0F
317; GFNIAVX512BW-NEXT:    shll $4, %eax
318; GFNIAVX512BW-NEXT:    shrl $4, %edi
319; GFNIAVX512BW-NEXT:    andl $3855, %edi # imm = 0xF0F
320; GFNIAVX512BW-NEXT:    orl %eax, %edi
321; GFNIAVX512BW-NEXT:    movl %edi, %eax
322; GFNIAVX512BW-NEXT:    andl $13107, %eax # imm = 0x3333
323; GFNIAVX512BW-NEXT:    shrl $2, %edi
324; GFNIAVX512BW-NEXT:    andl $13107, %edi # imm = 0x3333
325; GFNIAVX512BW-NEXT:    leal (%rdi,%rax,4), %eax
326; GFNIAVX512BW-NEXT:    movl %eax, %ecx
327; GFNIAVX512BW-NEXT:    andl $21845, %ecx # imm = 0x5555
328; GFNIAVX512BW-NEXT:    shrl %eax
329; GFNIAVX512BW-NEXT:    andl $21845, %eax # imm = 0x5555
330; GFNIAVX512BW-NEXT:    leal (%rax,%rcx,2), %eax
331; GFNIAVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
332; GFNIAVX512BW-NEXT:    retq
333  %b = call i16 @llvm.bitreverse.i16(i16 %a)
334  ret i16 %b
335}
336
337define i32 @test_bitreverse_i32(i32 %a) nounwind {
338; SSE-LABEL: test_bitreverse_i32:
339; SSE:       # %bb.0:
340; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
341; SSE-NEXT:    bswapl %edi
342; SSE-NEXT:    movl %edi, %eax
343; SSE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
344; SSE-NEXT:    shll $4, %eax
345; SSE-NEXT:    shrl $4, %edi
346; SSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
347; SSE-NEXT:    orl %eax, %edi
348; SSE-NEXT:    movl %edi, %eax
349; SSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
350; SSE-NEXT:    shrl $2, %edi
351; SSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
352; SSE-NEXT:    leal (%rdi,%rax,4), %eax
353; SSE-NEXT:    movl %eax, %ecx
354; SSE-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
355; SSE-NEXT:    shrl %eax
356; SSE-NEXT:    andl $1431655765, %eax # imm = 0x55555555
357; SSE-NEXT:    leal (%rax,%rcx,2), %eax
358; SSE-NEXT:    retq
359;
360; AVX-LABEL: test_bitreverse_i32:
361; AVX:       # %bb.0:
362; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
363; AVX-NEXT:    bswapl %edi
364; AVX-NEXT:    movl %edi, %eax
365; AVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
366; AVX-NEXT:    shll $4, %eax
367; AVX-NEXT:    shrl $4, %edi
368; AVX-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
369; AVX-NEXT:    orl %eax, %edi
370; AVX-NEXT:    movl %edi, %eax
371; AVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
372; AVX-NEXT:    shrl $2, %edi
373; AVX-NEXT:    andl $858993459, %edi # imm = 0x33333333
374; AVX-NEXT:    leal (%rdi,%rax,4), %eax
375; AVX-NEXT:    movl %eax, %ecx
376; AVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
377; AVX-NEXT:    shrl %eax
378; AVX-NEXT:    andl $1431655765, %eax # imm = 0x55555555
379; AVX-NEXT:    leal (%rax,%rcx,2), %eax
380; AVX-NEXT:    retq
381;
382; XOP-LABEL: test_bitreverse_i32:
383; XOP:       # %bb.0:
384; XOP-NEXT:    vmovd %edi, %xmm0
385; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
386; XOP-NEXT:    vmovd %xmm0, %eax
387; XOP-NEXT:    retq
388;
389; GFNISSE-LABEL: test_bitreverse_i32:
390; GFNISSE:       # %bb.0:
391; GFNISSE-NEXT:    # kill: def $edi killed $edi def $rdi
392; GFNISSE-NEXT:    bswapl %edi
393; GFNISSE-NEXT:    movl %edi, %eax
394; GFNISSE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
395; GFNISSE-NEXT:    shll $4, %eax
396; GFNISSE-NEXT:    shrl $4, %edi
397; GFNISSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
398; GFNISSE-NEXT:    orl %eax, %edi
399; GFNISSE-NEXT:    movl %edi, %eax
400; GFNISSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
401; GFNISSE-NEXT:    shrl $2, %edi
402; GFNISSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
403; GFNISSE-NEXT:    leal (%rdi,%rax,4), %eax
404; GFNISSE-NEXT:    movl %eax, %ecx
405; GFNISSE-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
406; GFNISSE-NEXT:    shrl %eax
407; GFNISSE-NEXT:    andl $1431655765, %eax # imm = 0x55555555
408; GFNISSE-NEXT:    leal (%rax,%rcx,2), %eax
409; GFNISSE-NEXT:    retq
410;
411; GFNIAVX-LABEL: test_bitreverse_i32:
412; GFNIAVX:       # %bb.0:
413; GFNIAVX-NEXT:    # kill: def $edi killed $edi def $rdi
414; GFNIAVX-NEXT:    bswapl %edi
415; GFNIAVX-NEXT:    movl %edi, %eax
416; GFNIAVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
417; GFNIAVX-NEXT:    shll $4, %eax
418; GFNIAVX-NEXT:    shrl $4, %edi
419; GFNIAVX-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
420; GFNIAVX-NEXT:    orl %eax, %edi
421; GFNIAVX-NEXT:    movl %edi, %eax
422; GFNIAVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
423; GFNIAVX-NEXT:    shrl $2, %edi
424; GFNIAVX-NEXT:    andl $858993459, %edi # imm = 0x33333333
425; GFNIAVX-NEXT:    leal (%rdi,%rax,4), %eax
426; GFNIAVX-NEXT:    movl %eax, %ecx
427; GFNIAVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
428; GFNIAVX-NEXT:    shrl %eax
429; GFNIAVX-NEXT:    andl $1431655765, %eax # imm = 0x55555555
430; GFNIAVX-NEXT:    leal (%rax,%rcx,2), %eax
431; GFNIAVX-NEXT:    retq
432;
433; GFNIAVX2-LABEL: test_bitreverse_i32:
434; GFNIAVX2:       # %bb.0:
435; GFNIAVX2-NEXT:    # kill: def $edi killed $edi def $rdi
436; GFNIAVX2-NEXT:    bswapl %edi
437; GFNIAVX2-NEXT:    movl %edi, %eax
438; GFNIAVX2-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
439; GFNIAVX2-NEXT:    shll $4, %eax
440; GFNIAVX2-NEXT:    shrl $4, %edi
441; GFNIAVX2-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
442; GFNIAVX2-NEXT:    orl %eax, %edi
443; GFNIAVX2-NEXT:    movl %edi, %eax
444; GFNIAVX2-NEXT:    andl $858993459, %eax # imm = 0x33333333
445; GFNIAVX2-NEXT:    shrl $2, %edi
446; GFNIAVX2-NEXT:    andl $858993459, %edi # imm = 0x33333333
447; GFNIAVX2-NEXT:    leal (%rdi,%rax,4), %eax
448; GFNIAVX2-NEXT:    movl %eax, %ecx
449; GFNIAVX2-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
450; GFNIAVX2-NEXT:    shrl %eax
451; GFNIAVX2-NEXT:    andl $1431655765, %eax # imm = 0x55555555
452; GFNIAVX2-NEXT:    leal (%rax,%rcx,2), %eax
453; GFNIAVX2-NEXT:    retq
454;
455; GFNIAVX512F-LABEL: test_bitreverse_i32:
456; GFNIAVX512F:       # %bb.0:
457; GFNIAVX512F-NEXT:    # kill: def $edi killed $edi def $rdi
458; GFNIAVX512F-NEXT:    bswapl %edi
459; GFNIAVX512F-NEXT:    movl %edi, %eax
460; GFNIAVX512F-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
461; GFNIAVX512F-NEXT:    shll $4, %eax
462; GFNIAVX512F-NEXT:    shrl $4, %edi
463; GFNIAVX512F-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
464; GFNIAVX512F-NEXT:    orl %eax, %edi
465; GFNIAVX512F-NEXT:    movl %edi, %eax
466; GFNIAVX512F-NEXT:    andl $858993459, %eax # imm = 0x33333333
467; GFNIAVX512F-NEXT:    shrl $2, %edi
468; GFNIAVX512F-NEXT:    andl $858993459, %edi # imm = 0x33333333
469; GFNIAVX512F-NEXT:    leal (%rdi,%rax,4), %eax
470; GFNIAVX512F-NEXT:    movl %eax, %ecx
471; GFNIAVX512F-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
472; GFNIAVX512F-NEXT:    shrl %eax
473; GFNIAVX512F-NEXT:    andl $1431655765, %eax # imm = 0x55555555
474; GFNIAVX512F-NEXT:    leal (%rax,%rcx,2), %eax
475; GFNIAVX512F-NEXT:    retq
476;
477; GFNIAVX512BW-LABEL: test_bitreverse_i32:
478; GFNIAVX512BW:       # %bb.0:
479; GFNIAVX512BW-NEXT:    # kill: def $edi killed $edi def $rdi
480; GFNIAVX512BW-NEXT:    bswapl %edi
481; GFNIAVX512BW-NEXT:    movl %edi, %eax
482; GFNIAVX512BW-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
483; GFNIAVX512BW-NEXT:    shll $4, %eax
484; GFNIAVX512BW-NEXT:    shrl $4, %edi
485; GFNIAVX512BW-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
486; GFNIAVX512BW-NEXT:    orl %eax, %edi
487; GFNIAVX512BW-NEXT:    movl %edi, %eax
488; GFNIAVX512BW-NEXT:    andl $858993459, %eax # imm = 0x33333333
489; GFNIAVX512BW-NEXT:    shrl $2, %edi
490; GFNIAVX512BW-NEXT:    andl $858993459, %edi # imm = 0x33333333
491; GFNIAVX512BW-NEXT:    leal (%rdi,%rax,4), %eax
492; GFNIAVX512BW-NEXT:    movl %eax, %ecx
493; GFNIAVX512BW-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
494; GFNIAVX512BW-NEXT:    shrl %eax
495; GFNIAVX512BW-NEXT:    andl $1431655765, %eax # imm = 0x55555555
496; GFNIAVX512BW-NEXT:    leal (%rax,%rcx,2), %eax
497; GFNIAVX512BW-NEXT:    retq
498  %b = call i32 @llvm.bitreverse.i32(i32 %a)
499  ret i32 %b
500}
501
502define i64 @test_bitreverse_i64(i64 %a) nounwind {
503; SSE-LABEL: test_bitreverse_i64:
504; SSE:       # %bb.0:
505; SSE-NEXT:    bswapq %rdi
506; SSE-NEXT:    movq %rdi, %rax
507; SSE-NEXT:    shrq $4, %rax
508; SSE-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
509; SSE-NEXT:    andq %rcx, %rax
510; SSE-NEXT:    andq %rcx, %rdi
511; SSE-NEXT:    shlq $4, %rdi
512; SSE-NEXT:    orq %rax, %rdi
513; SSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
514; SSE-NEXT:    movq %rdi, %rcx
515; SSE-NEXT:    andq %rax, %rcx
516; SSE-NEXT:    shrq $2, %rdi
517; SSE-NEXT:    andq %rax, %rdi
518; SSE-NEXT:    leaq (%rdi,%rcx,4), %rax
519; SSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
520; SSE-NEXT:    movq %rax, %rdx
521; SSE-NEXT:    andq %rcx, %rdx
522; SSE-NEXT:    shrq %rax
523; SSE-NEXT:    andq %rcx, %rax
524; SSE-NEXT:    leaq (%rax,%rdx,2), %rax
525; SSE-NEXT:    retq
526;
527; AVX-LABEL: test_bitreverse_i64:
528; AVX:       # %bb.0:
529; AVX-NEXT:    bswapq %rdi
530; AVX-NEXT:    movq %rdi, %rax
531; AVX-NEXT:    shrq $4, %rax
532; AVX-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
533; AVX-NEXT:    andq %rcx, %rax
534; AVX-NEXT:    andq %rcx, %rdi
535; AVX-NEXT:    shlq $4, %rdi
536; AVX-NEXT:    orq %rax, %rdi
537; AVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
538; AVX-NEXT:    movq %rdi, %rcx
539; AVX-NEXT:    andq %rax, %rcx
540; AVX-NEXT:    shrq $2, %rdi
541; AVX-NEXT:    andq %rax, %rdi
542; AVX-NEXT:    leaq (%rdi,%rcx,4), %rax
543; AVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
544; AVX-NEXT:    movq %rax, %rdx
545; AVX-NEXT:    andq %rcx, %rdx
546; AVX-NEXT:    shrq %rax
547; AVX-NEXT:    andq %rcx, %rax
548; AVX-NEXT:    leaq (%rax,%rdx,2), %rax
549; AVX-NEXT:    retq
550;
551; XOP-LABEL: test_bitreverse_i64:
552; XOP:       # %bb.0:
553; XOP-NEXT:    vmovq %rdi, %xmm0
554; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
555; XOP-NEXT:    vmovq %xmm0, %rax
556; XOP-NEXT:    retq
557;
558; GFNISSE-LABEL: test_bitreverse_i64:
559; GFNISSE:       # %bb.0:
560; GFNISSE-NEXT:    bswapq %rdi
561; GFNISSE-NEXT:    movq %rdi, %rax
562; GFNISSE-NEXT:    shrq $4, %rax
563; GFNISSE-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
564; GFNISSE-NEXT:    andq %rcx, %rax
565; GFNISSE-NEXT:    andq %rcx, %rdi
566; GFNISSE-NEXT:    shlq $4, %rdi
567; GFNISSE-NEXT:    orq %rax, %rdi
568; GFNISSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
569; GFNISSE-NEXT:    movq %rdi, %rcx
570; GFNISSE-NEXT:    andq %rax, %rcx
571; GFNISSE-NEXT:    shrq $2, %rdi
572; GFNISSE-NEXT:    andq %rax, %rdi
573; GFNISSE-NEXT:    leaq (%rdi,%rcx,4), %rax
574; GFNISSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
575; GFNISSE-NEXT:    movq %rax, %rdx
576; GFNISSE-NEXT:    andq %rcx, %rdx
577; GFNISSE-NEXT:    shrq %rax
578; GFNISSE-NEXT:    andq %rcx, %rax
579; GFNISSE-NEXT:    leaq (%rax,%rdx,2), %rax
580; GFNISSE-NEXT:    retq
581;
582; GFNIAVX-LABEL: test_bitreverse_i64:
583; GFNIAVX:       # %bb.0:
584; GFNIAVX-NEXT:    bswapq %rdi
585; GFNIAVX-NEXT:    movq %rdi, %rax
586; GFNIAVX-NEXT:    shrq $4, %rax
587; GFNIAVX-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
588; GFNIAVX-NEXT:    andq %rcx, %rax
589; GFNIAVX-NEXT:    andq %rcx, %rdi
590; GFNIAVX-NEXT:    shlq $4, %rdi
591; GFNIAVX-NEXT:    orq %rax, %rdi
592; GFNIAVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
593; GFNIAVX-NEXT:    movq %rdi, %rcx
594; GFNIAVX-NEXT:    andq %rax, %rcx
595; GFNIAVX-NEXT:    shrq $2, %rdi
596; GFNIAVX-NEXT:    andq %rax, %rdi
597; GFNIAVX-NEXT:    leaq (%rdi,%rcx,4), %rax
598; GFNIAVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
599; GFNIAVX-NEXT:    movq %rax, %rdx
600; GFNIAVX-NEXT:    andq %rcx, %rdx
601; GFNIAVX-NEXT:    shrq %rax
602; GFNIAVX-NEXT:    andq %rcx, %rax
603; GFNIAVX-NEXT:    leaq (%rax,%rdx,2), %rax
604; GFNIAVX-NEXT:    retq
605;
606; GFNIAVX2-LABEL: test_bitreverse_i64:
607; GFNIAVX2:       # %bb.0:
608; GFNIAVX2-NEXT:    bswapq %rdi
609; GFNIAVX2-NEXT:    movq %rdi, %rax
610; GFNIAVX2-NEXT:    shrq $4, %rax
611; GFNIAVX2-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
612; GFNIAVX2-NEXT:    andq %rcx, %rax
613; GFNIAVX2-NEXT:    andq %rcx, %rdi
614; GFNIAVX2-NEXT:    shlq $4, %rdi
615; GFNIAVX2-NEXT:    orq %rax, %rdi
616; GFNIAVX2-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
617; GFNIAVX2-NEXT:    movq %rdi, %rcx
618; GFNIAVX2-NEXT:    andq %rax, %rcx
619; GFNIAVX2-NEXT:    shrq $2, %rdi
620; GFNIAVX2-NEXT:    andq %rax, %rdi
621; GFNIAVX2-NEXT:    leaq (%rdi,%rcx,4), %rax
622; GFNIAVX2-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
623; GFNIAVX2-NEXT:    movq %rax, %rdx
624; GFNIAVX2-NEXT:    andq %rcx, %rdx
625; GFNIAVX2-NEXT:    shrq %rax
626; GFNIAVX2-NEXT:    andq %rcx, %rax
627; GFNIAVX2-NEXT:    leaq (%rax,%rdx,2), %rax
628; GFNIAVX2-NEXT:    retq
629;
630; GFNIAVX512F-LABEL: test_bitreverse_i64:
631; GFNIAVX512F:       # %bb.0:
632; GFNIAVX512F-NEXT:    bswapq %rdi
633; GFNIAVX512F-NEXT:    movq %rdi, %rax
634; GFNIAVX512F-NEXT:    shrq $4, %rax
635; GFNIAVX512F-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
636; GFNIAVX512F-NEXT:    andq %rcx, %rax
637; GFNIAVX512F-NEXT:    andq %rcx, %rdi
638; GFNIAVX512F-NEXT:    shlq $4, %rdi
639; GFNIAVX512F-NEXT:    orq %rax, %rdi
640; GFNIAVX512F-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
641; GFNIAVX512F-NEXT:    movq %rdi, %rcx
642; GFNIAVX512F-NEXT:    andq %rax, %rcx
643; GFNIAVX512F-NEXT:    shrq $2, %rdi
644; GFNIAVX512F-NEXT:    andq %rax, %rdi
645; GFNIAVX512F-NEXT:    leaq (%rdi,%rcx,4), %rax
646; GFNIAVX512F-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
647; GFNIAVX512F-NEXT:    movq %rax, %rdx
648; GFNIAVX512F-NEXT:    andq %rcx, %rdx
649; GFNIAVX512F-NEXT:    shrq %rax
650; GFNIAVX512F-NEXT:    andq %rcx, %rax
651; GFNIAVX512F-NEXT:    leaq (%rax,%rdx,2), %rax
652; GFNIAVX512F-NEXT:    retq
653;
654; GFNIAVX512BW-LABEL: test_bitreverse_i64:
655; GFNIAVX512BW:       # %bb.0:
656; GFNIAVX512BW-NEXT:    bswapq %rdi
657; GFNIAVX512BW-NEXT:    movq %rdi, %rax
658; GFNIAVX512BW-NEXT:    shrq $4, %rax
659; GFNIAVX512BW-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
660; GFNIAVX512BW-NEXT:    andq %rcx, %rax
661; GFNIAVX512BW-NEXT:    andq %rcx, %rdi
662; GFNIAVX512BW-NEXT:    shlq $4, %rdi
663; GFNIAVX512BW-NEXT:    orq %rax, %rdi
664; GFNIAVX512BW-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
665; GFNIAVX512BW-NEXT:    movq %rdi, %rcx
666; GFNIAVX512BW-NEXT:    andq %rax, %rcx
667; GFNIAVX512BW-NEXT:    shrq $2, %rdi
668; GFNIAVX512BW-NEXT:    andq %rax, %rdi
669; GFNIAVX512BW-NEXT:    leaq (%rdi,%rcx,4), %rax
670; GFNIAVX512BW-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
671; GFNIAVX512BW-NEXT:    movq %rax, %rdx
672; GFNIAVX512BW-NEXT:    andq %rcx, %rdx
673; GFNIAVX512BW-NEXT:    shrq %rax
674; GFNIAVX512BW-NEXT:    andq %rcx, %rax
675; GFNIAVX512BW-NEXT:    leaq (%rax,%rdx,2), %rax
676; GFNIAVX512BW-NEXT:    retq
677  %b = call i64 @llvm.bitreverse.i64(i64 %a)
678  ret i64 %b
679}
680
681define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
682; SSE2-LABEL: test_bitreverse_v16i8:
683; SSE2:       # %bb.0:
684; SSE2-NEXT:    movdqa %xmm0, %xmm1
685; SSE2-NEXT:    psllw $4, %xmm1
686; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
687; SSE2-NEXT:    psrlw $4, %xmm0
688; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
689; SSE2-NEXT:    por %xmm1, %xmm0
690; SSE2-NEXT:    movdqa %xmm0, %xmm1
691; SSE2-NEXT:    psrlw $2, %xmm1
692; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
693; SSE2-NEXT:    pand %xmm2, %xmm1
694; SSE2-NEXT:    pand %xmm2, %xmm0
695; SSE2-NEXT:    psllw $2, %xmm0
696; SSE2-NEXT:    por %xmm1, %xmm0
697; SSE2-NEXT:    movdqa %xmm0, %xmm1
698; SSE2-NEXT:    psrlw $1, %xmm1
699; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
700; SSE2-NEXT:    pand %xmm2, %xmm1
701; SSE2-NEXT:    pand %xmm2, %xmm0
702; SSE2-NEXT:    paddb %xmm0, %xmm0
703; SSE2-NEXT:    por %xmm1, %xmm0
704; SSE2-NEXT:    retq
705;
706; SSSE3-LABEL: test_bitreverse_v16i8:
707; SSSE3:       # %bb.0:
708; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
709; SSSE3-NEXT:    movdqa %xmm0, %xmm2
710; SSSE3-NEXT:    pand %xmm1, %xmm2
711; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
712; SSSE3-NEXT:    pshufb %xmm2, %xmm3
713; SSSE3-NEXT:    psrlw $4, %xmm0
714; SSSE3-NEXT:    pand %xmm1, %xmm0
715; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
716; SSSE3-NEXT:    pshufb %xmm0, %xmm1
717; SSSE3-NEXT:    por %xmm3, %xmm1
718; SSSE3-NEXT:    movdqa %xmm1, %xmm0
719; SSSE3-NEXT:    retq
720;
721; AVX-LABEL: test_bitreverse_v16i8:
722; AVX:       # %bb.0:
723; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
724; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
725; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
726; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
727; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
728; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
729; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
730; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
731; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
732; AVX-NEXT:    retq
733;
734; XOP-LABEL: test_bitreverse_v16i8:
735; XOP:       # %bb.0:
736; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
737; XOP-NEXT:    retq
738;
739; GFNISSE-LABEL: test_bitreverse_v16i8:
740; GFNISSE:       # %bb.0:
741; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
742; GFNISSE-NEXT:    retq
743;
744; GFNIAVX-LABEL: test_bitreverse_v16i8:
745; GFNIAVX:       # %bb.0:
746; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
747; GFNIAVX-NEXT:    retq
748;
749; GFNIAVX2-LABEL: test_bitreverse_v16i8:
750; GFNIAVX2:       # %bb.0:
751; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
752; GFNIAVX2-NEXT:    retq
753;
754; GFNIAVX512F-LABEL: test_bitreverse_v16i8:
755; GFNIAVX512F:       # %bb.0:
756; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
757; GFNIAVX512F-NEXT:    retq
758;
759; GFNIAVX512BW-LABEL: test_bitreverse_v16i8:
760; GFNIAVX512BW:       # %bb.0:
761; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
762; GFNIAVX512BW-NEXT:    retq
763  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
764  ret <16 x i8> %b
765}
766
767define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
768; SSE2-LABEL: test_bitreverse_v8i16:
769; SSE2:       # %bb.0:
770; SSE2-NEXT:    movdqa %xmm0, %xmm1
771; SSE2-NEXT:    psrlw $8, %xmm1
772; SSE2-NEXT:    psllw $8, %xmm0
773; SSE2-NEXT:    por %xmm1, %xmm0
774; SSE2-NEXT:    movdqa %xmm0, %xmm1
775; SSE2-NEXT:    psllw $4, %xmm1
776; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
777; SSE2-NEXT:    psrlw $4, %xmm0
778; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
779; SSE2-NEXT:    por %xmm1, %xmm0
780; SSE2-NEXT:    movdqa %xmm0, %xmm1
781; SSE2-NEXT:    psrlw $2, %xmm1
782; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
783; SSE2-NEXT:    pand %xmm2, %xmm1
784; SSE2-NEXT:    pand %xmm2, %xmm0
785; SSE2-NEXT:    psllw $2, %xmm0
786; SSE2-NEXT:    por %xmm1, %xmm0
787; SSE2-NEXT:    movdqa %xmm0, %xmm1
788; SSE2-NEXT:    psrlw $1, %xmm1
789; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
790; SSE2-NEXT:    pand %xmm2, %xmm1
791; SSE2-NEXT:    pand %xmm2, %xmm0
792; SSE2-NEXT:    paddb %xmm0, %xmm0
793; SSE2-NEXT:    por %xmm1, %xmm0
794; SSE2-NEXT:    retq
795;
796; SSSE3-LABEL: test_bitreverse_v8i16:
797; SSSE3:       # %bb.0:
798; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
799; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
800; SSSE3-NEXT:    movdqa %xmm0, %xmm2
801; SSSE3-NEXT:    pand %xmm1, %xmm2
802; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
803; SSSE3-NEXT:    pshufb %xmm2, %xmm3
804; SSSE3-NEXT:    psrlw $4, %xmm0
805; SSSE3-NEXT:    pand %xmm1, %xmm0
806; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
807; SSSE3-NEXT:    pshufb %xmm0, %xmm1
808; SSSE3-NEXT:    por %xmm3, %xmm1
809; SSSE3-NEXT:    movdqa %xmm1, %xmm0
810; SSSE3-NEXT:    retq
811;
812; AVX-LABEL: test_bitreverse_v8i16:
813; AVX:       # %bb.0:
814; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
815; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
816; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
817; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
818; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
819; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
820; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
821; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
822; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
823; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
824; AVX-NEXT:    retq
825;
826; XOP-LABEL: test_bitreverse_v8i16:
827; XOP:       # %bb.0:
828; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
829; XOP-NEXT:    retq
830;
831; GFNISSE-LABEL: test_bitreverse_v8i16:
832; GFNISSE:       # %bb.0:
833; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
834; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
835; GFNISSE-NEXT:    retq
836;
837; GFNIAVX-LABEL: test_bitreverse_v8i16:
838; GFNIAVX:       # %bb.0:
839; GFNIAVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
840; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
841; GFNIAVX-NEXT:    retq
842;
843; GFNIAVX2-LABEL: test_bitreverse_v8i16:
844; GFNIAVX2:       # %bb.0:
845; GFNIAVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
846; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
847; GFNIAVX2-NEXT:    retq
848;
849; GFNIAVX512F-LABEL: test_bitreverse_v8i16:
850; GFNIAVX512F:       # %bb.0:
851; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
852; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
853; GFNIAVX512F-NEXT:    retq
854;
855; GFNIAVX512BW-LABEL: test_bitreverse_v8i16:
856; GFNIAVX512BW:       # %bb.0:
857; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
858; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
859; GFNIAVX512BW-NEXT:    retq
860  %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
861  ret <8 x i16> %b
862}
863
864define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
865; SSE2-LABEL: test_bitreverse_v4i32:
866; SSE2:       # %bb.0:
867; SSE2-NEXT:    pxor %xmm1, %xmm1
868; SSE2-NEXT:    movdqa %xmm0, %xmm2
869; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
870; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
871; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
872; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
873; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
874; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
875; SSE2-NEXT:    packuswb %xmm2, %xmm0
876; SSE2-NEXT:    movdqa %xmm0, %xmm1
877; SSE2-NEXT:    psllw $4, %xmm1
878; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
879; SSE2-NEXT:    psrlw $4, %xmm0
880; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
881; SSE2-NEXT:    por %xmm1, %xmm0
882; SSE2-NEXT:    movdqa %xmm0, %xmm1
883; SSE2-NEXT:    psrlw $2, %xmm1
884; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
885; SSE2-NEXT:    pand %xmm2, %xmm1
886; SSE2-NEXT:    pand %xmm2, %xmm0
887; SSE2-NEXT:    psllw $2, %xmm0
888; SSE2-NEXT:    por %xmm1, %xmm0
889; SSE2-NEXT:    movdqa %xmm0, %xmm1
890; SSE2-NEXT:    psrlw $1, %xmm1
891; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
892; SSE2-NEXT:    pand %xmm2, %xmm1
893; SSE2-NEXT:    pand %xmm2, %xmm0
894; SSE2-NEXT:    paddb %xmm0, %xmm0
895; SSE2-NEXT:    por %xmm1, %xmm0
896; SSE2-NEXT:    retq
897;
898; SSSE3-LABEL: test_bitreverse_v4i32:
899; SSSE3:       # %bb.0:
900; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
901; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
902; SSSE3-NEXT:    movdqa %xmm0, %xmm2
903; SSSE3-NEXT:    pand %xmm1, %xmm2
904; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
905; SSSE3-NEXT:    pshufb %xmm2, %xmm3
906; SSSE3-NEXT:    psrlw $4, %xmm0
907; SSSE3-NEXT:    pand %xmm1, %xmm0
908; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
909; SSSE3-NEXT:    pshufb %xmm0, %xmm1
910; SSSE3-NEXT:    por %xmm3, %xmm1
911; SSSE3-NEXT:    movdqa %xmm1, %xmm0
912; SSSE3-NEXT:    retq
913;
914; AVX-LABEL: test_bitreverse_v4i32:
915; AVX:       # %bb.0:
916; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
917; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
918; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
919; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
920; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
921; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
922; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
923; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
924; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
925; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
926; AVX-NEXT:    retq
927;
928; XOP-LABEL: test_bitreverse_v4i32:
929; XOP:       # %bb.0:
930; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
931; XOP-NEXT:    retq
932;
933; GFNISSE-LABEL: test_bitreverse_v4i32:
934; GFNISSE:       # %bb.0:
935; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
936; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
937; GFNISSE-NEXT:    retq
938;
939; GFNIAVX-LABEL: test_bitreverse_v4i32:
940; GFNIAVX:       # %bb.0:
941; GFNIAVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
942; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
943; GFNIAVX-NEXT:    retq
944;
945; GFNIAVX2-LABEL: test_bitreverse_v4i32:
946; GFNIAVX2:       # %bb.0:
947; GFNIAVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
948; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
949; GFNIAVX2-NEXT:    retq
950;
951; GFNIAVX512F-LABEL: test_bitreverse_v4i32:
952; GFNIAVX512F:       # %bb.0:
953; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
954; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
955; GFNIAVX512F-NEXT:    retq
956;
957; GFNIAVX512BW-LABEL: test_bitreverse_v4i32:
958; GFNIAVX512BW:       # %bb.0:
959; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
960; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
961; GFNIAVX512BW-NEXT:    retq
962  %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
963  ret <4 x i32> %b
964}
965
966define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
967; SSE2-LABEL: test_bitreverse_v2i64:
968; SSE2:       # %bb.0:
969; SSE2-NEXT:    pxor %xmm1, %xmm1
970; SSE2-NEXT:    movdqa %xmm0, %xmm2
971; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
972; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
973; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
974; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
975; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
976; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
977; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
978; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
979; SSE2-NEXT:    packuswb %xmm2, %xmm0
980; SSE2-NEXT:    movdqa %xmm0, %xmm1
981; SSE2-NEXT:    psllw $4, %xmm1
982; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
983; SSE2-NEXT:    psrlw $4, %xmm0
984; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
985; SSE2-NEXT:    por %xmm1, %xmm0
986; SSE2-NEXT:    movdqa %xmm0, %xmm1
987; SSE2-NEXT:    psrlw $2, %xmm1
988; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
989; SSE2-NEXT:    pand %xmm2, %xmm1
990; SSE2-NEXT:    pand %xmm2, %xmm0
991; SSE2-NEXT:    psllw $2, %xmm0
992; SSE2-NEXT:    por %xmm1, %xmm0
993; SSE2-NEXT:    movdqa %xmm0, %xmm1
994; SSE2-NEXT:    psrlw $1, %xmm1
995; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
996; SSE2-NEXT:    pand %xmm2, %xmm1
997; SSE2-NEXT:    pand %xmm2, %xmm0
998; SSE2-NEXT:    paddb %xmm0, %xmm0
999; SSE2-NEXT:    por %xmm1, %xmm0
1000; SSE2-NEXT:    retq
1001;
1002; SSSE3-LABEL: test_bitreverse_v2i64:
1003; SSSE3:       # %bb.0:
1004; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1005; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1006; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1007; SSSE3-NEXT:    pand %xmm1, %xmm2
1008; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1009; SSSE3-NEXT:    pshufb %xmm2, %xmm3
1010; SSSE3-NEXT:    psrlw $4, %xmm0
1011; SSSE3-NEXT:    pand %xmm1, %xmm0
1012; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1013; SSSE3-NEXT:    pshufb %xmm0, %xmm1
1014; SSSE3-NEXT:    por %xmm3, %xmm1
1015; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1016; SSSE3-NEXT:    retq
1017;
1018; AVX-LABEL: test_bitreverse_v2i64:
1019; AVX:       # %bb.0:
1020; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1021; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1022; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
1023; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1024; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1025; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1026; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
1027; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1028; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
1029; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
1030; AVX-NEXT:    retq
1031;
1032; XOP-LABEL: test_bitreverse_v2i64:
1033; XOP:       # %bb.0:
1034; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
1035; XOP-NEXT:    retq
1036;
1037; GFNISSE-LABEL: test_bitreverse_v2i64:
1038; GFNISSE:       # %bb.0:
1039; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1040; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1041; GFNISSE-NEXT:    retq
1042;
1043; GFNIAVX-LABEL: test_bitreverse_v2i64:
1044; GFNIAVX:       # %bb.0:
1045; GFNIAVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1046; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1047; GFNIAVX-NEXT:    retq
1048;
1049; GFNIAVX2-LABEL: test_bitreverse_v2i64:
1050; GFNIAVX2:       # %bb.0:
1051; GFNIAVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1052; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1053; GFNIAVX2-NEXT:    retq
1054;
1055; GFNIAVX512F-LABEL: test_bitreverse_v2i64:
1056; GFNIAVX512F:       # %bb.0:
1057; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1058; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1059; GFNIAVX512F-NEXT:    retq
1060;
1061; GFNIAVX512BW-LABEL: test_bitreverse_v2i64:
1062; GFNIAVX512BW:       # %bb.0:
1063; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1064; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1065; GFNIAVX512BW-NEXT:    retq
1066  %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
1067  ret <2 x i64> %b
1068}
1069
1070define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
1071; SSE2-LABEL: test_bitreverse_v32i8:
1072; SSE2:       # %bb.0:
1073; SSE2-NEXT:    movdqa %xmm1, %xmm2
1074; SSE2-NEXT:    movdqa %xmm0, %xmm3
1075; SSE2-NEXT:    psllw $4, %xmm3
1076; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1077; SSE2-NEXT:    movdqa %xmm1, %xmm4
1078; SSE2-NEXT:    pandn %xmm3, %xmm4
1079; SSE2-NEXT:    psrlw $4, %xmm0
1080; SSE2-NEXT:    pand %xmm1, %xmm0
1081; SSE2-NEXT:    por %xmm4, %xmm0
1082; SSE2-NEXT:    movdqa %xmm0, %xmm4
1083; SSE2-NEXT:    psrlw $2, %xmm4
1084; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1085; SSE2-NEXT:    pand %xmm3, %xmm4
1086; SSE2-NEXT:    pand %xmm3, %xmm0
1087; SSE2-NEXT:    psllw $2, %xmm0
1088; SSE2-NEXT:    por %xmm4, %xmm0
1089; SSE2-NEXT:    movdqa %xmm0, %xmm5
1090; SSE2-NEXT:    psrlw $1, %xmm5
1091; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1092; SSE2-NEXT:    pand %xmm4, %xmm5
1093; SSE2-NEXT:    pand %xmm4, %xmm0
1094; SSE2-NEXT:    paddb %xmm0, %xmm0
1095; SSE2-NEXT:    por %xmm5, %xmm0
1096; SSE2-NEXT:    movdqa %xmm2, %xmm5
1097; SSE2-NEXT:    psllw $4, %xmm5
1098; SSE2-NEXT:    psrlw $4, %xmm2
1099; SSE2-NEXT:    pand %xmm1, %xmm2
1100; SSE2-NEXT:    pandn %xmm5, %xmm1
1101; SSE2-NEXT:    por %xmm2, %xmm1
1102; SSE2-NEXT:    movdqa %xmm1, %xmm2
1103; SSE2-NEXT:    psrlw $2, %xmm2
1104; SSE2-NEXT:    pand %xmm3, %xmm2
1105; SSE2-NEXT:    pand %xmm3, %xmm1
1106; SSE2-NEXT:    psllw $2, %xmm1
1107; SSE2-NEXT:    por %xmm2, %xmm1
1108; SSE2-NEXT:    movdqa %xmm1, %xmm2
1109; SSE2-NEXT:    psrlw $1, %xmm2
1110; SSE2-NEXT:    pand %xmm4, %xmm2
1111; SSE2-NEXT:    pand %xmm4, %xmm1
1112; SSE2-NEXT:    paddb %xmm1, %xmm1
1113; SSE2-NEXT:    por %xmm2, %xmm1
1114; SSE2-NEXT:    retq
1115;
1116; SSSE3-LABEL: test_bitreverse_v32i8:
1117; SSSE3:       # %bb.0:
1118; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1119; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1120; SSSE3-NEXT:    pand %xmm4, %xmm2
1121; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1122; SSSE3-NEXT:    movdqa %xmm5, %xmm6
1123; SSSE3-NEXT:    pshufb %xmm2, %xmm6
1124; SSSE3-NEXT:    psrlw $4, %xmm0
1125; SSSE3-NEXT:    pand %xmm4, %xmm0
1126; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1127; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1128; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1129; SSSE3-NEXT:    por %xmm6, %xmm3
1130; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1131; SSSE3-NEXT:    pand %xmm4, %xmm0
1132; SSSE3-NEXT:    pshufb %xmm0, %xmm5
1133; SSSE3-NEXT:    psrlw $4, %xmm1
1134; SSSE3-NEXT:    pand %xmm4, %xmm1
1135; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1136; SSSE3-NEXT:    por %xmm5, %xmm2
1137; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1138; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1139; SSSE3-NEXT:    retq
1140;
1141; AVX1-LABEL: test_bitreverse_v32i8:
1142; AVX1:       # %bb.0:
1143; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1144; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1145; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
1146; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1147; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
1148; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1149; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1150; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1151; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
1152; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
1153; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
1154; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
1155; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1156; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1157; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
1158; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
1159; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1160; AVX1-NEXT:    retq
1161;
1162; AVX2-LABEL: test_bitreverse_v32i8:
1163; AVX2:       # %bb.0:
1164; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1165; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1166; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1167; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1168; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1169; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1170; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1171; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1172; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1173; AVX2-NEXT:    retq
1174;
1175; AVX512-LABEL: test_bitreverse_v32i8:
1176; AVX512:       # %bb.0:
1177; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1178; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1179; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1180; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1181; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1182; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1183; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1184; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1185; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1186; AVX512-NEXT:    retq
1187;
1188; XOPAVX1-LABEL: test_bitreverse_v32i8:
1189; XOPAVX1:       # %bb.0:
1190; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1191; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1192; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1193; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1194; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1195; XOPAVX1-NEXT:    retq
1196;
1197; XOPAVX2-LABEL: test_bitreverse_v32i8:
1198; XOPAVX2:       # %bb.0:
1199; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1200; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1201; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1202; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1203; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1204; XOPAVX2-NEXT:    retq
1205;
1206; GFNISSE-LABEL: test_bitreverse_v32i8:
1207; GFNISSE:       # %bb.0:
1208; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
1209; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
1210; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
1211; GFNISSE-NEXT:    retq
1212;
1213; GFNIAVX-LABEL: test_bitreverse_v32i8:
1214; GFNIAVX:       # %bb.0:
1215; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1216; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
1217; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm2, %xmm1, %xmm1
1218; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm2, %xmm0, %xmm0
1219; GFNIAVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1220; GFNIAVX-NEXT:    retq
1221;
1222; GFNIAVX2-LABEL: test_bitreverse_v32i8:
1223; GFNIAVX2:       # %bb.0:
1224; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1225; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1226; GFNIAVX2-NEXT:    retq
1227;
1228; GFNIAVX512F-LABEL: test_bitreverse_v32i8:
1229; GFNIAVX512F:       # %bb.0:
1230; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1231; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1232; GFNIAVX512F-NEXT:    retq
1233;
1234; GFNIAVX512BW-LABEL: test_bitreverse_v32i8:
1235; GFNIAVX512BW:       # %bb.0:
1236; GFNIAVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1237; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1238; GFNIAVX512BW-NEXT:    retq
1239  %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
1240  ret <32 x i8> %b
1241}
1242
1243define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
1244; SSE2-LABEL: test_bitreverse_v16i16:
1245; SSE2:       # %bb.0:
1246; SSE2-NEXT:    movdqa %xmm1, %xmm2
1247; SSE2-NEXT:    movdqa %xmm0, %xmm1
1248; SSE2-NEXT:    psrlw $8, %xmm1
1249; SSE2-NEXT:    psllw $8, %xmm0
1250; SSE2-NEXT:    por %xmm1, %xmm0
1251; SSE2-NEXT:    movdqa %xmm0, %xmm3
1252; SSE2-NEXT:    psllw $4, %xmm3
1253; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1254; SSE2-NEXT:    movdqa %xmm1, %xmm4
1255; SSE2-NEXT:    pandn %xmm3, %xmm4
1256; SSE2-NEXT:    psrlw $4, %xmm0
1257; SSE2-NEXT:    pand %xmm1, %xmm0
1258; SSE2-NEXT:    por %xmm4, %xmm0
1259; SSE2-NEXT:    movdqa %xmm0, %xmm4
1260; SSE2-NEXT:    psrlw $2, %xmm4
1261; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1262; SSE2-NEXT:    pand %xmm3, %xmm4
1263; SSE2-NEXT:    pand %xmm3, %xmm0
1264; SSE2-NEXT:    psllw $2, %xmm0
1265; SSE2-NEXT:    por %xmm4, %xmm0
1266; SSE2-NEXT:    movdqa %xmm0, %xmm5
1267; SSE2-NEXT:    psrlw $1, %xmm5
1268; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1269; SSE2-NEXT:    pand %xmm4, %xmm5
1270; SSE2-NEXT:    pand %xmm4, %xmm0
1271; SSE2-NEXT:    paddb %xmm0, %xmm0
1272; SSE2-NEXT:    por %xmm5, %xmm0
1273; SSE2-NEXT:    movdqa %xmm2, %xmm5
1274; SSE2-NEXT:    psrlw $8, %xmm5
1275; SSE2-NEXT:    psllw $8, %xmm2
1276; SSE2-NEXT:    por %xmm5, %xmm2
1277; SSE2-NEXT:    movdqa %xmm2, %xmm5
1278; SSE2-NEXT:    psllw $4, %xmm5
1279; SSE2-NEXT:    psrlw $4, %xmm2
1280; SSE2-NEXT:    pand %xmm1, %xmm2
1281; SSE2-NEXT:    pandn %xmm5, %xmm1
1282; SSE2-NEXT:    por %xmm2, %xmm1
1283; SSE2-NEXT:    movdqa %xmm1, %xmm2
1284; SSE2-NEXT:    psrlw $2, %xmm2
1285; SSE2-NEXT:    pand %xmm3, %xmm2
1286; SSE2-NEXT:    pand %xmm3, %xmm1
1287; SSE2-NEXT:    psllw $2, %xmm1
1288; SSE2-NEXT:    por %xmm2, %xmm1
1289; SSE2-NEXT:    movdqa %xmm1, %xmm2
1290; SSE2-NEXT:    psrlw $1, %xmm2
1291; SSE2-NEXT:    pand %xmm4, %xmm2
1292; SSE2-NEXT:    pand %xmm4, %xmm1
1293; SSE2-NEXT:    paddb %xmm1, %xmm1
1294; SSE2-NEXT:    por %xmm2, %xmm1
1295; SSE2-NEXT:    retq
1296;
1297; SSSE3-LABEL: test_bitreverse_v16i16:
1298; SSSE3:       # %bb.0:
1299; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1300; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1301; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1302; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1303; SSSE3-NEXT:    pand %xmm5, %xmm2
1304; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1305; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1306; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1307; SSSE3-NEXT:    psrlw $4, %xmm0
1308; SSSE3-NEXT:    pand %xmm5, %xmm0
1309; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1310; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1311; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1312; SSSE3-NEXT:    por %xmm7, %xmm3
1313; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1314; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1315; SSSE3-NEXT:    pand %xmm5, %xmm0
1316; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1317; SSSE3-NEXT:    psrlw $4, %xmm1
1318; SSSE3-NEXT:    pand %xmm5, %xmm1
1319; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1320; SSSE3-NEXT:    por %xmm6, %xmm2
1321; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1322; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1323; SSSE3-NEXT:    retq
1324;
1325; AVX1-LABEL: test_bitreverse_v16i16:
1326; AVX1:       # %bb.0:
1327; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1328; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1329; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1330; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1331; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1332; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1333; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1334; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1335; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1336; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1337; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1338; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1339; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1340; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1341; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1342; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1343; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1344; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1345; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1346; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1347; AVX1-NEXT:    retq
1348;
1349; AVX2-LABEL: test_bitreverse_v16i16:
1350; AVX2:       # %bb.0:
1351; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1352; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1353; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1354; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1355; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1356; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1357; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1358; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1359; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1360; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1361; AVX2-NEXT:    retq
1362;
1363; AVX512-LABEL: test_bitreverse_v16i16:
1364; AVX512:       # %bb.0:
1365; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1366; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1367; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1368; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1369; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1370; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1371; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1372; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1373; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1374; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1375; AVX512-NEXT:    retq
1376;
1377; XOPAVX1-LABEL: test_bitreverse_v16i16:
1378; XOPAVX1:       # %bb.0:
1379; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1380; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1381; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1382; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1383; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1384; XOPAVX1-NEXT:    retq
1385;
1386; XOPAVX2-LABEL: test_bitreverse_v16i16:
1387; XOPAVX2:       # %bb.0:
1388; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1389; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1390; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1391; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1392; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1393; XOPAVX2-NEXT:    retq
1394;
1395; GFNISSE-LABEL: test_bitreverse_v16i16:
1396; GFNISSE:       # %bb.0:
1397; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1398; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
1399; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1400; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
1401; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
1402; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
1403; GFNISSE-NEXT:    retq
1404;
1405; GFNIAVX-LABEL: test_bitreverse_v16i16:
1406; GFNIAVX:       # %bb.0:
1407; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1408; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1409; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1410; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1411; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
1412; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1413; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
1414; GFNIAVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1415; GFNIAVX-NEXT:    retq
1416;
1417; GFNIAVX2-LABEL: test_bitreverse_v16i16:
1418; GFNIAVX2:       # %bb.0:
1419; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1420; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1421; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1422; GFNIAVX2-NEXT:    retq
1423;
1424; GFNIAVX512F-LABEL: test_bitreverse_v16i16:
1425; GFNIAVX512F:       # %bb.0:
1426; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1427; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1428; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1429; GFNIAVX512F-NEXT:    retq
1430;
1431; GFNIAVX512BW-LABEL: test_bitreverse_v16i16:
1432; GFNIAVX512BW:       # %bb.0:
1433; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1434; GFNIAVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1435; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1436; GFNIAVX512BW-NEXT:    retq
1437  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
1438  ret <16 x i16> %b
1439}
1440
1441define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
1442; SSE2-LABEL: test_bitreverse_v8i32:
1443; SSE2:       # %bb.0:
1444; SSE2-NEXT:    movdqa %xmm1, %xmm2
1445; SSE2-NEXT:    pxor %xmm3, %xmm3
1446; SSE2-NEXT:    movdqa %xmm0, %xmm1
1447; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
1448; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1449; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1450; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1451; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1452; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1453; SSE2-NEXT:    packuswb %xmm1, %xmm0
1454; SSE2-NEXT:    movdqa %xmm0, %xmm4
1455; SSE2-NEXT:    psllw $4, %xmm4
1456; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1457; SSE2-NEXT:    movdqa %xmm1, %xmm5
1458; SSE2-NEXT:    pandn %xmm4, %xmm5
1459; SSE2-NEXT:    psrlw $4, %xmm0
1460; SSE2-NEXT:    pand %xmm1, %xmm0
1461; SSE2-NEXT:    por %xmm5, %xmm0
1462; SSE2-NEXT:    movdqa %xmm0, %xmm5
1463; SSE2-NEXT:    psrlw $2, %xmm5
1464; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1465; SSE2-NEXT:    pand %xmm4, %xmm5
1466; SSE2-NEXT:    pand %xmm4, %xmm0
1467; SSE2-NEXT:    psllw $2, %xmm0
1468; SSE2-NEXT:    por %xmm5, %xmm0
1469; SSE2-NEXT:    movdqa %xmm0, %xmm6
1470; SSE2-NEXT:    psrlw $1, %xmm6
1471; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1472; SSE2-NEXT:    pand %xmm5, %xmm6
1473; SSE2-NEXT:    pand %xmm5, %xmm0
1474; SSE2-NEXT:    paddb %xmm0, %xmm0
1475; SSE2-NEXT:    por %xmm6, %xmm0
1476; SSE2-NEXT:    movdqa %xmm2, %xmm6
1477; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15]
1478; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1479; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1480; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
1481; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1482; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1483; SSE2-NEXT:    packuswb %xmm6, %xmm2
1484; SSE2-NEXT:    movdqa %xmm2, %xmm3
1485; SSE2-NEXT:    psllw $4, %xmm3
1486; SSE2-NEXT:    psrlw $4, %xmm2
1487; SSE2-NEXT:    pand %xmm1, %xmm2
1488; SSE2-NEXT:    pandn %xmm3, %xmm1
1489; SSE2-NEXT:    por %xmm2, %xmm1
1490; SSE2-NEXT:    movdqa %xmm1, %xmm2
1491; SSE2-NEXT:    psrlw $2, %xmm2
1492; SSE2-NEXT:    pand %xmm4, %xmm2
1493; SSE2-NEXT:    pand %xmm4, %xmm1
1494; SSE2-NEXT:    psllw $2, %xmm1
1495; SSE2-NEXT:    por %xmm2, %xmm1
1496; SSE2-NEXT:    movdqa %xmm1, %xmm2
1497; SSE2-NEXT:    psrlw $1, %xmm2
1498; SSE2-NEXT:    pand %xmm5, %xmm2
1499; SSE2-NEXT:    pand %xmm5, %xmm1
1500; SSE2-NEXT:    paddb %xmm1, %xmm1
1501; SSE2-NEXT:    por %xmm2, %xmm1
1502; SSE2-NEXT:    retq
1503;
1504; SSSE3-LABEL: test_bitreverse_v8i32:
1505; SSSE3:       # %bb.0:
1506; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1507; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1508; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1509; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1510; SSSE3-NEXT:    pand %xmm5, %xmm2
1511; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1512; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1513; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1514; SSSE3-NEXT:    psrlw $4, %xmm0
1515; SSSE3-NEXT:    pand %xmm5, %xmm0
1516; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1517; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1518; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1519; SSSE3-NEXT:    por %xmm7, %xmm3
1520; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1521; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1522; SSSE3-NEXT:    pand %xmm5, %xmm0
1523; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1524; SSSE3-NEXT:    psrlw $4, %xmm1
1525; SSSE3-NEXT:    pand %xmm5, %xmm1
1526; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1527; SSSE3-NEXT:    por %xmm6, %xmm2
1528; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1529; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1530; SSSE3-NEXT:    retq
1531;
1532; AVX1-LABEL: test_bitreverse_v8i32:
1533; AVX1:       # %bb.0:
1534; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1535; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1536; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1537; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1538; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1539; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1540; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1541; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1542; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1543; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1544; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1545; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1546; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1547; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1548; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1549; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1550; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1551; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1552; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1553; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1554; AVX1-NEXT:    retq
1555;
1556; AVX2-LABEL: test_bitreverse_v8i32:
1557; AVX2:       # %bb.0:
1558; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1559; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1560; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1561; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1562; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1563; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1564; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1565; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1566; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1567; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1568; AVX2-NEXT:    retq
1569;
1570; AVX512-LABEL: test_bitreverse_v8i32:
1571; AVX512:       # %bb.0:
1572; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1573; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1574; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1575; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1576; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1577; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1578; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1579; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1580; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1581; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1582; AVX512-NEXT:    retq
1583;
1584; XOPAVX1-LABEL: test_bitreverse_v8i32:
1585; XOPAVX1:       # %bb.0:
1586; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1587; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1588; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1589; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1590; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1591; XOPAVX1-NEXT:    retq
1592;
1593; XOPAVX2-LABEL: test_bitreverse_v8i32:
1594; XOPAVX2:       # %bb.0:
1595; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1596; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1597; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1598; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1599; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1600; XOPAVX2-NEXT:    retq
1601;
1602; GFNISSE-LABEL: test_bitreverse_v8i32:
1603; GFNISSE:       # %bb.0:
1604; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1605; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
1606; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1607; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
1608; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
1609; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
1610; GFNISSE-NEXT:    retq
1611;
1612; GFNIAVX-LABEL: test_bitreverse_v8i32:
1613; GFNIAVX:       # %bb.0:
1614; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1615; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1616; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1617; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1618; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
1619; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1620; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
1621; GFNIAVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1622; GFNIAVX-NEXT:    retq
1623;
1624; GFNIAVX2-LABEL: test_bitreverse_v8i32:
1625; GFNIAVX2:       # %bb.0:
1626; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1627; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1628; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1629; GFNIAVX2-NEXT:    retq
1630;
1631; GFNIAVX512F-LABEL: test_bitreverse_v8i32:
1632; GFNIAVX512F:       # %bb.0:
1633; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1634; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1635; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1636; GFNIAVX512F-NEXT:    retq
1637;
1638; GFNIAVX512BW-LABEL: test_bitreverse_v8i32:
1639; GFNIAVX512BW:       # %bb.0:
1640; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1641; GFNIAVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1642; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1643; GFNIAVX512BW-NEXT:    retq
1644  %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
1645  ret <8 x i32> %b
1646}
1647
1648define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
1649; SSE2-LABEL: test_bitreverse_v4i64:
1650; SSE2:       # %bb.0:
1651; SSE2-NEXT:    movdqa %xmm1, %xmm2
1652; SSE2-NEXT:    pxor %xmm3, %xmm3
1653; SSE2-NEXT:    movdqa %xmm0, %xmm1
1654; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
1655; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1656; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1657; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1658; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1659; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1660; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1661; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1662; SSE2-NEXT:    packuswb %xmm1, %xmm0
1663; SSE2-NEXT:    movdqa %xmm0, %xmm4
1664; SSE2-NEXT:    psllw $4, %xmm4
1665; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1666; SSE2-NEXT:    movdqa %xmm1, %xmm5
1667; SSE2-NEXT:    pandn %xmm4, %xmm5
1668; SSE2-NEXT:    psrlw $4, %xmm0
1669; SSE2-NEXT:    pand %xmm1, %xmm0
1670; SSE2-NEXT:    por %xmm5, %xmm0
1671; SSE2-NEXT:    movdqa %xmm0, %xmm5
1672; SSE2-NEXT:    psrlw $2, %xmm5
1673; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1674; SSE2-NEXT:    pand %xmm4, %xmm5
1675; SSE2-NEXT:    pand %xmm4, %xmm0
1676; SSE2-NEXT:    psllw $2, %xmm0
1677; SSE2-NEXT:    por %xmm5, %xmm0
1678; SSE2-NEXT:    movdqa %xmm0, %xmm6
1679; SSE2-NEXT:    psrlw $1, %xmm6
1680; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1681; SSE2-NEXT:    pand %xmm5, %xmm6
1682; SSE2-NEXT:    pand %xmm5, %xmm0
1683; SSE2-NEXT:    paddb %xmm0, %xmm0
1684; SSE2-NEXT:    por %xmm6, %xmm0
1685; SSE2-NEXT:    movdqa %xmm2, %xmm6
1686; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15]
1687; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
1688; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1689; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1690; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
1691; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
1692; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1693; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1694; SSE2-NEXT:    packuswb %xmm6, %xmm2
1695; SSE2-NEXT:    movdqa %xmm2, %xmm3
1696; SSE2-NEXT:    psllw $4, %xmm3
1697; SSE2-NEXT:    psrlw $4, %xmm2
1698; SSE2-NEXT:    pand %xmm1, %xmm2
1699; SSE2-NEXT:    pandn %xmm3, %xmm1
1700; SSE2-NEXT:    por %xmm2, %xmm1
1701; SSE2-NEXT:    movdqa %xmm1, %xmm2
1702; SSE2-NEXT:    psrlw $2, %xmm2
1703; SSE2-NEXT:    pand %xmm4, %xmm2
1704; SSE2-NEXT:    pand %xmm4, %xmm1
1705; SSE2-NEXT:    psllw $2, %xmm1
1706; SSE2-NEXT:    por %xmm2, %xmm1
1707; SSE2-NEXT:    movdqa %xmm1, %xmm2
1708; SSE2-NEXT:    psrlw $1, %xmm2
1709; SSE2-NEXT:    pand %xmm5, %xmm2
1710; SSE2-NEXT:    pand %xmm5, %xmm1
1711; SSE2-NEXT:    paddb %xmm1, %xmm1
1712; SSE2-NEXT:    por %xmm2, %xmm1
1713; SSE2-NEXT:    retq
1714;
1715; SSSE3-LABEL: test_bitreverse_v4i64:
1716; SSSE3:       # %bb.0:
1717; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1718; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1719; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1720; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1721; SSSE3-NEXT:    pand %xmm5, %xmm2
1722; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1723; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1724; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1725; SSSE3-NEXT:    psrlw $4, %xmm0
1726; SSSE3-NEXT:    pand %xmm5, %xmm0
1727; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1728; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1729; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1730; SSSE3-NEXT:    por %xmm7, %xmm3
1731; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1732; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1733; SSSE3-NEXT:    pand %xmm5, %xmm0
1734; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1735; SSSE3-NEXT:    psrlw $4, %xmm1
1736; SSSE3-NEXT:    pand %xmm5, %xmm1
1737; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1738; SSSE3-NEXT:    por %xmm6, %xmm2
1739; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1740; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1741; SSSE3-NEXT:    retq
1742;
1743; AVX1-LABEL: test_bitreverse_v4i64:
1744; AVX1:       # %bb.0:
1745; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1746; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1747; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1748; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1749; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1750; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1751; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1752; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1753; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1754; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1755; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1756; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1757; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1758; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1759; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1760; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1761; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1762; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1763; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1764; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1765; AVX1-NEXT:    retq
1766;
1767; AVX2-LABEL: test_bitreverse_v4i64:
1768; AVX2:       # %bb.0:
1769; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1770; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1771; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1772; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1773; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1774; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1775; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1776; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1777; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1778; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1779; AVX2-NEXT:    retq
1780;
1781; AVX512-LABEL: test_bitreverse_v4i64:
1782; AVX512:       # %bb.0:
1783; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1784; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1785; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1786; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1787; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1788; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1789; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1790; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1791; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1792; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1793; AVX512-NEXT:    retq
1794;
1795; XOPAVX1-LABEL: test_bitreverse_v4i64:
1796; XOPAVX1:       # %bb.0:
1797; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1798; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1799; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1800; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1801; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1802; XOPAVX1-NEXT:    retq
1803;
1804; XOPAVX2-LABEL: test_bitreverse_v4i64:
1805; XOPAVX2:       # %bb.0:
1806; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1807; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1808; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1809; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1810; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1811; XOPAVX2-NEXT:    retq
1812;
1813; GFNISSE-LABEL: test_bitreverse_v4i64:
1814; GFNISSE:       # %bb.0:
1815; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1816; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
1817; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1818; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
1819; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
1820; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
1821; GFNISSE-NEXT:    retq
1822;
1823; GFNIAVX-LABEL: test_bitreverse_v4i64:
1824; GFNIAVX:       # %bb.0:
1825; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1826; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1827; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1828; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1829; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
1830; GFNIAVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1831; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
1832; GFNIAVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1833; GFNIAVX-NEXT:    retq
1834;
1835; GFNIAVX2-LABEL: test_bitreverse_v4i64:
1836; GFNIAVX2:       # %bb.0:
1837; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1838; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1839; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1840; GFNIAVX2-NEXT:    retq
1841;
1842; GFNIAVX512F-LABEL: test_bitreverse_v4i64:
1843; GFNIAVX512F:       # %bb.0:
1844; GFNIAVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1845; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1846; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1847; GFNIAVX512F-NEXT:    retq
1848;
1849; GFNIAVX512BW-LABEL: test_bitreverse_v4i64:
1850; GFNIAVX512BW:       # %bb.0:
1851; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1852; GFNIAVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1853; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1854; GFNIAVX512BW-NEXT:    retq
1855  %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
1856  ret <4 x i64> %b
1857}
1858
1859define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
1860; SSE2-LABEL: test_bitreverse_v64i8:
1861; SSE2:       # %bb.0:
1862; SSE2-NEXT:    movdqa %xmm3, %xmm4
1863; SSE2-NEXT:    movdqa %xmm0, %xmm5
1864; SSE2-NEXT:    psllw $4, %xmm5
1865; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1866; SSE2-NEXT:    movdqa %xmm3, %xmm6
1867; SSE2-NEXT:    pandn %xmm5, %xmm6
1868; SSE2-NEXT:    psrlw $4, %xmm0
1869; SSE2-NEXT:    pand %xmm3, %xmm0
1870; SSE2-NEXT:    por %xmm6, %xmm0
1871; SSE2-NEXT:    movdqa %xmm0, %xmm6
1872; SSE2-NEXT:    psrlw $2, %xmm6
1873; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1874; SSE2-NEXT:    pand %xmm8, %xmm6
1875; SSE2-NEXT:    pand %xmm8, %xmm0
1876; SSE2-NEXT:    psllw $2, %xmm0
1877; SSE2-NEXT:    por %xmm6, %xmm0
1878; SSE2-NEXT:    movdqa %xmm0, %xmm7
1879; SSE2-NEXT:    psrlw $1, %xmm7
1880; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1881; SSE2-NEXT:    pand %xmm6, %xmm7
1882; SSE2-NEXT:    pand %xmm6, %xmm0
1883; SSE2-NEXT:    paddb %xmm0, %xmm0
1884; SSE2-NEXT:    por %xmm7, %xmm0
1885; SSE2-NEXT:    movdqa %xmm1, %xmm7
1886; SSE2-NEXT:    psllw $4, %xmm7
1887; SSE2-NEXT:    movdqa %xmm3, %xmm5
1888; SSE2-NEXT:    pandn %xmm7, %xmm5
1889; SSE2-NEXT:    psrlw $4, %xmm1
1890; SSE2-NEXT:    pand %xmm3, %xmm1
1891; SSE2-NEXT:    por %xmm5, %xmm1
1892; SSE2-NEXT:    movdqa %xmm1, %xmm5
1893; SSE2-NEXT:    psrlw $2, %xmm5
1894; SSE2-NEXT:    pand %xmm8, %xmm5
1895; SSE2-NEXT:    pand %xmm8, %xmm1
1896; SSE2-NEXT:    psllw $2, %xmm1
1897; SSE2-NEXT:    por %xmm5, %xmm1
1898; SSE2-NEXT:    movdqa %xmm1, %xmm5
1899; SSE2-NEXT:    psrlw $1, %xmm5
1900; SSE2-NEXT:    pand %xmm6, %xmm5
1901; SSE2-NEXT:    pand %xmm6, %xmm1
1902; SSE2-NEXT:    paddb %xmm1, %xmm1
1903; SSE2-NEXT:    por %xmm5, %xmm1
1904; SSE2-NEXT:    movdqa %xmm2, %xmm5
1905; SSE2-NEXT:    psllw $4, %xmm5
1906; SSE2-NEXT:    movdqa %xmm3, %xmm7
1907; SSE2-NEXT:    pandn %xmm5, %xmm7
1908; SSE2-NEXT:    psrlw $4, %xmm2
1909; SSE2-NEXT:    pand %xmm3, %xmm2
1910; SSE2-NEXT:    por %xmm7, %xmm2
1911; SSE2-NEXT:    movdqa %xmm2, %xmm5
1912; SSE2-NEXT:    psrlw $2, %xmm5
1913; SSE2-NEXT:    pand %xmm8, %xmm5
1914; SSE2-NEXT:    pand %xmm8, %xmm2
1915; SSE2-NEXT:    psllw $2, %xmm2
1916; SSE2-NEXT:    por %xmm5, %xmm2
1917; SSE2-NEXT:    movdqa %xmm2, %xmm5
1918; SSE2-NEXT:    psrlw $1, %xmm5
1919; SSE2-NEXT:    pand %xmm6, %xmm5
1920; SSE2-NEXT:    pand %xmm6, %xmm2
1921; SSE2-NEXT:    paddb %xmm2, %xmm2
1922; SSE2-NEXT:    por %xmm5, %xmm2
1923; SSE2-NEXT:    movdqa %xmm4, %xmm5
1924; SSE2-NEXT:    psllw $4, %xmm5
1925; SSE2-NEXT:    psrlw $4, %xmm4
1926; SSE2-NEXT:    pand %xmm3, %xmm4
1927; SSE2-NEXT:    pandn %xmm5, %xmm3
1928; SSE2-NEXT:    por %xmm4, %xmm3
1929; SSE2-NEXT:    movdqa %xmm3, %xmm4
1930; SSE2-NEXT:    psrlw $2, %xmm4
1931; SSE2-NEXT:    pand %xmm8, %xmm4
1932; SSE2-NEXT:    pand %xmm8, %xmm3
1933; SSE2-NEXT:    psllw $2, %xmm3
1934; SSE2-NEXT:    por %xmm4, %xmm3
1935; SSE2-NEXT:    movdqa %xmm3, %xmm4
1936; SSE2-NEXT:    psrlw $1, %xmm4
1937; SSE2-NEXT:    pand %xmm6, %xmm4
1938; SSE2-NEXT:    pand %xmm6, %xmm3
1939; SSE2-NEXT:    paddb %xmm3, %xmm3
1940; SSE2-NEXT:    por %xmm4, %xmm3
1941; SSE2-NEXT:    retq
1942;
1943; SSSE3-LABEL: test_bitreverse_v64i8:
1944; SSSE3:       # %bb.0:
1945; SSSE3-NEXT:    movdqa %xmm0, %xmm5
1946; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1947; SSSE3-NEXT:    pand %xmm8, %xmm0
1948; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1949; SSSE3-NEXT:    movdqa %xmm9, %xmm6
1950; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1951; SSSE3-NEXT:    psrlw $4, %xmm5
1952; SSSE3-NEXT:    pand %xmm8, %xmm5
1953; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1954; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1955; SSSE3-NEXT:    pshufb %xmm5, %xmm0
1956; SSSE3-NEXT:    por %xmm6, %xmm0
1957; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1958; SSSE3-NEXT:    pand %xmm8, %xmm5
1959; SSSE3-NEXT:    movdqa %xmm9, %xmm6
1960; SSSE3-NEXT:    pshufb %xmm5, %xmm6
1961; SSSE3-NEXT:    psrlw $4, %xmm1
1962; SSSE3-NEXT:    pand %xmm8, %xmm1
1963; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1964; SSSE3-NEXT:    pshufb %xmm1, %xmm5
1965; SSSE3-NEXT:    por %xmm6, %xmm5
1966; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1967; SSSE3-NEXT:    pand %xmm8, %xmm1
1968; SSSE3-NEXT:    movdqa %xmm9, %xmm7
1969; SSSE3-NEXT:    pshufb %xmm1, %xmm7
1970; SSSE3-NEXT:    psrlw $4, %xmm2
1971; SSSE3-NEXT:    pand %xmm8, %xmm2
1972; SSSE3-NEXT:    movdqa %xmm4, %xmm6
1973; SSSE3-NEXT:    pshufb %xmm2, %xmm6
1974; SSSE3-NEXT:    por %xmm7, %xmm6
1975; SSSE3-NEXT:    movdqa %xmm3, %xmm1
1976; SSSE3-NEXT:    pand %xmm8, %xmm1
1977; SSSE3-NEXT:    pshufb %xmm1, %xmm9
1978; SSSE3-NEXT:    psrlw $4, %xmm3
1979; SSSE3-NEXT:    pand %xmm8, %xmm3
1980; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1981; SSSE3-NEXT:    por %xmm9, %xmm4
1982; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1983; SSSE3-NEXT:    movdqa %xmm6, %xmm2
1984; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1985; SSSE3-NEXT:    retq
1986;
1987; AVX1-LABEL: test_bitreverse_v64i8:
1988; AVX1:       # %bb.0:
1989; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1990; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1991; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
1992; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1993; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1994; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1995; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1996; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1997; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
1998; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
1999; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm4
2000; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
2001; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2002; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
2003; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
2004; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
2005; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2006; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2007; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
2008; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
2009; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2010; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
2011; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
2012; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
2013; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
2014; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
2015; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2016; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
2017; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
2018; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
2019; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2020; AVX1-NEXT:    retq
2021;
2022; AVX2-LABEL: test_bitreverse_v64i8:
2023; AVX2:       # %bb.0:
2024; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2025; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
2026; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2027; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
2028; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2029; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
2030; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2031; AVX2-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
2032; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
2033; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
2034; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
2035; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2036; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
2037; AVX2-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
2038; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
2039; AVX2-NEXT:    retq
2040;
2041; AVX512F-LABEL: test_bitreverse_v64i8:
2042; AVX512F:       # %bb.0:
2043; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2044; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2045; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
2046; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2047; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
2048; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm5
2049; AVX512F-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
2050; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
2051; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
2052; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
2053; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2054; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
2055; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
2056; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
2057; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
2058; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2059; AVX512F-NEXT:    vporq %zmm0, %zmm3, %zmm0
2060; AVX512F-NEXT:    retq
2061;
2062; AVX512BW-LABEL: test_bitreverse_v64i8:
2063; AVX512BW:       # %bb.0:
2064; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2065; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2066; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2067; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2068; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2069; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2070; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2071; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2072; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2073; AVX512BW-NEXT:    retq
2074;
2075; XOPAVX1-LABEL: test_bitreverse_v64i8:
2076; XOPAVX1:       # %bb.0:
2077; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2078; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
2079; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2080; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2081; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2082; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2083; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2084; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2085; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2086; XOPAVX1-NEXT:    retq
2087;
2088; XOPAVX2-LABEL: test_bitreverse_v64i8:
2089; XOPAVX2:       # %bb.0:
2090; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2091; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
2092; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2093; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2094; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2095; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2096; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2097; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2098; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2099; XOPAVX2-NEXT:    retq
2100;
2101; GFNISSE-LABEL: test_bitreverse_v64i8:
2102; GFNISSE:       # %bb.0:
2103; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
2104; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
2105; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
2106; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
2107; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm3
2108; GFNISSE-NEXT:    retq
2109;
2110; GFNIAVX-LABEL: test_bitreverse_v64i8:
2111; GFNIAVX:       # %bb.0:
2112; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
2113; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
2114; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm2
2115; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
2116; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2117; GFNIAVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
2118; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm2
2119; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
2120; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2121; GFNIAVX-NEXT:    retq
2122;
2123; GFNIAVX2-LABEL: test_bitreverse_v64i8:
2124; GFNIAVX2:       # %bb.0:
2125; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2126; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
2127; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
2128; GFNIAVX2-NEXT:    retq
2129;
2130; GFNIAVX512F-LABEL: test_bitreverse_v64i8:
2131; GFNIAVX512F:       # %bb.0:
2132; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2133; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2134; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
2135; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
2136; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2137; GFNIAVX512F-NEXT:    retq
2138;
2139; GFNIAVX512BW-LABEL: test_bitreverse_v64i8:
2140; GFNIAVX512BW:       # %bb.0:
2141; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2142; GFNIAVX512BW-NEXT:    retq
2143  %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
2144  ret <64 x i8> %b
2145}
2146
2147define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
2148; SSE2-LABEL: test_bitreverse_v32i16:
2149; SSE2:       # %bb.0:
2150; SSE2-NEXT:    movdqa %xmm3, %xmm4
2151; SSE2-NEXT:    movdqa %xmm0, %xmm3
2152; SSE2-NEXT:    psrlw $8, %xmm3
2153; SSE2-NEXT:    psllw $8, %xmm0
2154; SSE2-NEXT:    por %xmm3, %xmm0
2155; SSE2-NEXT:    movdqa %xmm0, %xmm5
2156; SSE2-NEXT:    psllw $4, %xmm5
2157; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2158; SSE2-NEXT:    movdqa %xmm3, %xmm6
2159; SSE2-NEXT:    pandn %xmm5, %xmm6
2160; SSE2-NEXT:    psrlw $4, %xmm0
2161; SSE2-NEXT:    pand %xmm3, %xmm0
2162; SSE2-NEXT:    por %xmm6, %xmm0
2163; SSE2-NEXT:    movdqa %xmm0, %xmm6
2164; SSE2-NEXT:    psrlw $2, %xmm6
2165; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2166; SSE2-NEXT:    pand %xmm8, %xmm6
2167; SSE2-NEXT:    pand %xmm8, %xmm0
2168; SSE2-NEXT:    psllw $2, %xmm0
2169; SSE2-NEXT:    por %xmm6, %xmm0
2170; SSE2-NEXT:    movdqa %xmm0, %xmm7
2171; SSE2-NEXT:    psrlw $1, %xmm7
2172; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2173; SSE2-NEXT:    pand %xmm6, %xmm7
2174; SSE2-NEXT:    pand %xmm6, %xmm0
2175; SSE2-NEXT:    paddb %xmm0, %xmm0
2176; SSE2-NEXT:    por %xmm7, %xmm0
2177; SSE2-NEXT:    movdqa %xmm1, %xmm7
2178; SSE2-NEXT:    psrlw $8, %xmm7
2179; SSE2-NEXT:    psllw $8, %xmm1
2180; SSE2-NEXT:    por %xmm7, %xmm1
2181; SSE2-NEXT:    movdqa %xmm1, %xmm7
2182; SSE2-NEXT:    psllw $4, %xmm7
2183; SSE2-NEXT:    movdqa %xmm3, %xmm5
2184; SSE2-NEXT:    pandn %xmm7, %xmm5
2185; SSE2-NEXT:    psrlw $4, %xmm1
2186; SSE2-NEXT:    pand %xmm3, %xmm1
2187; SSE2-NEXT:    por %xmm5, %xmm1
2188; SSE2-NEXT:    movdqa %xmm1, %xmm5
2189; SSE2-NEXT:    psrlw $2, %xmm5
2190; SSE2-NEXT:    pand %xmm8, %xmm5
2191; SSE2-NEXT:    pand %xmm8, %xmm1
2192; SSE2-NEXT:    psllw $2, %xmm1
2193; SSE2-NEXT:    por %xmm5, %xmm1
2194; SSE2-NEXT:    movdqa %xmm1, %xmm5
2195; SSE2-NEXT:    psrlw $1, %xmm5
2196; SSE2-NEXT:    pand %xmm6, %xmm5
2197; SSE2-NEXT:    pand %xmm6, %xmm1
2198; SSE2-NEXT:    paddb %xmm1, %xmm1
2199; SSE2-NEXT:    por %xmm5, %xmm1
2200; SSE2-NEXT:    movdqa %xmm2, %xmm5
2201; SSE2-NEXT:    psrlw $8, %xmm5
2202; SSE2-NEXT:    psllw $8, %xmm2
2203; SSE2-NEXT:    por %xmm5, %xmm2
2204; SSE2-NEXT:    movdqa %xmm2, %xmm5
2205; SSE2-NEXT:    psllw $4, %xmm5
2206; SSE2-NEXT:    movdqa %xmm3, %xmm7
2207; SSE2-NEXT:    pandn %xmm5, %xmm7
2208; SSE2-NEXT:    psrlw $4, %xmm2
2209; SSE2-NEXT:    pand %xmm3, %xmm2
2210; SSE2-NEXT:    por %xmm7, %xmm2
2211; SSE2-NEXT:    movdqa %xmm2, %xmm5
2212; SSE2-NEXT:    psrlw $2, %xmm5
2213; SSE2-NEXT:    pand %xmm8, %xmm5
2214; SSE2-NEXT:    pand %xmm8, %xmm2
2215; SSE2-NEXT:    psllw $2, %xmm2
2216; SSE2-NEXT:    por %xmm5, %xmm2
2217; SSE2-NEXT:    movdqa %xmm2, %xmm5
2218; SSE2-NEXT:    psrlw $1, %xmm5
2219; SSE2-NEXT:    pand %xmm6, %xmm5
2220; SSE2-NEXT:    pand %xmm6, %xmm2
2221; SSE2-NEXT:    paddb %xmm2, %xmm2
2222; SSE2-NEXT:    por %xmm5, %xmm2
2223; SSE2-NEXT:    movdqa %xmm4, %xmm5
2224; SSE2-NEXT:    psrlw $8, %xmm5
2225; SSE2-NEXT:    psllw $8, %xmm4
2226; SSE2-NEXT:    por %xmm5, %xmm4
2227; SSE2-NEXT:    movdqa %xmm4, %xmm5
2228; SSE2-NEXT:    psllw $4, %xmm5
2229; SSE2-NEXT:    psrlw $4, %xmm4
2230; SSE2-NEXT:    pand %xmm3, %xmm4
2231; SSE2-NEXT:    pandn %xmm5, %xmm3
2232; SSE2-NEXT:    por %xmm4, %xmm3
2233; SSE2-NEXT:    movdqa %xmm3, %xmm4
2234; SSE2-NEXT:    psrlw $2, %xmm4
2235; SSE2-NEXT:    pand %xmm8, %xmm4
2236; SSE2-NEXT:    pand %xmm8, %xmm3
2237; SSE2-NEXT:    psllw $2, %xmm3
2238; SSE2-NEXT:    por %xmm4, %xmm3
2239; SSE2-NEXT:    movdqa %xmm3, %xmm4
2240; SSE2-NEXT:    psrlw $1, %xmm4
2241; SSE2-NEXT:    pand %xmm6, %xmm4
2242; SSE2-NEXT:    pand %xmm6, %xmm3
2243; SSE2-NEXT:    paddb %xmm3, %xmm3
2244; SSE2-NEXT:    por %xmm4, %xmm3
2245; SSE2-NEXT:    retq
2246;
2247; SSSE3-LABEL: test_bitreverse_v32i16:
2248; SSSE3:       # %bb.0:
2249; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2250; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2251; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2252; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2253; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2254; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2255; SSSE3-NEXT:    pand %xmm9, %xmm0
2256; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2257; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2258; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2259; SSSE3-NEXT:    psrlw $4, %xmm1
2260; SSSE3-NEXT:    pand %xmm9, %xmm1
2261; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2262; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2263; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2264; SSSE3-NEXT:    por %xmm6, %xmm0
2265; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2266; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2267; SSSE3-NEXT:    pand %xmm9, %xmm1
2268; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2269; SSSE3-NEXT:    pshufb %xmm1, %xmm6
2270; SSSE3-NEXT:    psrlw $4, %xmm5
2271; SSSE3-NEXT:    pand %xmm9, %xmm5
2272; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2273; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2274; SSSE3-NEXT:    por %xmm6, %xmm1
2275; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2276; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2277; SSSE3-NEXT:    pand %xmm9, %xmm5
2278; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2279; SSSE3-NEXT:    pshufb %xmm5, %xmm6
2280; SSSE3-NEXT:    psrlw $4, %xmm2
2281; SSSE3-NEXT:    pand %xmm9, %xmm2
2282; SSSE3-NEXT:    movdqa %xmm4, %xmm5
2283; SSSE3-NEXT:    pshufb %xmm2, %xmm5
2284; SSSE3-NEXT:    por %xmm6, %xmm5
2285; SSSE3-NEXT:    pshufb %xmm8, %xmm3
2286; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2287; SSSE3-NEXT:    pand %xmm9, %xmm2
2288; SSSE3-NEXT:    pshufb %xmm2, %xmm7
2289; SSSE3-NEXT:    psrlw $4, %xmm3
2290; SSSE3-NEXT:    pand %xmm9, %xmm3
2291; SSSE3-NEXT:    pshufb %xmm3, %xmm4
2292; SSSE3-NEXT:    por %xmm7, %xmm4
2293; SSSE3-NEXT:    movdqa %xmm5, %xmm2
2294; SSSE3-NEXT:    movdqa %xmm4, %xmm3
2295; SSSE3-NEXT:    retq
2296;
2297; AVX1-LABEL: test_bitreverse_v32i16:
2298; AVX1:       # %bb.0:
2299; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2300; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2301; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2302; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2303; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2304; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2305; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2306; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2307; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2308; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2309; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2310; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2311; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2312; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
2313; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2314; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2315; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2316; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
2317; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
2318; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2319; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2320; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2321; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2322; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2323; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2324; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2325; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2326; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2327; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2328; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
2329; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2330; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2331; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2332; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
2333; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
2334; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2335; AVX1-NEXT:    retq
2336;
2337; AVX2-LABEL: test_bitreverse_v32i16:
2338; AVX2:       # %bb.0:
2339; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2340; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2341; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2342; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
2343; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2344; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2345; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2346; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2347; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2348; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2349; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
2350; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2351; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
2352; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2353; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2354; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2355; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2356; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2357; AVX2-NEXT:    retq
2358;
2359; AVX512F-LABEL: test_bitreverse_v32i16:
2360; AVX512F:       # %bb.0:
2361; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2362; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2363; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2364; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2365; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
2366; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2367; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2368; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2369; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
2370; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2371; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
2372; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
2373; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
2374; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2375; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
2376; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
2377; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
2378; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
2379; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2380; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
2381; AVX512F-NEXT:    retq
2382;
2383; AVX512BW-LABEL: test_bitreverse_v32i16:
2384; AVX512BW:       # %bb.0:
2385; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
2386; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2387; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2388; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2389; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2390; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2391; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2392; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2393; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2394; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2395; AVX512BW-NEXT:    retq
2396;
2397; XOPAVX1-LABEL: test_bitreverse_v32i16:
2398; XOPAVX1:       # %bb.0:
2399; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2400; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
2401; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2402; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2403; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2404; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2405; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2406; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2407; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2408; XOPAVX1-NEXT:    retq
2409;
2410; XOPAVX2-LABEL: test_bitreverse_v32i16:
2411; XOPAVX2:       # %bb.0:
2412; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2413; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
2414; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2415; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2416; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2417; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2418; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2419; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2420; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2421; XOPAVX2-NEXT:    retq
2422;
2423; GFNISSE-LABEL: test_bitreverse_v32i16:
2424; GFNISSE:       # %bb.0:
2425; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2426; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
2427; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
2428; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
2429; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
2430; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
2431; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
2432; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
2433; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
2434; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
2435; GFNISSE-NEXT:    retq
2436;
2437; GFNIAVX-LABEL: test_bitreverse_v32i16:
2438; GFNIAVX:       # %bb.0:
2439; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
2440; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2441; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2442; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
2443; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
2444; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2445; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
2446; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2447; GFNIAVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
2448; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2449; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
2450; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2451; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
2452; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2453; GFNIAVX-NEXT:    retq
2454;
2455; GFNIAVX2-LABEL: test_bitreverse_v32i16:
2456; GFNIAVX2:       # %bb.0:
2457; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2458; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2459; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2460; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2461; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2462; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2463; GFNIAVX2-NEXT:    retq
2464;
2465; GFNIAVX512F-LABEL: test_bitreverse_v32i16:
2466; GFNIAVX512F:       # %bb.0:
2467; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2468; GFNIAVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2469; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2470; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2471; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2472; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2473; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2474; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2475; GFNIAVX512F-NEXT:    retq
2476;
2477; GFNIAVX512BW-LABEL: test_bitreverse_v32i16:
2478; GFNIAVX512BW:       # %bb.0:
2479; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
2480; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2481; GFNIAVX512BW-NEXT:    retq
2482  %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
2483  ret <32 x i16> %b
2484}
2485
2486define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
2487; SSE2-LABEL: test_bitreverse_v16i32:
2488; SSE2:       # %bb.0:
2489; SSE2-NEXT:    movdqa %xmm3, %xmm4
2490; SSE2-NEXT:    pxor %xmm8, %xmm8
2491; SSE2-NEXT:    movdqa %xmm0, %xmm3
2492; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15]
2493; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
2494; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
2495; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
2496; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
2497; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
2498; SSE2-NEXT:    packuswb %xmm3, %xmm0
2499; SSE2-NEXT:    movdqa %xmm0, %xmm6
2500; SSE2-NEXT:    psllw $4, %xmm6
2501; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2502; SSE2-NEXT:    movdqa %xmm3, %xmm7
2503; SSE2-NEXT:    pandn %xmm6, %xmm7
2504; SSE2-NEXT:    psrlw $4, %xmm0
2505; SSE2-NEXT:    pand %xmm3, %xmm0
2506; SSE2-NEXT:    por %xmm7, %xmm0
2507; SSE2-NEXT:    movdqa %xmm0, %xmm7
2508; SSE2-NEXT:    psrlw $2, %xmm7
2509; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2510; SSE2-NEXT:    pand %xmm9, %xmm7
2511; SSE2-NEXT:    pand %xmm9, %xmm0
2512; SSE2-NEXT:    psllw $2, %xmm0
2513; SSE2-NEXT:    por %xmm7, %xmm0
2514; SSE2-NEXT:    movdqa %xmm0, %xmm5
2515; SSE2-NEXT:    psrlw $1, %xmm5
2516; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2517; SSE2-NEXT:    pand %xmm7, %xmm5
2518; SSE2-NEXT:    pand %xmm7, %xmm0
2519; SSE2-NEXT:    paddb %xmm0, %xmm0
2520; SSE2-NEXT:    por %xmm5, %xmm0
2521; SSE2-NEXT:    movdqa %xmm1, %xmm5
2522; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
2523; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
2524; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
2525; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
2526; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2527; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2528; SSE2-NEXT:    packuswb %xmm5, %xmm1
2529; SSE2-NEXT:    movdqa %xmm1, %xmm5
2530; SSE2-NEXT:    psllw $4, %xmm5
2531; SSE2-NEXT:    movdqa %xmm3, %xmm6
2532; SSE2-NEXT:    pandn %xmm5, %xmm6
2533; SSE2-NEXT:    psrlw $4, %xmm1
2534; SSE2-NEXT:    pand %xmm3, %xmm1
2535; SSE2-NEXT:    por %xmm6, %xmm1
2536; SSE2-NEXT:    movdqa %xmm1, %xmm5
2537; SSE2-NEXT:    psrlw $2, %xmm5
2538; SSE2-NEXT:    pand %xmm9, %xmm5
2539; SSE2-NEXT:    pand %xmm9, %xmm1
2540; SSE2-NEXT:    psllw $2, %xmm1
2541; SSE2-NEXT:    por %xmm5, %xmm1
2542; SSE2-NEXT:    movdqa %xmm1, %xmm5
2543; SSE2-NEXT:    psrlw $1, %xmm5
2544; SSE2-NEXT:    pand %xmm7, %xmm5
2545; SSE2-NEXT:    pand %xmm7, %xmm1
2546; SSE2-NEXT:    paddb %xmm1, %xmm1
2547; SSE2-NEXT:    por %xmm5, %xmm1
2548; SSE2-NEXT:    movdqa %xmm2, %xmm5
2549; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
2550; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
2551; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
2552; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
2553; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2554; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2555; SSE2-NEXT:    packuswb %xmm5, %xmm2
2556; SSE2-NEXT:    movdqa %xmm2, %xmm5
2557; SSE2-NEXT:    psllw $4, %xmm5
2558; SSE2-NEXT:    movdqa %xmm3, %xmm6
2559; SSE2-NEXT:    pandn %xmm5, %xmm6
2560; SSE2-NEXT:    psrlw $4, %xmm2
2561; SSE2-NEXT:    pand %xmm3, %xmm2
2562; SSE2-NEXT:    por %xmm6, %xmm2
2563; SSE2-NEXT:    movdqa %xmm2, %xmm5
2564; SSE2-NEXT:    psrlw $2, %xmm5
2565; SSE2-NEXT:    pand %xmm9, %xmm5
2566; SSE2-NEXT:    pand %xmm9, %xmm2
2567; SSE2-NEXT:    psllw $2, %xmm2
2568; SSE2-NEXT:    por %xmm5, %xmm2
2569; SSE2-NEXT:    movdqa %xmm2, %xmm5
2570; SSE2-NEXT:    psrlw $1, %xmm5
2571; SSE2-NEXT:    pand %xmm7, %xmm5
2572; SSE2-NEXT:    pand %xmm7, %xmm2
2573; SSE2-NEXT:    paddb %xmm2, %xmm2
2574; SSE2-NEXT:    por %xmm5, %xmm2
2575; SSE2-NEXT:    movdqa %xmm4, %xmm5
2576; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
2577; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
2578; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
2579; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
2580; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2581; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2582; SSE2-NEXT:    packuswb %xmm5, %xmm4
2583; SSE2-NEXT:    movdqa %xmm4, %xmm5
2584; SSE2-NEXT:    psllw $4, %xmm5
2585; SSE2-NEXT:    psrlw $4, %xmm4
2586; SSE2-NEXT:    pand %xmm3, %xmm4
2587; SSE2-NEXT:    pandn %xmm5, %xmm3
2588; SSE2-NEXT:    por %xmm4, %xmm3
2589; SSE2-NEXT:    movdqa %xmm3, %xmm4
2590; SSE2-NEXT:    psrlw $2, %xmm4
2591; SSE2-NEXT:    pand %xmm9, %xmm4
2592; SSE2-NEXT:    pand %xmm9, %xmm3
2593; SSE2-NEXT:    psllw $2, %xmm3
2594; SSE2-NEXT:    por %xmm4, %xmm3
2595; SSE2-NEXT:    movdqa %xmm3, %xmm4
2596; SSE2-NEXT:    psrlw $1, %xmm4
2597; SSE2-NEXT:    pand %xmm7, %xmm4
2598; SSE2-NEXT:    pand %xmm7, %xmm3
2599; SSE2-NEXT:    paddb %xmm3, %xmm3
2600; SSE2-NEXT:    por %xmm4, %xmm3
2601; SSE2-NEXT:    retq
2602;
2603; SSSE3-LABEL: test_bitreverse_v16i32:
2604; SSSE3:       # %bb.0:
2605; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2606; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2607; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2608; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2609; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2610; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2611; SSSE3-NEXT:    pand %xmm9, %xmm0
2612; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2613; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2614; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2615; SSSE3-NEXT:    psrlw $4, %xmm1
2616; SSSE3-NEXT:    pand %xmm9, %xmm1
2617; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2618; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2619; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2620; SSSE3-NEXT:    por %xmm6, %xmm0
2621; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2622; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2623; SSSE3-NEXT:    pand %xmm9, %xmm1
2624; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2625; SSSE3-NEXT:    pshufb %xmm1, %xmm6
2626; SSSE3-NEXT:    psrlw $4, %xmm5
2627; SSSE3-NEXT:    pand %xmm9, %xmm5
2628; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2629; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2630; SSSE3-NEXT:    por %xmm6, %xmm1
2631; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2632; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2633; SSSE3-NEXT:    pand %xmm9, %xmm5
2634; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2635; SSSE3-NEXT:    pshufb %xmm5, %xmm6
2636; SSSE3-NEXT:    psrlw $4, %xmm2
2637; SSSE3-NEXT:    pand %xmm9, %xmm2
2638; SSSE3-NEXT:    movdqa %xmm4, %xmm5
2639; SSSE3-NEXT:    pshufb %xmm2, %xmm5
2640; SSSE3-NEXT:    por %xmm6, %xmm5
2641; SSSE3-NEXT:    pshufb %xmm8, %xmm3
2642; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2643; SSSE3-NEXT:    pand %xmm9, %xmm2
2644; SSSE3-NEXT:    pshufb %xmm2, %xmm7
2645; SSSE3-NEXT:    psrlw $4, %xmm3
2646; SSSE3-NEXT:    pand %xmm9, %xmm3
2647; SSSE3-NEXT:    pshufb %xmm3, %xmm4
2648; SSSE3-NEXT:    por %xmm7, %xmm4
2649; SSSE3-NEXT:    movdqa %xmm5, %xmm2
2650; SSSE3-NEXT:    movdqa %xmm4, %xmm3
2651; SSSE3-NEXT:    retq
2652;
2653; AVX1-LABEL: test_bitreverse_v16i32:
2654; AVX1:       # %bb.0:
2655; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2656; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2657; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2658; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2659; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2660; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2661; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2662; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2663; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2664; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2665; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2666; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2667; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2668; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
2669; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2670; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2671; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2672; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
2673; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
2674; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2675; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2676; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2677; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2678; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2679; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2680; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2681; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2682; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2683; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2684; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
2685; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2686; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2687; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2688; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
2689; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
2690; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2691; AVX1-NEXT:    retq
2692;
2693; AVX2-LABEL: test_bitreverse_v16i32:
2694; AVX2:       # %bb.0:
2695; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2696; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2697; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2698; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
2699; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2700; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2701; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2702; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2703; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2704; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2705; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
2706; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2707; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
2708; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2709; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2710; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2711; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2712; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2713; AVX2-NEXT:    retq
2714;
2715; AVX512F-LABEL: test_bitreverse_v16i32:
2716; AVX512F:       # %bb.0:
2717; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2718; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2719; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2720; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2721; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
2722; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2723; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2724; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2725; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
2726; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2727; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
2728; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
2729; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
2730; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2731; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
2732; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
2733; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
2734; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
2735; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2736; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
2737; AVX512F-NEXT:    retq
2738;
2739; AVX512BW-LABEL: test_bitreverse_v16i32:
2740; AVX512BW:       # %bb.0:
2741; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
2742; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2743; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2744; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2745; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2746; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2747; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2748; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2749; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2750; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2751; AVX512BW-NEXT:    retq
2752;
2753; XOPAVX1-LABEL: test_bitreverse_v16i32:
2754; XOPAVX1:       # %bb.0:
2755; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2756; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
2757; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2758; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2759; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2760; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2761; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2762; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2763; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2764; XOPAVX1-NEXT:    retq
2765;
2766; XOPAVX2-LABEL: test_bitreverse_v16i32:
2767; XOPAVX2:       # %bb.0:
2768; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2769; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
2770; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2771; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2772; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2773; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2774; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2775; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2776; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2777; XOPAVX2-NEXT:    retq
2778;
2779; GFNISSE-LABEL: test_bitreverse_v16i32:
2780; GFNISSE:       # %bb.0:
2781; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2782; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
2783; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
2784; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
2785; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
2786; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
2787; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
2788; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
2789; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
2790; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
2791; GFNISSE-NEXT:    retq
2792;
2793; GFNIAVX-LABEL: test_bitreverse_v16i32:
2794; GFNIAVX:       # %bb.0:
2795; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
2796; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2797; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2798; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
2799; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
2800; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2801; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
2802; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2803; GFNIAVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
2804; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2805; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
2806; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2807; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
2808; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2809; GFNIAVX-NEXT:    retq
2810;
2811; GFNIAVX2-LABEL: test_bitreverse_v16i32:
2812; GFNIAVX2:       # %bb.0:
2813; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2814; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2815; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2816; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2817; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2818; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2819; GFNIAVX2-NEXT:    retq
2820;
2821; GFNIAVX512F-LABEL: test_bitreverse_v16i32:
2822; GFNIAVX512F:       # %bb.0:
2823; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2824; GFNIAVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2825; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2826; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2827; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2828; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2829; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2830; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2831; GFNIAVX512F-NEXT:    retq
2832;
2833; GFNIAVX512BW-LABEL: test_bitreverse_v16i32:
2834; GFNIAVX512BW:       # %bb.0:
2835; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
2836; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2837; GFNIAVX512BW-NEXT:    retq
2838  %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
2839  ret <16 x i32> %b
2840}
2841
2842define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
2843; SSE2-LABEL: test_bitreverse_v8i64:
2844; SSE2:       # %bb.0:
2845; SSE2-NEXT:    movdqa %xmm3, %xmm4
2846; SSE2-NEXT:    pxor %xmm8, %xmm8
2847; SSE2-NEXT:    movdqa %xmm0, %xmm3
2848; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15]
2849; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
2850; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
2851; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
2852; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
2853; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2854; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
2855; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
2856; SSE2-NEXT:    packuswb %xmm3, %xmm0
2857; SSE2-NEXT:    movdqa %xmm0, %xmm6
2858; SSE2-NEXT:    psllw $4, %xmm6
2859; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2860; SSE2-NEXT:    movdqa %xmm3, %xmm7
2861; SSE2-NEXT:    pandn %xmm6, %xmm7
2862; SSE2-NEXT:    psrlw $4, %xmm0
2863; SSE2-NEXT:    pand %xmm3, %xmm0
2864; SSE2-NEXT:    por %xmm7, %xmm0
2865; SSE2-NEXT:    movdqa %xmm0, %xmm7
2866; SSE2-NEXT:    psrlw $2, %xmm7
2867; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2868; SSE2-NEXT:    pand %xmm9, %xmm7
2869; SSE2-NEXT:    pand %xmm9, %xmm0
2870; SSE2-NEXT:    psllw $2, %xmm0
2871; SSE2-NEXT:    por %xmm7, %xmm0
2872; SSE2-NEXT:    movdqa %xmm0, %xmm5
2873; SSE2-NEXT:    psrlw $1, %xmm5
2874; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2875; SSE2-NEXT:    pand %xmm7, %xmm5
2876; SSE2-NEXT:    pand %xmm7, %xmm0
2877; SSE2-NEXT:    paddb %xmm0, %xmm0
2878; SSE2-NEXT:    por %xmm5, %xmm0
2879; SSE2-NEXT:    movdqa %xmm1, %xmm5
2880; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
2881; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
2882; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
2883; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
2884; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
2885; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2886; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2887; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2888; SSE2-NEXT:    packuswb %xmm5, %xmm1
2889; SSE2-NEXT:    movdqa %xmm1, %xmm5
2890; SSE2-NEXT:    psllw $4, %xmm5
2891; SSE2-NEXT:    movdqa %xmm3, %xmm6
2892; SSE2-NEXT:    pandn %xmm5, %xmm6
2893; SSE2-NEXT:    psrlw $4, %xmm1
2894; SSE2-NEXT:    pand %xmm3, %xmm1
2895; SSE2-NEXT:    por %xmm6, %xmm1
2896; SSE2-NEXT:    movdqa %xmm1, %xmm5
2897; SSE2-NEXT:    psrlw $2, %xmm5
2898; SSE2-NEXT:    pand %xmm9, %xmm5
2899; SSE2-NEXT:    pand %xmm9, %xmm1
2900; SSE2-NEXT:    psllw $2, %xmm1
2901; SSE2-NEXT:    por %xmm5, %xmm1
2902; SSE2-NEXT:    movdqa %xmm1, %xmm5
2903; SSE2-NEXT:    psrlw $1, %xmm5
2904; SSE2-NEXT:    pand %xmm7, %xmm5
2905; SSE2-NEXT:    pand %xmm7, %xmm1
2906; SSE2-NEXT:    paddb %xmm1, %xmm1
2907; SSE2-NEXT:    por %xmm5, %xmm1
2908; SSE2-NEXT:    movdqa %xmm2, %xmm5
2909; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
2910; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
2911; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
2912; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
2913; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
2914; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
2915; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2916; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2917; SSE2-NEXT:    packuswb %xmm5, %xmm2
2918; SSE2-NEXT:    movdqa %xmm2, %xmm5
2919; SSE2-NEXT:    psllw $4, %xmm5
2920; SSE2-NEXT:    movdqa %xmm3, %xmm6
2921; SSE2-NEXT:    pandn %xmm5, %xmm6
2922; SSE2-NEXT:    psrlw $4, %xmm2
2923; SSE2-NEXT:    pand %xmm3, %xmm2
2924; SSE2-NEXT:    por %xmm6, %xmm2
2925; SSE2-NEXT:    movdqa %xmm2, %xmm5
2926; SSE2-NEXT:    psrlw $2, %xmm5
2927; SSE2-NEXT:    pand %xmm9, %xmm5
2928; SSE2-NEXT:    pand %xmm9, %xmm2
2929; SSE2-NEXT:    psllw $2, %xmm2
2930; SSE2-NEXT:    por %xmm5, %xmm2
2931; SSE2-NEXT:    movdqa %xmm2, %xmm5
2932; SSE2-NEXT:    psrlw $1, %xmm5
2933; SSE2-NEXT:    pand %xmm7, %xmm5
2934; SSE2-NEXT:    pand %xmm7, %xmm2
2935; SSE2-NEXT:    paddb %xmm2, %xmm2
2936; SSE2-NEXT:    por %xmm5, %xmm2
2937; SSE2-NEXT:    movdqa %xmm4, %xmm5
2938; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
2939; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
2940; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
2941; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
2942; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
2943; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
2944; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2945; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2946; SSE2-NEXT:    packuswb %xmm5, %xmm4
2947; SSE2-NEXT:    movdqa %xmm4, %xmm5
2948; SSE2-NEXT:    psllw $4, %xmm5
2949; SSE2-NEXT:    psrlw $4, %xmm4
2950; SSE2-NEXT:    pand %xmm3, %xmm4
2951; SSE2-NEXT:    pandn %xmm5, %xmm3
2952; SSE2-NEXT:    por %xmm4, %xmm3
2953; SSE2-NEXT:    movdqa %xmm3, %xmm4
2954; SSE2-NEXT:    psrlw $2, %xmm4
2955; SSE2-NEXT:    pand %xmm9, %xmm4
2956; SSE2-NEXT:    pand %xmm9, %xmm3
2957; SSE2-NEXT:    psllw $2, %xmm3
2958; SSE2-NEXT:    por %xmm4, %xmm3
2959; SSE2-NEXT:    movdqa %xmm3, %xmm4
2960; SSE2-NEXT:    psrlw $1, %xmm4
2961; SSE2-NEXT:    pand %xmm7, %xmm4
2962; SSE2-NEXT:    pand %xmm7, %xmm3
2963; SSE2-NEXT:    paddb %xmm3, %xmm3
2964; SSE2-NEXT:    por %xmm4, %xmm3
2965; SSE2-NEXT:    retq
2966;
2967; SSSE3-LABEL: test_bitreverse_v8i64:
2968; SSSE3:       # %bb.0:
2969; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2970; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2971; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2972; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2973; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2974; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2975; SSSE3-NEXT:    pand %xmm9, %xmm0
2976; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2977; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2978; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2979; SSSE3-NEXT:    psrlw $4, %xmm1
2980; SSSE3-NEXT:    pand %xmm9, %xmm1
2981; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2982; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2983; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2984; SSSE3-NEXT:    por %xmm6, %xmm0
2985; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2986; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2987; SSSE3-NEXT:    pand %xmm9, %xmm1
2988; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2989; SSSE3-NEXT:    pshufb %xmm1, %xmm6
2990; SSSE3-NEXT:    psrlw $4, %xmm5
2991; SSSE3-NEXT:    pand %xmm9, %xmm5
2992; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2993; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2994; SSSE3-NEXT:    por %xmm6, %xmm1
2995; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2996; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2997; SSSE3-NEXT:    pand %xmm9, %xmm5
2998; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2999; SSSE3-NEXT:    pshufb %xmm5, %xmm6
3000; SSSE3-NEXT:    psrlw $4, %xmm2
3001; SSSE3-NEXT:    pand %xmm9, %xmm2
3002; SSSE3-NEXT:    movdqa %xmm4, %xmm5
3003; SSSE3-NEXT:    pshufb %xmm2, %xmm5
3004; SSSE3-NEXT:    por %xmm6, %xmm5
3005; SSSE3-NEXT:    pshufb %xmm8, %xmm3
3006; SSSE3-NEXT:    movdqa %xmm3, %xmm2
3007; SSSE3-NEXT:    pand %xmm9, %xmm2
3008; SSSE3-NEXT:    pshufb %xmm2, %xmm7
3009; SSSE3-NEXT:    psrlw $4, %xmm3
3010; SSSE3-NEXT:    pand %xmm9, %xmm3
3011; SSSE3-NEXT:    pshufb %xmm3, %xmm4
3012; SSSE3-NEXT:    por %xmm7, %xmm4
3013; SSSE3-NEXT:    movdqa %xmm5, %xmm2
3014; SSSE3-NEXT:    movdqa %xmm4, %xmm3
3015; SSSE3-NEXT:    retq
3016;
3017; AVX1-LABEL: test_bitreverse_v8i64:
3018; AVX1:       # %bb.0:
3019; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3020; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3021; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
3022; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3023; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
3024; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3025; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
3026; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
3027; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
3028; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3029; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
3030; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
3031; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
3032; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
3033; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
3034; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
3035; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
3036; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
3037; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
3038; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3039; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3040; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
3041; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
3042; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
3043; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
3044; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
3045; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
3046; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
3047; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
3048; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
3049; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
3050; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
3051; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
3052; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
3053; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
3054; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
3055; AVX1-NEXT:    retq
3056;
3057; AVX2-LABEL: test_bitreverse_v8i64:
3058; AVX2:       # %bb.0:
3059; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3060; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3061; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3062; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
3063; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3064; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
3065; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
3066; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
3067; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3068; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
3069; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
3070; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3071; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
3072; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
3073; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
3074; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
3075; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
3076; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
3077; AVX2-NEXT:    retq
3078;
3079; AVX512F-LABEL: test_bitreverse_v8i64:
3080; AVX512F:       # %bb.0:
3081; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
3082; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3083; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3084; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3085; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
3086; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3087; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
3088; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3089; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
3090; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
3091; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
3092; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
3093; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
3094; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3095; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
3096; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
3097; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
3098; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
3099; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3100; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
3101; AVX512F-NEXT:    retq
3102;
3103; AVX512BW-LABEL: test_bitreverse_v8i64:
3104; AVX512BW:       # %bb.0:
3105; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
3106; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3107; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
3108; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3109; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
3110; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
3111; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
3112; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3113; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
3114; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
3115; AVX512BW-NEXT:    retq
3116;
3117; XOPAVX1-LABEL: test_bitreverse_v8i64:
3118; XOPAVX1:       # %bb.0:
3119; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3120; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
3121; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3122; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
3123; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3124; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3125; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3126; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
3127; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
3128; XOPAVX1-NEXT:    retq
3129;
3130; XOPAVX2-LABEL: test_bitreverse_v8i64:
3131; XOPAVX2:       # %bb.0:
3132; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
3133; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
3134; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3135; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
3136; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
3137; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
3138; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3139; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
3140; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
3141; XOPAVX2-NEXT:    retq
3142;
3143; GFNISSE-LABEL: test_bitreverse_v8i64:
3144; GFNISSE:       # %bb.0:
3145; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3146; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
3147; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
3148; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
3149; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
3150; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
3151; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
3152; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
3153; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
3154; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
3155; GFNISSE-NEXT:    retq
3156;
3157; GFNIAVX-LABEL: test_bitreverse_v8i64:
3158; GFNIAVX:       # %bb.0:
3159; GFNIAVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
3160; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3161; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
3162; GFNIAVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
3163; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
3164; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
3165; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
3166; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3167; GFNIAVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
3168; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
3169; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
3170; GFNIAVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
3171; GFNIAVX-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
3172; GFNIAVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
3173; GFNIAVX-NEXT:    retq
3174;
3175; GFNIAVX2-LABEL: test_bitreverse_v8i64:
3176; GFNIAVX2:       # %bb.0:
3177; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3178; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3179; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
3180; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
3181; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3182; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
3183; GFNIAVX2-NEXT:    retq
3184;
3185; GFNIAVX512F-LABEL: test_bitreverse_v8i64:
3186; GFNIAVX512F:       # %bb.0:
3187; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
3188; GFNIAVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3189; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3190; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
3191; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
3192; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3193; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
3194; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3195; GFNIAVX512F-NEXT:    retq
3196;
3197; GFNIAVX512BW-LABEL: test_bitreverse_v8i64:
3198; GFNIAVX512BW:       # %bb.0:
3199; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
3200; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
3201; GFNIAVX512BW-NEXT:    retq
3202  %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
3203  ret <8 x i64> %b
3204}
3205
3206;
3207; Constant Folding
3208;
3209
3210define i32 @fold_bitreverse_i32() nounwind {
3211; ALL-LABEL: fold_bitreverse_i32:
3212; ALL:       # %bb.0:
3213; ALL-NEXT:    movl $16711935, %eax # imm = 0xFF00FF
3214; ALL-NEXT:    retq
3215  %b = call i32 @llvm.bitreverse.i32(i32 4278255360)
3216  ret i32 %b
3217}
3218
3219define <16 x i8> @fold_bitreverse_v16i8() nounwind {
3220; SSE-LABEL: fold_bitreverse_v16i8:
3221; SSE:       # %bb.0:
3222; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3223; SSE-NEXT:    retq
3224;
3225; AVX-LABEL: fold_bitreverse_v16i8:
3226; AVX:       # %bb.0:
3227; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3228; AVX-NEXT:    retq
3229;
3230; XOP-LABEL: fold_bitreverse_v16i8:
3231; XOP:       # %bb.0:
3232; XOP-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3233; XOP-NEXT:    retq
3234;
3235; GFNISSE-LABEL: fold_bitreverse_v16i8:
3236; GFNISSE:       # %bb.0:
3237; GFNISSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3238; GFNISSE-NEXT:    retq
3239;
3240; GFNIAVX-LABEL: fold_bitreverse_v16i8:
3241; GFNIAVX:       # %bb.0:
3242; GFNIAVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3243; GFNIAVX-NEXT:    retq
3244;
3245; GFNIAVX2-LABEL: fold_bitreverse_v16i8:
3246; GFNIAVX2:       # %bb.0:
3247; GFNIAVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3248; GFNIAVX2-NEXT:    retq
3249;
3250; GFNIAVX512F-LABEL: fold_bitreverse_v16i8:
3251; GFNIAVX512F:       # %bb.0:
3252; GFNIAVX512F-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3253; GFNIAVX512F-NEXT:    retq
3254;
3255; GFNIAVX512BW-LABEL: fold_bitreverse_v16i8:
3256; GFNIAVX512BW:       # %bb.0:
3257; GFNIAVX512BW-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
3258; GFNIAVX512BW-NEXT:    retq
3259  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>)
3260  ret <16 x i8> %b
3261}
3262
3263define <16 x i16> @fold_bitreverse_v16i16() nounwind {
3264; SSE-LABEL: fold_bitreverse_v16i16:
3265; SSE:       # %bb.0:
3266; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
3267; SSE-NEXT:    movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
3268; SSE-NEXT:    retq
3269;
3270; AVX-LABEL: fold_bitreverse_v16i16:
3271; AVX:       # %bb.0:
3272; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3273; AVX-NEXT:    retq
3274;
3275; XOP-LABEL: fold_bitreverse_v16i16:
3276; XOP:       # %bb.0:
3277; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3278; XOP-NEXT:    retq
3279;
3280; GFNISSE-LABEL: fold_bitreverse_v16i16:
3281; GFNISSE:       # %bb.0:
3282; GFNISSE-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
3283; GFNISSE-NEXT:    movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
3284; GFNISSE-NEXT:    retq
3285;
3286; GFNIAVX-LABEL: fold_bitreverse_v16i16:
3287; GFNIAVX:       # %bb.0:
3288; GFNIAVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3289; GFNIAVX-NEXT:    retq
3290;
3291; GFNIAVX2-LABEL: fold_bitreverse_v16i16:
3292; GFNIAVX2:       # %bb.0:
3293; GFNIAVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3294; GFNIAVX2-NEXT:    retq
3295;
3296; GFNIAVX512F-LABEL: fold_bitreverse_v16i16:
3297; GFNIAVX512F:       # %bb.0:
3298; GFNIAVX512F-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3299; GFNIAVX512F-NEXT:    retq
3300;
3301; GFNIAVX512BW-LABEL: fold_bitreverse_v16i16:
3302; GFNIAVX512BW:       # %bb.0:
3303; GFNIAVX512BW-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3304; GFNIAVX512BW-NEXT:    retq
3305  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>)
3306  ret <16 x i16> %b
3307}
3308
3309define <16 x i32> @fold_bitreverse_v16i32() nounwind {
3310; SSE-LABEL: fold_bitreverse_v16i32:
3311; SSE:       # %bb.0:
3312; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
3313; SSE-NEXT:    movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
3314; SSE-NEXT:    movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
3315; SSE-NEXT:    movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103]
3316; SSE-NEXT:    retq
3317;
3318; AVX1-LABEL: fold_bitreverse_v16i32:
3319; AVX1:       # %bb.0:
3320; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3321; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3322; AVX1-NEXT:    retq
3323;
3324; AVX2-LABEL: fold_bitreverse_v16i32:
3325; AVX2:       # %bb.0:
3326; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3327; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3328; AVX2-NEXT:    retq
3329;
3330; AVX512-LABEL: fold_bitreverse_v16i32:
3331; AVX512:       # %bb.0:
3332; AVX512-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3333; AVX512-NEXT:    retq
3334;
3335; XOP-LABEL: fold_bitreverse_v16i32:
3336; XOP:       # %bb.0:
3337; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3338; XOP-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3339; XOP-NEXT:    retq
3340;
3341; GFNISSE-LABEL: fold_bitreverse_v16i32:
3342; GFNISSE:       # %bb.0:
3343; GFNISSE-NEXT:    movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
3344; GFNISSE-NEXT:    movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
3345; GFNISSE-NEXT:    movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
3346; GFNISSE-NEXT:    movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103]
3347; GFNISSE-NEXT:    retq
3348;
3349; GFNIAVX-LABEL: fold_bitreverse_v16i32:
3350; GFNIAVX:       # %bb.0:
3351; GFNIAVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3352; GFNIAVX-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3353; GFNIAVX-NEXT:    retq
3354;
3355; GFNIAVX2-LABEL: fold_bitreverse_v16i32:
3356; GFNIAVX2:       # %bb.0:
3357; GFNIAVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3358; GFNIAVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3359; GFNIAVX2-NEXT:    retq
3360;
3361; GFNIAVX512F-LABEL: fold_bitreverse_v16i32:
3362; GFNIAVX512F:       # %bb.0:
3363; GFNIAVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3364; GFNIAVX512F-NEXT:    retq
3365;
3366; GFNIAVX512BW-LABEL: fold_bitreverse_v16i32:
3367; GFNIAVX512BW:       # %bb.0:
3368; GFNIAVX512BW-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3369; GFNIAVX512BW-NEXT:    retq
3370  %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>)
3371  ret <16 x i32> %b
3372}
3373
3374declare i8 @llvm.bitreverse.i8(i8) readnone
3375declare i16 @llvm.bitreverse.i16(i16) readnone
3376declare i32 @llvm.bitreverse.i32(i32) readnone
3377declare i64 @llvm.bitreverse.i64(i64) readnone
3378
3379declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone
3380declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone
3381declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone
3382declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone
3383
3384declare <32 x i8>  @llvm.bitreverse.v32i8(<32 x i8>) readnone
3385declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone
3386declare <8 x i32>  @llvm.bitreverse.v8i32(<8 x i32>) readnone
3387declare <4 x i64>  @llvm.bitreverse.v4i64(<4 x i64>) readnone
3388
3389declare <64 x i8>  @llvm.bitreverse.v64i8(<64 x i8>) readnone
3390declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone
3391declare <16 x i32>  @llvm.bitreverse.v16i32(<16 x i32>) readnone
3392declare <8 x i64>  @llvm.bitreverse.v8i64(<8 x i64>) readnone
3393