1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=ALL,SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX1
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX2
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefixes=ALL,GFNISSE
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX1
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX2
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX512,GFNIAVX512F
14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX512,GFNIAVX512BW
15
16; Make sure we don't crash with avx512bw and xop
17; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw
18
19define i8 @test_bitreverse_i8(i8 %a) nounwind {
20; SSE-LABEL: test_bitreverse_i8:
21; SSE:       # %bb.0:
22; SSE-NEXT:    rolb $4, %dil
23; SSE-NEXT:    movl %edi, %eax
24; SSE-NEXT:    andb $51, %al
25; SSE-NEXT:    shlb $2, %al
26; SSE-NEXT:    shrb $2, %dil
27; SSE-NEXT:    andb $51, %dil
28; SSE-NEXT:    orb %dil, %al
29; SSE-NEXT:    movl %eax, %ecx
30; SSE-NEXT:    andb $85, %cl
31; SSE-NEXT:    addb %cl, %cl
32; SSE-NEXT:    shrb %al
33; SSE-NEXT:    andb $85, %al
34; SSE-NEXT:    orb %cl, %al
35; SSE-NEXT:    retq
36;
37; AVX-LABEL: test_bitreverse_i8:
38; AVX:       # %bb.0:
39; AVX-NEXT:    rolb $4, %dil
40; AVX-NEXT:    movl %edi, %eax
41; AVX-NEXT:    andb $51, %al
42; AVX-NEXT:    shlb $2, %al
43; AVX-NEXT:    shrb $2, %dil
44; AVX-NEXT:    andb $51, %dil
45; AVX-NEXT:    orb %dil, %al
46; AVX-NEXT:    movl %eax, %ecx
47; AVX-NEXT:    andb $85, %cl
48; AVX-NEXT:    addb %cl, %cl
49; AVX-NEXT:    shrb %al
50; AVX-NEXT:    andb $85, %al
51; AVX-NEXT:    orb %cl, %al
52; AVX-NEXT:    retq
53;
54; XOP-LABEL: test_bitreverse_i8:
55; XOP:       # %bb.0:
56; XOP-NEXT:    vmovd %edi, %xmm0
57; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
58; XOP-NEXT:    vmovd %xmm0, %eax
59; XOP-NEXT:    # kill: def $al killed $al killed $eax
60; XOP-NEXT:    retq
61;
62; GFNISSE-LABEL: test_bitreverse_i8:
63; GFNISSE:       # %bb.0:
64; GFNISSE-NEXT:    rolb $4, %dil
65; GFNISSE-NEXT:    movl %edi, %eax
66; GFNISSE-NEXT:    andb $51, %al
67; GFNISSE-NEXT:    shlb $2, %al
68; GFNISSE-NEXT:    shrb $2, %dil
69; GFNISSE-NEXT:    andb $51, %dil
70; GFNISSE-NEXT:    orb %dil, %al
71; GFNISSE-NEXT:    movl %eax, %ecx
72; GFNISSE-NEXT:    andb $85, %cl
73; GFNISSE-NEXT:    addb %cl, %cl
74; GFNISSE-NEXT:    shrb %al
75; GFNISSE-NEXT:    andb $85, %al
76; GFNISSE-NEXT:    orb %cl, %al
77; GFNISSE-NEXT:    retq
78;
79; GFNIAVX-LABEL: test_bitreverse_i8:
80; GFNIAVX:       # %bb.0:
81; GFNIAVX-NEXT:    rolb $4, %dil
82; GFNIAVX-NEXT:    movl %edi, %eax
83; GFNIAVX-NEXT:    andb $51, %al
84; GFNIAVX-NEXT:    shlb $2, %al
85; GFNIAVX-NEXT:    shrb $2, %dil
86; GFNIAVX-NEXT:    andb $51, %dil
87; GFNIAVX-NEXT:    orb %dil, %al
88; GFNIAVX-NEXT:    movl %eax, %ecx
89; GFNIAVX-NEXT:    andb $85, %cl
90; GFNIAVX-NEXT:    addb %cl, %cl
91; GFNIAVX-NEXT:    shrb %al
92; GFNIAVX-NEXT:    andb $85, %al
93; GFNIAVX-NEXT:    orb %cl, %al
94; GFNIAVX-NEXT:    retq
95  %b = call i8 @llvm.bitreverse.i8(i8 %a)
96  ret i8 %b
97}
98
99define i16 @test_bitreverse_i16(i16 %a) nounwind {
100; SSE-LABEL: test_bitreverse_i16:
101; SSE:       # %bb.0:
102; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
103; SSE-NEXT:    rolw $8, %di
104; SSE-NEXT:    movl %edi, %eax
105; SSE-NEXT:    andl $3855, %eax # imm = 0xF0F
106; SSE-NEXT:    shll $4, %eax
107; SSE-NEXT:    shrl $4, %edi
108; SSE-NEXT:    andl $3855, %edi # imm = 0xF0F
109; SSE-NEXT:    orl %eax, %edi
110; SSE-NEXT:    movl %edi, %eax
111; SSE-NEXT:    andl $13107, %eax # imm = 0x3333
112; SSE-NEXT:    shrl $2, %edi
113; SSE-NEXT:    andl $13107, %edi # imm = 0x3333
114; SSE-NEXT:    leal (%rdi,%rax,4), %eax
115; SSE-NEXT:    movl %eax, %ecx
116; SSE-NEXT:    andl $21845, %ecx # imm = 0x5555
117; SSE-NEXT:    shrl %eax
118; SSE-NEXT:    andl $21845, %eax # imm = 0x5555
119; SSE-NEXT:    leal (%rax,%rcx,2), %eax
120; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
121; SSE-NEXT:    retq
122;
123; AVX-LABEL: test_bitreverse_i16:
124; AVX:       # %bb.0:
125; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
126; AVX-NEXT:    rolw $8, %di
127; AVX-NEXT:    movl %edi, %eax
128; AVX-NEXT:    andl $3855, %eax # imm = 0xF0F
129; AVX-NEXT:    shll $4, %eax
130; AVX-NEXT:    shrl $4, %edi
131; AVX-NEXT:    andl $3855, %edi # imm = 0xF0F
132; AVX-NEXT:    orl %eax, %edi
133; AVX-NEXT:    movl %edi, %eax
134; AVX-NEXT:    andl $13107, %eax # imm = 0x3333
135; AVX-NEXT:    shrl $2, %edi
136; AVX-NEXT:    andl $13107, %edi # imm = 0x3333
137; AVX-NEXT:    leal (%rdi,%rax,4), %eax
138; AVX-NEXT:    movl %eax, %ecx
139; AVX-NEXT:    andl $21845, %ecx # imm = 0x5555
140; AVX-NEXT:    shrl %eax
141; AVX-NEXT:    andl $21845, %eax # imm = 0x5555
142; AVX-NEXT:    leal (%rax,%rcx,2), %eax
143; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
144; AVX-NEXT:    retq
145;
146; XOP-LABEL: test_bitreverse_i16:
147; XOP:       # %bb.0:
148; XOP-NEXT:    vmovd %edi, %xmm0
149; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
150; XOP-NEXT:    vmovd %xmm0, %eax
151; XOP-NEXT:    # kill: def $ax killed $ax killed $eax
152; XOP-NEXT:    retq
153;
154; GFNISSE-LABEL: test_bitreverse_i16:
155; GFNISSE:       # %bb.0:
156; GFNISSE-NEXT:    # kill: def $edi killed $edi def $rdi
157; GFNISSE-NEXT:    rolw $8, %di
158; GFNISSE-NEXT:    movl %edi, %eax
159; GFNISSE-NEXT:    andl $3855, %eax # imm = 0xF0F
160; GFNISSE-NEXT:    shll $4, %eax
161; GFNISSE-NEXT:    shrl $4, %edi
162; GFNISSE-NEXT:    andl $3855, %edi # imm = 0xF0F
163; GFNISSE-NEXT:    orl %eax, %edi
164; GFNISSE-NEXT:    movl %edi, %eax
165; GFNISSE-NEXT:    andl $13107, %eax # imm = 0x3333
166; GFNISSE-NEXT:    shrl $2, %edi
167; GFNISSE-NEXT:    andl $13107, %edi # imm = 0x3333
168; GFNISSE-NEXT:    leal (%rdi,%rax,4), %eax
169; GFNISSE-NEXT:    movl %eax, %ecx
170; GFNISSE-NEXT:    andl $21845, %ecx # imm = 0x5555
171; GFNISSE-NEXT:    shrl %eax
172; GFNISSE-NEXT:    andl $21845, %eax # imm = 0x5555
173; GFNISSE-NEXT:    leal (%rax,%rcx,2), %eax
174; GFNISSE-NEXT:    # kill: def $ax killed $ax killed $eax
175; GFNISSE-NEXT:    retq
176;
177; GFNIAVX-LABEL: test_bitreverse_i16:
178; GFNIAVX:       # %bb.0:
179; GFNIAVX-NEXT:    # kill: def $edi killed $edi def $rdi
180; GFNIAVX-NEXT:    rolw $8, %di
181; GFNIAVX-NEXT:    movl %edi, %eax
182; GFNIAVX-NEXT:    andl $3855, %eax # imm = 0xF0F
183; GFNIAVX-NEXT:    shll $4, %eax
184; GFNIAVX-NEXT:    shrl $4, %edi
185; GFNIAVX-NEXT:    andl $3855, %edi # imm = 0xF0F
186; GFNIAVX-NEXT:    orl %eax, %edi
187; GFNIAVX-NEXT:    movl %edi, %eax
188; GFNIAVX-NEXT:    andl $13107, %eax # imm = 0x3333
189; GFNIAVX-NEXT:    shrl $2, %edi
190; GFNIAVX-NEXT:    andl $13107, %edi # imm = 0x3333
191; GFNIAVX-NEXT:    leal (%rdi,%rax,4), %eax
192; GFNIAVX-NEXT:    movl %eax, %ecx
193; GFNIAVX-NEXT:    andl $21845, %ecx # imm = 0x5555
194; GFNIAVX-NEXT:    shrl %eax
195; GFNIAVX-NEXT:    andl $21845, %eax # imm = 0x5555
196; GFNIAVX-NEXT:    leal (%rax,%rcx,2), %eax
197; GFNIAVX-NEXT:    # kill: def $ax killed $ax killed $eax
198; GFNIAVX-NEXT:    retq
199  %b = call i16 @llvm.bitreverse.i16(i16 %a)
200  ret i16 %b
201}
202
203define i32 @test_bitreverse_i32(i32 %a) nounwind {
204; SSE-LABEL: test_bitreverse_i32:
205; SSE:       # %bb.0:
206; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
207; SSE-NEXT:    bswapl %edi
208; SSE-NEXT:    movl %edi, %eax
209; SSE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
210; SSE-NEXT:    shll $4, %eax
211; SSE-NEXT:    shrl $4, %edi
212; SSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
213; SSE-NEXT:    orl %eax, %edi
214; SSE-NEXT:    movl %edi, %eax
215; SSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
216; SSE-NEXT:    shrl $2, %edi
217; SSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
218; SSE-NEXT:    leal (%rdi,%rax,4), %eax
219; SSE-NEXT:    movl %eax, %ecx
220; SSE-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
221; SSE-NEXT:    shrl %eax
222; SSE-NEXT:    andl $1431655765, %eax # imm = 0x55555555
223; SSE-NEXT:    leal (%rax,%rcx,2), %eax
224; SSE-NEXT:    retq
225;
226; AVX-LABEL: test_bitreverse_i32:
227; AVX:       # %bb.0:
228; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
229; AVX-NEXT:    bswapl %edi
230; AVX-NEXT:    movl %edi, %eax
231; AVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
232; AVX-NEXT:    shll $4, %eax
233; AVX-NEXT:    shrl $4, %edi
234; AVX-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
235; AVX-NEXT:    orl %eax, %edi
236; AVX-NEXT:    movl %edi, %eax
237; AVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
238; AVX-NEXT:    shrl $2, %edi
239; AVX-NEXT:    andl $858993459, %edi # imm = 0x33333333
240; AVX-NEXT:    leal (%rdi,%rax,4), %eax
241; AVX-NEXT:    movl %eax, %ecx
242; AVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
243; AVX-NEXT:    shrl %eax
244; AVX-NEXT:    andl $1431655765, %eax # imm = 0x55555555
245; AVX-NEXT:    leal (%rax,%rcx,2), %eax
246; AVX-NEXT:    retq
247;
248; XOP-LABEL: test_bitreverse_i32:
249; XOP:       # %bb.0:
250; XOP-NEXT:    vmovd %edi, %xmm0
251; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
252; XOP-NEXT:    vmovd %xmm0, %eax
253; XOP-NEXT:    retq
254;
255; GFNISSE-LABEL: test_bitreverse_i32:
256; GFNISSE:       # %bb.0:
257; GFNISSE-NEXT:    # kill: def $edi killed $edi def $rdi
258; GFNISSE-NEXT:    bswapl %edi
259; GFNISSE-NEXT:    movl %edi, %eax
260; GFNISSE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
261; GFNISSE-NEXT:    shll $4, %eax
262; GFNISSE-NEXT:    shrl $4, %edi
263; GFNISSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
264; GFNISSE-NEXT:    orl %eax, %edi
265; GFNISSE-NEXT:    movl %edi, %eax
266; GFNISSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
267; GFNISSE-NEXT:    shrl $2, %edi
268; GFNISSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
269; GFNISSE-NEXT:    leal (%rdi,%rax,4), %eax
270; GFNISSE-NEXT:    movl %eax, %ecx
271; GFNISSE-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
272; GFNISSE-NEXT:    shrl %eax
273; GFNISSE-NEXT:    andl $1431655765, %eax # imm = 0x55555555
274; GFNISSE-NEXT:    leal (%rax,%rcx,2), %eax
275; GFNISSE-NEXT:    retq
276;
277; GFNIAVX-LABEL: test_bitreverse_i32:
278; GFNIAVX:       # %bb.0:
279; GFNIAVX-NEXT:    # kill: def $edi killed $edi def $rdi
280; GFNIAVX-NEXT:    bswapl %edi
281; GFNIAVX-NEXT:    movl %edi, %eax
282; GFNIAVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
283; GFNIAVX-NEXT:    shll $4, %eax
284; GFNIAVX-NEXT:    shrl $4, %edi
285; GFNIAVX-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
286; GFNIAVX-NEXT:    orl %eax, %edi
287; GFNIAVX-NEXT:    movl %edi, %eax
288; GFNIAVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
289; GFNIAVX-NEXT:    shrl $2, %edi
290; GFNIAVX-NEXT:    andl $858993459, %edi # imm = 0x33333333
291; GFNIAVX-NEXT:    leal (%rdi,%rax,4), %eax
292; GFNIAVX-NEXT:    movl %eax, %ecx
293; GFNIAVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
294; GFNIAVX-NEXT:    shrl %eax
295; GFNIAVX-NEXT:    andl $1431655765, %eax # imm = 0x55555555
296; GFNIAVX-NEXT:    leal (%rax,%rcx,2), %eax
297; GFNIAVX-NEXT:    retq
298  %b = call i32 @llvm.bitreverse.i32(i32 %a)
299  ret i32 %b
300}
301
302define i64 @test_bitreverse_i64(i64 %a) nounwind {
303; SSE-LABEL: test_bitreverse_i64:
304; SSE:       # %bb.0:
305; SSE-NEXT:    bswapq %rdi
306; SSE-NEXT:    movq %rdi, %rax
307; SSE-NEXT:    shrq $4, %rax
308; SSE-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
309; SSE-NEXT:    andq %rcx, %rax
310; SSE-NEXT:    andq %rcx, %rdi
311; SSE-NEXT:    shlq $4, %rdi
312; SSE-NEXT:    orq %rax, %rdi
313; SSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
314; SSE-NEXT:    movq %rdi, %rcx
315; SSE-NEXT:    andq %rax, %rcx
316; SSE-NEXT:    shrq $2, %rdi
317; SSE-NEXT:    andq %rax, %rdi
318; SSE-NEXT:    leaq (%rdi,%rcx,4), %rax
319; SSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
320; SSE-NEXT:    movq %rax, %rdx
321; SSE-NEXT:    andq %rcx, %rdx
322; SSE-NEXT:    shrq %rax
323; SSE-NEXT:    andq %rcx, %rax
324; SSE-NEXT:    leaq (%rax,%rdx,2), %rax
325; SSE-NEXT:    retq
326;
327; AVX-LABEL: test_bitreverse_i64:
328; AVX:       # %bb.0:
329; AVX-NEXT:    bswapq %rdi
330; AVX-NEXT:    movq %rdi, %rax
331; AVX-NEXT:    shrq $4, %rax
332; AVX-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
333; AVX-NEXT:    andq %rcx, %rax
334; AVX-NEXT:    andq %rcx, %rdi
335; AVX-NEXT:    shlq $4, %rdi
336; AVX-NEXT:    orq %rax, %rdi
337; AVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
338; AVX-NEXT:    movq %rdi, %rcx
339; AVX-NEXT:    andq %rax, %rcx
340; AVX-NEXT:    shrq $2, %rdi
341; AVX-NEXT:    andq %rax, %rdi
342; AVX-NEXT:    leaq (%rdi,%rcx,4), %rax
343; AVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
344; AVX-NEXT:    movq %rax, %rdx
345; AVX-NEXT:    andq %rcx, %rdx
346; AVX-NEXT:    shrq %rax
347; AVX-NEXT:    andq %rcx, %rax
348; AVX-NEXT:    leaq (%rax,%rdx,2), %rax
349; AVX-NEXT:    retq
350;
351; XOP-LABEL: test_bitreverse_i64:
352; XOP:       # %bb.0:
353; XOP-NEXT:    vmovq %rdi, %xmm0
354; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
355; XOP-NEXT:    vmovq %xmm0, %rax
356; XOP-NEXT:    retq
357;
358; GFNISSE-LABEL: test_bitreverse_i64:
359; GFNISSE:       # %bb.0:
360; GFNISSE-NEXT:    bswapq %rdi
361; GFNISSE-NEXT:    movq %rdi, %rax
362; GFNISSE-NEXT:    shrq $4, %rax
363; GFNISSE-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
364; GFNISSE-NEXT:    andq %rcx, %rax
365; GFNISSE-NEXT:    andq %rcx, %rdi
366; GFNISSE-NEXT:    shlq $4, %rdi
367; GFNISSE-NEXT:    orq %rax, %rdi
368; GFNISSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
369; GFNISSE-NEXT:    movq %rdi, %rcx
370; GFNISSE-NEXT:    andq %rax, %rcx
371; GFNISSE-NEXT:    shrq $2, %rdi
372; GFNISSE-NEXT:    andq %rax, %rdi
373; GFNISSE-NEXT:    leaq (%rdi,%rcx,4), %rax
374; GFNISSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
375; GFNISSE-NEXT:    movq %rax, %rdx
376; GFNISSE-NEXT:    andq %rcx, %rdx
377; GFNISSE-NEXT:    shrq %rax
378; GFNISSE-NEXT:    andq %rcx, %rax
379; GFNISSE-NEXT:    leaq (%rax,%rdx,2), %rax
380; GFNISSE-NEXT:    retq
381;
382; GFNIAVX-LABEL: test_bitreverse_i64:
383; GFNIAVX:       # %bb.0:
384; GFNIAVX-NEXT:    bswapq %rdi
385; GFNIAVX-NEXT:    movq %rdi, %rax
386; GFNIAVX-NEXT:    shrq $4, %rax
387; GFNIAVX-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
388; GFNIAVX-NEXT:    andq %rcx, %rax
389; GFNIAVX-NEXT:    andq %rcx, %rdi
390; GFNIAVX-NEXT:    shlq $4, %rdi
391; GFNIAVX-NEXT:    orq %rax, %rdi
392; GFNIAVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
393; GFNIAVX-NEXT:    movq %rdi, %rcx
394; GFNIAVX-NEXT:    andq %rax, %rcx
395; GFNIAVX-NEXT:    shrq $2, %rdi
396; GFNIAVX-NEXT:    andq %rax, %rdi
397; GFNIAVX-NEXT:    leaq (%rdi,%rcx,4), %rax
398; GFNIAVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
399; GFNIAVX-NEXT:    movq %rax, %rdx
400; GFNIAVX-NEXT:    andq %rcx, %rdx
401; GFNIAVX-NEXT:    shrq %rax
402; GFNIAVX-NEXT:    andq %rcx, %rax
403; GFNIAVX-NEXT:    leaq (%rax,%rdx,2), %rax
404; GFNIAVX-NEXT:    retq
405  %b = call i64 @llvm.bitreverse.i64(i64 %a)
406  ret i64 %b
407}
408
409define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
410; SSE2-LABEL: test_bitreverse_v16i8:
411; SSE2:       # %bb.0:
412; SSE2-NEXT:    movdqa %xmm0, %xmm1
413; SSE2-NEXT:    psrlw $4, %xmm1
414; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
415; SSE2-NEXT:    pand %xmm2, %xmm1
416; SSE2-NEXT:    pand %xmm2, %xmm0
417; SSE2-NEXT:    psllw $4, %xmm0
418; SSE2-NEXT:    por %xmm1, %xmm0
419; SSE2-NEXT:    movdqa %xmm0, %xmm1
420; SSE2-NEXT:    psrlw $2, %xmm1
421; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
422; SSE2-NEXT:    pand %xmm2, %xmm1
423; SSE2-NEXT:    pand %xmm2, %xmm0
424; SSE2-NEXT:    psllw $2, %xmm0
425; SSE2-NEXT:    por %xmm1, %xmm0
426; SSE2-NEXT:    movdqa %xmm0, %xmm1
427; SSE2-NEXT:    psrlw $1, %xmm1
428; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
429; SSE2-NEXT:    pand %xmm2, %xmm1
430; SSE2-NEXT:    pand %xmm2, %xmm0
431; SSE2-NEXT:    paddb %xmm0, %xmm0
432; SSE2-NEXT:    por %xmm1, %xmm0
433; SSE2-NEXT:    retq
434;
435; SSSE3-LABEL: test_bitreverse_v16i8:
436; SSSE3:       # %bb.0:
437; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
438; SSSE3-NEXT:    movdqa %xmm0, %xmm2
439; SSSE3-NEXT:    pand %xmm1, %xmm2
440; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
441; SSSE3-NEXT:    pshufb %xmm2, %xmm3
442; SSSE3-NEXT:    psrlw $4, %xmm0
443; SSSE3-NEXT:    pand %xmm1, %xmm0
444; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
445; SSSE3-NEXT:    pshufb %xmm0, %xmm1
446; SSSE3-NEXT:    por %xmm3, %xmm1
447; SSSE3-NEXT:    movdqa %xmm1, %xmm0
448; SSSE3-NEXT:    retq
449;
450; AVX-LABEL: test_bitreverse_v16i8:
451; AVX:       # %bb.0:
452; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
453; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
454; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
455; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
456; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
457; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
458; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
459; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
460; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
461; AVX-NEXT:    retq
462;
463; XOP-LABEL: test_bitreverse_v16i8:
464; XOP:       # %bb.0:
465; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
466; XOP-NEXT:    retq
467;
468; GFNISSE-LABEL: test_bitreverse_v16i8:
469; GFNISSE:       # %bb.0:
470; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
471; GFNISSE-NEXT:    retq
472;
473; GFNIAVX-LABEL: test_bitreverse_v16i8:
474; GFNIAVX:       # %bb.0:
475; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
476; GFNIAVX-NEXT:    retq
477  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
478  ret <16 x i8> %b
479}
480
481define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
482; SSE2-LABEL: test_bitreverse_v8i16:
483; SSE2:       # %bb.0:
484; SSE2-NEXT:    movdqa %xmm0, %xmm1
485; SSE2-NEXT:    psrlw $8, %xmm1
486; SSE2-NEXT:    psllw $8, %xmm0
487; SSE2-NEXT:    por %xmm1, %xmm0
488; SSE2-NEXT:    movdqa %xmm0, %xmm1
489; SSE2-NEXT:    psrlw $4, %xmm1
490; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
491; SSE2-NEXT:    pand %xmm2, %xmm1
492; SSE2-NEXT:    pand %xmm2, %xmm0
493; SSE2-NEXT:    psllw $4, %xmm0
494; SSE2-NEXT:    por %xmm1, %xmm0
495; SSE2-NEXT:    movdqa %xmm0, %xmm1
496; SSE2-NEXT:    psrlw $2, %xmm1
497; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
498; SSE2-NEXT:    pand %xmm2, %xmm1
499; SSE2-NEXT:    pand %xmm2, %xmm0
500; SSE2-NEXT:    psllw $2, %xmm0
501; SSE2-NEXT:    por %xmm1, %xmm0
502; SSE2-NEXT:    movdqa %xmm0, %xmm1
503; SSE2-NEXT:    psrlw $1, %xmm1
504; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
505; SSE2-NEXT:    pand %xmm2, %xmm1
506; SSE2-NEXT:    pand %xmm2, %xmm0
507; SSE2-NEXT:    paddb %xmm0, %xmm0
508; SSE2-NEXT:    por %xmm1, %xmm0
509; SSE2-NEXT:    retq
510;
511; SSSE3-LABEL: test_bitreverse_v8i16:
512; SSSE3:       # %bb.0:
513; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
514; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
515; SSSE3-NEXT:    movdqa %xmm0, %xmm2
516; SSSE3-NEXT:    pand %xmm1, %xmm2
517; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
518; SSSE3-NEXT:    pshufb %xmm2, %xmm3
519; SSSE3-NEXT:    psrlw $4, %xmm0
520; SSSE3-NEXT:    pand %xmm1, %xmm0
521; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
522; SSSE3-NEXT:    pshufb %xmm0, %xmm1
523; SSSE3-NEXT:    por %xmm3, %xmm1
524; SSSE3-NEXT:    movdqa %xmm1, %xmm0
525; SSSE3-NEXT:    retq
526;
527; AVX-LABEL: test_bitreverse_v8i16:
528; AVX:       # %bb.0:
529; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
530; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
531; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
532; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
533; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
534; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
535; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
536; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
537; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
538; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
539; AVX-NEXT:    retq
540;
541; XOP-LABEL: test_bitreverse_v8i16:
542; XOP:       # %bb.0:
543; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
544; XOP-NEXT:    retq
545;
546; GFNISSE-LABEL: test_bitreverse_v8i16:
547; GFNISSE:       # %bb.0:
548; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
549; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
550; GFNISSE-NEXT:    retq
551;
552; GFNIAVX-LABEL: test_bitreverse_v8i16:
553; GFNIAVX:       # %bb.0:
554; GFNIAVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
555; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
556; GFNIAVX-NEXT:    retq
557  %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
558  ret <8 x i16> %b
559}
560
561define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
562; SSE2-LABEL: test_bitreverse_v4i32:
563; SSE2:       # %bb.0:
564; SSE2-NEXT:    pxor %xmm1, %xmm1
565; SSE2-NEXT:    movdqa %xmm0, %xmm2
566; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
567; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
568; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
569; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
570; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
571; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
572; SSE2-NEXT:    packuswb %xmm2, %xmm0
573; SSE2-NEXT:    movdqa %xmm0, %xmm1
574; SSE2-NEXT:    psrlw $4, %xmm1
575; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
576; SSE2-NEXT:    pand %xmm2, %xmm1
577; SSE2-NEXT:    pand %xmm2, %xmm0
578; SSE2-NEXT:    psllw $4, %xmm0
579; SSE2-NEXT:    por %xmm1, %xmm0
580; SSE2-NEXT:    movdqa %xmm0, %xmm1
581; SSE2-NEXT:    psrlw $2, %xmm1
582; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
583; SSE2-NEXT:    pand %xmm2, %xmm1
584; SSE2-NEXT:    pand %xmm2, %xmm0
585; SSE2-NEXT:    psllw $2, %xmm0
586; SSE2-NEXT:    por %xmm1, %xmm0
587; SSE2-NEXT:    movdqa %xmm0, %xmm1
588; SSE2-NEXT:    psrlw $1, %xmm1
589; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
590; SSE2-NEXT:    pand %xmm2, %xmm1
591; SSE2-NEXT:    pand %xmm2, %xmm0
592; SSE2-NEXT:    paddb %xmm0, %xmm0
593; SSE2-NEXT:    por %xmm1, %xmm0
594; SSE2-NEXT:    retq
595;
596; SSSE3-LABEL: test_bitreverse_v4i32:
597; SSSE3:       # %bb.0:
598; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
599; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
600; SSSE3-NEXT:    movdqa %xmm0, %xmm2
601; SSSE3-NEXT:    pand %xmm1, %xmm2
602; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
603; SSSE3-NEXT:    pshufb %xmm2, %xmm3
604; SSSE3-NEXT:    psrlw $4, %xmm0
605; SSSE3-NEXT:    pand %xmm1, %xmm0
606; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
607; SSSE3-NEXT:    pshufb %xmm0, %xmm1
608; SSSE3-NEXT:    por %xmm3, %xmm1
609; SSSE3-NEXT:    movdqa %xmm1, %xmm0
610; SSSE3-NEXT:    retq
611;
612; AVX-LABEL: test_bitreverse_v4i32:
613; AVX:       # %bb.0:
614; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
615; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
616; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
617; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
618; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
619; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
620; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
621; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
622; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
623; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
624; AVX-NEXT:    retq
625;
626; XOP-LABEL: test_bitreverse_v4i32:
627; XOP:       # %bb.0:
628; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
629; XOP-NEXT:    retq
630;
631; GFNISSE-LABEL: test_bitreverse_v4i32:
632; GFNISSE:       # %bb.0:
633; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
634; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
635; GFNISSE-NEXT:    retq
636;
637; GFNIAVX-LABEL: test_bitreverse_v4i32:
638; GFNIAVX:       # %bb.0:
639; GFNIAVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
640; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
641; GFNIAVX-NEXT:    retq
642  %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
643  ret <4 x i32> %b
644}
645
646define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
647; SSE2-LABEL: test_bitreverse_v2i64:
648; SSE2:       # %bb.0:
649; SSE2-NEXT:    pxor %xmm1, %xmm1
650; SSE2-NEXT:    movdqa %xmm0, %xmm2
651; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
652; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
653; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
654; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
655; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
656; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
657; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
658; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
659; SSE2-NEXT:    packuswb %xmm2, %xmm0
660; SSE2-NEXT:    movdqa %xmm0, %xmm1
661; SSE2-NEXT:    psrlw $4, %xmm1
662; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
663; SSE2-NEXT:    pand %xmm2, %xmm1
664; SSE2-NEXT:    pand %xmm2, %xmm0
665; SSE2-NEXT:    psllw $4, %xmm0
666; SSE2-NEXT:    por %xmm1, %xmm0
667; SSE2-NEXT:    movdqa %xmm0, %xmm1
668; SSE2-NEXT:    psrlw $2, %xmm1
669; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
670; SSE2-NEXT:    pand %xmm2, %xmm1
671; SSE2-NEXT:    pand %xmm2, %xmm0
672; SSE2-NEXT:    psllw $2, %xmm0
673; SSE2-NEXT:    por %xmm1, %xmm0
674; SSE2-NEXT:    movdqa %xmm0, %xmm1
675; SSE2-NEXT:    psrlw $1, %xmm1
676; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
677; SSE2-NEXT:    pand %xmm2, %xmm1
678; SSE2-NEXT:    pand %xmm2, %xmm0
679; SSE2-NEXT:    paddb %xmm0, %xmm0
680; SSE2-NEXT:    por %xmm1, %xmm0
681; SSE2-NEXT:    retq
682;
683; SSSE3-LABEL: test_bitreverse_v2i64:
684; SSSE3:       # %bb.0:
685; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
686; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
687; SSSE3-NEXT:    movdqa %xmm0, %xmm2
688; SSSE3-NEXT:    pand %xmm1, %xmm2
689; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
690; SSSE3-NEXT:    pshufb %xmm2, %xmm3
691; SSSE3-NEXT:    psrlw $4, %xmm0
692; SSSE3-NEXT:    pand %xmm1, %xmm0
693; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
694; SSSE3-NEXT:    pshufb %xmm0, %xmm1
695; SSSE3-NEXT:    por %xmm3, %xmm1
696; SSSE3-NEXT:    movdqa %xmm1, %xmm0
697; SSSE3-NEXT:    retq
698;
699; AVX-LABEL: test_bitreverse_v2i64:
700; AVX:       # %bb.0:
701; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
702; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
703; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
704; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
705; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
706; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
707; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
708; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
709; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
710; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
711; AVX-NEXT:    retq
712;
713; XOP-LABEL: test_bitreverse_v2i64:
714; XOP:       # %bb.0:
715; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
716; XOP-NEXT:    retq
717;
718; GFNISSE-LABEL: test_bitreverse_v2i64:
719; GFNISSE:       # %bb.0:
720; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
721; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
722; GFNISSE-NEXT:    retq
723;
724; GFNIAVX-LABEL: test_bitreverse_v2i64:
725; GFNIAVX:       # %bb.0:
726; GFNIAVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
727; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
728; GFNIAVX-NEXT:    retq
729  %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
730  ret <2 x i64> %b
731}
732
733define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
734; SSE2-LABEL: test_bitreverse_v32i8:
735; SSE2:       # %bb.0:
736; SSE2-NEXT:    movdqa %xmm0, %xmm3
737; SSE2-NEXT:    psrlw $4, %xmm3
738; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
739; SSE2-NEXT:    pand %xmm2, %xmm3
740; SSE2-NEXT:    pand %xmm2, %xmm0
741; SSE2-NEXT:    psllw $4, %xmm0
742; SSE2-NEXT:    por %xmm3, %xmm0
743; SSE2-NEXT:    movdqa %xmm0, %xmm4
744; SSE2-NEXT:    psrlw $2, %xmm4
745; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
746; SSE2-NEXT:    pand %xmm3, %xmm4
747; SSE2-NEXT:    pand %xmm3, %xmm0
748; SSE2-NEXT:    psllw $2, %xmm0
749; SSE2-NEXT:    por %xmm4, %xmm0
750; SSE2-NEXT:    movdqa %xmm0, %xmm5
751; SSE2-NEXT:    psrlw $1, %xmm5
752; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
753; SSE2-NEXT:    pand %xmm4, %xmm5
754; SSE2-NEXT:    pand %xmm4, %xmm0
755; SSE2-NEXT:    paddb %xmm0, %xmm0
756; SSE2-NEXT:    por %xmm5, %xmm0
757; SSE2-NEXT:    movdqa %xmm1, %xmm5
758; SSE2-NEXT:    psrlw $4, %xmm5
759; SSE2-NEXT:    pand %xmm2, %xmm5
760; SSE2-NEXT:    pand %xmm2, %xmm1
761; SSE2-NEXT:    psllw $4, %xmm1
762; SSE2-NEXT:    por %xmm5, %xmm1
763; SSE2-NEXT:    movdqa %xmm1, %xmm2
764; SSE2-NEXT:    psrlw $2, %xmm2
765; SSE2-NEXT:    pand %xmm3, %xmm2
766; SSE2-NEXT:    pand %xmm3, %xmm1
767; SSE2-NEXT:    psllw $2, %xmm1
768; SSE2-NEXT:    por %xmm2, %xmm1
769; SSE2-NEXT:    movdqa %xmm1, %xmm2
770; SSE2-NEXT:    psrlw $1, %xmm2
771; SSE2-NEXT:    pand %xmm4, %xmm2
772; SSE2-NEXT:    pand %xmm4, %xmm1
773; SSE2-NEXT:    paddb %xmm1, %xmm1
774; SSE2-NEXT:    por %xmm2, %xmm1
775; SSE2-NEXT:    retq
776;
777; SSSE3-LABEL: test_bitreverse_v32i8:
778; SSSE3:       # %bb.0:
779; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
780; SSSE3-NEXT:    movdqa %xmm0, %xmm2
781; SSSE3-NEXT:    pand %xmm4, %xmm2
782; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
783; SSSE3-NEXT:    movdqa %xmm5, %xmm6
784; SSSE3-NEXT:    pshufb %xmm2, %xmm6
785; SSSE3-NEXT:    psrlw $4, %xmm0
786; SSSE3-NEXT:    pand %xmm4, %xmm0
787; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
788; SSSE3-NEXT:    movdqa %xmm2, %xmm3
789; SSSE3-NEXT:    pshufb %xmm0, %xmm3
790; SSSE3-NEXT:    por %xmm6, %xmm3
791; SSSE3-NEXT:    movdqa %xmm1, %xmm0
792; SSSE3-NEXT:    pand %xmm4, %xmm0
793; SSSE3-NEXT:    pshufb %xmm0, %xmm5
794; SSSE3-NEXT:    psrlw $4, %xmm1
795; SSSE3-NEXT:    pand %xmm4, %xmm1
796; SSSE3-NEXT:    pshufb %xmm1, %xmm2
797; SSSE3-NEXT:    por %xmm5, %xmm2
798; SSSE3-NEXT:    movdqa %xmm3, %xmm0
799; SSSE3-NEXT:    movdqa %xmm2, %xmm1
800; SSSE3-NEXT:    retq
801;
802; AVX1-LABEL: test_bitreverse_v32i8:
803; AVX1:       # %bb.0:
804; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
805; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
806; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
807; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
808; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
809; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
810; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
811; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
812; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
813; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
814; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
815; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
816; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
817; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
818; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
819; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
820; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
821; AVX1-NEXT:    retq
822;
823; AVX2-LABEL: test_bitreverse_v32i8:
824; AVX2:       # %bb.0:
825; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
826; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
827; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
828; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
829; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
830; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
831; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
832; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
833; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
834; AVX2-NEXT:    retq
835;
836; AVX512-LABEL: test_bitreverse_v32i8:
837; AVX512:       # %bb.0:
838; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
839; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
840; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
841; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
842; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
843; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
844; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
845; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
846; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
847; AVX512-NEXT:    retq
848;
849; XOPAVX1-LABEL: test_bitreverse_v32i8:
850; XOPAVX1:       # %bb.0:
851; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
852; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
853; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
854; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
855; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
856; XOPAVX1-NEXT:    retq
857;
858; XOPAVX2-LABEL: test_bitreverse_v32i8:
859; XOPAVX2:       # %bb.0:
860; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
861; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
862; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
863; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
864; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
865; XOPAVX2-NEXT:    retq
866;
867; GFNISSE-LABEL: test_bitreverse_v32i8:
868; GFNISSE:       # %bb.0:
869; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
870; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
871; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
872; GFNISSE-NEXT:    retq
873;
874; GFNIAVX1-LABEL: test_bitreverse_v32i8:
875; GFNIAVX1:       # %bb.0:
876; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
877; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
878; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm2, %xmm1, %xmm1
879; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm2, %xmm0, %xmm0
880; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
881; GFNIAVX1-NEXT:    retq
882;
883; GFNIAVX2-LABEL: test_bitreverse_v32i8:
884; GFNIAVX2:       # %bb.0:
885; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
886; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
887; GFNIAVX2-NEXT:    retq
888;
889; GFNIAVX512-LABEL: test_bitreverse_v32i8:
890; GFNIAVX512:       # %bb.0:
891; GFNIAVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
892; GFNIAVX512-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
893; GFNIAVX512-NEXT:    retq
894  %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
895  ret <32 x i8> %b
896}
897
898define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
899; SSE2-LABEL: test_bitreverse_v16i16:
900; SSE2:       # %bb.0:
901; SSE2-NEXT:    movdqa %xmm0, %xmm2
902; SSE2-NEXT:    psrlw $8, %xmm2
903; SSE2-NEXT:    psllw $8, %xmm0
904; SSE2-NEXT:    por %xmm2, %xmm0
905; SSE2-NEXT:    movdqa %xmm0, %xmm3
906; SSE2-NEXT:    psrlw $4, %xmm3
907; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
908; SSE2-NEXT:    pand %xmm2, %xmm3
909; SSE2-NEXT:    pand %xmm2, %xmm0
910; SSE2-NEXT:    psllw $4, %xmm0
911; SSE2-NEXT:    por %xmm3, %xmm0
912; SSE2-NEXT:    movdqa %xmm0, %xmm4
913; SSE2-NEXT:    psrlw $2, %xmm4
914; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
915; SSE2-NEXT:    pand %xmm3, %xmm4
916; SSE2-NEXT:    pand %xmm3, %xmm0
917; SSE2-NEXT:    psllw $2, %xmm0
918; SSE2-NEXT:    por %xmm4, %xmm0
919; SSE2-NEXT:    movdqa %xmm0, %xmm5
920; SSE2-NEXT:    psrlw $1, %xmm5
921; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
922; SSE2-NEXT:    pand %xmm4, %xmm5
923; SSE2-NEXT:    pand %xmm4, %xmm0
924; SSE2-NEXT:    paddb %xmm0, %xmm0
925; SSE2-NEXT:    por %xmm5, %xmm0
926; SSE2-NEXT:    movdqa %xmm1, %xmm5
927; SSE2-NEXT:    psrlw $8, %xmm5
928; SSE2-NEXT:    psllw $8, %xmm1
929; SSE2-NEXT:    por %xmm5, %xmm1
930; SSE2-NEXT:    movdqa %xmm1, %xmm5
931; SSE2-NEXT:    psrlw $4, %xmm5
932; SSE2-NEXT:    pand %xmm2, %xmm5
933; SSE2-NEXT:    pand %xmm2, %xmm1
934; SSE2-NEXT:    psllw $4, %xmm1
935; SSE2-NEXT:    por %xmm5, %xmm1
936; SSE2-NEXT:    movdqa %xmm1, %xmm2
937; SSE2-NEXT:    psrlw $2, %xmm2
938; SSE2-NEXT:    pand %xmm3, %xmm2
939; SSE2-NEXT:    pand %xmm3, %xmm1
940; SSE2-NEXT:    psllw $2, %xmm1
941; SSE2-NEXT:    por %xmm2, %xmm1
942; SSE2-NEXT:    movdqa %xmm1, %xmm2
943; SSE2-NEXT:    psrlw $1, %xmm2
944; SSE2-NEXT:    pand %xmm4, %xmm2
945; SSE2-NEXT:    pand %xmm4, %xmm1
946; SSE2-NEXT:    paddb %xmm1, %xmm1
947; SSE2-NEXT:    por %xmm2, %xmm1
948; SSE2-NEXT:    retq
949;
950; SSSE3-LABEL: test_bitreverse_v16i16:
951; SSSE3:       # %bb.0:
952; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
953; SSSE3-NEXT:    pshufb %xmm4, %xmm0
954; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
955; SSSE3-NEXT:    movdqa %xmm0, %xmm2
956; SSSE3-NEXT:    pand %xmm5, %xmm2
957; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
958; SSSE3-NEXT:    movdqa %xmm6, %xmm7
959; SSSE3-NEXT:    pshufb %xmm2, %xmm7
960; SSSE3-NEXT:    psrlw $4, %xmm0
961; SSSE3-NEXT:    pand %xmm5, %xmm0
962; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
963; SSSE3-NEXT:    movdqa %xmm2, %xmm3
964; SSSE3-NEXT:    pshufb %xmm0, %xmm3
965; SSSE3-NEXT:    por %xmm7, %xmm3
966; SSSE3-NEXT:    pshufb %xmm4, %xmm1
967; SSSE3-NEXT:    movdqa %xmm1, %xmm0
968; SSSE3-NEXT:    pand %xmm5, %xmm0
969; SSSE3-NEXT:    pshufb %xmm0, %xmm6
970; SSSE3-NEXT:    psrlw $4, %xmm1
971; SSSE3-NEXT:    pand %xmm5, %xmm1
972; SSSE3-NEXT:    pshufb %xmm1, %xmm2
973; SSSE3-NEXT:    por %xmm6, %xmm2
974; SSSE3-NEXT:    movdqa %xmm3, %xmm0
975; SSSE3-NEXT:    movdqa %xmm2, %xmm1
976; SSSE3-NEXT:    retq
977;
978; AVX1-LABEL: test_bitreverse_v16i16:
979; AVX1:       # %bb.0:
980; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
981; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
982; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
983; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
984; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
985; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
986; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
987; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
988; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
989; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
990; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
991; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
992; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
993; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
994; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
995; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
996; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
997; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
998; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
999; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1000; AVX1-NEXT:    retq
1001;
1002; AVX2-LABEL: test_bitreverse_v16i16:
1003; AVX2:       # %bb.0:
1004; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1005; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1006; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1007; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1008; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1009; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1010; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1011; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1012; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1013; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1014; AVX2-NEXT:    retq
1015;
1016; AVX512-LABEL: test_bitreverse_v16i16:
1017; AVX512:       # %bb.0:
1018; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1019; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1020; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1021; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1022; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1023; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1024; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1025; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1026; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1027; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1028; AVX512-NEXT:    retq
1029;
1030; XOPAVX1-LABEL: test_bitreverse_v16i16:
1031; XOPAVX1:       # %bb.0:
1032; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1033; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1034; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1035; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1036; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1037; XOPAVX1-NEXT:    retq
1038;
1039; XOPAVX2-LABEL: test_bitreverse_v16i16:
1040; XOPAVX2:       # %bb.0:
1041; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1042; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1043; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1044; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1045; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1046; XOPAVX2-NEXT:    retq
1047;
1048; GFNISSE-LABEL: test_bitreverse_v16i16:
1049; GFNISSE:       # %bb.0:
1050; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1051; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
1052; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1053; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
1054; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
1055; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
1056; GFNISSE-NEXT:    retq
1057;
1058; GFNIAVX1-LABEL: test_bitreverse_v16i16:
1059; GFNIAVX1:       # %bb.0:
1060; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1061; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1062; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1063; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1064; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
1065; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1066; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
1067; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1068; GFNIAVX1-NEXT:    retq
1069;
1070; GFNIAVX2-LABEL: test_bitreverse_v16i16:
1071; GFNIAVX2:       # %bb.0:
1072; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1073; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1074; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1075; GFNIAVX2-NEXT:    retq
1076;
1077; GFNIAVX512-LABEL: test_bitreverse_v16i16:
1078; GFNIAVX512:       # %bb.0:
1079; GFNIAVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1080; GFNIAVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1081; GFNIAVX512-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1082; GFNIAVX512-NEXT:    retq
1083  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
1084  ret <16 x i16> %b
1085}
1086
1087define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
1088; SSE2-LABEL: test_bitreverse_v8i32:
1089; SSE2:       # %bb.0:
1090; SSE2-NEXT:    pxor %xmm2, %xmm2
1091; SSE2-NEXT:    movdqa %xmm0, %xmm3
1092; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1093; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
1094; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
1095; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1096; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1097; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1098; SSE2-NEXT:    packuswb %xmm3, %xmm0
1099; SSE2-NEXT:    movdqa %xmm0, %xmm4
1100; SSE2-NEXT:    psrlw $4, %xmm4
1101; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1102; SSE2-NEXT:    pand %xmm3, %xmm4
1103; SSE2-NEXT:    pand %xmm3, %xmm0
1104; SSE2-NEXT:    psllw $4, %xmm0
1105; SSE2-NEXT:    por %xmm4, %xmm0
1106; SSE2-NEXT:    movdqa %xmm0, %xmm5
1107; SSE2-NEXT:    psrlw $2, %xmm5
1108; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1109; SSE2-NEXT:    pand %xmm4, %xmm5
1110; SSE2-NEXT:    pand %xmm4, %xmm0
1111; SSE2-NEXT:    psllw $2, %xmm0
1112; SSE2-NEXT:    por %xmm5, %xmm0
1113; SSE2-NEXT:    movdqa %xmm0, %xmm6
1114; SSE2-NEXT:    psrlw $1, %xmm6
1115; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1116; SSE2-NEXT:    pand %xmm5, %xmm6
1117; SSE2-NEXT:    pand %xmm5, %xmm0
1118; SSE2-NEXT:    paddb %xmm0, %xmm0
1119; SSE2-NEXT:    por %xmm6, %xmm0
1120; SSE2-NEXT:    movdqa %xmm1, %xmm6
1121; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
1122; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1123; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1124; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1125; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1126; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1127; SSE2-NEXT:    packuswb %xmm6, %xmm1
1128; SSE2-NEXT:    movdqa %xmm1, %xmm2
1129; SSE2-NEXT:    psrlw $4, %xmm2
1130; SSE2-NEXT:    pand %xmm3, %xmm2
1131; SSE2-NEXT:    pand %xmm3, %xmm1
1132; SSE2-NEXT:    psllw $4, %xmm1
1133; SSE2-NEXT:    por %xmm2, %xmm1
1134; SSE2-NEXT:    movdqa %xmm1, %xmm2
1135; SSE2-NEXT:    psrlw $2, %xmm2
1136; SSE2-NEXT:    pand %xmm4, %xmm2
1137; SSE2-NEXT:    pand %xmm4, %xmm1
1138; SSE2-NEXT:    psllw $2, %xmm1
1139; SSE2-NEXT:    por %xmm2, %xmm1
1140; SSE2-NEXT:    movdqa %xmm1, %xmm2
1141; SSE2-NEXT:    psrlw $1, %xmm2
1142; SSE2-NEXT:    pand %xmm5, %xmm2
1143; SSE2-NEXT:    pand %xmm5, %xmm1
1144; SSE2-NEXT:    paddb %xmm1, %xmm1
1145; SSE2-NEXT:    por %xmm2, %xmm1
1146; SSE2-NEXT:    retq
1147;
1148; SSSE3-LABEL: test_bitreverse_v8i32:
1149; SSSE3:       # %bb.0:
1150; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1151; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1152; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1153; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1154; SSSE3-NEXT:    pand %xmm5, %xmm2
1155; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1156; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1157; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1158; SSSE3-NEXT:    psrlw $4, %xmm0
1159; SSSE3-NEXT:    pand %xmm5, %xmm0
1160; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1161; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1162; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1163; SSSE3-NEXT:    por %xmm7, %xmm3
1164; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1165; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1166; SSSE3-NEXT:    pand %xmm5, %xmm0
1167; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1168; SSSE3-NEXT:    psrlw $4, %xmm1
1169; SSSE3-NEXT:    pand %xmm5, %xmm1
1170; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1171; SSSE3-NEXT:    por %xmm6, %xmm2
1172; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1173; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1174; SSSE3-NEXT:    retq
1175;
1176; AVX1-LABEL: test_bitreverse_v8i32:
1177; AVX1:       # %bb.0:
1178; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1179; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1180; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1181; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1182; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1183; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1184; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1185; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1186; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1187; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1188; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1189; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1190; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1191; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1192; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1193; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1194; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1195; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1196; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1197; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1198; AVX1-NEXT:    retq
1199;
1200; AVX2-LABEL: test_bitreverse_v8i32:
1201; AVX2:       # %bb.0:
1202; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1203; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1204; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1205; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1206; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1207; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1208; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1209; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1210; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1211; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1212; AVX2-NEXT:    retq
1213;
1214; AVX512-LABEL: test_bitreverse_v8i32:
1215; AVX512:       # %bb.0:
1216; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1217; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1218; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1219; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1220; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1221; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1222; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1223; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1224; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1225; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1226; AVX512-NEXT:    retq
1227;
1228; XOPAVX1-LABEL: test_bitreverse_v8i32:
1229; XOPAVX1:       # %bb.0:
1230; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1231; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1232; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1233; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1234; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1235; XOPAVX1-NEXT:    retq
1236;
1237; XOPAVX2-LABEL: test_bitreverse_v8i32:
1238; XOPAVX2:       # %bb.0:
1239; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1240; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1241; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1242; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1243; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1244; XOPAVX2-NEXT:    retq
1245;
1246; GFNISSE-LABEL: test_bitreverse_v8i32:
1247; GFNISSE:       # %bb.0:
1248; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1249; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
1250; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1251; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
1252; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
1253; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
1254; GFNISSE-NEXT:    retq
1255;
1256; GFNIAVX1-LABEL: test_bitreverse_v8i32:
1257; GFNIAVX1:       # %bb.0:
1258; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1259; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1260; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1261; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1262; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
1263; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1264; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
1265; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1266; GFNIAVX1-NEXT:    retq
1267;
1268; GFNIAVX2-LABEL: test_bitreverse_v8i32:
1269; GFNIAVX2:       # %bb.0:
1270; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1271; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1272; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1273; GFNIAVX2-NEXT:    retq
1274;
1275; GFNIAVX512-LABEL: test_bitreverse_v8i32:
1276; GFNIAVX512:       # %bb.0:
1277; GFNIAVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1278; GFNIAVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1279; GFNIAVX512-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1280; GFNIAVX512-NEXT:    retq
1281  %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
1282  ret <8 x i32> %b
1283}
1284
1285define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
1286; SSE2-LABEL: test_bitreverse_v4i64:
1287; SSE2:       # %bb.0:
1288; SSE2-NEXT:    pxor %xmm2, %xmm2
1289; SSE2-NEXT:    movdqa %xmm0, %xmm3
1290; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1291; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
1292; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
1293; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
1294; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1295; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1296; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1297; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1298; SSE2-NEXT:    packuswb %xmm3, %xmm0
1299; SSE2-NEXT:    movdqa %xmm0, %xmm4
1300; SSE2-NEXT:    psrlw $4, %xmm4
1301; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1302; SSE2-NEXT:    pand %xmm3, %xmm4
1303; SSE2-NEXT:    pand %xmm3, %xmm0
1304; SSE2-NEXT:    psllw $4, %xmm0
1305; SSE2-NEXT:    por %xmm4, %xmm0
1306; SSE2-NEXT:    movdqa %xmm0, %xmm5
1307; SSE2-NEXT:    psrlw $2, %xmm5
1308; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1309; SSE2-NEXT:    pand %xmm4, %xmm5
1310; SSE2-NEXT:    pand %xmm4, %xmm0
1311; SSE2-NEXT:    psllw $2, %xmm0
1312; SSE2-NEXT:    por %xmm5, %xmm0
1313; SSE2-NEXT:    movdqa %xmm0, %xmm6
1314; SSE2-NEXT:    psrlw $1, %xmm6
1315; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1316; SSE2-NEXT:    pand %xmm5, %xmm6
1317; SSE2-NEXT:    pand %xmm5, %xmm0
1318; SSE2-NEXT:    paddb %xmm0, %xmm0
1319; SSE2-NEXT:    por %xmm6, %xmm0
1320; SSE2-NEXT:    movdqa %xmm1, %xmm6
1321; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
1322; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
1323; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1324; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1325; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1326; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1327; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1328; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1329; SSE2-NEXT:    packuswb %xmm6, %xmm1
1330; SSE2-NEXT:    movdqa %xmm1, %xmm2
1331; SSE2-NEXT:    psrlw $4, %xmm2
1332; SSE2-NEXT:    pand %xmm3, %xmm2
1333; SSE2-NEXT:    pand %xmm3, %xmm1
1334; SSE2-NEXT:    psllw $4, %xmm1
1335; SSE2-NEXT:    por %xmm2, %xmm1
1336; SSE2-NEXT:    movdqa %xmm1, %xmm2
1337; SSE2-NEXT:    psrlw $2, %xmm2
1338; SSE2-NEXT:    pand %xmm4, %xmm2
1339; SSE2-NEXT:    pand %xmm4, %xmm1
1340; SSE2-NEXT:    psllw $2, %xmm1
1341; SSE2-NEXT:    por %xmm2, %xmm1
1342; SSE2-NEXT:    movdqa %xmm1, %xmm2
1343; SSE2-NEXT:    psrlw $1, %xmm2
1344; SSE2-NEXT:    pand %xmm5, %xmm2
1345; SSE2-NEXT:    pand %xmm5, %xmm1
1346; SSE2-NEXT:    paddb %xmm1, %xmm1
1347; SSE2-NEXT:    por %xmm2, %xmm1
1348; SSE2-NEXT:    retq
1349;
1350; SSSE3-LABEL: test_bitreverse_v4i64:
1351; SSSE3:       # %bb.0:
1352; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1353; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1354; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1355; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1356; SSSE3-NEXT:    pand %xmm5, %xmm2
1357; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1358; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1359; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1360; SSSE3-NEXT:    psrlw $4, %xmm0
1361; SSSE3-NEXT:    pand %xmm5, %xmm0
1362; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1363; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1364; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1365; SSSE3-NEXT:    por %xmm7, %xmm3
1366; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1367; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1368; SSSE3-NEXT:    pand %xmm5, %xmm0
1369; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1370; SSSE3-NEXT:    psrlw $4, %xmm1
1371; SSSE3-NEXT:    pand %xmm5, %xmm1
1372; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1373; SSSE3-NEXT:    por %xmm6, %xmm2
1374; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1375; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1376; SSSE3-NEXT:    retq
1377;
1378; AVX1-LABEL: test_bitreverse_v4i64:
1379; AVX1:       # %bb.0:
1380; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1381; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1382; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1383; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1384; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1385; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1386; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1387; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1388; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1389; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1390; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1391; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1392; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1393; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1394; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1395; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1396; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1397; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1398; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1399; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1400; AVX1-NEXT:    retq
1401;
1402; AVX2-LABEL: test_bitreverse_v4i64:
1403; AVX2:       # %bb.0:
1404; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1405; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1406; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1407; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1408; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1409; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1410; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1411; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1412; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1413; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1414; AVX2-NEXT:    retq
1415;
1416; AVX512-LABEL: test_bitreverse_v4i64:
1417; AVX512:       # %bb.0:
1418; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1419; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1420; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1421; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1422; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1423; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1424; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1425; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1426; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1427; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1428; AVX512-NEXT:    retq
1429;
1430; XOPAVX1-LABEL: test_bitreverse_v4i64:
1431; XOPAVX1:       # %bb.0:
1432; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1433; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1434; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1435; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1436; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1437; XOPAVX1-NEXT:    retq
1438;
1439; XOPAVX2-LABEL: test_bitreverse_v4i64:
1440; XOPAVX2:       # %bb.0:
1441; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1442; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1443; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1444; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1445; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1446; XOPAVX2-NEXT:    retq
1447;
1448; GFNISSE-LABEL: test_bitreverse_v4i64:
1449; GFNISSE:       # %bb.0:
1450; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1451; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
1452; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1453; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
1454; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
1455; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
1456; GFNISSE-NEXT:    retq
1457;
1458; GFNIAVX1-LABEL: test_bitreverse_v4i64:
1459; GFNIAVX1:       # %bb.0:
1460; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1461; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1462; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1463; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1464; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
1465; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1466; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
1467; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1468; GFNIAVX1-NEXT:    retq
1469;
1470; GFNIAVX2-LABEL: test_bitreverse_v4i64:
1471; GFNIAVX2:       # %bb.0:
1472; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1473; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1474; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1475; GFNIAVX2-NEXT:    retq
1476;
1477; GFNIAVX512-LABEL: test_bitreverse_v4i64:
1478; GFNIAVX512:       # %bb.0:
1479; GFNIAVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1480; GFNIAVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1481; GFNIAVX512-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1482; GFNIAVX512-NEXT:    retq
1483  %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
1484  ret <4 x i64> %b
1485}
1486
1487define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
1488; SSE2-LABEL: test_bitreverse_v64i8:
1489; SSE2:       # %bb.0:
1490; SSE2-NEXT:    movdqa %xmm0, %xmm5
1491; SSE2-NEXT:    psrlw $4, %xmm5
1492; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1493; SSE2-NEXT:    pand %xmm4, %xmm5
1494; SSE2-NEXT:    pand %xmm4, %xmm0
1495; SSE2-NEXT:    psllw $4, %xmm0
1496; SSE2-NEXT:    por %xmm5, %xmm0
1497; SSE2-NEXT:    movdqa %xmm0, %xmm6
1498; SSE2-NEXT:    psrlw $2, %xmm6
1499; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1500; SSE2-NEXT:    pand %xmm5, %xmm6
1501; SSE2-NEXT:    pand %xmm5, %xmm0
1502; SSE2-NEXT:    psllw $2, %xmm0
1503; SSE2-NEXT:    por %xmm6, %xmm0
1504; SSE2-NEXT:    movdqa %xmm0, %xmm7
1505; SSE2-NEXT:    psrlw $1, %xmm7
1506; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1507; SSE2-NEXT:    pand %xmm6, %xmm7
1508; SSE2-NEXT:    pand %xmm6, %xmm0
1509; SSE2-NEXT:    paddb %xmm0, %xmm0
1510; SSE2-NEXT:    por %xmm7, %xmm0
1511; SSE2-NEXT:    movdqa %xmm1, %xmm7
1512; SSE2-NEXT:    psrlw $4, %xmm7
1513; SSE2-NEXT:    pand %xmm4, %xmm7
1514; SSE2-NEXT:    pand %xmm4, %xmm1
1515; SSE2-NEXT:    psllw $4, %xmm1
1516; SSE2-NEXT:    por %xmm7, %xmm1
1517; SSE2-NEXT:    movdqa %xmm1, %xmm7
1518; SSE2-NEXT:    psrlw $2, %xmm7
1519; SSE2-NEXT:    pand %xmm5, %xmm7
1520; SSE2-NEXT:    pand %xmm5, %xmm1
1521; SSE2-NEXT:    psllw $2, %xmm1
1522; SSE2-NEXT:    por %xmm7, %xmm1
1523; SSE2-NEXT:    movdqa %xmm1, %xmm7
1524; SSE2-NEXT:    psrlw $1, %xmm7
1525; SSE2-NEXT:    pand %xmm6, %xmm7
1526; SSE2-NEXT:    pand %xmm6, %xmm1
1527; SSE2-NEXT:    paddb %xmm1, %xmm1
1528; SSE2-NEXT:    por %xmm7, %xmm1
1529; SSE2-NEXT:    movdqa %xmm2, %xmm7
1530; SSE2-NEXT:    psrlw $4, %xmm7
1531; SSE2-NEXT:    pand %xmm4, %xmm7
1532; SSE2-NEXT:    pand %xmm4, %xmm2
1533; SSE2-NEXT:    psllw $4, %xmm2
1534; SSE2-NEXT:    por %xmm7, %xmm2
1535; SSE2-NEXT:    movdqa %xmm2, %xmm7
1536; SSE2-NEXT:    psrlw $2, %xmm7
1537; SSE2-NEXT:    pand %xmm5, %xmm7
1538; SSE2-NEXT:    pand %xmm5, %xmm2
1539; SSE2-NEXT:    psllw $2, %xmm2
1540; SSE2-NEXT:    por %xmm7, %xmm2
1541; SSE2-NEXT:    movdqa %xmm2, %xmm7
1542; SSE2-NEXT:    psrlw $1, %xmm7
1543; SSE2-NEXT:    pand %xmm6, %xmm7
1544; SSE2-NEXT:    pand %xmm6, %xmm2
1545; SSE2-NEXT:    paddb %xmm2, %xmm2
1546; SSE2-NEXT:    por %xmm7, %xmm2
1547; SSE2-NEXT:    movdqa %xmm3, %xmm7
1548; SSE2-NEXT:    psrlw $4, %xmm7
1549; SSE2-NEXT:    pand %xmm4, %xmm7
1550; SSE2-NEXT:    pand %xmm4, %xmm3
1551; SSE2-NEXT:    psllw $4, %xmm3
1552; SSE2-NEXT:    por %xmm7, %xmm3
1553; SSE2-NEXT:    movdqa %xmm3, %xmm4
1554; SSE2-NEXT:    psrlw $2, %xmm4
1555; SSE2-NEXT:    pand %xmm5, %xmm4
1556; SSE2-NEXT:    pand %xmm5, %xmm3
1557; SSE2-NEXT:    psllw $2, %xmm3
1558; SSE2-NEXT:    por %xmm4, %xmm3
1559; SSE2-NEXT:    movdqa %xmm3, %xmm4
1560; SSE2-NEXT:    psrlw $1, %xmm4
1561; SSE2-NEXT:    pand %xmm6, %xmm4
1562; SSE2-NEXT:    pand %xmm6, %xmm3
1563; SSE2-NEXT:    paddb %xmm3, %xmm3
1564; SSE2-NEXT:    por %xmm4, %xmm3
1565; SSE2-NEXT:    retq
1566;
1567; SSSE3-LABEL: test_bitreverse_v64i8:
1568; SSSE3:       # %bb.0:
1569; SSSE3-NEXT:    movdqa %xmm0, %xmm5
1570; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1571; SSSE3-NEXT:    pand %xmm8, %xmm0
1572; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1573; SSSE3-NEXT:    movdqa %xmm9, %xmm6
1574; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1575; SSSE3-NEXT:    psrlw $4, %xmm5
1576; SSSE3-NEXT:    pand %xmm8, %xmm5
1577; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1578; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1579; SSSE3-NEXT:    pshufb %xmm5, %xmm0
1580; SSSE3-NEXT:    por %xmm6, %xmm0
1581; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1582; SSSE3-NEXT:    pand %xmm8, %xmm5
1583; SSSE3-NEXT:    movdqa %xmm9, %xmm6
1584; SSSE3-NEXT:    pshufb %xmm5, %xmm6
1585; SSSE3-NEXT:    psrlw $4, %xmm1
1586; SSSE3-NEXT:    pand %xmm8, %xmm1
1587; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1588; SSSE3-NEXT:    pshufb %xmm1, %xmm5
1589; SSSE3-NEXT:    por %xmm6, %xmm5
1590; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1591; SSSE3-NEXT:    pand %xmm8, %xmm1
1592; SSSE3-NEXT:    movdqa %xmm9, %xmm7
1593; SSSE3-NEXT:    pshufb %xmm1, %xmm7
1594; SSSE3-NEXT:    psrlw $4, %xmm2
1595; SSSE3-NEXT:    pand %xmm8, %xmm2
1596; SSSE3-NEXT:    movdqa %xmm4, %xmm6
1597; SSSE3-NEXT:    pshufb %xmm2, %xmm6
1598; SSSE3-NEXT:    por %xmm7, %xmm6
1599; SSSE3-NEXT:    movdqa %xmm3, %xmm1
1600; SSSE3-NEXT:    pand %xmm8, %xmm1
1601; SSSE3-NEXT:    pshufb %xmm1, %xmm9
1602; SSSE3-NEXT:    psrlw $4, %xmm3
1603; SSSE3-NEXT:    pand %xmm8, %xmm3
1604; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1605; SSSE3-NEXT:    por %xmm9, %xmm4
1606; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1607; SSSE3-NEXT:    movdqa %xmm6, %xmm2
1608; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1609; SSSE3-NEXT:    retq
1610;
1611; AVX1-LABEL: test_bitreverse_v64i8:
1612; AVX1:       # %bb.0:
1613; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1614; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1615; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
1616; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1617; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1618; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1619; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1620; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1621; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
1622; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
1623; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm4
1624; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1625; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1626; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1627; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1628; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
1629; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1630; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1631; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
1632; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1633; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1634; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1635; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
1636; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
1637; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1638; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1639; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1640; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1641; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1642; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1643; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1644; AVX1-NEXT:    retq
1645;
1646; AVX2-LABEL: test_bitreverse_v64i8:
1647; AVX2:       # %bb.0:
1648; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1649; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
1650; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1651; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1652; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1653; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
1654; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1655; AVX2-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
1656; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
1657; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
1658; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1659; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
1660; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
1661; AVX2-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
1662; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
1663; AVX2-NEXT:    retq
1664;
1665; AVX512F-LABEL: test_bitreverse_v64i8:
1666; AVX512F:       # %bb.0:
1667; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1668; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1669; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
1670; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1671; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1672; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm5
1673; AVX512F-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
1674; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
1675; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
1676; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
1677; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1678; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
1679; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
1680; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
1681; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
1682; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1683; AVX512F-NEXT:    vporq %zmm0, %zmm3, %zmm0
1684; AVX512F-NEXT:    retq
1685;
1686; AVX512BW-LABEL: test_bitreverse_v64i8:
1687; AVX512BW:       # %bb.0:
1688; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1689; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
1690; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1691; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
1692; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
1693; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
1694; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1695; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
1696; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
1697; AVX512BW-NEXT:    retq
1698;
1699; XOPAVX1-LABEL: test_bitreverse_v64i8:
1700; XOPAVX1:       # %bb.0:
1701; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1702; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1703; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1704; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1705; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1706; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1707; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1708; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1709; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1710; XOPAVX1-NEXT:    retq
1711;
1712; XOPAVX2-LABEL: test_bitreverse_v64i8:
1713; XOPAVX2:       # %bb.0:
1714; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1715; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1716; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1717; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1718; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
1719; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1720; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1721; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1722; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
1723; XOPAVX2-NEXT:    retq
1724;
1725; GFNISSE-LABEL: test_bitreverse_v64i8:
1726; GFNISSE:       # %bb.0:
1727; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
1728; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
1729; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
1730; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
1731; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm3
1732; GFNISSE-NEXT:    retq
1733;
1734; GFNIAVX1-LABEL: test_bitreverse_v64i8:
1735; GFNIAVX1:       # %bb.0:
1736; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1737; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1738; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm2
1739; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0
1740; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1741; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1742; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm2
1743; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1
1744; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1745; GFNIAVX1-NEXT:    retq
1746;
1747; GFNIAVX2-LABEL: test_bitreverse_v64i8:
1748; GFNIAVX2:       # %bb.0:
1749; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1750; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
1751; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
1752; GFNIAVX2-NEXT:    retq
1753;
1754; GFNIAVX512F-LABEL: test_bitreverse_v64i8:
1755; GFNIAVX512F:       # %bb.0:
1756; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1757; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1758; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
1759; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
1760; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1761; GFNIAVX512F-NEXT:    retq
1762;
1763; GFNIAVX512BW-LABEL: test_bitreverse_v64i8:
1764; GFNIAVX512BW:       # %bb.0:
1765; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
1766; GFNIAVX512BW-NEXT:    retq
1767  %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
1768  ret <64 x i8> %b
1769}
1770
1771define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
1772; SSE2-LABEL: test_bitreverse_v32i16:
1773; SSE2:       # %bb.0:
1774; SSE2-NEXT:    movdqa %xmm0, %xmm4
1775; SSE2-NEXT:    psrlw $8, %xmm4
1776; SSE2-NEXT:    psllw $8, %xmm0
1777; SSE2-NEXT:    por %xmm4, %xmm0
1778; SSE2-NEXT:    movdqa %xmm0, %xmm5
1779; SSE2-NEXT:    psrlw $4, %xmm5
1780; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1781; SSE2-NEXT:    pand %xmm4, %xmm5
1782; SSE2-NEXT:    pand %xmm4, %xmm0
1783; SSE2-NEXT:    psllw $4, %xmm0
1784; SSE2-NEXT:    por %xmm5, %xmm0
1785; SSE2-NEXT:    movdqa %xmm0, %xmm6
1786; SSE2-NEXT:    psrlw $2, %xmm6
1787; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1788; SSE2-NEXT:    pand %xmm5, %xmm6
1789; SSE2-NEXT:    pand %xmm5, %xmm0
1790; SSE2-NEXT:    psllw $2, %xmm0
1791; SSE2-NEXT:    por %xmm6, %xmm0
1792; SSE2-NEXT:    movdqa %xmm0, %xmm7
1793; SSE2-NEXT:    psrlw $1, %xmm7
1794; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1795; SSE2-NEXT:    pand %xmm6, %xmm7
1796; SSE2-NEXT:    pand %xmm6, %xmm0
1797; SSE2-NEXT:    paddb %xmm0, %xmm0
1798; SSE2-NEXT:    por %xmm7, %xmm0
1799; SSE2-NEXT:    movdqa %xmm1, %xmm7
1800; SSE2-NEXT:    psrlw $8, %xmm7
1801; SSE2-NEXT:    psllw $8, %xmm1
1802; SSE2-NEXT:    por %xmm7, %xmm1
1803; SSE2-NEXT:    movdqa %xmm1, %xmm7
1804; SSE2-NEXT:    psrlw $4, %xmm7
1805; SSE2-NEXT:    pand %xmm4, %xmm7
1806; SSE2-NEXT:    pand %xmm4, %xmm1
1807; SSE2-NEXT:    psllw $4, %xmm1
1808; SSE2-NEXT:    por %xmm7, %xmm1
1809; SSE2-NEXT:    movdqa %xmm1, %xmm7
1810; SSE2-NEXT:    psrlw $2, %xmm7
1811; SSE2-NEXT:    pand %xmm5, %xmm7
1812; SSE2-NEXT:    pand %xmm5, %xmm1
1813; SSE2-NEXT:    psllw $2, %xmm1
1814; SSE2-NEXT:    por %xmm7, %xmm1
1815; SSE2-NEXT:    movdqa %xmm1, %xmm7
1816; SSE2-NEXT:    psrlw $1, %xmm7
1817; SSE2-NEXT:    pand %xmm6, %xmm7
1818; SSE2-NEXT:    pand %xmm6, %xmm1
1819; SSE2-NEXT:    paddb %xmm1, %xmm1
1820; SSE2-NEXT:    por %xmm7, %xmm1
1821; SSE2-NEXT:    movdqa %xmm2, %xmm7
1822; SSE2-NEXT:    psrlw $8, %xmm7
1823; SSE2-NEXT:    psllw $8, %xmm2
1824; SSE2-NEXT:    por %xmm7, %xmm2
1825; SSE2-NEXT:    movdqa %xmm2, %xmm7
1826; SSE2-NEXT:    psrlw $4, %xmm7
1827; SSE2-NEXT:    pand %xmm4, %xmm7
1828; SSE2-NEXT:    pand %xmm4, %xmm2
1829; SSE2-NEXT:    psllw $4, %xmm2
1830; SSE2-NEXT:    por %xmm7, %xmm2
1831; SSE2-NEXT:    movdqa %xmm2, %xmm7
1832; SSE2-NEXT:    psrlw $2, %xmm7
1833; SSE2-NEXT:    pand %xmm5, %xmm7
1834; SSE2-NEXT:    pand %xmm5, %xmm2
1835; SSE2-NEXT:    psllw $2, %xmm2
1836; SSE2-NEXT:    por %xmm7, %xmm2
1837; SSE2-NEXT:    movdqa %xmm2, %xmm7
1838; SSE2-NEXT:    psrlw $1, %xmm7
1839; SSE2-NEXT:    pand %xmm6, %xmm7
1840; SSE2-NEXT:    pand %xmm6, %xmm2
1841; SSE2-NEXT:    paddb %xmm2, %xmm2
1842; SSE2-NEXT:    por %xmm7, %xmm2
1843; SSE2-NEXT:    movdqa %xmm3, %xmm7
1844; SSE2-NEXT:    psrlw $8, %xmm7
1845; SSE2-NEXT:    psllw $8, %xmm3
1846; SSE2-NEXT:    por %xmm7, %xmm3
1847; SSE2-NEXT:    movdqa %xmm3, %xmm7
1848; SSE2-NEXT:    psrlw $4, %xmm7
1849; SSE2-NEXT:    pand %xmm4, %xmm7
1850; SSE2-NEXT:    pand %xmm4, %xmm3
1851; SSE2-NEXT:    psllw $4, %xmm3
1852; SSE2-NEXT:    por %xmm7, %xmm3
1853; SSE2-NEXT:    movdqa %xmm3, %xmm4
1854; SSE2-NEXT:    psrlw $2, %xmm4
1855; SSE2-NEXT:    pand %xmm5, %xmm4
1856; SSE2-NEXT:    pand %xmm5, %xmm3
1857; SSE2-NEXT:    psllw $2, %xmm3
1858; SSE2-NEXT:    por %xmm4, %xmm3
1859; SSE2-NEXT:    movdqa %xmm3, %xmm4
1860; SSE2-NEXT:    psrlw $1, %xmm4
1861; SSE2-NEXT:    pand %xmm6, %xmm4
1862; SSE2-NEXT:    pand %xmm6, %xmm3
1863; SSE2-NEXT:    paddb %xmm3, %xmm3
1864; SSE2-NEXT:    por %xmm4, %xmm3
1865; SSE2-NEXT:    retq
1866;
1867; SSSE3-LABEL: test_bitreverse_v32i16:
1868; SSSE3:       # %bb.0:
1869; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1870; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1871; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1872; SSSE3-NEXT:    pshufb %xmm8, %xmm1
1873; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1874; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1875; SSSE3-NEXT:    pand %xmm9, %xmm0
1876; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1877; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1878; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1879; SSSE3-NEXT:    psrlw $4, %xmm1
1880; SSSE3-NEXT:    pand %xmm9, %xmm1
1881; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1882; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1883; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1884; SSSE3-NEXT:    por %xmm6, %xmm0
1885; SSSE3-NEXT:    pshufb %xmm8, %xmm5
1886; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1887; SSSE3-NEXT:    pand %xmm9, %xmm1
1888; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1889; SSSE3-NEXT:    pshufb %xmm1, %xmm6
1890; SSSE3-NEXT:    psrlw $4, %xmm5
1891; SSSE3-NEXT:    pand %xmm9, %xmm5
1892; SSSE3-NEXT:    movdqa %xmm4, %xmm1
1893; SSSE3-NEXT:    pshufb %xmm5, %xmm1
1894; SSSE3-NEXT:    por %xmm6, %xmm1
1895; SSSE3-NEXT:    pshufb %xmm8, %xmm2
1896; SSSE3-NEXT:    movdqa %xmm2, %xmm5
1897; SSSE3-NEXT:    pand %xmm9, %xmm5
1898; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1899; SSSE3-NEXT:    pshufb %xmm5, %xmm6
1900; SSSE3-NEXT:    psrlw $4, %xmm2
1901; SSSE3-NEXT:    pand %xmm9, %xmm2
1902; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1903; SSSE3-NEXT:    pshufb %xmm2, %xmm5
1904; SSSE3-NEXT:    por %xmm6, %xmm5
1905; SSSE3-NEXT:    pshufb %xmm8, %xmm3
1906; SSSE3-NEXT:    movdqa %xmm3, %xmm2
1907; SSSE3-NEXT:    pand %xmm9, %xmm2
1908; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1909; SSSE3-NEXT:    psrlw $4, %xmm3
1910; SSSE3-NEXT:    pand %xmm9, %xmm3
1911; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1912; SSSE3-NEXT:    por %xmm7, %xmm4
1913; SSSE3-NEXT:    movdqa %xmm5, %xmm2
1914; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1915; SSSE3-NEXT:    retq
1916;
1917; AVX1-LABEL: test_bitreverse_v32i16:
1918; AVX1:       # %bb.0:
1919; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1920; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1921; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1922; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1923; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
1924; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1925; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1926; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1927; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1928; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1929; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
1930; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
1931; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1932; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
1933; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1934; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1935; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1936; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
1937; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
1938; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1939; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1940; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1941; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
1942; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1943; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1944; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1945; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
1946; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
1947; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1948; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
1949; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
1950; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1951; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
1952; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
1953; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
1954; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1955; AVX1-NEXT:    retq
1956;
1957; AVX2-LABEL: test_bitreverse_v32i16:
1958; AVX2:       # %bb.0:
1959; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1960; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1961; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1962; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
1963; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1964; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
1965; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1966; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
1967; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1968; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
1969; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
1970; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1971; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
1972; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
1973; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
1974; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
1975; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
1976; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
1977; AVX2-NEXT:    retq
1978;
1979; AVX512F-LABEL: test_bitreverse_v32i16:
1980; AVX512F:       # %bb.0:
1981; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1982; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1983; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1984; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1985; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
1986; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1987; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
1988; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1989; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
1990; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
1991; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
1992; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
1993; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
1994; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1995; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
1996; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
1997; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
1998; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
1999; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2000; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
2001; AVX512F-NEXT:    retq
2002;
2003; AVX512BW-LABEL: test_bitreverse_v32i16:
2004; AVX512BW:       # %bb.0:
2005; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
2006; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2007; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2008; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2009; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2010; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2011; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2012; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2013; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2014; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2015; AVX512BW-NEXT:    retq
2016;
2017; XOPAVX1-LABEL: test_bitreverse_v32i16:
2018; XOPAVX1:       # %bb.0:
2019; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2020; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
2021; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2022; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2023; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2024; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2025; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2026; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2027; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2028; XOPAVX1-NEXT:    retq
2029;
2030; XOPAVX2-LABEL: test_bitreverse_v32i16:
2031; XOPAVX2:       # %bb.0:
2032; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2033; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
2034; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2035; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2036; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2037; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2038; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2039; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2040; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2041; XOPAVX2-NEXT:    retq
2042;
2043; GFNISSE-LABEL: test_bitreverse_v32i16:
2044; GFNISSE:       # %bb.0:
2045; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2046; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
2047; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
2048; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
2049; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
2050; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
2051; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
2052; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
2053; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
2054; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
2055; GFNISSE-NEXT:    retq
2056;
2057; GFNIAVX1-LABEL: test_bitreverse_v32i16:
2058; GFNIAVX1:       # %bb.0:
2059; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2060; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2061; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2062; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
2063; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
2064; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2065; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
2066; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2067; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2068; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2069; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
2070; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2071; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
2072; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2073; GFNIAVX1-NEXT:    retq
2074;
2075; GFNIAVX2-LABEL: test_bitreverse_v32i16:
2076; GFNIAVX2:       # %bb.0:
2077; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2078; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2079; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2080; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2081; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2082; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2083; GFNIAVX2-NEXT:    retq
2084;
2085; GFNIAVX512F-LABEL: test_bitreverse_v32i16:
2086; GFNIAVX512F:       # %bb.0:
2087; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2088; GFNIAVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2089; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2090; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2091; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2092; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2093; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2094; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2095; GFNIAVX512F-NEXT:    retq
2096;
2097; GFNIAVX512BW-LABEL: test_bitreverse_v32i16:
2098; GFNIAVX512BW:       # %bb.0:
2099; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
2100; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2101; GFNIAVX512BW-NEXT:    retq
2102  %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
2103  ret <32 x i16> %b
2104}
2105
2106define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
2107; SSE2-LABEL: test_bitreverse_v16i32:
2108; SSE2:       # %bb.0:
2109; SSE2-NEXT:    pxor %xmm8, %xmm8
2110; SSE2-NEXT:    movdqa %xmm0, %xmm5
2111; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
2112; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
2113; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
2114; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
2115; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
2116; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
2117; SSE2-NEXT:    packuswb %xmm5, %xmm0
2118; SSE2-NEXT:    movdqa %xmm0, %xmm6
2119; SSE2-NEXT:    psrlw $4, %xmm6
2120; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2121; SSE2-NEXT:    pand %xmm5, %xmm6
2122; SSE2-NEXT:    pand %xmm5, %xmm0
2123; SSE2-NEXT:    psllw $4, %xmm0
2124; SSE2-NEXT:    por %xmm6, %xmm0
2125; SSE2-NEXT:    movdqa %xmm0, %xmm7
2126; SSE2-NEXT:    psrlw $2, %xmm7
2127; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2128; SSE2-NEXT:    pand %xmm6, %xmm7
2129; SSE2-NEXT:    pand %xmm6, %xmm0
2130; SSE2-NEXT:    psllw $2, %xmm0
2131; SSE2-NEXT:    por %xmm7, %xmm0
2132; SSE2-NEXT:    movdqa %xmm0, %xmm4
2133; SSE2-NEXT:    psrlw $1, %xmm4
2134; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2135; SSE2-NEXT:    pand %xmm7, %xmm4
2136; SSE2-NEXT:    pand %xmm7, %xmm0
2137; SSE2-NEXT:    paddb %xmm0, %xmm0
2138; SSE2-NEXT:    por %xmm4, %xmm0
2139; SSE2-NEXT:    movdqa %xmm1, %xmm4
2140; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
2141; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2142; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2143; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
2144; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2145; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2146; SSE2-NEXT:    packuswb %xmm4, %xmm1
2147; SSE2-NEXT:    movdqa %xmm1, %xmm4
2148; SSE2-NEXT:    psrlw $4, %xmm4
2149; SSE2-NEXT:    pand %xmm5, %xmm4
2150; SSE2-NEXT:    pand %xmm5, %xmm1
2151; SSE2-NEXT:    psllw $4, %xmm1
2152; SSE2-NEXT:    por %xmm4, %xmm1
2153; SSE2-NEXT:    movdqa %xmm1, %xmm4
2154; SSE2-NEXT:    psrlw $2, %xmm4
2155; SSE2-NEXT:    pand %xmm6, %xmm4
2156; SSE2-NEXT:    pand %xmm6, %xmm1
2157; SSE2-NEXT:    psllw $2, %xmm1
2158; SSE2-NEXT:    por %xmm4, %xmm1
2159; SSE2-NEXT:    movdqa %xmm1, %xmm4
2160; SSE2-NEXT:    psrlw $1, %xmm4
2161; SSE2-NEXT:    pand %xmm7, %xmm4
2162; SSE2-NEXT:    pand %xmm7, %xmm1
2163; SSE2-NEXT:    paddb %xmm1, %xmm1
2164; SSE2-NEXT:    por %xmm4, %xmm1
2165; SSE2-NEXT:    movdqa %xmm2, %xmm4
2166; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
2167; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2168; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2169; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
2170; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2171; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2172; SSE2-NEXT:    packuswb %xmm4, %xmm2
2173; SSE2-NEXT:    movdqa %xmm2, %xmm4
2174; SSE2-NEXT:    psrlw $4, %xmm4
2175; SSE2-NEXT:    pand %xmm5, %xmm4
2176; SSE2-NEXT:    pand %xmm5, %xmm2
2177; SSE2-NEXT:    psllw $4, %xmm2
2178; SSE2-NEXT:    por %xmm4, %xmm2
2179; SSE2-NEXT:    movdqa %xmm2, %xmm4
2180; SSE2-NEXT:    psrlw $2, %xmm4
2181; SSE2-NEXT:    pand %xmm6, %xmm4
2182; SSE2-NEXT:    pand %xmm6, %xmm2
2183; SSE2-NEXT:    psllw $2, %xmm2
2184; SSE2-NEXT:    por %xmm4, %xmm2
2185; SSE2-NEXT:    movdqa %xmm2, %xmm4
2186; SSE2-NEXT:    psrlw $1, %xmm4
2187; SSE2-NEXT:    pand %xmm7, %xmm4
2188; SSE2-NEXT:    pand %xmm7, %xmm2
2189; SSE2-NEXT:    paddb %xmm2, %xmm2
2190; SSE2-NEXT:    por %xmm4, %xmm2
2191; SSE2-NEXT:    movdqa %xmm3, %xmm4
2192; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
2193; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2194; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2195; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
2196; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
2197; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
2198; SSE2-NEXT:    packuswb %xmm4, %xmm3
2199; SSE2-NEXT:    movdqa %xmm3, %xmm4
2200; SSE2-NEXT:    psrlw $4, %xmm4
2201; SSE2-NEXT:    pand %xmm5, %xmm4
2202; SSE2-NEXT:    pand %xmm5, %xmm3
2203; SSE2-NEXT:    psllw $4, %xmm3
2204; SSE2-NEXT:    por %xmm4, %xmm3
2205; SSE2-NEXT:    movdqa %xmm3, %xmm4
2206; SSE2-NEXT:    psrlw $2, %xmm4
2207; SSE2-NEXT:    pand %xmm6, %xmm4
2208; SSE2-NEXT:    pand %xmm6, %xmm3
2209; SSE2-NEXT:    psllw $2, %xmm3
2210; SSE2-NEXT:    por %xmm4, %xmm3
2211; SSE2-NEXT:    movdqa %xmm3, %xmm4
2212; SSE2-NEXT:    psrlw $1, %xmm4
2213; SSE2-NEXT:    pand %xmm7, %xmm4
2214; SSE2-NEXT:    pand %xmm7, %xmm3
2215; SSE2-NEXT:    paddb %xmm3, %xmm3
2216; SSE2-NEXT:    por %xmm4, %xmm3
2217; SSE2-NEXT:    retq
2218;
2219; SSSE3-LABEL: test_bitreverse_v16i32:
2220; SSSE3:       # %bb.0:
2221; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2222; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2223; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2224; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2225; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2226; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2227; SSSE3-NEXT:    pand %xmm9, %xmm0
2228; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2229; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2230; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2231; SSSE3-NEXT:    psrlw $4, %xmm1
2232; SSSE3-NEXT:    pand %xmm9, %xmm1
2233; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2234; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2235; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2236; SSSE3-NEXT:    por %xmm6, %xmm0
2237; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2238; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2239; SSSE3-NEXT:    pand %xmm9, %xmm1
2240; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2241; SSSE3-NEXT:    pshufb %xmm1, %xmm6
2242; SSSE3-NEXT:    psrlw $4, %xmm5
2243; SSSE3-NEXT:    pand %xmm9, %xmm5
2244; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2245; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2246; SSSE3-NEXT:    por %xmm6, %xmm1
2247; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2248; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2249; SSSE3-NEXT:    pand %xmm9, %xmm5
2250; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2251; SSSE3-NEXT:    pshufb %xmm5, %xmm6
2252; SSSE3-NEXT:    psrlw $4, %xmm2
2253; SSSE3-NEXT:    pand %xmm9, %xmm2
2254; SSSE3-NEXT:    movdqa %xmm4, %xmm5
2255; SSSE3-NEXT:    pshufb %xmm2, %xmm5
2256; SSSE3-NEXT:    por %xmm6, %xmm5
2257; SSSE3-NEXT:    pshufb %xmm8, %xmm3
2258; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2259; SSSE3-NEXT:    pand %xmm9, %xmm2
2260; SSSE3-NEXT:    pshufb %xmm2, %xmm7
2261; SSSE3-NEXT:    psrlw $4, %xmm3
2262; SSSE3-NEXT:    pand %xmm9, %xmm3
2263; SSSE3-NEXT:    pshufb %xmm3, %xmm4
2264; SSSE3-NEXT:    por %xmm7, %xmm4
2265; SSSE3-NEXT:    movdqa %xmm5, %xmm2
2266; SSSE3-NEXT:    movdqa %xmm4, %xmm3
2267; SSSE3-NEXT:    retq
2268;
2269; AVX1-LABEL: test_bitreverse_v16i32:
2270; AVX1:       # %bb.0:
2271; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2272; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2273; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2274; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2275; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2276; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2277; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2278; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2279; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2280; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2281; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2282; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2283; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2284; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
2285; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2286; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2287; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2288; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
2289; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
2290; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2291; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2292; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2293; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2294; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2295; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2296; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2297; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2298; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2299; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2300; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
2301; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2302; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2303; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2304; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
2305; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
2306; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2307; AVX1-NEXT:    retq
2308;
2309; AVX2-LABEL: test_bitreverse_v16i32:
2310; AVX2:       # %bb.0:
2311; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2312; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2313; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2314; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
2315; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2316; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2317; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2318; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2319; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2320; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2321; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
2322; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2323; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
2324; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2325; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2326; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2327; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2328; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2329; AVX2-NEXT:    retq
2330;
2331; AVX512F-LABEL: test_bitreverse_v16i32:
2332; AVX512F:       # %bb.0:
2333; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2334; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2335; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2336; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2337; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
2338; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2339; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2340; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2341; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
2342; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2343; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
2344; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
2345; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
2346; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2347; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
2348; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
2349; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
2350; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
2351; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2352; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
2353; AVX512F-NEXT:    retq
2354;
2355; AVX512BW-LABEL: test_bitreverse_v16i32:
2356; AVX512BW:       # %bb.0:
2357; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
2358; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2359; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2360; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2361; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2362; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2363; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2364; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2365; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2366; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2367; AVX512BW-NEXT:    retq
2368;
2369; XOPAVX1-LABEL: test_bitreverse_v16i32:
2370; XOPAVX1:       # %bb.0:
2371; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2372; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
2373; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2374; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2375; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2376; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2377; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2378; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2379; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2380; XOPAVX1-NEXT:    retq
2381;
2382; XOPAVX2-LABEL: test_bitreverse_v16i32:
2383; XOPAVX2:       # %bb.0:
2384; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2385; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
2386; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2387; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2388; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2389; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2390; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2391; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2392; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2393; XOPAVX2-NEXT:    retq
2394;
2395; GFNISSE-LABEL: test_bitreverse_v16i32:
2396; GFNISSE:       # %bb.0:
2397; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2398; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
2399; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
2400; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
2401; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
2402; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
2403; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
2404; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
2405; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
2406; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
2407; GFNISSE-NEXT:    retq
2408;
2409; GFNIAVX1-LABEL: test_bitreverse_v16i32:
2410; GFNIAVX1:       # %bb.0:
2411; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2412; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2413; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2414; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
2415; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
2416; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2417; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
2418; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2419; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2420; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2421; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
2422; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2423; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
2424; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2425; GFNIAVX1-NEXT:    retq
2426;
2427; GFNIAVX2-LABEL: test_bitreverse_v16i32:
2428; GFNIAVX2:       # %bb.0:
2429; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2430; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2431; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2432; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2433; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2434; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2435; GFNIAVX2-NEXT:    retq
2436;
2437; GFNIAVX512F-LABEL: test_bitreverse_v16i32:
2438; GFNIAVX512F:       # %bb.0:
2439; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2440; GFNIAVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2441; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2442; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2443; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2444; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2445; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2446; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2447; GFNIAVX512F-NEXT:    retq
2448;
2449; GFNIAVX512BW-LABEL: test_bitreverse_v16i32:
2450; GFNIAVX512BW:       # %bb.0:
2451; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
2452; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2453; GFNIAVX512BW-NEXT:    retq
2454  %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
2455  ret <16 x i32> %b
2456}
2457
2458define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
2459; SSE2-LABEL: test_bitreverse_v8i64:
2460; SSE2:       # %bb.0:
2461; SSE2-NEXT:    pxor %xmm8, %xmm8
2462; SSE2-NEXT:    movdqa %xmm0, %xmm5
2463; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
2464; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
2465; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
2466; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
2467; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
2468; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2469; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
2470; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
2471; SSE2-NEXT:    packuswb %xmm5, %xmm0
2472; SSE2-NEXT:    movdqa %xmm0, %xmm6
2473; SSE2-NEXT:    psrlw $4, %xmm6
2474; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2475; SSE2-NEXT:    pand %xmm5, %xmm6
2476; SSE2-NEXT:    pand %xmm5, %xmm0
2477; SSE2-NEXT:    psllw $4, %xmm0
2478; SSE2-NEXT:    por %xmm6, %xmm0
2479; SSE2-NEXT:    movdqa %xmm0, %xmm7
2480; SSE2-NEXT:    psrlw $2, %xmm7
2481; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2482; SSE2-NEXT:    pand %xmm6, %xmm7
2483; SSE2-NEXT:    pand %xmm6, %xmm0
2484; SSE2-NEXT:    psllw $2, %xmm0
2485; SSE2-NEXT:    por %xmm7, %xmm0
2486; SSE2-NEXT:    movdqa %xmm0, %xmm4
2487; SSE2-NEXT:    psrlw $1, %xmm4
2488; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2489; SSE2-NEXT:    pand %xmm7, %xmm4
2490; SSE2-NEXT:    pand %xmm7, %xmm0
2491; SSE2-NEXT:    paddb %xmm0, %xmm0
2492; SSE2-NEXT:    por %xmm4, %xmm0
2493; SSE2-NEXT:    movdqa %xmm1, %xmm4
2494; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
2495; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
2496; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2497; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2498; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
2499; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2500; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2501; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2502; SSE2-NEXT:    packuswb %xmm4, %xmm1
2503; SSE2-NEXT:    movdqa %xmm1, %xmm4
2504; SSE2-NEXT:    psrlw $4, %xmm4
2505; SSE2-NEXT:    pand %xmm5, %xmm4
2506; SSE2-NEXT:    pand %xmm5, %xmm1
2507; SSE2-NEXT:    psllw $4, %xmm1
2508; SSE2-NEXT:    por %xmm4, %xmm1
2509; SSE2-NEXT:    movdqa %xmm1, %xmm4
2510; SSE2-NEXT:    psrlw $2, %xmm4
2511; SSE2-NEXT:    pand %xmm6, %xmm4
2512; SSE2-NEXT:    pand %xmm6, %xmm1
2513; SSE2-NEXT:    psllw $2, %xmm1
2514; SSE2-NEXT:    por %xmm4, %xmm1
2515; SSE2-NEXT:    movdqa %xmm1, %xmm4
2516; SSE2-NEXT:    psrlw $1, %xmm4
2517; SSE2-NEXT:    pand %xmm7, %xmm4
2518; SSE2-NEXT:    pand %xmm7, %xmm1
2519; SSE2-NEXT:    paddb %xmm1, %xmm1
2520; SSE2-NEXT:    por %xmm4, %xmm1
2521; SSE2-NEXT:    movdqa %xmm2, %xmm4
2522; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
2523; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
2524; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2525; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2526; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
2527; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
2528; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2529; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2530; SSE2-NEXT:    packuswb %xmm4, %xmm2
2531; SSE2-NEXT:    movdqa %xmm2, %xmm4
2532; SSE2-NEXT:    psrlw $4, %xmm4
2533; SSE2-NEXT:    pand %xmm5, %xmm4
2534; SSE2-NEXT:    pand %xmm5, %xmm2
2535; SSE2-NEXT:    psllw $4, %xmm2
2536; SSE2-NEXT:    por %xmm4, %xmm2
2537; SSE2-NEXT:    movdqa %xmm2, %xmm4
2538; SSE2-NEXT:    psrlw $2, %xmm4
2539; SSE2-NEXT:    pand %xmm6, %xmm4
2540; SSE2-NEXT:    pand %xmm6, %xmm2
2541; SSE2-NEXT:    psllw $2, %xmm2
2542; SSE2-NEXT:    por %xmm4, %xmm2
2543; SSE2-NEXT:    movdqa %xmm2, %xmm4
2544; SSE2-NEXT:    psrlw $1, %xmm4
2545; SSE2-NEXT:    pand %xmm7, %xmm4
2546; SSE2-NEXT:    pand %xmm7, %xmm2
2547; SSE2-NEXT:    paddb %xmm2, %xmm2
2548; SSE2-NEXT:    por %xmm4, %xmm2
2549; SSE2-NEXT:    movdqa %xmm3, %xmm4
2550; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
2551; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
2552; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2553; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2554; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
2555; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
2556; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
2557; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
2558; SSE2-NEXT:    packuswb %xmm4, %xmm3
2559; SSE2-NEXT:    movdqa %xmm3, %xmm4
2560; SSE2-NEXT:    psrlw $4, %xmm4
2561; SSE2-NEXT:    pand %xmm5, %xmm4
2562; SSE2-NEXT:    pand %xmm5, %xmm3
2563; SSE2-NEXT:    psllw $4, %xmm3
2564; SSE2-NEXT:    por %xmm4, %xmm3
2565; SSE2-NEXT:    movdqa %xmm3, %xmm4
2566; SSE2-NEXT:    psrlw $2, %xmm4
2567; SSE2-NEXT:    pand %xmm6, %xmm4
2568; SSE2-NEXT:    pand %xmm6, %xmm3
2569; SSE2-NEXT:    psllw $2, %xmm3
2570; SSE2-NEXT:    por %xmm4, %xmm3
2571; SSE2-NEXT:    movdqa %xmm3, %xmm4
2572; SSE2-NEXT:    psrlw $1, %xmm4
2573; SSE2-NEXT:    pand %xmm7, %xmm4
2574; SSE2-NEXT:    pand %xmm7, %xmm3
2575; SSE2-NEXT:    paddb %xmm3, %xmm3
2576; SSE2-NEXT:    por %xmm4, %xmm3
2577; SSE2-NEXT:    retq
2578;
2579; SSSE3-LABEL: test_bitreverse_v8i64:
2580; SSSE3:       # %bb.0:
2581; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2582; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2583; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2584; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2585; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2586; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2587; SSSE3-NEXT:    pand %xmm9, %xmm0
2588; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2589; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2590; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2591; SSSE3-NEXT:    psrlw $4, %xmm1
2592; SSSE3-NEXT:    pand %xmm9, %xmm1
2593; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2594; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2595; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2596; SSSE3-NEXT:    por %xmm6, %xmm0
2597; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2598; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2599; SSSE3-NEXT:    pand %xmm9, %xmm1
2600; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2601; SSSE3-NEXT:    pshufb %xmm1, %xmm6
2602; SSSE3-NEXT:    psrlw $4, %xmm5
2603; SSSE3-NEXT:    pand %xmm9, %xmm5
2604; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2605; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2606; SSSE3-NEXT:    por %xmm6, %xmm1
2607; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2608; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2609; SSSE3-NEXT:    pand %xmm9, %xmm5
2610; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2611; SSSE3-NEXT:    pshufb %xmm5, %xmm6
2612; SSSE3-NEXT:    psrlw $4, %xmm2
2613; SSSE3-NEXT:    pand %xmm9, %xmm2
2614; SSSE3-NEXT:    movdqa %xmm4, %xmm5
2615; SSSE3-NEXT:    pshufb %xmm2, %xmm5
2616; SSSE3-NEXT:    por %xmm6, %xmm5
2617; SSSE3-NEXT:    pshufb %xmm8, %xmm3
2618; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2619; SSSE3-NEXT:    pand %xmm9, %xmm2
2620; SSSE3-NEXT:    pshufb %xmm2, %xmm7
2621; SSSE3-NEXT:    psrlw $4, %xmm3
2622; SSSE3-NEXT:    pand %xmm9, %xmm3
2623; SSSE3-NEXT:    pshufb %xmm3, %xmm4
2624; SSSE3-NEXT:    por %xmm7, %xmm4
2625; SSSE3-NEXT:    movdqa %xmm5, %xmm2
2626; SSSE3-NEXT:    movdqa %xmm4, %xmm3
2627; SSSE3-NEXT:    retq
2628;
2629; AVX1-LABEL: test_bitreverse_v8i64:
2630; AVX1:       # %bb.0:
2631; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2632; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2633; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2634; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2635; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2636; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2637; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2638; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2639; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2640; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2641; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2642; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2643; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2644; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
2645; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2646; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2647; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2648; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
2649; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
2650; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2651; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2652; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2653; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2654; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2655; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2656; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2657; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2658; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2659; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2660; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
2661; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2662; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2663; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2664; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
2665; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
2666; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2667; AVX1-NEXT:    retq
2668;
2669; AVX2-LABEL: test_bitreverse_v8i64:
2670; AVX2:       # %bb.0:
2671; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2672; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2673; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2674; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
2675; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2676; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2677; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2678; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2679; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2680; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2681; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
2682; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2683; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
2684; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2685; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2686; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2687; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2688; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2689; AVX2-NEXT:    retq
2690;
2691; AVX512F-LABEL: test_bitreverse_v8i64:
2692; AVX512F:       # %bb.0:
2693; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2694; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2695; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2696; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2697; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
2698; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2699; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2700; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2701; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
2702; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2703; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
2704; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
2705; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
2706; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2707; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
2708; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
2709; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
2710; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
2711; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2712; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
2713; AVX512F-NEXT:    retq
2714;
2715; AVX512BW-LABEL: test_bitreverse_v8i64:
2716; AVX512BW:       # %bb.0:
2717; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
2718; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2719; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2720; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2721; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2722; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2723; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2724; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2725; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2726; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2727; AVX512BW-NEXT:    retq
2728;
2729; XOPAVX1-LABEL: test_bitreverse_v8i64:
2730; XOPAVX1:       # %bb.0:
2731; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2732; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2733; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2734; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2735; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2736; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2737; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2738; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2739; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2740; XOPAVX1-NEXT:    retq
2741;
2742; XOPAVX2-LABEL: test_bitreverse_v8i64:
2743; XOPAVX2:       # %bb.0:
2744; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2745; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2746; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2747; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2748; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2749; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2750; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2751; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2752; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2753; XOPAVX2-NEXT:    retq
2754;
2755; GFNISSE-LABEL: test_bitreverse_v8i64:
2756; GFNISSE:       # %bb.0:
2757; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2758; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
2759; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
2760; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
2761; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
2762; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
2763; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
2764; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
2765; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
2766; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
2767; GFNISSE-NEXT:    retq
2768;
2769; GFNIAVX1-LABEL: test_bitreverse_v8i64:
2770; GFNIAVX1:       # %bb.0:
2771; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2772; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2773; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2774; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
2775; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
2776; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2777; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
2778; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2779; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2780; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2781; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2
2782; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2783; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
2784; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2785; GFNIAVX1-NEXT:    retq
2786;
2787; GFNIAVX2-LABEL: test_bitreverse_v8i64:
2788; GFNIAVX2:       # %bb.0:
2789; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2790; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2791; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2792; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2793; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2794; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2795; GFNIAVX2-NEXT:    retq
2796;
2797; GFNIAVX512F-LABEL: test_bitreverse_v8i64:
2798; GFNIAVX512F:       # %bb.0:
2799; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2800; GFNIAVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2801; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2802; GFNIAVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2803; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2804; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2805; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2806; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2807; GFNIAVX512F-NEXT:    retq
2808;
2809; GFNIAVX512BW-LABEL: test_bitreverse_v8i64:
2810; GFNIAVX512BW:       # %bb.0:
2811; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
2812; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2813; GFNIAVX512BW-NEXT:    retq
2814  %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
2815  ret <8 x i64> %b
2816}
2817
2818;
2819; Constant Folding
2820;
2821
2822define i32 @fold_bitreverse_i32() nounwind {
2823; ALL-LABEL: fold_bitreverse_i32:
2824; ALL:       # %bb.0:
2825; ALL-NEXT:    movl $16711935, %eax # imm = 0xFF00FF
2826; ALL-NEXT:    retq
2827  %b = call i32 @llvm.bitreverse.i32(i32 4278255360)
2828  ret i32 %b
2829}
2830
2831define <16 x i8> @fold_bitreverse_v16i8() nounwind {
2832; SSE-LABEL: fold_bitreverse_v16i8:
2833; SSE:       # %bb.0:
2834; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2835; SSE-NEXT:    retq
2836;
2837; AVX-LABEL: fold_bitreverse_v16i8:
2838; AVX:       # %bb.0:
2839; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2840; AVX-NEXT:    retq
2841;
2842; XOP-LABEL: fold_bitreverse_v16i8:
2843; XOP:       # %bb.0:
2844; XOP-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2845; XOP-NEXT:    retq
2846;
2847; GFNISSE-LABEL: fold_bitreverse_v16i8:
2848; GFNISSE:       # %bb.0:
2849; GFNISSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2850; GFNISSE-NEXT:    retq
2851;
2852; GFNIAVX-LABEL: fold_bitreverse_v16i8:
2853; GFNIAVX:       # %bb.0:
2854; GFNIAVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2855; GFNIAVX-NEXT:    retq
2856  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>)
2857  ret <16 x i8> %b
2858}
2859
2860define <16 x i16> @fold_bitreverse_v16i16() nounwind {
2861; SSE-LABEL: fold_bitreverse_v16i16:
2862; SSE:       # %bb.0:
2863; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
2864; SSE-NEXT:    movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
2865; SSE-NEXT:    retq
2866;
2867; AVX-LABEL: fold_bitreverse_v16i16:
2868; AVX:       # %bb.0:
2869; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
2870; AVX-NEXT:    retq
2871;
2872; XOP-LABEL: fold_bitreverse_v16i16:
2873; XOP:       # %bb.0:
2874; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
2875; XOP-NEXT:    retq
2876;
2877; GFNISSE-LABEL: fold_bitreverse_v16i16:
2878; GFNISSE:       # %bb.0:
2879; GFNISSE-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
2880; GFNISSE-NEXT:    movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
2881; GFNISSE-NEXT:    retq
2882;
2883; GFNIAVX-LABEL: fold_bitreverse_v16i16:
2884; GFNIAVX:       # %bb.0:
2885; GFNIAVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
2886; GFNIAVX-NEXT:    retq
2887  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>)
2888  ret <16 x i16> %b
2889}
2890
2891define <16 x i32> @fold_bitreverse_v16i32() nounwind {
2892; SSE-LABEL: fold_bitreverse_v16i32:
2893; SSE:       # %bb.0:
2894; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
2895; SSE-NEXT:    movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
2896; SSE-NEXT:    movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
2897; SSE-NEXT:    movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103]
2898; SSE-NEXT:    retq
2899;
2900; AVX1-LABEL: fold_bitreverse_v16i32:
2901; AVX1:       # %bb.0:
2902; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2903; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2904; AVX1-NEXT:    retq
2905;
2906; AVX2-LABEL: fold_bitreverse_v16i32:
2907; AVX2:       # %bb.0:
2908; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2909; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2910; AVX2-NEXT:    retq
2911;
2912; AVX512-LABEL: fold_bitreverse_v16i32:
2913; AVX512:       # %bb.0:
2914; AVX512-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2915; AVX512-NEXT:    retq
2916;
2917; XOP-LABEL: fold_bitreverse_v16i32:
2918; XOP:       # %bb.0:
2919; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2920; XOP-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2921; XOP-NEXT:    retq
2922;
2923; GFNISSE-LABEL: fold_bitreverse_v16i32:
2924; GFNISSE:       # %bb.0:
2925; GFNISSE-NEXT:    movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
2926; GFNISSE-NEXT:    movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
2927; GFNISSE-NEXT:    movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
2928; GFNISSE-NEXT:    movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103]
2929; GFNISSE-NEXT:    retq
2930;
2931; GFNIAVX1-LABEL: fold_bitreverse_v16i32:
2932; GFNIAVX1:       # %bb.0:
2933; GFNIAVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2934; GFNIAVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2935; GFNIAVX1-NEXT:    retq
2936;
2937; GFNIAVX2-LABEL: fold_bitreverse_v16i32:
2938; GFNIAVX2:       # %bb.0:
2939; GFNIAVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2940; GFNIAVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2941; GFNIAVX2-NEXT:    retq
2942;
2943; GFNIAVX512-LABEL: fold_bitreverse_v16i32:
2944; GFNIAVX512:       # %bb.0:
2945; GFNIAVX512-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2946; GFNIAVX512-NEXT:    retq
2947  %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>)
2948  ret <16 x i32> %b
2949}
2950
2951declare i8 @llvm.bitreverse.i8(i8) readnone
2952declare i16 @llvm.bitreverse.i16(i16) readnone
2953declare i32 @llvm.bitreverse.i32(i32) readnone
2954declare i64 @llvm.bitreverse.i64(i64) readnone
2955
2956declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone
2957declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone
2958declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone
2959declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone
2960
2961declare <32 x i8>  @llvm.bitreverse.v32i8(<32 x i8>) readnone
2962declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone
2963declare <8 x i32>  @llvm.bitreverse.v8i32(<8 x i32>) readnone
2964declare <4 x i64>  @llvm.bitreverse.v4i64(<4 x i64>) readnone
2965
2966declare <64 x i8>  @llvm.bitreverse.v64i8(<64 x i8>) readnone
2967declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone
2968declare <16 x i32>  @llvm.bitreverse.v16i32(<16 x i32>) readnone
2969declare <8 x i64>  @llvm.bitreverse.v8i64(<8 x i64>) readnone
2970