1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 10 11; Make sure we don't crash with avx512bw and xop 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw 13 14define i8 @test_bitreverse_i8(i8 %a) nounwind { 15; SSE-LABEL: test_bitreverse_i8: 16; SSE: # %bb.0: 17; SSE-NEXT: rolb $4, %dil 18; SSE-NEXT: movl %edi, %eax 19; SSE-NEXT: andb $51, %al 20; SSE-NEXT: shlb $2, %al 21; SSE-NEXT: andb $-52, %dil 22; SSE-NEXT: shrb $2, %dil 23; SSE-NEXT: orb %al, %dil 24; SSE-NEXT: movl %edi, %eax 25; SSE-NEXT: andb $85, %al 26; SSE-NEXT: addb %al, %al 27; SSE-NEXT: andb $-86, %dil 28; SSE-NEXT: shrb %dil 29; SSE-NEXT: orb %al, %dil 30; SSE-NEXT: movl %edi, %eax 31; SSE-NEXT: retq 32; 33; AVX-LABEL: test_bitreverse_i8: 34; AVX: # %bb.0: 35; AVX-NEXT: rolb $4, %dil 36; AVX-NEXT: movl %edi, %eax 37; AVX-NEXT: andb $51, %al 38; AVX-NEXT: shlb $2, %al 39; AVX-NEXT: andb $-52, %dil 40; AVX-NEXT: shrb $2, %dil 41; AVX-NEXT: orb %al, %dil 42; AVX-NEXT: movl %edi, %eax 43; AVX-NEXT: andb $85, %al 44; AVX-NEXT: addb %al, %al 45; AVX-NEXT: andb $-86, %dil 46; AVX-NEXT: shrb %dil 47; AVX-NEXT: orb %al, %dil 48; AVX-NEXT: movl %edi, %eax 49; AVX-NEXT: retq 50; 51; XOP-LABEL: test_bitreverse_i8: 52; XOP: # %bb.0: 53; XOP-NEXT: vmovd %edi, %xmm0 54; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 55; XOP-NEXT: vpextrb $0, %xmm0, %eax 56; XOP-NEXT: # kill: def $al killed $al killed $eax 57; XOP-NEXT: retq 58 %b = call i8 @llvm.bitreverse.i8(i8 %a) 59 ret i8 %b 60} 61 62define i16 @test_bitreverse_i16(i16 %a) nounwind { 63; SSE-LABEL: test_bitreverse_i16: 64; SSE: # %bb.0: 65; SSE-NEXT: # kill: def $edi killed $edi def $rdi 66; SSE-NEXT: rolw $8, %di 67; SSE-NEXT: movl %edi, %eax 68; SSE-NEXT: andl $3855, %eax # imm = 0xF0F 69; SSE-NEXT: shll $4, %eax 70; SSE-NEXT: andl $61680, %edi # imm = 0xF0F0 71; SSE-NEXT: shrl $4, %edi 72; SSE-NEXT: orl %eax, %edi 73; SSE-NEXT: movl %edi, %eax 74; SSE-NEXT: andl $13107, %eax # imm = 0x3333 75; SSE-NEXT: andl $52428, %edi # imm = 0xCCCC 76; SSE-NEXT: shrl $2, %edi 77; SSE-NEXT: leal (%rdi,%rax,4), %eax 78; SSE-NEXT: movl %eax, %ecx 79; SSE-NEXT: andl $21845, %ecx # imm = 0x5555 80; SSE-NEXT: andl $43690, %eax # imm = 0xAAAA 81; SSE-NEXT: shrl %eax 82; SSE-NEXT: leal (%rax,%rcx,2), %eax 83; SSE-NEXT: # kill: def $ax killed $ax killed $eax 84; SSE-NEXT: retq 85; 86; AVX-LABEL: test_bitreverse_i16: 87; AVX: # %bb.0: 88; AVX-NEXT: # kill: def $edi killed $edi def $rdi 89; AVX-NEXT: rolw $8, %di 90; AVX-NEXT: movl %edi, %eax 91; AVX-NEXT: andl $3855, %eax # imm = 0xF0F 92; AVX-NEXT: shll $4, %eax 93; AVX-NEXT: andl $61680, %edi # imm = 0xF0F0 94; AVX-NEXT: shrl $4, %edi 95; AVX-NEXT: orl %eax, %edi 96; AVX-NEXT: movl %edi, %eax 97; AVX-NEXT: andl $13107, %eax # imm = 0x3333 98; AVX-NEXT: andl $52428, %edi # imm = 0xCCCC 99; AVX-NEXT: shrl $2, %edi 100; AVX-NEXT: leal (%rdi,%rax,4), %eax 101; AVX-NEXT: movl %eax, %ecx 102; AVX-NEXT: andl $21845, %ecx # imm = 0x5555 103; AVX-NEXT: andl $43690, %eax # imm = 0xAAAA 104; AVX-NEXT: shrl %eax 105; AVX-NEXT: leal (%rax,%rcx,2), %eax 106; AVX-NEXT: # kill: def $ax killed $ax killed $eax 107; AVX-NEXT: retq 108; 109; XOP-LABEL: test_bitreverse_i16: 110; XOP: # %bb.0: 111; XOP-NEXT: vmovd %edi, %xmm0 112; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 113; XOP-NEXT: vmovd %xmm0, %eax 114; XOP-NEXT: # kill: def $ax killed $ax killed $eax 115; XOP-NEXT: retq 116 %b = call i16 @llvm.bitreverse.i16(i16 %a) 117 ret i16 %b 118} 119 120define i32 @test_bitreverse_i32(i32 %a) nounwind { 121; SSE-LABEL: test_bitreverse_i32: 122; SSE: # %bb.0: 123; SSE-NEXT: # kill: def $edi killed $edi def $rdi 124; SSE-NEXT: bswapl %edi 125; SSE-NEXT: movl %edi, %eax 126; SSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 127; SSE-NEXT: shll $4, %eax 128; SSE-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 129; SSE-NEXT: shrl $4, %edi 130; SSE-NEXT: orl %eax, %edi 131; SSE-NEXT: movl %edi, %eax 132; SSE-NEXT: andl $858993459, %eax # imm = 0x33333333 133; SSE-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC 134; SSE-NEXT: shrl $2, %edi 135; SSE-NEXT: leal (%rdi,%rax,4), %eax 136; SSE-NEXT: movl %eax, %ecx 137; SSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555 138; SSE-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA 139; SSE-NEXT: shrl %eax 140; SSE-NEXT: leal (%rax,%rcx,2), %eax 141; SSE-NEXT: retq 142; 143; AVX-LABEL: test_bitreverse_i32: 144; AVX: # %bb.0: 145; AVX-NEXT: # kill: def $edi killed $edi def $rdi 146; AVX-NEXT: bswapl %edi 147; AVX-NEXT: movl %edi, %eax 148; AVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 149; AVX-NEXT: shll $4, %eax 150; AVX-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 151; AVX-NEXT: shrl $4, %edi 152; AVX-NEXT: orl %eax, %edi 153; AVX-NEXT: movl %edi, %eax 154; AVX-NEXT: andl $858993459, %eax # imm = 0x33333333 155; AVX-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC 156; AVX-NEXT: shrl $2, %edi 157; AVX-NEXT: leal (%rdi,%rax,4), %eax 158; AVX-NEXT: movl %eax, %ecx 159; AVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555 160; AVX-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA 161; AVX-NEXT: shrl %eax 162; AVX-NEXT: leal (%rax,%rcx,2), %eax 163; AVX-NEXT: retq 164; 165; XOP-LABEL: test_bitreverse_i32: 166; XOP: # %bb.0: 167; XOP-NEXT: vmovd %edi, %xmm0 168; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 169; XOP-NEXT: vmovd %xmm0, %eax 170; XOP-NEXT: retq 171 %b = call i32 @llvm.bitreverse.i32(i32 %a) 172 ret i32 %b 173} 174 175define i64 @test_bitreverse_i64(i64 %a) nounwind { 176; SSE-LABEL: test_bitreverse_i64: 177; SSE: # %bb.0: 178; SSE-NEXT: bswapq %rdi 179; SSE-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F 180; SSE-NEXT: andq %rdi, %rax 181; SSE-NEXT: shlq $4, %rax 182; SSE-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 183; SSE-NEXT: andq %rdi, %rcx 184; SSE-NEXT: shrq $4, %rcx 185; SSE-NEXT: orq %rax, %rcx 186; SSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 187; SSE-NEXT: andq %rcx, %rax 188; SSE-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC 189; SSE-NEXT: andq %rcx, %rdx 190; SSE-NEXT: shrq $2, %rdx 191; SSE-NEXT: leaq (%rdx,%rax,4), %rax 192; SSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 193; SSE-NEXT: andq %rax, %rcx 194; SSE-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA 195; SSE-NEXT: andq %rax, %rdx 196; SSE-NEXT: shrq %rdx 197; SSE-NEXT: leaq (%rdx,%rcx,2), %rax 198; SSE-NEXT: retq 199; 200; AVX-LABEL: test_bitreverse_i64: 201; AVX: # %bb.0: 202; AVX-NEXT: bswapq %rdi 203; AVX-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F 204; AVX-NEXT: andq %rdi, %rax 205; AVX-NEXT: shlq $4, %rax 206; AVX-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 207; AVX-NEXT: andq %rdi, %rcx 208; AVX-NEXT: shrq $4, %rcx 209; AVX-NEXT: orq %rax, %rcx 210; AVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 211; AVX-NEXT: andq %rcx, %rax 212; AVX-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC 213; AVX-NEXT: andq %rcx, %rdx 214; AVX-NEXT: shrq $2, %rdx 215; AVX-NEXT: leaq (%rdx,%rax,4), %rax 216; AVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 217; AVX-NEXT: andq %rax, %rcx 218; AVX-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA 219; AVX-NEXT: andq %rax, %rdx 220; AVX-NEXT: shrq %rdx 221; AVX-NEXT: leaq (%rdx,%rcx,2), %rax 222; AVX-NEXT: retq 223; 224; XOP-LABEL: test_bitreverse_i64: 225; XOP: # %bb.0: 226; XOP-NEXT: vmovq %rdi, %xmm0 227; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 228; XOP-NEXT: vmovq %xmm0, %rax 229; XOP-NEXT: retq 230 %b = call i64 @llvm.bitreverse.i64(i64 %a) 231 ret i64 %b 232} 233 234define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { 235; SSE2-LABEL: test_bitreverse_v16i8: 236; SSE2: # %bb.0: 237; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 238; SSE2-NEXT: movdqa %xmm0, %xmm2 239; SSE2-NEXT: pand %xmm1, %xmm2 240; SSE2-NEXT: psllw $4, %xmm2 241; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 242; SSE2-NEXT: pand %xmm3, %xmm2 243; SSE2-NEXT: pand %xmm3, %xmm0 244; SSE2-NEXT: psrlw $4, %xmm0 245; SSE2-NEXT: pand %xmm1, %xmm0 246; SSE2-NEXT: por %xmm2, %xmm0 247; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 248; SSE2-NEXT: pand %xmm0, %xmm1 249; SSE2-NEXT: psllw $2, %xmm1 250; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 251; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 252; SSE2-NEXT: psrlw $2, %xmm0 253; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 254; SSE2-NEXT: por %xmm1, %xmm0 255; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 256; SSE2-NEXT: pand %xmm0, %xmm1 257; SSE2-NEXT: paddb %xmm1, %xmm1 258; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 259; SSE2-NEXT: psrlw $1, %xmm0 260; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 261; SSE2-NEXT: por %xmm1, %xmm0 262; SSE2-NEXT: retq 263; 264; SSSE3-LABEL: test_bitreverse_v16i8: 265; SSSE3: # %bb.0: 266; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 267; SSSE3-NEXT: movdqa %xmm0, %xmm2 268; SSSE3-NEXT: pand %xmm1, %xmm2 269; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 270; SSSE3-NEXT: pshufb %xmm2, %xmm3 271; SSSE3-NEXT: psrlw $4, %xmm0 272; SSSE3-NEXT: pand %xmm1, %xmm0 273; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 274; SSSE3-NEXT: pshufb %xmm0, %xmm1 275; SSSE3-NEXT: por %xmm3, %xmm1 276; SSSE3-NEXT: movdqa %xmm1, %xmm0 277; SSSE3-NEXT: retq 278; 279; AVX-LABEL: test_bitreverse_v16i8: 280; AVX: # %bb.0: 281; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 282; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 283; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 284; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 285; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 286; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 287; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 288; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 289; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 290; AVX-NEXT: retq 291; 292; XOP-LABEL: test_bitreverse_v16i8: 293; XOP: # %bb.0: 294; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 295; XOP-NEXT: retq 296 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) 297 ret <16 x i8> %b 298} 299 300define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { 301; SSE2-LABEL: test_bitreverse_v8i16: 302; SSE2: # %bb.0: 303; SSE2-NEXT: pxor %xmm1, %xmm1 304; SSE2-NEXT: movdqa %xmm0, %xmm2 305; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 306; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 307; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] 308; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 309; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 310; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 311; SSE2-NEXT: packuswb %xmm2, %xmm0 312; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 313; SSE2-NEXT: movdqa %xmm0, %xmm2 314; SSE2-NEXT: pand %xmm1, %xmm2 315; SSE2-NEXT: psllw $4, %xmm2 316; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 317; SSE2-NEXT: pand %xmm3, %xmm2 318; SSE2-NEXT: pand %xmm3, %xmm0 319; SSE2-NEXT: psrlw $4, %xmm0 320; SSE2-NEXT: pand %xmm1, %xmm0 321; SSE2-NEXT: por %xmm2, %xmm0 322; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 323; SSE2-NEXT: pand %xmm0, %xmm1 324; SSE2-NEXT: psllw $2, %xmm1 325; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 326; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 327; SSE2-NEXT: psrlw $2, %xmm0 328; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 329; SSE2-NEXT: por %xmm1, %xmm0 330; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 331; SSE2-NEXT: pand %xmm0, %xmm1 332; SSE2-NEXT: paddb %xmm1, %xmm1 333; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 334; SSE2-NEXT: psrlw $1, %xmm0 335; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 336; SSE2-NEXT: por %xmm1, %xmm0 337; SSE2-NEXT: retq 338; 339; SSSE3-LABEL: test_bitreverse_v8i16: 340; SSSE3: # %bb.0: 341; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 342; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 343; SSSE3-NEXT: movdqa %xmm0, %xmm2 344; SSSE3-NEXT: pand %xmm1, %xmm2 345; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 346; SSSE3-NEXT: pshufb %xmm2, %xmm3 347; SSSE3-NEXT: psrlw $4, %xmm0 348; SSSE3-NEXT: pand %xmm1, %xmm0 349; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 350; SSSE3-NEXT: pshufb %xmm0, %xmm1 351; SSSE3-NEXT: por %xmm3, %xmm1 352; SSSE3-NEXT: movdqa %xmm1, %xmm0 353; SSSE3-NEXT: retq 354; 355; AVX-LABEL: test_bitreverse_v8i16: 356; AVX: # %bb.0: 357; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 358; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 359; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 360; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 361; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 362; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 363; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 364; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 365; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 366; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 367; AVX-NEXT: retq 368; 369; XOP-LABEL: test_bitreverse_v8i16: 370; XOP: # %bb.0: 371; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 372; XOP-NEXT: retq 373 %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) 374 ret <8 x i16> %b 375} 376 377define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { 378; SSE2-LABEL: test_bitreverse_v4i32: 379; SSE2: # %bb.0: 380; SSE2-NEXT: pxor %xmm1, %xmm1 381; SSE2-NEXT: movdqa %xmm0, %xmm2 382; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 383; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 384; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 385; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 386; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 387; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 388; SSE2-NEXT: packuswb %xmm2, %xmm0 389; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 390; SSE2-NEXT: movdqa %xmm0, %xmm2 391; SSE2-NEXT: pand %xmm1, %xmm2 392; SSE2-NEXT: psllw $4, %xmm2 393; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 394; SSE2-NEXT: pand %xmm3, %xmm2 395; SSE2-NEXT: pand %xmm3, %xmm0 396; SSE2-NEXT: psrlw $4, %xmm0 397; SSE2-NEXT: pand %xmm1, %xmm0 398; SSE2-NEXT: por %xmm2, %xmm0 399; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 400; SSE2-NEXT: pand %xmm0, %xmm1 401; SSE2-NEXT: psllw $2, %xmm1 402; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 403; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 404; SSE2-NEXT: psrlw $2, %xmm0 405; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 406; SSE2-NEXT: por %xmm1, %xmm0 407; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 408; SSE2-NEXT: pand %xmm0, %xmm1 409; SSE2-NEXT: paddb %xmm1, %xmm1 410; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 411; SSE2-NEXT: psrlw $1, %xmm0 412; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 413; SSE2-NEXT: por %xmm1, %xmm0 414; SSE2-NEXT: retq 415; 416; SSSE3-LABEL: test_bitreverse_v4i32: 417; SSSE3: # %bb.0: 418; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 419; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 420; SSSE3-NEXT: movdqa %xmm0, %xmm2 421; SSSE3-NEXT: pand %xmm1, %xmm2 422; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 423; SSSE3-NEXT: pshufb %xmm2, %xmm3 424; SSSE3-NEXT: psrlw $4, %xmm0 425; SSSE3-NEXT: pand %xmm1, %xmm0 426; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 427; SSSE3-NEXT: pshufb %xmm0, %xmm1 428; SSSE3-NEXT: por %xmm3, %xmm1 429; SSSE3-NEXT: movdqa %xmm1, %xmm0 430; SSSE3-NEXT: retq 431; 432; AVX-LABEL: test_bitreverse_v4i32: 433; AVX: # %bb.0: 434; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 435; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 436; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 437; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 438; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 439; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 440; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 441; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 442; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 443; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 444; AVX-NEXT: retq 445; 446; XOP-LABEL: test_bitreverse_v4i32: 447; XOP: # %bb.0: 448; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 449; XOP-NEXT: retq 450 %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) 451 ret <4 x i32> %b 452} 453 454define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { 455; SSE2-LABEL: test_bitreverse_v2i64: 456; SSE2: # %bb.0: 457; SSE2-NEXT: pxor %xmm1, %xmm1 458; SSE2-NEXT: movdqa %xmm0, %xmm2 459; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 460; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 461; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 462; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 463; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 464; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 465; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 466; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 467; SSE2-NEXT: packuswb %xmm2, %xmm0 468; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 469; SSE2-NEXT: movdqa %xmm0, %xmm2 470; SSE2-NEXT: pand %xmm1, %xmm2 471; SSE2-NEXT: psllw $4, %xmm2 472; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 473; SSE2-NEXT: pand %xmm3, %xmm2 474; SSE2-NEXT: pand %xmm3, %xmm0 475; SSE2-NEXT: psrlw $4, %xmm0 476; SSE2-NEXT: pand %xmm1, %xmm0 477; SSE2-NEXT: por %xmm2, %xmm0 478; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 479; SSE2-NEXT: pand %xmm0, %xmm1 480; SSE2-NEXT: psllw $2, %xmm1 481; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 482; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 483; SSE2-NEXT: psrlw $2, %xmm0 484; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 485; SSE2-NEXT: por %xmm1, %xmm0 486; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 487; SSE2-NEXT: pand %xmm0, %xmm1 488; SSE2-NEXT: paddb %xmm1, %xmm1 489; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 490; SSE2-NEXT: psrlw $1, %xmm0 491; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 492; SSE2-NEXT: por %xmm1, %xmm0 493; SSE2-NEXT: retq 494; 495; SSSE3-LABEL: test_bitreverse_v2i64: 496; SSSE3: # %bb.0: 497; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 498; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 499; SSSE3-NEXT: movdqa %xmm0, %xmm2 500; SSSE3-NEXT: pand %xmm1, %xmm2 501; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 502; SSSE3-NEXT: pshufb %xmm2, %xmm3 503; SSSE3-NEXT: psrlw $4, %xmm0 504; SSSE3-NEXT: pand %xmm1, %xmm0 505; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 506; SSSE3-NEXT: pshufb %xmm0, %xmm1 507; SSSE3-NEXT: por %xmm3, %xmm1 508; SSSE3-NEXT: movdqa %xmm1, %xmm0 509; SSSE3-NEXT: retq 510; 511; AVX-LABEL: test_bitreverse_v2i64: 512; AVX: # %bb.0: 513; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 514; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 515; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 516; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 517; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 518; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 519; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 520; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 521; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 522; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 523; AVX-NEXT: retq 524; 525; XOP-LABEL: test_bitreverse_v2i64: 526; XOP: # %bb.0: 527; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 528; XOP-NEXT: retq 529 %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) 530 ret <2 x i64> %b 531} 532 533define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { 534; SSE2-LABEL: test_bitreverse_v32i8: 535; SSE2: # %bb.0: 536; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 537; SSE2-NEXT: movdqa %xmm0, %xmm3 538; SSE2-NEXT: pand %xmm2, %xmm3 539; SSE2-NEXT: psllw $4, %xmm3 540; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 541; SSE2-NEXT: pand %xmm5, %xmm3 542; SSE2-NEXT: pand %xmm5, %xmm0 543; SSE2-NEXT: psrlw $4, %xmm0 544; SSE2-NEXT: pand %xmm2, %xmm0 545; SSE2-NEXT: por %xmm3, %xmm0 546; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 547; SSE2-NEXT: movdqa %xmm0, %xmm4 548; SSE2-NEXT: pand %xmm3, %xmm4 549; SSE2-NEXT: psllw $2, %xmm4 550; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 551; SSE2-NEXT: pand %xmm8, %xmm4 552; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 553; SSE2-NEXT: pand %xmm9, %xmm0 554; SSE2-NEXT: psrlw $2, %xmm0 555; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 556; SSE2-NEXT: pand %xmm10, %xmm0 557; SSE2-NEXT: por %xmm4, %xmm0 558; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 559; SSE2-NEXT: movdqa %xmm0, %xmm7 560; SSE2-NEXT: pand %xmm4, %xmm7 561; SSE2-NEXT: psrlw $1, %xmm7 562; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 563; SSE2-NEXT: pand %xmm11, %xmm7 564; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 565; SSE2-NEXT: pand %xmm6, %xmm0 566; SSE2-NEXT: paddb %xmm0, %xmm0 567; SSE2-NEXT: por %xmm7, %xmm0 568; SSE2-NEXT: movdqa %xmm1, %xmm7 569; SSE2-NEXT: pand %xmm2, %xmm7 570; SSE2-NEXT: psllw $4, %xmm7 571; SSE2-NEXT: pand %xmm5, %xmm7 572; SSE2-NEXT: pand %xmm5, %xmm1 573; SSE2-NEXT: psrlw $4, %xmm1 574; SSE2-NEXT: pand %xmm2, %xmm1 575; SSE2-NEXT: por %xmm7, %xmm1 576; SSE2-NEXT: pand %xmm1, %xmm3 577; SSE2-NEXT: psllw $2, %xmm3 578; SSE2-NEXT: pand %xmm8, %xmm3 579; SSE2-NEXT: pand %xmm9, %xmm1 580; SSE2-NEXT: psrlw $2, %xmm1 581; SSE2-NEXT: pand %xmm10, %xmm1 582; SSE2-NEXT: por %xmm3, %xmm1 583; SSE2-NEXT: pand %xmm1, %xmm4 584; SSE2-NEXT: psrlw $1, %xmm4 585; SSE2-NEXT: pand %xmm11, %xmm4 586; SSE2-NEXT: pand %xmm6, %xmm1 587; SSE2-NEXT: paddb %xmm1, %xmm1 588; SSE2-NEXT: por %xmm4, %xmm1 589; SSE2-NEXT: retq 590; 591; SSSE3-LABEL: test_bitreverse_v32i8: 592; SSSE3: # %bb.0: 593; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 594; SSSE3-NEXT: movdqa %xmm0, %xmm2 595; SSSE3-NEXT: pand %xmm4, %xmm2 596; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 597; SSSE3-NEXT: movdqa %xmm5, %xmm6 598; SSSE3-NEXT: pshufb %xmm2, %xmm6 599; SSSE3-NEXT: psrlw $4, %xmm0 600; SSSE3-NEXT: pand %xmm4, %xmm0 601; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 602; SSSE3-NEXT: movdqa %xmm2, %xmm3 603; SSSE3-NEXT: pshufb %xmm0, %xmm3 604; SSSE3-NEXT: por %xmm6, %xmm3 605; SSSE3-NEXT: movdqa %xmm1, %xmm0 606; SSSE3-NEXT: pand %xmm4, %xmm0 607; SSSE3-NEXT: pshufb %xmm0, %xmm5 608; SSSE3-NEXT: psrlw $4, %xmm1 609; SSSE3-NEXT: pand %xmm4, %xmm1 610; SSSE3-NEXT: pshufb %xmm1, %xmm2 611; SSSE3-NEXT: por %xmm5, %xmm2 612; SSSE3-NEXT: movdqa %xmm3, %xmm0 613; SSSE3-NEXT: movdqa %xmm2, %xmm1 614; SSSE3-NEXT: retq 615; 616; AVX1-LABEL: test_bitreverse_v32i8: 617; AVX1: # %bb.0: 618; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 619; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 620; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 621; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 622; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 623; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 624; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 625; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 626; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 627; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 628; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 629; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 630; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 631; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 632; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 633; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 634; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 635; AVX1-NEXT: retq 636; 637; AVX2-LABEL: test_bitreverse_v32i8: 638; AVX2: # %bb.0: 639; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 640; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 641; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 642; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 643; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 644; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 645; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 646; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 647; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 648; AVX2-NEXT: retq 649; 650; AVX512-LABEL: test_bitreverse_v32i8: 651; AVX512: # %bb.0: 652; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 653; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 654; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 655; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 656; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 657; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 658; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 659; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 660; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 661; AVX512-NEXT: retq 662; 663; XOPAVX1-LABEL: test_bitreverse_v32i8: 664; XOPAVX1: # %bb.0: 665; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 666; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 667; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 668; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 669; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 670; XOPAVX1-NEXT: retq 671; 672; XOPAVX2-LABEL: test_bitreverse_v32i8: 673; XOPAVX2: # %bb.0: 674; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 675; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 676; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 677; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 678; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 679; XOPAVX2-NEXT: retq 680 %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) 681 ret <32 x i8> %b 682} 683 684define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { 685; SSE2-LABEL: test_bitreverse_v16i16: 686; SSE2: # %bb.0: 687; SSE2-NEXT: pxor %xmm4, %xmm4 688; SSE2-NEXT: movdqa %xmm0, %xmm2 689; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 690; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 691; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] 692; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 693; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 694; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 695; SSE2-NEXT: packuswb %xmm2, %xmm0 696; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 697; SSE2-NEXT: movdqa %xmm0, %xmm3 698; SSE2-NEXT: pand %xmm2, %xmm3 699; SSE2-NEXT: psllw $4, %xmm3 700; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 701; SSE2-NEXT: pand %xmm6, %xmm3 702; SSE2-NEXT: pand %xmm6, %xmm0 703; SSE2-NEXT: psrlw $4, %xmm0 704; SSE2-NEXT: pand %xmm2, %xmm0 705; SSE2-NEXT: por %xmm3, %xmm0 706; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 707; SSE2-NEXT: movdqa %xmm0, %xmm5 708; SSE2-NEXT: pand %xmm3, %xmm5 709; SSE2-NEXT: psllw $2, %xmm5 710; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 711; SSE2-NEXT: pand %xmm8, %xmm5 712; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 713; SSE2-NEXT: pand %xmm9, %xmm0 714; SSE2-NEXT: psrlw $2, %xmm0 715; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 716; SSE2-NEXT: pand %xmm10, %xmm0 717; SSE2-NEXT: por %xmm5, %xmm0 718; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 719; SSE2-NEXT: movdqa %xmm0, %xmm7 720; SSE2-NEXT: pand %xmm5, %xmm7 721; SSE2-NEXT: psrlw $1, %xmm7 722; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 723; SSE2-NEXT: pand %xmm11, %xmm7 724; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 725; SSE2-NEXT: pand %xmm12, %xmm0 726; SSE2-NEXT: paddb %xmm0, %xmm0 727; SSE2-NEXT: por %xmm7, %xmm0 728; SSE2-NEXT: movdqa %xmm1, %xmm7 729; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] 730; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,3,2,4,5,6,7] 731; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,7,6] 732; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 733; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 734; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6] 735; SSE2-NEXT: packuswb %xmm7, %xmm1 736; SSE2-NEXT: movdqa %xmm1, %xmm4 737; SSE2-NEXT: pand %xmm2, %xmm4 738; SSE2-NEXT: psllw $4, %xmm4 739; SSE2-NEXT: pand %xmm6, %xmm4 740; SSE2-NEXT: pand %xmm6, %xmm1 741; SSE2-NEXT: psrlw $4, %xmm1 742; SSE2-NEXT: pand %xmm2, %xmm1 743; SSE2-NEXT: por %xmm4, %xmm1 744; SSE2-NEXT: pand %xmm1, %xmm3 745; SSE2-NEXT: psllw $2, %xmm3 746; SSE2-NEXT: pand %xmm8, %xmm3 747; SSE2-NEXT: pand %xmm9, %xmm1 748; SSE2-NEXT: psrlw $2, %xmm1 749; SSE2-NEXT: pand %xmm10, %xmm1 750; SSE2-NEXT: por %xmm3, %xmm1 751; SSE2-NEXT: pand %xmm1, %xmm5 752; SSE2-NEXT: psrlw $1, %xmm5 753; SSE2-NEXT: pand %xmm11, %xmm5 754; SSE2-NEXT: pand %xmm12, %xmm1 755; SSE2-NEXT: paddb %xmm1, %xmm1 756; SSE2-NEXT: por %xmm5, %xmm1 757; SSE2-NEXT: retq 758; 759; SSSE3-LABEL: test_bitreverse_v16i16: 760; SSSE3: # %bb.0: 761; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 762; SSSE3-NEXT: pshufb %xmm4, %xmm0 763; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 764; SSSE3-NEXT: movdqa %xmm0, %xmm2 765; SSSE3-NEXT: pand %xmm5, %xmm2 766; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 767; SSSE3-NEXT: movdqa %xmm6, %xmm7 768; SSSE3-NEXT: pshufb %xmm2, %xmm7 769; SSSE3-NEXT: psrlw $4, %xmm0 770; SSSE3-NEXT: pand %xmm5, %xmm0 771; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 772; SSSE3-NEXT: movdqa %xmm2, %xmm3 773; SSSE3-NEXT: pshufb %xmm0, %xmm3 774; SSSE3-NEXT: por %xmm7, %xmm3 775; SSSE3-NEXT: pshufb %xmm4, %xmm1 776; SSSE3-NEXT: movdqa %xmm1, %xmm0 777; SSSE3-NEXT: pand %xmm5, %xmm0 778; SSSE3-NEXT: pshufb %xmm0, %xmm6 779; SSSE3-NEXT: psrlw $4, %xmm1 780; SSSE3-NEXT: pand %xmm5, %xmm1 781; SSSE3-NEXT: pshufb %xmm1, %xmm2 782; SSSE3-NEXT: por %xmm6, %xmm2 783; SSSE3-NEXT: movdqa %xmm3, %xmm0 784; SSSE3-NEXT: movdqa %xmm2, %xmm1 785; SSSE3-NEXT: retq 786; 787; AVX1-LABEL: test_bitreverse_v16i16: 788; AVX1: # %bb.0: 789; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 790; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 791; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 792; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 793; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 794; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 795; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 796; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 797; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 798; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 799; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 800; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 801; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 802; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 803; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 804; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 805; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 806; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 807; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 808; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 809; AVX1-NEXT: retq 810; 811; AVX2-LABEL: test_bitreverse_v16i16: 812; AVX2: # %bb.0: 813; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 814; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 815; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 816; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 817; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 818; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 819; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 820; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 821; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 822; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 823; AVX2-NEXT: retq 824; 825; AVX512-LABEL: test_bitreverse_v16i16: 826; AVX512: # %bb.0: 827; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 828; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 829; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 830; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 831; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 832; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 833; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 834; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 835; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 836; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 837; AVX512-NEXT: retq 838; 839; XOPAVX1-LABEL: test_bitreverse_v16i16: 840; XOPAVX1: # %bb.0: 841; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 842; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 843; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 844; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 845; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 846; XOPAVX1-NEXT: retq 847; 848; XOPAVX2-LABEL: test_bitreverse_v16i16: 849; XOPAVX2: # %bb.0: 850; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 851; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 852; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 853; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 854; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 855; XOPAVX2-NEXT: retq 856 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) 857 ret <16 x i16> %b 858} 859 860define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { 861; SSE2-LABEL: test_bitreverse_v8i32: 862; SSE2: # %bb.0: 863; SSE2-NEXT: pxor %xmm4, %xmm4 864; SSE2-NEXT: movdqa %xmm0, %xmm2 865; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 866; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 867; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 868; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 869; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 870; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 871; SSE2-NEXT: packuswb %xmm2, %xmm0 872; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 873; SSE2-NEXT: movdqa %xmm0, %xmm3 874; SSE2-NEXT: pand %xmm2, %xmm3 875; SSE2-NEXT: psllw $4, %xmm3 876; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 877; SSE2-NEXT: pand %xmm6, %xmm3 878; SSE2-NEXT: pand %xmm6, %xmm0 879; SSE2-NEXT: psrlw $4, %xmm0 880; SSE2-NEXT: pand %xmm2, %xmm0 881; SSE2-NEXT: por %xmm3, %xmm0 882; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 883; SSE2-NEXT: movdqa %xmm0, %xmm5 884; SSE2-NEXT: pand %xmm3, %xmm5 885; SSE2-NEXT: psllw $2, %xmm5 886; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 887; SSE2-NEXT: pand %xmm8, %xmm5 888; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 889; SSE2-NEXT: pand %xmm9, %xmm0 890; SSE2-NEXT: psrlw $2, %xmm0 891; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 892; SSE2-NEXT: pand %xmm10, %xmm0 893; SSE2-NEXT: por %xmm5, %xmm0 894; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 895; SSE2-NEXT: movdqa %xmm0, %xmm7 896; SSE2-NEXT: pand %xmm5, %xmm7 897; SSE2-NEXT: psrlw $1, %xmm7 898; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 899; SSE2-NEXT: pand %xmm11, %xmm7 900; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 901; SSE2-NEXT: pand %xmm12, %xmm0 902; SSE2-NEXT: paddb %xmm0, %xmm0 903; SSE2-NEXT: por %xmm7, %xmm0 904; SSE2-NEXT: movdqa %xmm1, %xmm7 905; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] 906; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,2,1,0,4,5,6,7] 907; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4] 908; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 909; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 910; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 911; SSE2-NEXT: packuswb %xmm7, %xmm1 912; SSE2-NEXT: movdqa %xmm1, %xmm4 913; SSE2-NEXT: pand %xmm2, %xmm4 914; SSE2-NEXT: psllw $4, %xmm4 915; SSE2-NEXT: pand %xmm6, %xmm4 916; SSE2-NEXT: pand %xmm6, %xmm1 917; SSE2-NEXT: psrlw $4, %xmm1 918; SSE2-NEXT: pand %xmm2, %xmm1 919; SSE2-NEXT: por %xmm4, %xmm1 920; SSE2-NEXT: pand %xmm1, %xmm3 921; SSE2-NEXT: psllw $2, %xmm3 922; SSE2-NEXT: pand %xmm8, %xmm3 923; SSE2-NEXT: pand %xmm9, %xmm1 924; SSE2-NEXT: psrlw $2, %xmm1 925; SSE2-NEXT: pand %xmm10, %xmm1 926; SSE2-NEXT: por %xmm3, %xmm1 927; SSE2-NEXT: pand %xmm1, %xmm5 928; SSE2-NEXT: psrlw $1, %xmm5 929; SSE2-NEXT: pand %xmm11, %xmm5 930; SSE2-NEXT: pand %xmm12, %xmm1 931; SSE2-NEXT: paddb %xmm1, %xmm1 932; SSE2-NEXT: por %xmm5, %xmm1 933; SSE2-NEXT: retq 934; 935; SSSE3-LABEL: test_bitreverse_v8i32: 936; SSSE3: # %bb.0: 937; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 938; SSSE3-NEXT: pshufb %xmm4, %xmm0 939; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 940; SSSE3-NEXT: movdqa %xmm0, %xmm2 941; SSSE3-NEXT: pand %xmm5, %xmm2 942; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 943; SSSE3-NEXT: movdqa %xmm6, %xmm7 944; SSSE3-NEXT: pshufb %xmm2, %xmm7 945; SSSE3-NEXT: psrlw $4, %xmm0 946; SSSE3-NEXT: pand %xmm5, %xmm0 947; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 948; SSSE3-NEXT: movdqa %xmm2, %xmm3 949; SSSE3-NEXT: pshufb %xmm0, %xmm3 950; SSSE3-NEXT: por %xmm7, %xmm3 951; SSSE3-NEXT: pshufb %xmm4, %xmm1 952; SSSE3-NEXT: movdqa %xmm1, %xmm0 953; SSSE3-NEXT: pand %xmm5, %xmm0 954; SSSE3-NEXT: pshufb %xmm0, %xmm6 955; SSSE3-NEXT: psrlw $4, %xmm1 956; SSSE3-NEXT: pand %xmm5, %xmm1 957; SSSE3-NEXT: pshufb %xmm1, %xmm2 958; SSSE3-NEXT: por %xmm6, %xmm2 959; SSSE3-NEXT: movdqa %xmm3, %xmm0 960; SSSE3-NEXT: movdqa %xmm2, %xmm1 961; SSSE3-NEXT: retq 962; 963; AVX1-LABEL: test_bitreverse_v8i32: 964; AVX1: # %bb.0: 965; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 966; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 967; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 968; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 969; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 970; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 971; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 972; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 973; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 974; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 975; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 976; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 977; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 978; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 979; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 980; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 981; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 982; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 983; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 984; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 985; AVX1-NEXT: retq 986; 987; AVX2-LABEL: test_bitreverse_v8i32: 988; AVX2: # %bb.0: 989; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 990; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 991; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 992; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 993; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 994; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 995; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 996; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 997; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 998; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 999; AVX2-NEXT: retq 1000; 1001; AVX512-LABEL: test_bitreverse_v8i32: 1002; AVX512: # %bb.0: 1003; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1004; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1005; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1006; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1007; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1008; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1009; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1010; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1011; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1012; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1013; AVX512-NEXT: retq 1014; 1015; XOPAVX1-LABEL: test_bitreverse_v8i32: 1016; XOPAVX1: # %bb.0: 1017; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1018; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1019; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1020; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1021; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1022; XOPAVX1-NEXT: retq 1023; 1024; XOPAVX2-LABEL: test_bitreverse_v8i32: 1025; XOPAVX2: # %bb.0: 1026; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1027; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1028; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1029; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1030; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1031; XOPAVX2-NEXT: retq 1032 %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) 1033 ret <8 x i32> %b 1034} 1035 1036define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { 1037; SSE2-LABEL: test_bitreverse_v4i64: 1038; SSE2: # %bb.0: 1039; SSE2-NEXT: pxor %xmm4, %xmm4 1040; SSE2-NEXT: movdqa %xmm0, %xmm2 1041; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 1042; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 1043; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1044; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1045; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1046; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1047; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1048; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1049; SSE2-NEXT: packuswb %xmm2, %xmm0 1050; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1051; SSE2-NEXT: movdqa %xmm0, %xmm3 1052; SSE2-NEXT: pand %xmm2, %xmm3 1053; SSE2-NEXT: psllw $4, %xmm3 1054; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1055; SSE2-NEXT: pand %xmm6, %xmm3 1056; SSE2-NEXT: pand %xmm6, %xmm0 1057; SSE2-NEXT: psrlw $4, %xmm0 1058; SSE2-NEXT: pand %xmm2, %xmm0 1059; SSE2-NEXT: por %xmm3, %xmm0 1060; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1061; SSE2-NEXT: movdqa %xmm0, %xmm5 1062; SSE2-NEXT: pand %xmm3, %xmm5 1063; SSE2-NEXT: psllw $2, %xmm5 1064; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 1065; SSE2-NEXT: pand %xmm8, %xmm5 1066; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1067; SSE2-NEXT: pand %xmm9, %xmm0 1068; SSE2-NEXT: psrlw $2, %xmm0 1069; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 1070; SSE2-NEXT: pand %xmm10, %xmm0 1071; SSE2-NEXT: por %xmm5, %xmm0 1072; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1073; SSE2-NEXT: movdqa %xmm0, %xmm7 1074; SSE2-NEXT: pand %xmm5, %xmm7 1075; SSE2-NEXT: psrlw $1, %xmm7 1076; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1077; SSE2-NEXT: pand %xmm11, %xmm7 1078; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1079; SSE2-NEXT: pand %xmm12, %xmm0 1080; SSE2-NEXT: paddb %xmm0, %xmm0 1081; SSE2-NEXT: por %xmm7, %xmm0 1082; SSE2-NEXT: movdqa %xmm1, %xmm7 1083; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] 1084; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,0,1] 1085; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,2,1,0,4,5,6,7] 1086; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4] 1087; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 1088; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1089; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1090; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1091; SSE2-NEXT: packuswb %xmm7, %xmm1 1092; SSE2-NEXT: movdqa %xmm1, %xmm4 1093; SSE2-NEXT: pand %xmm2, %xmm4 1094; SSE2-NEXT: psllw $4, %xmm4 1095; SSE2-NEXT: pand %xmm6, %xmm4 1096; SSE2-NEXT: pand %xmm6, %xmm1 1097; SSE2-NEXT: psrlw $4, %xmm1 1098; SSE2-NEXT: pand %xmm2, %xmm1 1099; SSE2-NEXT: por %xmm4, %xmm1 1100; SSE2-NEXT: pand %xmm1, %xmm3 1101; SSE2-NEXT: psllw $2, %xmm3 1102; SSE2-NEXT: pand %xmm8, %xmm3 1103; SSE2-NEXT: pand %xmm9, %xmm1 1104; SSE2-NEXT: psrlw $2, %xmm1 1105; SSE2-NEXT: pand %xmm10, %xmm1 1106; SSE2-NEXT: por %xmm3, %xmm1 1107; SSE2-NEXT: pand %xmm1, %xmm5 1108; SSE2-NEXT: psrlw $1, %xmm5 1109; SSE2-NEXT: pand %xmm11, %xmm5 1110; SSE2-NEXT: pand %xmm12, %xmm1 1111; SSE2-NEXT: paddb %xmm1, %xmm1 1112; SSE2-NEXT: por %xmm5, %xmm1 1113; SSE2-NEXT: retq 1114; 1115; SSSE3-LABEL: test_bitreverse_v4i64: 1116; SSSE3: # %bb.0: 1117; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1118; SSSE3-NEXT: pshufb %xmm4, %xmm0 1119; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1120; SSSE3-NEXT: movdqa %xmm0, %xmm2 1121; SSSE3-NEXT: pand %xmm5, %xmm2 1122; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1123; SSSE3-NEXT: movdqa %xmm6, %xmm7 1124; SSSE3-NEXT: pshufb %xmm2, %xmm7 1125; SSSE3-NEXT: psrlw $4, %xmm0 1126; SSSE3-NEXT: pand %xmm5, %xmm0 1127; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1128; SSSE3-NEXT: movdqa %xmm2, %xmm3 1129; SSSE3-NEXT: pshufb %xmm0, %xmm3 1130; SSSE3-NEXT: por %xmm7, %xmm3 1131; SSSE3-NEXT: pshufb %xmm4, %xmm1 1132; SSSE3-NEXT: movdqa %xmm1, %xmm0 1133; SSSE3-NEXT: pand %xmm5, %xmm0 1134; SSSE3-NEXT: pshufb %xmm0, %xmm6 1135; SSSE3-NEXT: psrlw $4, %xmm1 1136; SSSE3-NEXT: pand %xmm5, %xmm1 1137; SSSE3-NEXT: pshufb %xmm1, %xmm2 1138; SSSE3-NEXT: por %xmm6, %xmm2 1139; SSSE3-NEXT: movdqa %xmm3, %xmm0 1140; SSSE3-NEXT: movdqa %xmm2, %xmm1 1141; SSSE3-NEXT: retq 1142; 1143; AVX1-LABEL: test_bitreverse_v4i64: 1144; AVX1: # %bb.0: 1145; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1146; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1147; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1148; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1149; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1150; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1151; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1152; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1153; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1154; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1155; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1156; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1157; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1158; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1159; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1160; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1161; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1162; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1163; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1164; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1165; AVX1-NEXT: retq 1166; 1167; AVX2-LABEL: test_bitreverse_v4i64: 1168; AVX2: # %bb.0: 1169; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1170; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1171; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1172; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1173; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1174; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1175; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1176; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1177; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1178; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1179; AVX2-NEXT: retq 1180; 1181; AVX512-LABEL: test_bitreverse_v4i64: 1182; AVX512: # %bb.0: 1183; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1184; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1185; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1186; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1187; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1188; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1189; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1190; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1191; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1192; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1193; AVX512-NEXT: retq 1194; 1195; XOPAVX1-LABEL: test_bitreverse_v4i64: 1196; XOPAVX1: # %bb.0: 1197; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1198; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1199; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1200; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1201; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1202; XOPAVX1-NEXT: retq 1203; 1204; XOPAVX2-LABEL: test_bitreverse_v4i64: 1205; XOPAVX2: # %bb.0: 1206; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1207; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1208; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1209; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1210; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1211; XOPAVX2-NEXT: retq 1212 %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) 1213 ret <4 x i64> %b 1214} 1215 1216define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { 1217; SSE2-LABEL: test_bitreverse_v64i8: 1218; SSE2: # %bb.0: 1219; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1220; SSE2-NEXT: movdqa %xmm0, %xmm5 1221; SSE2-NEXT: pand %xmm13, %xmm5 1222; SSE2-NEXT: psllw $4, %xmm5 1223; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1224; SSE2-NEXT: pand %xmm7, %xmm5 1225; SSE2-NEXT: pand %xmm7, %xmm0 1226; SSE2-NEXT: psrlw $4, %xmm0 1227; SSE2-NEXT: pand %xmm13, %xmm0 1228; SSE2-NEXT: por %xmm5, %xmm0 1229; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1230; SSE2-NEXT: movdqa %xmm0, %xmm6 1231; SSE2-NEXT: pand %xmm5, %xmm6 1232; SSE2-NEXT: psllw $2, %xmm6 1233; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 1234; SSE2-NEXT: pand %xmm8, %xmm6 1235; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1236; SSE2-NEXT: pand %xmm9, %xmm0 1237; SSE2-NEXT: psrlw $2, %xmm0 1238; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 1239; SSE2-NEXT: pand %xmm10, %xmm0 1240; SSE2-NEXT: por %xmm6, %xmm0 1241; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1242; SSE2-NEXT: movdqa %xmm0, %xmm4 1243; SSE2-NEXT: pand %xmm6, %xmm4 1244; SSE2-NEXT: psrlw $1, %xmm4 1245; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1246; SSE2-NEXT: pand %xmm11, %xmm4 1247; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1248; SSE2-NEXT: pand %xmm12, %xmm0 1249; SSE2-NEXT: paddb %xmm0, %xmm0 1250; SSE2-NEXT: por %xmm4, %xmm0 1251; SSE2-NEXT: movdqa %xmm1, %xmm4 1252; SSE2-NEXT: pand %xmm13, %xmm4 1253; SSE2-NEXT: psllw $4, %xmm4 1254; SSE2-NEXT: pand %xmm7, %xmm4 1255; SSE2-NEXT: pand %xmm7, %xmm1 1256; SSE2-NEXT: psrlw $4, %xmm1 1257; SSE2-NEXT: pand %xmm13, %xmm1 1258; SSE2-NEXT: por %xmm4, %xmm1 1259; SSE2-NEXT: movdqa %xmm1, %xmm4 1260; SSE2-NEXT: pand %xmm5, %xmm4 1261; SSE2-NEXT: psllw $2, %xmm4 1262; SSE2-NEXT: pand %xmm8, %xmm4 1263; SSE2-NEXT: pand %xmm9, %xmm1 1264; SSE2-NEXT: psrlw $2, %xmm1 1265; SSE2-NEXT: pand %xmm10, %xmm1 1266; SSE2-NEXT: por %xmm4, %xmm1 1267; SSE2-NEXT: movdqa %xmm1, %xmm4 1268; SSE2-NEXT: pand %xmm6, %xmm4 1269; SSE2-NEXT: psrlw $1, %xmm4 1270; SSE2-NEXT: pand %xmm11, %xmm4 1271; SSE2-NEXT: pand %xmm12, %xmm1 1272; SSE2-NEXT: paddb %xmm1, %xmm1 1273; SSE2-NEXT: por %xmm4, %xmm1 1274; SSE2-NEXT: movdqa %xmm2, %xmm4 1275; SSE2-NEXT: pand %xmm13, %xmm4 1276; SSE2-NEXT: psllw $4, %xmm4 1277; SSE2-NEXT: pand %xmm7, %xmm4 1278; SSE2-NEXT: pand %xmm7, %xmm2 1279; SSE2-NEXT: psrlw $4, %xmm2 1280; SSE2-NEXT: pand %xmm13, %xmm2 1281; SSE2-NEXT: por %xmm4, %xmm2 1282; SSE2-NEXT: movdqa %xmm2, %xmm4 1283; SSE2-NEXT: pand %xmm5, %xmm4 1284; SSE2-NEXT: psllw $2, %xmm4 1285; SSE2-NEXT: pand %xmm8, %xmm4 1286; SSE2-NEXT: pand %xmm9, %xmm2 1287; SSE2-NEXT: psrlw $2, %xmm2 1288; SSE2-NEXT: pand %xmm10, %xmm2 1289; SSE2-NEXT: por %xmm4, %xmm2 1290; SSE2-NEXT: movdqa %xmm2, %xmm4 1291; SSE2-NEXT: pand %xmm6, %xmm4 1292; SSE2-NEXT: psrlw $1, %xmm4 1293; SSE2-NEXT: pand %xmm11, %xmm4 1294; SSE2-NEXT: pand %xmm12, %xmm2 1295; SSE2-NEXT: paddb %xmm2, %xmm2 1296; SSE2-NEXT: por %xmm4, %xmm2 1297; SSE2-NEXT: movdqa %xmm3, %xmm4 1298; SSE2-NEXT: pand %xmm13, %xmm4 1299; SSE2-NEXT: psllw $4, %xmm4 1300; SSE2-NEXT: pand %xmm7, %xmm4 1301; SSE2-NEXT: pand %xmm7, %xmm3 1302; SSE2-NEXT: psrlw $4, %xmm3 1303; SSE2-NEXT: pand %xmm13, %xmm3 1304; SSE2-NEXT: por %xmm4, %xmm3 1305; SSE2-NEXT: pand %xmm3, %xmm5 1306; SSE2-NEXT: psllw $2, %xmm5 1307; SSE2-NEXT: pand %xmm8, %xmm5 1308; SSE2-NEXT: pand %xmm9, %xmm3 1309; SSE2-NEXT: psrlw $2, %xmm3 1310; SSE2-NEXT: pand %xmm10, %xmm3 1311; SSE2-NEXT: por %xmm5, %xmm3 1312; SSE2-NEXT: pand %xmm3, %xmm6 1313; SSE2-NEXT: psrlw $1, %xmm6 1314; SSE2-NEXT: pand %xmm11, %xmm6 1315; SSE2-NEXT: pand %xmm12, %xmm3 1316; SSE2-NEXT: paddb %xmm3, %xmm3 1317; SSE2-NEXT: por %xmm6, %xmm3 1318; SSE2-NEXT: retq 1319; 1320; SSSE3-LABEL: test_bitreverse_v64i8: 1321; SSSE3: # %bb.0: 1322; SSSE3-NEXT: movdqa %xmm0, %xmm5 1323; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1324; SSSE3-NEXT: pand %xmm8, %xmm0 1325; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1326; SSSE3-NEXT: movdqa %xmm9, %xmm6 1327; SSSE3-NEXT: pshufb %xmm0, %xmm6 1328; SSSE3-NEXT: psrlw $4, %xmm5 1329; SSSE3-NEXT: pand %xmm8, %xmm5 1330; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1331; SSSE3-NEXT: movdqa %xmm4, %xmm0 1332; SSSE3-NEXT: pshufb %xmm5, %xmm0 1333; SSSE3-NEXT: por %xmm6, %xmm0 1334; SSSE3-NEXT: movdqa %xmm1, %xmm5 1335; SSSE3-NEXT: pand %xmm8, %xmm5 1336; SSSE3-NEXT: movdqa %xmm9, %xmm6 1337; SSSE3-NEXT: pshufb %xmm5, %xmm6 1338; SSSE3-NEXT: psrlw $4, %xmm1 1339; SSSE3-NEXT: pand %xmm8, %xmm1 1340; SSSE3-NEXT: movdqa %xmm4, %xmm5 1341; SSSE3-NEXT: pshufb %xmm1, %xmm5 1342; SSSE3-NEXT: por %xmm6, %xmm5 1343; SSSE3-NEXT: movdqa %xmm2, %xmm1 1344; SSSE3-NEXT: pand %xmm8, %xmm1 1345; SSSE3-NEXT: movdqa %xmm9, %xmm7 1346; SSSE3-NEXT: pshufb %xmm1, %xmm7 1347; SSSE3-NEXT: psrlw $4, %xmm2 1348; SSSE3-NEXT: pand %xmm8, %xmm2 1349; SSSE3-NEXT: movdqa %xmm4, %xmm6 1350; SSSE3-NEXT: pshufb %xmm2, %xmm6 1351; SSSE3-NEXT: por %xmm7, %xmm6 1352; SSSE3-NEXT: movdqa %xmm3, %xmm1 1353; SSSE3-NEXT: pand %xmm8, %xmm1 1354; SSSE3-NEXT: pshufb %xmm1, %xmm9 1355; SSSE3-NEXT: psrlw $4, %xmm3 1356; SSSE3-NEXT: pand %xmm8, %xmm3 1357; SSSE3-NEXT: pshufb %xmm3, %xmm4 1358; SSSE3-NEXT: por %xmm9, %xmm4 1359; SSSE3-NEXT: movdqa %xmm5, %xmm1 1360; SSSE3-NEXT: movdqa %xmm6, %xmm2 1361; SSSE3-NEXT: movdqa %xmm4, %xmm3 1362; SSSE3-NEXT: retq 1363; 1364; AVX1-LABEL: test_bitreverse_v64i8: 1365; AVX1: # %bb.0: 1366; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1367; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1368; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 1369; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1370; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1371; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1372; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1373; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1374; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1375; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1376; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm4 1377; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1378; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1379; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1380; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1381; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 1382; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1383; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1384; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 1385; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1386; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1387; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1388; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1389; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1390; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1391; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1392; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1393; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1394; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1395; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1396; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1397; AVX1-NEXT: retq 1398; 1399; AVX2-LABEL: test_bitreverse_v64i8: 1400; AVX2: # %bb.0: 1401; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1402; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 1403; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1404; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1405; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1406; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1407; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1408; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0 1409; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 1410; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 1411; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1412; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 1413; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1414; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1 1415; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 1416; AVX2-NEXT: retq 1417; 1418; AVX512F-LABEL: test_bitreverse_v64i8: 1419; AVX512F: # %bb.0: 1420; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1421; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3 1422; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1423; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1424; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 1425; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 1426; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1427; AVX512F-NEXT: vpshufb %ymm0, %ymm5, %ymm0 1428; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 1429; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 1430; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1431; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 1432; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 1433; AVX512F-NEXT: vpshufb %ymm1, %ymm5, %ymm1 1434; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 1435; AVX512F-NEXT: retq 1436; 1437; AVX512BW-LABEL: test_bitreverse_v64i8: 1438; AVX512BW: # %bb.0: 1439; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1440; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 1441; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1442; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 1443; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 1444; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 1445; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1446; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 1447; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 1448; AVX512BW-NEXT: retq 1449; 1450; XOPAVX1-LABEL: test_bitreverse_v64i8: 1451; XOPAVX1: # %bb.0: 1452; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1453; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1454; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1455; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1456; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1457; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1458; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1459; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1460; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1461; XOPAVX1-NEXT: retq 1462; 1463; XOPAVX2-LABEL: test_bitreverse_v64i8: 1464; XOPAVX2: # %bb.0: 1465; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1466; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1467; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1468; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1469; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1470; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1471; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1472; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1473; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 1474; XOPAVX2-NEXT: retq 1475 %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) 1476 ret <64 x i8> %b 1477} 1478 1479define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { 1480; SSE2-LABEL: test_bitreverse_v32i16: 1481; SSE2: # %bb.0: 1482; SSE2-NEXT: pxor %xmm14, %xmm14 1483; SSE2-NEXT: movdqa %xmm0, %xmm4 1484; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] 1485; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] 1486; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6] 1487; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 1488; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 1489; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 1490; SSE2-NEXT: packuswb %xmm4, %xmm0 1491; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1492; SSE2-NEXT: movdqa %xmm0, %xmm5 1493; SSE2-NEXT: pand %xmm8, %xmm5 1494; SSE2-NEXT: psllw $4, %xmm5 1495; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1496; SSE2-NEXT: pand %xmm4, %xmm5 1497; SSE2-NEXT: pand %xmm4, %xmm0 1498; SSE2-NEXT: psrlw $4, %xmm0 1499; SSE2-NEXT: pand %xmm8, %xmm0 1500; SSE2-NEXT: por %xmm5, %xmm0 1501; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1502; SSE2-NEXT: movdqa %xmm0, %xmm7 1503; SSE2-NEXT: pand %xmm5, %xmm7 1504; SSE2-NEXT: psllw $2, %xmm7 1505; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 1506; SSE2-NEXT: pand %xmm9, %xmm7 1507; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1508; SSE2-NEXT: pand %xmm10, %xmm0 1509; SSE2-NEXT: psrlw $2, %xmm0 1510; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 1511; SSE2-NEXT: pand %xmm11, %xmm0 1512; SSE2-NEXT: por %xmm7, %xmm0 1513; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1514; SSE2-NEXT: movdqa %xmm0, %xmm6 1515; SSE2-NEXT: pand %xmm7, %xmm6 1516; SSE2-NEXT: psrlw $1, %xmm6 1517; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1518; SSE2-NEXT: pand %xmm12, %xmm6 1519; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1520; SSE2-NEXT: pand %xmm13, %xmm0 1521; SSE2-NEXT: paddb %xmm0, %xmm0 1522; SSE2-NEXT: por %xmm6, %xmm0 1523; SSE2-NEXT: movdqa %xmm1, %xmm6 1524; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1525; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] 1526; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6] 1527; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 1528; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 1529; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6] 1530; SSE2-NEXT: packuswb %xmm6, %xmm1 1531; SSE2-NEXT: movdqa %xmm1, %xmm6 1532; SSE2-NEXT: pand %xmm8, %xmm6 1533; SSE2-NEXT: psllw $4, %xmm6 1534; SSE2-NEXT: pand %xmm4, %xmm6 1535; SSE2-NEXT: pand %xmm4, %xmm1 1536; SSE2-NEXT: psrlw $4, %xmm1 1537; SSE2-NEXT: pand %xmm8, %xmm1 1538; SSE2-NEXT: por %xmm6, %xmm1 1539; SSE2-NEXT: movdqa %xmm1, %xmm6 1540; SSE2-NEXT: pand %xmm5, %xmm6 1541; SSE2-NEXT: psllw $2, %xmm6 1542; SSE2-NEXT: pand %xmm9, %xmm6 1543; SSE2-NEXT: pand %xmm10, %xmm1 1544; SSE2-NEXT: psrlw $2, %xmm1 1545; SSE2-NEXT: pand %xmm11, %xmm1 1546; SSE2-NEXT: por %xmm6, %xmm1 1547; SSE2-NEXT: movdqa %xmm1, %xmm6 1548; SSE2-NEXT: pand %xmm7, %xmm6 1549; SSE2-NEXT: psrlw $1, %xmm6 1550; SSE2-NEXT: pand %xmm12, %xmm6 1551; SSE2-NEXT: pand %xmm13, %xmm1 1552; SSE2-NEXT: paddb %xmm1, %xmm1 1553; SSE2-NEXT: por %xmm6, %xmm1 1554; SSE2-NEXT: movdqa %xmm2, %xmm6 1555; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1556; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] 1557; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6] 1558; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] 1559; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 1560; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] 1561; SSE2-NEXT: packuswb %xmm6, %xmm2 1562; SSE2-NEXT: movdqa %xmm2, %xmm6 1563; SSE2-NEXT: pand %xmm8, %xmm6 1564; SSE2-NEXT: psllw $4, %xmm6 1565; SSE2-NEXT: pand %xmm4, %xmm6 1566; SSE2-NEXT: pand %xmm4, %xmm2 1567; SSE2-NEXT: psrlw $4, %xmm2 1568; SSE2-NEXT: pand %xmm8, %xmm2 1569; SSE2-NEXT: por %xmm6, %xmm2 1570; SSE2-NEXT: movdqa %xmm2, %xmm6 1571; SSE2-NEXT: pand %xmm5, %xmm6 1572; SSE2-NEXT: psllw $2, %xmm6 1573; SSE2-NEXT: pand %xmm9, %xmm6 1574; SSE2-NEXT: pand %xmm10, %xmm2 1575; SSE2-NEXT: psrlw $2, %xmm2 1576; SSE2-NEXT: pand %xmm11, %xmm2 1577; SSE2-NEXT: por %xmm6, %xmm2 1578; SSE2-NEXT: movdqa %xmm2, %xmm6 1579; SSE2-NEXT: pand %xmm7, %xmm6 1580; SSE2-NEXT: psrlw $1, %xmm6 1581; SSE2-NEXT: pand %xmm12, %xmm6 1582; SSE2-NEXT: pand %xmm13, %xmm2 1583; SSE2-NEXT: paddb %xmm2, %xmm2 1584; SSE2-NEXT: por %xmm6, %xmm2 1585; SSE2-NEXT: movdqa %xmm3, %xmm6 1586; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1587; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] 1588; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6] 1589; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] 1590; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] 1591; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6] 1592; SSE2-NEXT: packuswb %xmm6, %xmm3 1593; SSE2-NEXT: movdqa %xmm3, %xmm6 1594; SSE2-NEXT: pand %xmm8, %xmm6 1595; SSE2-NEXT: psllw $4, %xmm6 1596; SSE2-NEXT: pand %xmm4, %xmm6 1597; SSE2-NEXT: pand %xmm4, %xmm3 1598; SSE2-NEXT: psrlw $4, %xmm3 1599; SSE2-NEXT: pand %xmm8, %xmm3 1600; SSE2-NEXT: por %xmm6, %xmm3 1601; SSE2-NEXT: pand %xmm3, %xmm5 1602; SSE2-NEXT: psllw $2, %xmm5 1603; SSE2-NEXT: pand %xmm9, %xmm5 1604; SSE2-NEXT: pand %xmm10, %xmm3 1605; SSE2-NEXT: psrlw $2, %xmm3 1606; SSE2-NEXT: pand %xmm11, %xmm3 1607; SSE2-NEXT: por %xmm5, %xmm3 1608; SSE2-NEXT: pand %xmm3, %xmm7 1609; SSE2-NEXT: psrlw $1, %xmm7 1610; SSE2-NEXT: pand %xmm12, %xmm7 1611; SSE2-NEXT: pand %xmm13, %xmm3 1612; SSE2-NEXT: paddb %xmm3, %xmm3 1613; SSE2-NEXT: por %xmm7, %xmm3 1614; SSE2-NEXT: retq 1615; 1616; SSSE3-LABEL: test_bitreverse_v32i16: 1617; SSSE3: # %bb.0: 1618; SSSE3-NEXT: movdqa %xmm1, %xmm5 1619; SSSE3-NEXT: movdqa %xmm0, %xmm1 1620; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1621; SSSE3-NEXT: pshufb %xmm8, %xmm1 1622; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1623; SSSE3-NEXT: movdqa %xmm1, %xmm0 1624; SSSE3-NEXT: pand %xmm9, %xmm0 1625; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1626; SSSE3-NEXT: movdqa %xmm7, %xmm6 1627; SSSE3-NEXT: pshufb %xmm0, %xmm6 1628; SSSE3-NEXT: psrlw $4, %xmm1 1629; SSSE3-NEXT: pand %xmm9, %xmm1 1630; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1631; SSSE3-NEXT: movdqa %xmm4, %xmm0 1632; SSSE3-NEXT: pshufb %xmm1, %xmm0 1633; SSSE3-NEXT: por %xmm6, %xmm0 1634; SSSE3-NEXT: pshufb %xmm8, %xmm5 1635; SSSE3-NEXT: movdqa %xmm5, %xmm1 1636; SSSE3-NEXT: pand %xmm9, %xmm1 1637; SSSE3-NEXT: movdqa %xmm7, %xmm6 1638; SSSE3-NEXT: pshufb %xmm1, %xmm6 1639; SSSE3-NEXT: psrlw $4, %xmm5 1640; SSSE3-NEXT: pand %xmm9, %xmm5 1641; SSSE3-NEXT: movdqa %xmm4, %xmm1 1642; SSSE3-NEXT: pshufb %xmm5, %xmm1 1643; SSSE3-NEXT: por %xmm6, %xmm1 1644; SSSE3-NEXT: pshufb %xmm8, %xmm2 1645; SSSE3-NEXT: movdqa %xmm2, %xmm5 1646; SSSE3-NEXT: pand %xmm9, %xmm5 1647; SSSE3-NEXT: movdqa %xmm7, %xmm6 1648; SSSE3-NEXT: pshufb %xmm5, %xmm6 1649; SSSE3-NEXT: psrlw $4, %xmm2 1650; SSSE3-NEXT: pand %xmm9, %xmm2 1651; SSSE3-NEXT: movdqa %xmm4, %xmm5 1652; SSSE3-NEXT: pshufb %xmm2, %xmm5 1653; SSSE3-NEXT: por %xmm6, %xmm5 1654; SSSE3-NEXT: pshufb %xmm8, %xmm3 1655; SSSE3-NEXT: movdqa %xmm3, %xmm2 1656; SSSE3-NEXT: pand %xmm9, %xmm2 1657; SSSE3-NEXT: pshufb %xmm2, %xmm7 1658; SSSE3-NEXT: psrlw $4, %xmm3 1659; SSSE3-NEXT: pand %xmm9, %xmm3 1660; SSSE3-NEXT: pshufb %xmm3, %xmm4 1661; SSSE3-NEXT: por %xmm7, %xmm4 1662; SSSE3-NEXT: movdqa %xmm5, %xmm2 1663; SSSE3-NEXT: movdqa %xmm4, %xmm3 1664; SSSE3-NEXT: retq 1665; 1666; AVX1-LABEL: test_bitreverse_v32i16: 1667; AVX1: # %bb.0: 1668; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1669; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1670; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1671; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1672; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1673; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1674; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1675; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1676; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1677; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1678; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1679; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1680; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1681; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 1682; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1683; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1684; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1685; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 1686; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 1687; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1688; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1689; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1690; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1691; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1692; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1693; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1694; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1695; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1696; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1697; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 1698; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 1699; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1700; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 1701; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 1702; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 1703; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1704; AVX1-NEXT: retq 1705; 1706; AVX2-LABEL: test_bitreverse_v32i16: 1707; AVX2: # %bb.0: 1708; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1709; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1710; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1711; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 1712; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1713; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1714; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1715; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 1716; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1717; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 1718; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 1719; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1720; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 1721; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 1722; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 1723; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 1724; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 1725; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 1726; AVX2-NEXT: retq 1727; 1728; AVX512F-LABEL: test_bitreverse_v32i16: 1729; AVX512F: # %bb.0: 1730; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1731; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1732; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1733; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm4 1734; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1735; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1736; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 1737; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 1738; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1739; AVX512F-NEXT: vpshufb %ymm0, %ymm6, %ymm0 1740; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 1741; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1742; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm2 1743; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 1744; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 1745; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 1746; AVX512F-NEXT: vpshufb %ymm1, %ymm6, %ymm1 1747; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 1748; AVX512F-NEXT: retq 1749; 1750; AVX512BW-LABEL: test_bitreverse_v32i16: 1751; AVX512BW: # %bb.0: 1752; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] 1753; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1754; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 1755; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1756; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 1757; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 1758; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 1759; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1760; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 1761; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 1762; AVX512BW-NEXT: retq 1763; 1764; XOPAVX1-LABEL: test_bitreverse_v32i16: 1765; XOPAVX1: # %bb.0: 1766; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1767; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1768; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1769; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1770; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1771; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1772; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1773; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1774; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1775; XOPAVX1-NEXT: retq 1776; 1777; XOPAVX2-LABEL: test_bitreverse_v32i16: 1778; XOPAVX2: # %bb.0: 1779; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1780; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1781; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1782; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1783; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1784; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1785; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1786; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1787; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 1788; XOPAVX2-NEXT: retq 1789 %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) 1790 ret <32 x i16> %b 1791} 1792 1793define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { 1794; SSE2-LABEL: test_bitreverse_v16i32: 1795; SSE2: # %bb.0: 1796; SSE2-NEXT: pxor %xmm14, %xmm14 1797; SSE2-NEXT: movdqa %xmm0, %xmm4 1798; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] 1799; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 1800; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 1801; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 1802; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1803; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1804; SSE2-NEXT: packuswb %xmm4, %xmm0 1805; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1806; SSE2-NEXT: movdqa %xmm0, %xmm5 1807; SSE2-NEXT: pand %xmm8, %xmm5 1808; SSE2-NEXT: psllw $4, %xmm5 1809; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1810; SSE2-NEXT: pand %xmm4, %xmm5 1811; SSE2-NEXT: pand %xmm4, %xmm0 1812; SSE2-NEXT: psrlw $4, %xmm0 1813; SSE2-NEXT: pand %xmm8, %xmm0 1814; SSE2-NEXT: por %xmm5, %xmm0 1815; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1816; SSE2-NEXT: movdqa %xmm0, %xmm7 1817; SSE2-NEXT: pand %xmm5, %xmm7 1818; SSE2-NEXT: psllw $2, %xmm7 1819; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 1820; SSE2-NEXT: pand %xmm9, %xmm7 1821; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1822; SSE2-NEXT: pand %xmm10, %xmm0 1823; SSE2-NEXT: psrlw $2, %xmm0 1824; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 1825; SSE2-NEXT: pand %xmm11, %xmm0 1826; SSE2-NEXT: por %xmm7, %xmm0 1827; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1828; SSE2-NEXT: movdqa %xmm0, %xmm6 1829; SSE2-NEXT: pand %xmm7, %xmm6 1830; SSE2-NEXT: psrlw $1, %xmm6 1831; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1832; SSE2-NEXT: pand %xmm12, %xmm6 1833; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1834; SSE2-NEXT: pand %xmm13, %xmm0 1835; SSE2-NEXT: paddb %xmm0, %xmm0 1836; SSE2-NEXT: por %xmm6, %xmm0 1837; SSE2-NEXT: movdqa %xmm1, %xmm6 1838; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1839; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1840; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1841; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 1842; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1843; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1844; SSE2-NEXT: packuswb %xmm6, %xmm1 1845; SSE2-NEXT: movdqa %xmm1, %xmm6 1846; SSE2-NEXT: pand %xmm8, %xmm6 1847; SSE2-NEXT: psllw $4, %xmm6 1848; SSE2-NEXT: pand %xmm4, %xmm6 1849; SSE2-NEXT: pand %xmm4, %xmm1 1850; SSE2-NEXT: psrlw $4, %xmm1 1851; SSE2-NEXT: pand %xmm8, %xmm1 1852; SSE2-NEXT: por %xmm6, %xmm1 1853; SSE2-NEXT: movdqa %xmm1, %xmm6 1854; SSE2-NEXT: pand %xmm5, %xmm6 1855; SSE2-NEXT: psllw $2, %xmm6 1856; SSE2-NEXT: pand %xmm9, %xmm6 1857; SSE2-NEXT: pand %xmm10, %xmm1 1858; SSE2-NEXT: psrlw $2, %xmm1 1859; SSE2-NEXT: pand %xmm11, %xmm1 1860; SSE2-NEXT: por %xmm6, %xmm1 1861; SSE2-NEXT: movdqa %xmm1, %xmm6 1862; SSE2-NEXT: pand %xmm7, %xmm6 1863; SSE2-NEXT: psrlw $1, %xmm6 1864; SSE2-NEXT: pand %xmm12, %xmm6 1865; SSE2-NEXT: pand %xmm13, %xmm1 1866; SSE2-NEXT: paddb %xmm1, %xmm1 1867; SSE2-NEXT: por %xmm6, %xmm1 1868; SSE2-NEXT: movdqa %xmm2, %xmm6 1869; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1870; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1871; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1872; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] 1873; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1874; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1875; SSE2-NEXT: packuswb %xmm6, %xmm2 1876; SSE2-NEXT: movdqa %xmm2, %xmm6 1877; SSE2-NEXT: pand %xmm8, %xmm6 1878; SSE2-NEXT: psllw $4, %xmm6 1879; SSE2-NEXT: pand %xmm4, %xmm6 1880; SSE2-NEXT: pand %xmm4, %xmm2 1881; SSE2-NEXT: psrlw $4, %xmm2 1882; SSE2-NEXT: pand %xmm8, %xmm2 1883; SSE2-NEXT: por %xmm6, %xmm2 1884; SSE2-NEXT: movdqa %xmm2, %xmm6 1885; SSE2-NEXT: pand %xmm5, %xmm6 1886; SSE2-NEXT: psllw $2, %xmm6 1887; SSE2-NEXT: pand %xmm9, %xmm6 1888; SSE2-NEXT: pand %xmm10, %xmm2 1889; SSE2-NEXT: psrlw $2, %xmm2 1890; SSE2-NEXT: pand %xmm11, %xmm2 1891; SSE2-NEXT: por %xmm6, %xmm2 1892; SSE2-NEXT: movdqa %xmm2, %xmm6 1893; SSE2-NEXT: pand %xmm7, %xmm6 1894; SSE2-NEXT: psrlw $1, %xmm6 1895; SSE2-NEXT: pand %xmm12, %xmm6 1896; SSE2-NEXT: pand %xmm13, %xmm2 1897; SSE2-NEXT: paddb %xmm2, %xmm2 1898; SSE2-NEXT: por %xmm6, %xmm2 1899; SSE2-NEXT: movdqa %xmm3, %xmm6 1900; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1901; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1902; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1903; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] 1904; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 1905; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 1906; SSE2-NEXT: packuswb %xmm6, %xmm3 1907; SSE2-NEXT: movdqa %xmm3, %xmm6 1908; SSE2-NEXT: pand %xmm8, %xmm6 1909; SSE2-NEXT: psllw $4, %xmm6 1910; SSE2-NEXT: pand %xmm4, %xmm6 1911; SSE2-NEXT: pand %xmm4, %xmm3 1912; SSE2-NEXT: psrlw $4, %xmm3 1913; SSE2-NEXT: pand %xmm8, %xmm3 1914; SSE2-NEXT: por %xmm6, %xmm3 1915; SSE2-NEXT: pand %xmm3, %xmm5 1916; SSE2-NEXT: psllw $2, %xmm5 1917; SSE2-NEXT: pand %xmm9, %xmm5 1918; SSE2-NEXT: pand %xmm10, %xmm3 1919; SSE2-NEXT: psrlw $2, %xmm3 1920; SSE2-NEXT: pand %xmm11, %xmm3 1921; SSE2-NEXT: por %xmm5, %xmm3 1922; SSE2-NEXT: pand %xmm3, %xmm7 1923; SSE2-NEXT: psrlw $1, %xmm7 1924; SSE2-NEXT: pand %xmm12, %xmm7 1925; SSE2-NEXT: pand %xmm13, %xmm3 1926; SSE2-NEXT: paddb %xmm3, %xmm3 1927; SSE2-NEXT: por %xmm7, %xmm3 1928; SSE2-NEXT: retq 1929; 1930; SSSE3-LABEL: test_bitreverse_v16i32: 1931; SSSE3: # %bb.0: 1932; SSSE3-NEXT: movdqa %xmm1, %xmm5 1933; SSSE3-NEXT: movdqa %xmm0, %xmm1 1934; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1935; SSSE3-NEXT: pshufb %xmm8, %xmm1 1936; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1937; SSSE3-NEXT: movdqa %xmm1, %xmm0 1938; SSSE3-NEXT: pand %xmm9, %xmm0 1939; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1940; SSSE3-NEXT: movdqa %xmm7, %xmm6 1941; SSSE3-NEXT: pshufb %xmm0, %xmm6 1942; SSSE3-NEXT: psrlw $4, %xmm1 1943; SSSE3-NEXT: pand %xmm9, %xmm1 1944; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1945; SSSE3-NEXT: movdqa %xmm4, %xmm0 1946; SSSE3-NEXT: pshufb %xmm1, %xmm0 1947; SSSE3-NEXT: por %xmm6, %xmm0 1948; SSSE3-NEXT: pshufb %xmm8, %xmm5 1949; SSSE3-NEXT: movdqa %xmm5, %xmm1 1950; SSSE3-NEXT: pand %xmm9, %xmm1 1951; SSSE3-NEXT: movdqa %xmm7, %xmm6 1952; SSSE3-NEXT: pshufb %xmm1, %xmm6 1953; SSSE3-NEXT: psrlw $4, %xmm5 1954; SSSE3-NEXT: pand %xmm9, %xmm5 1955; SSSE3-NEXT: movdqa %xmm4, %xmm1 1956; SSSE3-NEXT: pshufb %xmm5, %xmm1 1957; SSSE3-NEXT: por %xmm6, %xmm1 1958; SSSE3-NEXT: pshufb %xmm8, %xmm2 1959; SSSE3-NEXT: movdqa %xmm2, %xmm5 1960; SSSE3-NEXT: pand %xmm9, %xmm5 1961; SSSE3-NEXT: movdqa %xmm7, %xmm6 1962; SSSE3-NEXT: pshufb %xmm5, %xmm6 1963; SSSE3-NEXT: psrlw $4, %xmm2 1964; SSSE3-NEXT: pand %xmm9, %xmm2 1965; SSSE3-NEXT: movdqa %xmm4, %xmm5 1966; SSSE3-NEXT: pshufb %xmm2, %xmm5 1967; SSSE3-NEXT: por %xmm6, %xmm5 1968; SSSE3-NEXT: pshufb %xmm8, %xmm3 1969; SSSE3-NEXT: movdqa %xmm3, %xmm2 1970; SSSE3-NEXT: pand %xmm9, %xmm2 1971; SSSE3-NEXT: pshufb %xmm2, %xmm7 1972; SSSE3-NEXT: psrlw $4, %xmm3 1973; SSSE3-NEXT: pand %xmm9, %xmm3 1974; SSSE3-NEXT: pshufb %xmm3, %xmm4 1975; SSSE3-NEXT: por %xmm7, %xmm4 1976; SSSE3-NEXT: movdqa %xmm5, %xmm2 1977; SSSE3-NEXT: movdqa %xmm4, %xmm3 1978; SSSE3-NEXT: retq 1979; 1980; AVX1-LABEL: test_bitreverse_v16i32: 1981; AVX1: # %bb.0: 1982; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1983; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1984; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1985; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1986; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1987; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1988; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1989; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1990; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1991; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1992; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1993; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1994; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1995; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 1996; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1997; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1998; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1999; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2000; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2001; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2002; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2003; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2004; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2005; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2006; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2007; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2008; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2009; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2010; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2011; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2012; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2013; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2014; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2015; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2016; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2017; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2018; AVX1-NEXT: retq 2019; 2020; AVX2-LABEL: test_bitreverse_v16i32: 2021; AVX2: # %bb.0: 2022; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2023; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2024; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2025; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2026; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2027; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2028; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2029; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2030; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2031; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2032; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2033; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2034; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2035; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2036; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2037; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2038; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2039; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2040; AVX2-NEXT: retq 2041; 2042; AVX512F-LABEL: test_bitreverse_v16i32: 2043; AVX512F: # %bb.0: 2044; AVX512F-NEXT: vpsrld $24, %zmm0, %zmm1 2045; AVX512F-NEXT: vpsrld $8, %zmm0, %zmm2 2046; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 2047; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 2048; AVX512F-NEXT: vpslld $24, %zmm0, %zmm2 2049; AVX512F-NEXT: vpslld $8, %zmm0, %zmm0 2050; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 2051; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2052; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 2053; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 2054; AVX512F-NEXT: vpslld $4, %zmm1, %zmm1 2055; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 2056; AVX512F-NEXT: vpsrld $4, %zmm0, %zmm0 2057; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2058; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 2059; AVX512F-NEXT: vpslld $2, %zmm1, %zmm1 2060; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 2061; AVX512F-NEXT: vpsrld $2, %zmm0, %zmm0 2062; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2063; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 2064; AVX512F-NEXT: vpslld $1, %zmm1, %zmm1 2065; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 2066; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 2067; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2068; AVX512F-NEXT: retq 2069; 2070; AVX512BW-LABEL: test_bitreverse_v16i32: 2071; AVX512BW: # %bb.0: 2072; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] 2073; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2074; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2075; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2076; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2077; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2078; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2079; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2080; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2081; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2082; AVX512BW-NEXT: retq 2083; 2084; XOPAVX1-LABEL: test_bitreverse_v16i32: 2085; XOPAVX1: # %bb.0: 2086; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2087; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 2088; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2089; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2090; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2091; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2092; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2093; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2094; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2095; XOPAVX1-NEXT: retq 2096; 2097; XOPAVX2-LABEL: test_bitreverse_v16i32: 2098; XOPAVX2: # %bb.0: 2099; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2100; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 2101; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2102; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2103; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2104; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2105; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2106; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2107; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2108; XOPAVX2-NEXT: retq 2109 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) 2110 ret <16 x i32> %b 2111} 2112 2113define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { 2114; SSE2-LABEL: test_bitreverse_v8i64: 2115; SSE2: # %bb.0: 2116; SSE2-NEXT: pxor %xmm14, %xmm14 2117; SSE2-NEXT: movdqa %xmm0, %xmm4 2118; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] 2119; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 2120; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2121; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2122; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 2123; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2124; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 2125; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 2126; SSE2-NEXT: packuswb %xmm4, %xmm0 2127; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2128; SSE2-NEXT: movdqa %xmm0, %xmm5 2129; SSE2-NEXT: pand %xmm8, %xmm5 2130; SSE2-NEXT: psllw $4, %xmm5 2131; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 2132; SSE2-NEXT: pand %xmm4, %xmm5 2133; SSE2-NEXT: pand %xmm4, %xmm0 2134; SSE2-NEXT: psrlw $4, %xmm0 2135; SSE2-NEXT: pand %xmm8, %xmm0 2136; SSE2-NEXT: por %xmm5, %xmm0 2137; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 2138; SSE2-NEXT: movdqa %xmm0, %xmm7 2139; SSE2-NEXT: pand %xmm5, %xmm7 2140; SSE2-NEXT: psllw $2, %xmm7 2141; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 2142; SSE2-NEXT: pand %xmm9, %xmm7 2143; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 2144; SSE2-NEXT: pand %xmm10, %xmm0 2145; SSE2-NEXT: psrlw $2, %xmm0 2146; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 2147; SSE2-NEXT: pand %xmm11, %xmm0 2148; SSE2-NEXT: por %xmm7, %xmm0 2149; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 2150; SSE2-NEXT: movdqa %xmm0, %xmm6 2151; SSE2-NEXT: pand %xmm7, %xmm6 2152; SSE2-NEXT: psrlw $1, %xmm6 2153; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 2154; SSE2-NEXT: pand %xmm12, %xmm6 2155; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 2156; SSE2-NEXT: pand %xmm13, %xmm0 2157; SSE2-NEXT: paddb %xmm0, %xmm0 2158; SSE2-NEXT: por %xmm6, %xmm0 2159; SSE2-NEXT: movdqa %xmm1, %xmm6 2160; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 2161; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 2162; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 2163; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 2164; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 2165; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2166; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 2167; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 2168; SSE2-NEXT: packuswb %xmm6, %xmm1 2169; SSE2-NEXT: movdqa %xmm1, %xmm6 2170; SSE2-NEXT: pand %xmm8, %xmm6 2171; SSE2-NEXT: psllw $4, %xmm6 2172; SSE2-NEXT: pand %xmm4, %xmm6 2173; SSE2-NEXT: pand %xmm4, %xmm1 2174; SSE2-NEXT: psrlw $4, %xmm1 2175; SSE2-NEXT: pand %xmm8, %xmm1 2176; SSE2-NEXT: por %xmm6, %xmm1 2177; SSE2-NEXT: movdqa %xmm1, %xmm6 2178; SSE2-NEXT: pand %xmm5, %xmm6 2179; SSE2-NEXT: psllw $2, %xmm6 2180; SSE2-NEXT: pand %xmm9, %xmm6 2181; SSE2-NEXT: pand %xmm10, %xmm1 2182; SSE2-NEXT: psrlw $2, %xmm1 2183; SSE2-NEXT: pand %xmm11, %xmm1 2184; SSE2-NEXT: por %xmm6, %xmm1 2185; SSE2-NEXT: movdqa %xmm1, %xmm6 2186; SSE2-NEXT: pand %xmm7, %xmm6 2187; SSE2-NEXT: psrlw $1, %xmm6 2188; SSE2-NEXT: pand %xmm12, %xmm6 2189; SSE2-NEXT: pand %xmm13, %xmm1 2190; SSE2-NEXT: paddb %xmm1, %xmm1 2191; SSE2-NEXT: por %xmm6, %xmm1 2192; SSE2-NEXT: movdqa %xmm2, %xmm6 2193; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 2194; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 2195; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 2196; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 2197; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] 2198; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2199; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 2200; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 2201; SSE2-NEXT: packuswb %xmm6, %xmm2 2202; SSE2-NEXT: movdqa %xmm2, %xmm6 2203; SSE2-NEXT: pand %xmm8, %xmm6 2204; SSE2-NEXT: psllw $4, %xmm6 2205; SSE2-NEXT: pand %xmm4, %xmm6 2206; SSE2-NEXT: pand %xmm4, %xmm2 2207; SSE2-NEXT: psrlw $4, %xmm2 2208; SSE2-NEXT: pand %xmm8, %xmm2 2209; SSE2-NEXT: por %xmm6, %xmm2 2210; SSE2-NEXT: movdqa %xmm2, %xmm6 2211; SSE2-NEXT: pand %xmm5, %xmm6 2212; SSE2-NEXT: psllw $2, %xmm6 2213; SSE2-NEXT: pand %xmm9, %xmm6 2214; SSE2-NEXT: pand %xmm10, %xmm2 2215; SSE2-NEXT: psrlw $2, %xmm2 2216; SSE2-NEXT: pand %xmm11, %xmm2 2217; SSE2-NEXT: por %xmm6, %xmm2 2218; SSE2-NEXT: movdqa %xmm2, %xmm6 2219; SSE2-NEXT: pand %xmm7, %xmm6 2220; SSE2-NEXT: psrlw $1, %xmm6 2221; SSE2-NEXT: pand %xmm12, %xmm6 2222; SSE2-NEXT: pand %xmm13, %xmm2 2223; SSE2-NEXT: paddb %xmm2, %xmm2 2224; SSE2-NEXT: por %xmm6, %xmm2 2225; SSE2-NEXT: movdqa %xmm3, %xmm6 2226; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 2227; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 2228; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 2229; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 2230; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] 2231; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 2232; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 2233; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 2234; SSE2-NEXT: packuswb %xmm6, %xmm3 2235; SSE2-NEXT: movdqa %xmm3, %xmm6 2236; SSE2-NEXT: pand %xmm8, %xmm6 2237; SSE2-NEXT: psllw $4, %xmm6 2238; SSE2-NEXT: pand %xmm4, %xmm6 2239; SSE2-NEXT: pand %xmm4, %xmm3 2240; SSE2-NEXT: psrlw $4, %xmm3 2241; SSE2-NEXT: pand %xmm8, %xmm3 2242; SSE2-NEXT: por %xmm6, %xmm3 2243; SSE2-NEXT: pand %xmm3, %xmm5 2244; SSE2-NEXT: psllw $2, %xmm5 2245; SSE2-NEXT: pand %xmm9, %xmm5 2246; SSE2-NEXT: pand %xmm10, %xmm3 2247; SSE2-NEXT: psrlw $2, %xmm3 2248; SSE2-NEXT: pand %xmm11, %xmm3 2249; SSE2-NEXT: por %xmm5, %xmm3 2250; SSE2-NEXT: pand %xmm3, %xmm7 2251; SSE2-NEXT: psrlw $1, %xmm7 2252; SSE2-NEXT: pand %xmm12, %xmm7 2253; SSE2-NEXT: pand %xmm13, %xmm3 2254; SSE2-NEXT: paddb %xmm3, %xmm3 2255; SSE2-NEXT: por %xmm7, %xmm3 2256; SSE2-NEXT: retq 2257; 2258; SSSE3-LABEL: test_bitreverse_v8i64: 2259; SSSE3: # %bb.0: 2260; SSSE3-NEXT: movdqa %xmm1, %xmm5 2261; SSSE3-NEXT: movdqa %xmm0, %xmm1 2262; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2263; SSSE3-NEXT: pshufb %xmm8, %xmm1 2264; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2265; SSSE3-NEXT: movdqa %xmm1, %xmm0 2266; SSSE3-NEXT: pand %xmm9, %xmm0 2267; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2268; SSSE3-NEXT: movdqa %xmm7, %xmm6 2269; SSSE3-NEXT: pshufb %xmm0, %xmm6 2270; SSSE3-NEXT: psrlw $4, %xmm1 2271; SSSE3-NEXT: pand %xmm9, %xmm1 2272; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2273; SSSE3-NEXT: movdqa %xmm4, %xmm0 2274; SSSE3-NEXT: pshufb %xmm1, %xmm0 2275; SSSE3-NEXT: por %xmm6, %xmm0 2276; SSSE3-NEXT: pshufb %xmm8, %xmm5 2277; SSSE3-NEXT: movdqa %xmm5, %xmm1 2278; SSSE3-NEXT: pand %xmm9, %xmm1 2279; SSSE3-NEXT: movdqa %xmm7, %xmm6 2280; SSSE3-NEXT: pshufb %xmm1, %xmm6 2281; SSSE3-NEXT: psrlw $4, %xmm5 2282; SSSE3-NEXT: pand %xmm9, %xmm5 2283; SSSE3-NEXT: movdqa %xmm4, %xmm1 2284; SSSE3-NEXT: pshufb %xmm5, %xmm1 2285; SSSE3-NEXT: por %xmm6, %xmm1 2286; SSSE3-NEXT: pshufb %xmm8, %xmm2 2287; SSSE3-NEXT: movdqa %xmm2, %xmm5 2288; SSSE3-NEXT: pand %xmm9, %xmm5 2289; SSSE3-NEXT: movdqa %xmm7, %xmm6 2290; SSSE3-NEXT: pshufb %xmm5, %xmm6 2291; SSSE3-NEXT: psrlw $4, %xmm2 2292; SSSE3-NEXT: pand %xmm9, %xmm2 2293; SSSE3-NEXT: movdqa %xmm4, %xmm5 2294; SSSE3-NEXT: pshufb %xmm2, %xmm5 2295; SSSE3-NEXT: por %xmm6, %xmm5 2296; SSSE3-NEXT: pshufb %xmm8, %xmm3 2297; SSSE3-NEXT: movdqa %xmm3, %xmm2 2298; SSSE3-NEXT: pand %xmm9, %xmm2 2299; SSSE3-NEXT: pshufb %xmm2, %xmm7 2300; SSSE3-NEXT: psrlw $4, %xmm3 2301; SSSE3-NEXT: pand %xmm9, %xmm3 2302; SSSE3-NEXT: pshufb %xmm3, %xmm4 2303; SSSE3-NEXT: por %xmm7, %xmm4 2304; SSSE3-NEXT: movdqa %xmm5, %xmm2 2305; SSSE3-NEXT: movdqa %xmm4, %xmm3 2306; SSSE3-NEXT: retq 2307; 2308; AVX1-LABEL: test_bitreverse_v8i64: 2309; AVX1: # %bb.0: 2310; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2311; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2312; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2313; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2314; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2315; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2316; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2317; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2318; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2319; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2320; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2321; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2322; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2323; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 2324; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2325; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2326; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2327; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2328; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2329; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2330; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2331; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2332; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2333; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2334; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2335; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2336; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2337; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2338; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2339; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2340; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2341; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2342; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2343; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2344; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2345; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2346; AVX1-NEXT: retq 2347; 2348; AVX2-LABEL: test_bitreverse_v8i64: 2349; AVX2: # %bb.0: 2350; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2351; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2352; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2353; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2354; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2355; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2356; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2357; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2358; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2359; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2360; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2361; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2362; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2363; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2364; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2365; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2366; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2367; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2368; AVX2-NEXT: retq 2369; 2370; AVX512F-LABEL: test_bitreverse_v8i64: 2371; AVX512F: # %bb.0: 2372; AVX512F-NEXT: vpsrlq $56, %zmm0, %zmm1 2373; AVX512F-NEXT: vpsrlq $40, %zmm0, %zmm2 2374; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 2375; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 2376; AVX512F-NEXT: vpsrlq $24, %zmm0, %zmm2 2377; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 2378; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 2379; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm2 2380; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 2381; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 2382; AVX512F-NEXT: vpsllq $8, %zmm0, %zmm2 2383; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 2384; AVX512F-NEXT: vpsllq $24, %zmm0, %zmm3 2385; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3 2386; AVX512F-NEXT: vporq %zmm2, %zmm3, %zmm2 2387; AVX512F-NEXT: vpsllq $56, %zmm0, %zmm3 2388; AVX512F-NEXT: vpsllq $40, %zmm0, %zmm0 2389; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 2390; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0 2391; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2392; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0 2393; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 2394; AVX512F-NEXT: vpsllq $4, %zmm1, %zmm1 2395; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 2396; AVX512F-NEXT: vpsrlq $4, %zmm0, %zmm0 2397; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2398; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 2399; AVX512F-NEXT: vpsllq $2, %zmm1, %zmm1 2400; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 2401; AVX512F-NEXT: vpsrlq $2, %zmm0, %zmm0 2402; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2403; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 2404; AVX512F-NEXT: vpsllq $1, %zmm1, %zmm1 2405; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 2406; AVX512F-NEXT: vpsrlq $1, %zmm0, %zmm0 2407; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2408; AVX512F-NEXT: retq 2409; 2410; AVX512BW-LABEL: test_bitreverse_v8i64: 2411; AVX512BW: # %bb.0: 2412; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] 2413; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2414; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2415; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2416; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2417; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2418; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2419; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2420; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2421; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2422; AVX512BW-NEXT: retq 2423; 2424; XOPAVX1-LABEL: test_bitreverse_v8i64: 2425; XOPAVX1: # %bb.0: 2426; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2427; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 2428; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2429; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2430; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2431; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2432; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2433; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2434; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2435; XOPAVX1-NEXT: retq 2436; 2437; XOPAVX2-LABEL: test_bitreverse_v8i64: 2438; XOPAVX2: # %bb.0: 2439; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2440; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 2441; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2442; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2443; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2444; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2445; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2446; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2447; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2448; XOPAVX2-NEXT: retq 2449 %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) 2450 ret <8 x i64> %b 2451} 2452 2453; 2454; Constant Folding 2455; 2456 2457define i32 @fold_bitreverse_i32() nounwind { 2458; ALL-LABEL: fold_bitreverse_i32: 2459; ALL: # %bb.0: 2460; ALL-NEXT: movl $16711935, %eax # imm = 0xFF00FF 2461; ALL-NEXT: retq 2462 %b = call i32 @llvm.bitreverse.i32(i32 4278255360) 2463 ret i32 %b 2464} 2465 2466define <16 x i8> @fold_bitreverse_v16i8() nounwind { 2467; SSE-LABEL: fold_bitreverse_v16i8: 2468; SSE: # %bb.0: 2469; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2470; SSE-NEXT: retq 2471; 2472; AVX-LABEL: fold_bitreverse_v16i8: 2473; AVX: # %bb.0: 2474; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2475; AVX-NEXT: retq 2476; 2477; XOP-LABEL: fold_bitreverse_v16i8: 2478; XOP: # %bb.0: 2479; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2480; XOP-NEXT: retq 2481 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>) 2482 ret <16 x i8> %b 2483} 2484 2485define <16 x i16> @fold_bitreverse_v16i16() nounwind { 2486; SSE-LABEL: fold_bitreverse_v16i16: 2487; SSE: # %bb.0: 2488; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959] 2489; SSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863] 2490; SSE-NEXT: retq 2491; 2492; AVX-LABEL: fold_bitreverse_v16i16: 2493; AVX: # %bb.0: 2494; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 2495; AVX-NEXT: retq 2496; 2497; XOP-LABEL: fold_bitreverse_v16i16: 2498; XOP: # %bb.0: 2499; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 2500; XOP-NEXT: retq 2501 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>) 2502 ret <16 x i16> %b 2503} 2504 2505define <16 x i32> @fold_bitreverse_v16i32() nounwind { 2506; SSE-LABEL: fold_bitreverse_v16i32: 2507; SSE: # %bb.0: 2508; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471] 2509; SSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559] 2510; SSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015] 2511; SSE-NEXT: movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103] 2512; SSE-NEXT: retq 2513; 2514; AVX1-LABEL: fold_bitreverse_v16i32: 2515; AVX1: # %bb.0: 2516; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2517; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2518; AVX1-NEXT: retq 2519; 2520; AVX2-LABEL: fold_bitreverse_v16i32: 2521; AVX2: # %bb.0: 2522; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2523; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2524; AVX2-NEXT: retq 2525; 2526; AVX512-LABEL: fold_bitreverse_v16i32: 2527; AVX512: # %bb.0: 2528; AVX512-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2529; AVX512-NEXT: retq 2530; 2531; XOP-LABEL: fold_bitreverse_v16i32: 2532; XOP: # %bb.0: 2533; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2534; XOP-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2535; XOP-NEXT: retq 2536 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>) 2537 ret <16 x i32> %b 2538} 2539 2540declare i8 @llvm.bitreverse.i8(i8) readnone 2541declare i16 @llvm.bitreverse.i16(i16) readnone 2542declare i32 @llvm.bitreverse.i32(i32) readnone 2543declare i64 @llvm.bitreverse.i64(i64) readnone 2544 2545declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone 2546declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone 2547declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone 2548declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone 2549 2550declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) readnone 2551declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone 2552declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone 2553declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone 2554 2555declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) readnone 2556declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone 2557declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>) readnone 2558declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>) readnone 2559