1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 10 11; Make sure we don't crash with avx512bw and xop 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw 13 14define i8 @test_bitreverse_i8(i8 %a) nounwind { 15; SSE-LABEL: test_bitreverse_i8: 16; SSE: # %bb.0: 17; SSE-NEXT: movl %edi, %eax 18; SSE-NEXT: rolb $4, %al 19; SSE-NEXT: movl %eax, %ecx 20; SSE-NEXT: andb $51, %cl 21; SSE-NEXT: shlb $2, %cl 22; SSE-NEXT: andb $-52, %al 23; SSE-NEXT: shrb $2, %al 24; SSE-NEXT: orb %cl, %al 25; SSE-NEXT: movl %eax, %ecx 26; SSE-NEXT: andb $85, %cl 27; SSE-NEXT: addb %cl, %cl 28; SSE-NEXT: andb $-86, %al 29; SSE-NEXT: shrb %al 30; SSE-NEXT: orb %cl, %al 31; SSE-NEXT: # kill: def $al killed $al killed $eax 32; SSE-NEXT: retq 33; 34; AVX-LABEL: test_bitreverse_i8: 35; AVX: # %bb.0: 36; AVX-NEXT: movl %edi, %eax 37; AVX-NEXT: rolb $4, %al 38; AVX-NEXT: movl %eax, %ecx 39; AVX-NEXT: andb $51, %cl 40; AVX-NEXT: shlb $2, %cl 41; AVX-NEXT: andb $-52, %al 42; AVX-NEXT: shrb $2, %al 43; AVX-NEXT: orb %cl, %al 44; AVX-NEXT: movl %eax, %ecx 45; AVX-NEXT: andb $85, %cl 46; AVX-NEXT: addb %cl, %cl 47; AVX-NEXT: andb $-86, %al 48; AVX-NEXT: shrb %al 49; AVX-NEXT: orb %cl, %al 50; AVX-NEXT: # kill: def $al killed $al killed $eax 51; AVX-NEXT: retq 52; 53; XOP-LABEL: test_bitreverse_i8: 54; XOP: # %bb.0: 55; XOP-NEXT: vmovd %edi, %xmm0 56; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 57; XOP-NEXT: vpextrb $0, %xmm0, %eax 58; XOP-NEXT: # kill: def $al killed $al killed $eax 59; XOP-NEXT: retq 60 %b = call i8 @llvm.bitreverse.i8(i8 %a) 61 ret i8 %b 62} 63 64define i16 @test_bitreverse_i16(i16 %a) nounwind { 65; SSE-LABEL: test_bitreverse_i16: 66; SSE: # %bb.0: 67; SSE-NEXT: # kill: def $edi killed $edi def $rdi 68; SSE-NEXT: rolw $8, %di 69; SSE-NEXT: movl %edi, %eax 70; SSE-NEXT: andl $3855, %eax # imm = 0xF0F 71; SSE-NEXT: shll $4, %eax 72; SSE-NEXT: andl $61680, %edi # imm = 0xF0F0 73; SSE-NEXT: shrl $4, %edi 74; SSE-NEXT: orl %eax, %edi 75; SSE-NEXT: movl %edi, %eax 76; SSE-NEXT: andl $13107, %eax # imm = 0x3333 77; SSE-NEXT: andl $52428, %edi # imm = 0xCCCC 78; SSE-NEXT: shrl $2, %edi 79; SSE-NEXT: leal (%rdi,%rax,4), %eax 80; SSE-NEXT: movl %eax, %ecx 81; SSE-NEXT: andl $21845, %ecx # imm = 0x5555 82; SSE-NEXT: andl $43690, %eax # imm = 0xAAAA 83; SSE-NEXT: shrl %eax 84; SSE-NEXT: leal (%rax,%rcx,2), %eax 85; SSE-NEXT: # kill: def $ax killed $ax killed $eax 86; SSE-NEXT: retq 87; 88; AVX-LABEL: test_bitreverse_i16: 89; AVX: # %bb.0: 90; AVX-NEXT: # kill: def $edi killed $edi def $rdi 91; AVX-NEXT: rolw $8, %di 92; AVX-NEXT: movl %edi, %eax 93; AVX-NEXT: andl $3855, %eax # imm = 0xF0F 94; AVX-NEXT: shll $4, %eax 95; AVX-NEXT: andl $61680, %edi # imm = 0xF0F0 96; AVX-NEXT: shrl $4, %edi 97; AVX-NEXT: orl %eax, %edi 98; AVX-NEXT: movl %edi, %eax 99; AVX-NEXT: andl $13107, %eax # imm = 0x3333 100; AVX-NEXT: andl $52428, %edi # imm = 0xCCCC 101; AVX-NEXT: shrl $2, %edi 102; AVX-NEXT: leal (%rdi,%rax,4), %eax 103; AVX-NEXT: movl %eax, %ecx 104; AVX-NEXT: andl $21845, %ecx # imm = 0x5555 105; AVX-NEXT: andl $43690, %eax # imm = 0xAAAA 106; AVX-NEXT: shrl %eax 107; AVX-NEXT: leal (%rax,%rcx,2), %eax 108; AVX-NEXT: # kill: def $ax killed $ax killed $eax 109; AVX-NEXT: retq 110; 111; XOP-LABEL: test_bitreverse_i16: 112; XOP: # %bb.0: 113; XOP-NEXT: vmovd %edi, %xmm0 114; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 115; XOP-NEXT: vmovd %xmm0, %eax 116; XOP-NEXT: # kill: def $ax killed $ax killed $eax 117; XOP-NEXT: retq 118 %b = call i16 @llvm.bitreverse.i16(i16 %a) 119 ret i16 %b 120} 121 122define i32 @test_bitreverse_i32(i32 %a) nounwind { 123; SSE-LABEL: test_bitreverse_i32: 124; SSE: # %bb.0: 125; SSE-NEXT: # kill: def $edi killed $edi def $rdi 126; SSE-NEXT: bswapl %edi 127; SSE-NEXT: movl %edi, %eax 128; SSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 129; SSE-NEXT: shll $4, %eax 130; SSE-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 131; SSE-NEXT: shrl $4, %edi 132; SSE-NEXT: orl %eax, %edi 133; SSE-NEXT: movl %edi, %eax 134; SSE-NEXT: andl $858993459, %eax # imm = 0x33333333 135; SSE-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC 136; SSE-NEXT: shrl $2, %edi 137; SSE-NEXT: leal (%rdi,%rax,4), %eax 138; SSE-NEXT: movl %eax, %ecx 139; SSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555 140; SSE-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA 141; SSE-NEXT: shrl %eax 142; SSE-NEXT: leal (%rax,%rcx,2), %eax 143; SSE-NEXT: retq 144; 145; AVX-LABEL: test_bitreverse_i32: 146; AVX: # %bb.0: 147; AVX-NEXT: # kill: def $edi killed $edi def $rdi 148; AVX-NEXT: bswapl %edi 149; AVX-NEXT: movl %edi, %eax 150; AVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 151; AVX-NEXT: shll $4, %eax 152; AVX-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 153; AVX-NEXT: shrl $4, %edi 154; AVX-NEXT: orl %eax, %edi 155; AVX-NEXT: movl %edi, %eax 156; AVX-NEXT: andl $858993459, %eax # imm = 0x33333333 157; AVX-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC 158; AVX-NEXT: shrl $2, %edi 159; AVX-NEXT: leal (%rdi,%rax,4), %eax 160; AVX-NEXT: movl %eax, %ecx 161; AVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555 162; AVX-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA 163; AVX-NEXT: shrl %eax 164; AVX-NEXT: leal (%rax,%rcx,2), %eax 165; AVX-NEXT: retq 166; 167; XOP-LABEL: test_bitreverse_i32: 168; XOP: # %bb.0: 169; XOP-NEXT: vmovd %edi, %xmm0 170; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 171; XOP-NEXT: vmovd %xmm0, %eax 172; XOP-NEXT: retq 173 %b = call i32 @llvm.bitreverse.i32(i32 %a) 174 ret i32 %b 175} 176 177define i64 @test_bitreverse_i64(i64 %a) nounwind { 178; SSE-LABEL: test_bitreverse_i64: 179; SSE: # %bb.0: 180; SSE-NEXT: bswapq %rdi 181; SSE-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F 182; SSE-NEXT: andq %rdi, %rax 183; SSE-NEXT: shlq $4, %rax 184; SSE-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 185; SSE-NEXT: andq %rdi, %rcx 186; SSE-NEXT: shrq $4, %rcx 187; SSE-NEXT: orq %rax, %rcx 188; SSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 189; SSE-NEXT: andq %rcx, %rax 190; SSE-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC 191; SSE-NEXT: andq %rcx, %rdx 192; SSE-NEXT: shrq $2, %rdx 193; SSE-NEXT: leaq (%rdx,%rax,4), %rax 194; SSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 195; SSE-NEXT: andq %rax, %rcx 196; SSE-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA 197; SSE-NEXT: andq %rax, %rdx 198; SSE-NEXT: shrq %rdx 199; SSE-NEXT: leaq (%rdx,%rcx,2), %rax 200; SSE-NEXT: retq 201; 202; AVX-LABEL: test_bitreverse_i64: 203; AVX: # %bb.0: 204; AVX-NEXT: bswapq %rdi 205; AVX-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F 206; AVX-NEXT: andq %rdi, %rax 207; AVX-NEXT: shlq $4, %rax 208; AVX-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 209; AVX-NEXT: andq %rdi, %rcx 210; AVX-NEXT: shrq $4, %rcx 211; AVX-NEXT: orq %rax, %rcx 212; AVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 213; AVX-NEXT: andq %rcx, %rax 214; AVX-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC 215; AVX-NEXT: andq %rcx, %rdx 216; AVX-NEXT: shrq $2, %rdx 217; AVX-NEXT: leaq (%rdx,%rax,4), %rax 218; AVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 219; AVX-NEXT: andq %rax, %rcx 220; AVX-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA 221; AVX-NEXT: andq %rax, %rdx 222; AVX-NEXT: shrq %rdx 223; AVX-NEXT: leaq (%rdx,%rcx,2), %rax 224; AVX-NEXT: retq 225; 226; XOP-LABEL: test_bitreverse_i64: 227; XOP: # %bb.0: 228; XOP-NEXT: vmovq %rdi, %xmm0 229; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 230; XOP-NEXT: vmovq %xmm0, %rax 231; XOP-NEXT: retq 232 %b = call i64 @llvm.bitreverse.i64(i64 %a) 233 ret i64 %b 234} 235 236define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { 237; SSE2-LABEL: test_bitreverse_v16i8: 238; SSE2: # %bb.0: 239; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 240; SSE2-NEXT: movdqa %xmm0, %xmm2 241; SSE2-NEXT: pand %xmm1, %xmm2 242; SSE2-NEXT: psllw $4, %xmm2 243; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 244; SSE2-NEXT: pand %xmm3, %xmm2 245; SSE2-NEXT: pand %xmm3, %xmm0 246; SSE2-NEXT: psrlw $4, %xmm0 247; SSE2-NEXT: pand %xmm1, %xmm0 248; SSE2-NEXT: por %xmm2, %xmm0 249; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 250; SSE2-NEXT: pand %xmm0, %xmm1 251; SSE2-NEXT: psllw $2, %xmm1 252; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 253; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 254; SSE2-NEXT: psrlw $2, %xmm0 255; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 256; SSE2-NEXT: por %xmm1, %xmm0 257; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 258; SSE2-NEXT: pand %xmm0, %xmm1 259; SSE2-NEXT: paddb %xmm1, %xmm1 260; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 261; SSE2-NEXT: psrlw $1, %xmm0 262; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 263; SSE2-NEXT: por %xmm1, %xmm0 264; SSE2-NEXT: retq 265; 266; SSSE3-LABEL: test_bitreverse_v16i8: 267; SSSE3: # %bb.0: 268; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 269; SSSE3-NEXT: movdqa %xmm0, %xmm2 270; SSSE3-NEXT: pand %xmm1, %xmm2 271; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 272; SSSE3-NEXT: pshufb %xmm2, %xmm3 273; SSSE3-NEXT: psrlw $4, %xmm0 274; SSSE3-NEXT: pand %xmm1, %xmm0 275; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 276; SSSE3-NEXT: pshufb %xmm0, %xmm1 277; SSSE3-NEXT: por %xmm3, %xmm1 278; SSSE3-NEXT: movdqa %xmm1, %xmm0 279; SSSE3-NEXT: retq 280; 281; AVX-LABEL: test_bitreverse_v16i8: 282; AVX: # %bb.0: 283; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 284; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 285; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 286; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 287; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 288; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 289; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 290; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 291; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 292; AVX-NEXT: retq 293; 294; XOP-LABEL: test_bitreverse_v16i8: 295; XOP: # %bb.0: 296; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 297; XOP-NEXT: retq 298 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) 299 ret <16 x i8> %b 300} 301 302define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { 303; SSE2-LABEL: test_bitreverse_v8i16: 304; SSE2: # %bb.0: 305; SSE2-NEXT: pxor %xmm1, %xmm1 306; SSE2-NEXT: movdqa %xmm0, %xmm2 307; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 308; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 309; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] 310; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 311; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 312; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 313; SSE2-NEXT: packuswb %xmm2, %xmm0 314; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 315; SSE2-NEXT: movdqa %xmm0, %xmm2 316; SSE2-NEXT: pand %xmm1, %xmm2 317; SSE2-NEXT: psllw $4, %xmm2 318; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 319; SSE2-NEXT: pand %xmm3, %xmm2 320; SSE2-NEXT: pand %xmm3, %xmm0 321; SSE2-NEXT: psrlw $4, %xmm0 322; SSE2-NEXT: pand %xmm1, %xmm0 323; SSE2-NEXT: por %xmm2, %xmm0 324; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 325; SSE2-NEXT: pand %xmm0, %xmm1 326; SSE2-NEXT: psllw $2, %xmm1 327; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 328; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 329; SSE2-NEXT: psrlw $2, %xmm0 330; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 331; SSE2-NEXT: por %xmm1, %xmm0 332; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 333; SSE2-NEXT: pand %xmm0, %xmm1 334; SSE2-NEXT: paddb %xmm1, %xmm1 335; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 336; SSE2-NEXT: psrlw $1, %xmm0 337; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 338; SSE2-NEXT: por %xmm1, %xmm0 339; SSE2-NEXT: retq 340; 341; SSSE3-LABEL: test_bitreverse_v8i16: 342; SSSE3: # %bb.0: 343; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 344; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 345; SSSE3-NEXT: movdqa %xmm0, %xmm2 346; SSSE3-NEXT: pand %xmm1, %xmm2 347; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 348; SSSE3-NEXT: pshufb %xmm2, %xmm3 349; SSSE3-NEXT: psrlw $4, %xmm0 350; SSSE3-NEXT: pand %xmm1, %xmm0 351; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 352; SSSE3-NEXT: pshufb %xmm0, %xmm1 353; SSSE3-NEXT: por %xmm3, %xmm1 354; SSSE3-NEXT: movdqa %xmm1, %xmm0 355; SSSE3-NEXT: retq 356; 357; AVX-LABEL: test_bitreverse_v8i16: 358; AVX: # %bb.0: 359; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 360; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 361; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 362; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 363; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 364; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 365; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 366; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 367; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 368; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 369; AVX-NEXT: retq 370; 371; XOP-LABEL: test_bitreverse_v8i16: 372; XOP: # %bb.0: 373; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 374; XOP-NEXT: retq 375 %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) 376 ret <8 x i16> %b 377} 378 379define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { 380; SSE2-LABEL: test_bitreverse_v4i32: 381; SSE2: # %bb.0: 382; SSE2-NEXT: pxor %xmm1, %xmm1 383; SSE2-NEXT: movdqa %xmm0, %xmm2 384; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 385; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 386; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 387; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 388; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 389; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 390; SSE2-NEXT: packuswb %xmm2, %xmm0 391; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 392; SSE2-NEXT: movdqa %xmm0, %xmm2 393; SSE2-NEXT: pand %xmm1, %xmm2 394; SSE2-NEXT: psllw $4, %xmm2 395; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 396; SSE2-NEXT: pand %xmm3, %xmm2 397; SSE2-NEXT: pand %xmm3, %xmm0 398; SSE2-NEXT: psrlw $4, %xmm0 399; SSE2-NEXT: pand %xmm1, %xmm0 400; SSE2-NEXT: por %xmm2, %xmm0 401; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 402; SSE2-NEXT: pand %xmm0, %xmm1 403; SSE2-NEXT: psllw $2, %xmm1 404; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 405; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 406; SSE2-NEXT: psrlw $2, %xmm0 407; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 408; SSE2-NEXT: por %xmm1, %xmm0 409; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 410; SSE2-NEXT: pand %xmm0, %xmm1 411; SSE2-NEXT: paddb %xmm1, %xmm1 412; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 413; SSE2-NEXT: psrlw $1, %xmm0 414; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 415; SSE2-NEXT: por %xmm1, %xmm0 416; SSE2-NEXT: retq 417; 418; SSSE3-LABEL: test_bitreverse_v4i32: 419; SSSE3: # %bb.0: 420; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 421; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 422; SSSE3-NEXT: movdqa %xmm0, %xmm2 423; SSSE3-NEXT: pand %xmm1, %xmm2 424; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 425; SSSE3-NEXT: pshufb %xmm2, %xmm3 426; SSSE3-NEXT: psrlw $4, %xmm0 427; SSSE3-NEXT: pand %xmm1, %xmm0 428; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 429; SSSE3-NEXT: pshufb %xmm0, %xmm1 430; SSSE3-NEXT: por %xmm3, %xmm1 431; SSSE3-NEXT: movdqa %xmm1, %xmm0 432; SSSE3-NEXT: retq 433; 434; AVX-LABEL: test_bitreverse_v4i32: 435; AVX: # %bb.0: 436; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 437; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 438; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 439; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 440; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 441; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 442; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 443; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 444; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 445; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 446; AVX-NEXT: retq 447; 448; XOP-LABEL: test_bitreverse_v4i32: 449; XOP: # %bb.0: 450; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 451; XOP-NEXT: retq 452 %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) 453 ret <4 x i32> %b 454} 455 456define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { 457; SSE2-LABEL: test_bitreverse_v2i64: 458; SSE2: # %bb.0: 459; SSE2-NEXT: pxor %xmm1, %xmm1 460; SSE2-NEXT: movdqa %xmm0, %xmm2 461; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 462; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 463; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 464; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 465; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 466; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 467; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 468; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 469; SSE2-NEXT: packuswb %xmm2, %xmm0 470; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 471; SSE2-NEXT: movdqa %xmm0, %xmm2 472; SSE2-NEXT: pand %xmm1, %xmm2 473; SSE2-NEXT: psllw $4, %xmm2 474; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 475; SSE2-NEXT: pand %xmm3, %xmm2 476; SSE2-NEXT: pand %xmm3, %xmm0 477; SSE2-NEXT: psrlw $4, %xmm0 478; SSE2-NEXT: pand %xmm1, %xmm0 479; SSE2-NEXT: por %xmm2, %xmm0 480; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 481; SSE2-NEXT: pand %xmm0, %xmm1 482; SSE2-NEXT: psllw $2, %xmm1 483; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 484; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 485; SSE2-NEXT: psrlw $2, %xmm0 486; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 487; SSE2-NEXT: por %xmm1, %xmm0 488; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 489; SSE2-NEXT: pand %xmm0, %xmm1 490; SSE2-NEXT: paddb %xmm1, %xmm1 491; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 492; SSE2-NEXT: psrlw $1, %xmm0 493; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 494; SSE2-NEXT: por %xmm1, %xmm0 495; SSE2-NEXT: retq 496; 497; SSSE3-LABEL: test_bitreverse_v2i64: 498; SSSE3: # %bb.0: 499; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 500; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 501; SSSE3-NEXT: movdqa %xmm0, %xmm2 502; SSSE3-NEXT: pand %xmm1, %xmm2 503; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 504; SSSE3-NEXT: pshufb %xmm2, %xmm3 505; SSSE3-NEXT: psrlw $4, %xmm0 506; SSSE3-NEXT: pand %xmm1, %xmm0 507; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 508; SSSE3-NEXT: pshufb %xmm0, %xmm1 509; SSSE3-NEXT: por %xmm3, %xmm1 510; SSSE3-NEXT: movdqa %xmm1, %xmm0 511; SSSE3-NEXT: retq 512; 513; AVX-LABEL: test_bitreverse_v2i64: 514; AVX: # %bb.0: 515; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 516; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 517; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 518; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 519; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 520; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 521; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 522; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 523; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 524; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 525; AVX-NEXT: retq 526; 527; XOP-LABEL: test_bitreverse_v2i64: 528; XOP: # %bb.0: 529; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 530; XOP-NEXT: retq 531 %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) 532 ret <2 x i64> %b 533} 534 535define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { 536; SSE2-LABEL: test_bitreverse_v32i8: 537; SSE2: # %bb.0: 538; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 539; SSE2-NEXT: movdqa %xmm0, %xmm3 540; SSE2-NEXT: pand %xmm2, %xmm3 541; SSE2-NEXT: psllw $4, %xmm3 542; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 543; SSE2-NEXT: pand %xmm5, %xmm3 544; SSE2-NEXT: pand %xmm5, %xmm0 545; SSE2-NEXT: psrlw $4, %xmm0 546; SSE2-NEXT: pand %xmm2, %xmm0 547; SSE2-NEXT: por %xmm3, %xmm0 548; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 549; SSE2-NEXT: movdqa %xmm0, %xmm4 550; SSE2-NEXT: pand %xmm3, %xmm4 551; SSE2-NEXT: psllw $2, %xmm4 552; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 553; SSE2-NEXT: pand %xmm8, %xmm4 554; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 555; SSE2-NEXT: pand %xmm9, %xmm0 556; SSE2-NEXT: psrlw $2, %xmm0 557; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 558; SSE2-NEXT: pand %xmm10, %xmm0 559; SSE2-NEXT: por %xmm4, %xmm0 560; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 561; SSE2-NEXT: movdqa %xmm0, %xmm7 562; SSE2-NEXT: pand %xmm4, %xmm7 563; SSE2-NEXT: psrlw $1, %xmm7 564; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 565; SSE2-NEXT: pand %xmm11, %xmm7 566; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 567; SSE2-NEXT: pand %xmm6, %xmm0 568; SSE2-NEXT: paddb %xmm0, %xmm0 569; SSE2-NEXT: por %xmm7, %xmm0 570; SSE2-NEXT: movdqa %xmm1, %xmm7 571; SSE2-NEXT: pand %xmm2, %xmm7 572; SSE2-NEXT: psllw $4, %xmm7 573; SSE2-NEXT: pand %xmm5, %xmm7 574; SSE2-NEXT: pand %xmm5, %xmm1 575; SSE2-NEXT: psrlw $4, %xmm1 576; SSE2-NEXT: pand %xmm2, %xmm1 577; SSE2-NEXT: por %xmm7, %xmm1 578; SSE2-NEXT: pand %xmm1, %xmm3 579; SSE2-NEXT: psllw $2, %xmm3 580; SSE2-NEXT: pand %xmm8, %xmm3 581; SSE2-NEXT: pand %xmm9, %xmm1 582; SSE2-NEXT: psrlw $2, %xmm1 583; SSE2-NEXT: pand %xmm10, %xmm1 584; SSE2-NEXT: por %xmm3, %xmm1 585; SSE2-NEXT: pand %xmm1, %xmm4 586; SSE2-NEXT: psrlw $1, %xmm4 587; SSE2-NEXT: pand %xmm11, %xmm4 588; SSE2-NEXT: pand %xmm6, %xmm1 589; SSE2-NEXT: paddb %xmm1, %xmm1 590; SSE2-NEXT: por %xmm4, %xmm1 591; SSE2-NEXT: retq 592; 593; SSSE3-LABEL: test_bitreverse_v32i8: 594; SSSE3: # %bb.0: 595; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 596; SSSE3-NEXT: movdqa %xmm0, %xmm2 597; SSSE3-NEXT: pand %xmm4, %xmm2 598; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 599; SSSE3-NEXT: movdqa %xmm5, %xmm6 600; SSSE3-NEXT: pshufb %xmm2, %xmm6 601; SSSE3-NEXT: psrlw $4, %xmm0 602; SSSE3-NEXT: pand %xmm4, %xmm0 603; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 604; SSSE3-NEXT: movdqa %xmm2, %xmm3 605; SSSE3-NEXT: pshufb %xmm0, %xmm3 606; SSSE3-NEXT: por %xmm6, %xmm3 607; SSSE3-NEXT: movdqa %xmm1, %xmm0 608; SSSE3-NEXT: pand %xmm4, %xmm0 609; SSSE3-NEXT: pshufb %xmm0, %xmm5 610; SSSE3-NEXT: psrlw $4, %xmm1 611; SSSE3-NEXT: pand %xmm4, %xmm1 612; SSSE3-NEXT: pshufb %xmm1, %xmm2 613; SSSE3-NEXT: por %xmm5, %xmm2 614; SSSE3-NEXT: movdqa %xmm3, %xmm0 615; SSSE3-NEXT: movdqa %xmm2, %xmm1 616; SSSE3-NEXT: retq 617; 618; AVX1-LABEL: test_bitreverse_v32i8: 619; AVX1: # %bb.0: 620; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 621; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 622; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 623; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 624; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 625; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 626; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 627; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 628; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 629; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 630; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 631; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 632; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 633; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 634; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 635; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 636; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 637; AVX1-NEXT: retq 638; 639; AVX2-LABEL: test_bitreverse_v32i8: 640; AVX2: # %bb.0: 641; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 642; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 643; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 644; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 645; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 646; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 647; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 648; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 649; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 650; AVX2-NEXT: retq 651; 652; AVX512-LABEL: test_bitreverse_v32i8: 653; AVX512: # %bb.0: 654; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 655; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 656; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 657; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 658; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 659; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 660; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 661; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 662; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 663; AVX512-NEXT: retq 664; 665; XOPAVX1-LABEL: test_bitreverse_v32i8: 666; XOPAVX1: # %bb.0: 667; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 668; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 669; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 670; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 671; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 672; XOPAVX1-NEXT: retq 673; 674; XOPAVX2-LABEL: test_bitreverse_v32i8: 675; XOPAVX2: # %bb.0: 676; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 677; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 678; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 679; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 680; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 681; XOPAVX2-NEXT: retq 682 %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) 683 ret <32 x i8> %b 684} 685 686define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { 687; SSE2-LABEL: test_bitreverse_v16i16: 688; SSE2: # %bb.0: 689; SSE2-NEXT: pxor %xmm4, %xmm4 690; SSE2-NEXT: movdqa %xmm0, %xmm2 691; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 692; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 693; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] 694; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 695; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 696; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 697; SSE2-NEXT: packuswb %xmm2, %xmm0 698; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 699; SSE2-NEXT: movdqa %xmm0, %xmm3 700; SSE2-NEXT: pand %xmm2, %xmm3 701; SSE2-NEXT: psllw $4, %xmm3 702; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 703; SSE2-NEXT: pand %xmm6, %xmm3 704; SSE2-NEXT: pand %xmm6, %xmm0 705; SSE2-NEXT: psrlw $4, %xmm0 706; SSE2-NEXT: pand %xmm2, %xmm0 707; SSE2-NEXT: por %xmm3, %xmm0 708; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 709; SSE2-NEXT: movdqa %xmm0, %xmm5 710; SSE2-NEXT: pand %xmm3, %xmm5 711; SSE2-NEXT: psllw $2, %xmm5 712; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 713; SSE2-NEXT: pand %xmm8, %xmm5 714; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 715; SSE2-NEXT: pand %xmm9, %xmm0 716; SSE2-NEXT: psrlw $2, %xmm0 717; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 718; SSE2-NEXT: pand %xmm10, %xmm0 719; SSE2-NEXT: por %xmm5, %xmm0 720; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 721; SSE2-NEXT: movdqa %xmm0, %xmm7 722; SSE2-NEXT: pand %xmm5, %xmm7 723; SSE2-NEXT: psrlw $1, %xmm7 724; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 725; SSE2-NEXT: pand %xmm11, %xmm7 726; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 727; SSE2-NEXT: pand %xmm12, %xmm0 728; SSE2-NEXT: paddb %xmm0, %xmm0 729; SSE2-NEXT: por %xmm7, %xmm0 730; SSE2-NEXT: movdqa %xmm1, %xmm7 731; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] 732; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,3,2,4,5,6,7] 733; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,7,6] 734; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 735; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 736; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6] 737; SSE2-NEXT: packuswb %xmm7, %xmm1 738; SSE2-NEXT: movdqa %xmm1, %xmm4 739; SSE2-NEXT: pand %xmm2, %xmm4 740; SSE2-NEXT: psllw $4, %xmm4 741; SSE2-NEXT: pand %xmm6, %xmm4 742; SSE2-NEXT: pand %xmm6, %xmm1 743; SSE2-NEXT: psrlw $4, %xmm1 744; SSE2-NEXT: pand %xmm2, %xmm1 745; SSE2-NEXT: por %xmm4, %xmm1 746; SSE2-NEXT: pand %xmm1, %xmm3 747; SSE2-NEXT: psllw $2, %xmm3 748; SSE2-NEXT: pand %xmm8, %xmm3 749; SSE2-NEXT: pand %xmm9, %xmm1 750; SSE2-NEXT: psrlw $2, %xmm1 751; SSE2-NEXT: pand %xmm10, %xmm1 752; SSE2-NEXT: por %xmm3, %xmm1 753; SSE2-NEXT: pand %xmm1, %xmm5 754; SSE2-NEXT: psrlw $1, %xmm5 755; SSE2-NEXT: pand %xmm11, %xmm5 756; SSE2-NEXT: pand %xmm12, %xmm1 757; SSE2-NEXT: paddb %xmm1, %xmm1 758; SSE2-NEXT: por %xmm5, %xmm1 759; SSE2-NEXT: retq 760; 761; SSSE3-LABEL: test_bitreverse_v16i16: 762; SSSE3: # %bb.0: 763; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 764; SSSE3-NEXT: pshufb %xmm4, %xmm0 765; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 766; SSSE3-NEXT: movdqa %xmm0, %xmm2 767; SSSE3-NEXT: pand %xmm5, %xmm2 768; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 769; SSSE3-NEXT: movdqa %xmm6, %xmm7 770; SSSE3-NEXT: pshufb %xmm2, %xmm7 771; SSSE3-NEXT: psrlw $4, %xmm0 772; SSSE3-NEXT: pand %xmm5, %xmm0 773; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 774; SSSE3-NEXT: movdqa %xmm2, %xmm3 775; SSSE3-NEXT: pshufb %xmm0, %xmm3 776; SSSE3-NEXT: por %xmm7, %xmm3 777; SSSE3-NEXT: pshufb %xmm4, %xmm1 778; SSSE3-NEXT: movdqa %xmm1, %xmm0 779; SSSE3-NEXT: pand %xmm5, %xmm0 780; SSSE3-NEXT: pshufb %xmm0, %xmm6 781; SSSE3-NEXT: psrlw $4, %xmm1 782; SSSE3-NEXT: pand %xmm5, %xmm1 783; SSSE3-NEXT: pshufb %xmm1, %xmm2 784; SSSE3-NEXT: por %xmm6, %xmm2 785; SSSE3-NEXT: movdqa %xmm3, %xmm0 786; SSSE3-NEXT: movdqa %xmm2, %xmm1 787; SSSE3-NEXT: retq 788; 789; AVX1-LABEL: test_bitreverse_v16i16: 790; AVX1: # %bb.0: 791; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 792; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 793; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 794; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 795; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 796; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 797; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 798; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 799; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 800; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 801; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 802; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 803; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 804; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 805; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 806; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 807; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 808; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 809; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 810; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 811; AVX1-NEXT: retq 812; 813; AVX2-LABEL: test_bitreverse_v16i16: 814; AVX2: # %bb.0: 815; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 816; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 817; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 818; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 819; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 820; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 821; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 822; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 823; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 824; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 825; AVX2-NEXT: retq 826; 827; AVX512-LABEL: test_bitreverse_v16i16: 828; AVX512: # %bb.0: 829; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 830; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 831; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 832; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 833; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 834; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 835; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 836; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 837; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 838; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 839; AVX512-NEXT: retq 840; 841; XOPAVX1-LABEL: test_bitreverse_v16i16: 842; XOPAVX1: # %bb.0: 843; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 844; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 845; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 846; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 847; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 848; XOPAVX1-NEXT: retq 849; 850; XOPAVX2-LABEL: test_bitreverse_v16i16: 851; XOPAVX2: # %bb.0: 852; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 853; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 854; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 855; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 856; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 857; XOPAVX2-NEXT: retq 858 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) 859 ret <16 x i16> %b 860} 861 862define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { 863; SSE2-LABEL: test_bitreverse_v8i32: 864; SSE2: # %bb.0: 865; SSE2-NEXT: pxor %xmm4, %xmm4 866; SSE2-NEXT: movdqa %xmm0, %xmm2 867; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 868; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 869; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 870; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 871; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 872; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 873; SSE2-NEXT: packuswb %xmm2, %xmm0 874; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 875; SSE2-NEXT: movdqa %xmm0, %xmm3 876; SSE2-NEXT: pand %xmm2, %xmm3 877; SSE2-NEXT: psllw $4, %xmm3 878; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 879; SSE2-NEXT: pand %xmm6, %xmm3 880; SSE2-NEXT: pand %xmm6, %xmm0 881; SSE2-NEXT: psrlw $4, %xmm0 882; SSE2-NEXT: pand %xmm2, %xmm0 883; SSE2-NEXT: por %xmm3, %xmm0 884; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 885; SSE2-NEXT: movdqa %xmm0, %xmm5 886; SSE2-NEXT: pand %xmm3, %xmm5 887; SSE2-NEXT: psllw $2, %xmm5 888; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 889; SSE2-NEXT: pand %xmm8, %xmm5 890; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 891; SSE2-NEXT: pand %xmm9, %xmm0 892; SSE2-NEXT: psrlw $2, %xmm0 893; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 894; SSE2-NEXT: pand %xmm10, %xmm0 895; SSE2-NEXT: por %xmm5, %xmm0 896; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 897; SSE2-NEXT: movdqa %xmm0, %xmm7 898; SSE2-NEXT: pand %xmm5, %xmm7 899; SSE2-NEXT: psrlw $1, %xmm7 900; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 901; SSE2-NEXT: pand %xmm11, %xmm7 902; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 903; SSE2-NEXT: pand %xmm12, %xmm0 904; SSE2-NEXT: paddb %xmm0, %xmm0 905; SSE2-NEXT: por %xmm7, %xmm0 906; SSE2-NEXT: movdqa %xmm1, %xmm7 907; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] 908; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,2,1,0,4,5,6,7] 909; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4] 910; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 911; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 912; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 913; SSE2-NEXT: packuswb %xmm7, %xmm1 914; SSE2-NEXT: movdqa %xmm1, %xmm4 915; SSE2-NEXT: pand %xmm2, %xmm4 916; SSE2-NEXT: psllw $4, %xmm4 917; SSE2-NEXT: pand %xmm6, %xmm4 918; SSE2-NEXT: pand %xmm6, %xmm1 919; SSE2-NEXT: psrlw $4, %xmm1 920; SSE2-NEXT: pand %xmm2, %xmm1 921; SSE2-NEXT: por %xmm4, %xmm1 922; SSE2-NEXT: pand %xmm1, %xmm3 923; SSE2-NEXT: psllw $2, %xmm3 924; SSE2-NEXT: pand %xmm8, %xmm3 925; SSE2-NEXT: pand %xmm9, %xmm1 926; SSE2-NEXT: psrlw $2, %xmm1 927; SSE2-NEXT: pand %xmm10, %xmm1 928; SSE2-NEXT: por %xmm3, %xmm1 929; SSE2-NEXT: pand %xmm1, %xmm5 930; SSE2-NEXT: psrlw $1, %xmm5 931; SSE2-NEXT: pand %xmm11, %xmm5 932; SSE2-NEXT: pand %xmm12, %xmm1 933; SSE2-NEXT: paddb %xmm1, %xmm1 934; SSE2-NEXT: por %xmm5, %xmm1 935; SSE2-NEXT: retq 936; 937; SSSE3-LABEL: test_bitreverse_v8i32: 938; SSSE3: # %bb.0: 939; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 940; SSSE3-NEXT: pshufb %xmm4, %xmm0 941; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 942; SSSE3-NEXT: movdqa %xmm0, %xmm2 943; SSSE3-NEXT: pand %xmm5, %xmm2 944; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 945; SSSE3-NEXT: movdqa %xmm6, %xmm7 946; SSSE3-NEXT: pshufb %xmm2, %xmm7 947; SSSE3-NEXT: psrlw $4, %xmm0 948; SSSE3-NEXT: pand %xmm5, %xmm0 949; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 950; SSSE3-NEXT: movdqa %xmm2, %xmm3 951; SSSE3-NEXT: pshufb %xmm0, %xmm3 952; SSSE3-NEXT: por %xmm7, %xmm3 953; SSSE3-NEXT: pshufb %xmm4, %xmm1 954; SSSE3-NEXT: movdqa %xmm1, %xmm0 955; SSSE3-NEXT: pand %xmm5, %xmm0 956; SSSE3-NEXT: pshufb %xmm0, %xmm6 957; SSSE3-NEXT: psrlw $4, %xmm1 958; SSSE3-NEXT: pand %xmm5, %xmm1 959; SSSE3-NEXT: pshufb %xmm1, %xmm2 960; SSSE3-NEXT: por %xmm6, %xmm2 961; SSSE3-NEXT: movdqa %xmm3, %xmm0 962; SSSE3-NEXT: movdqa %xmm2, %xmm1 963; SSSE3-NEXT: retq 964; 965; AVX1-LABEL: test_bitreverse_v8i32: 966; AVX1: # %bb.0: 967; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 968; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 969; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 970; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 971; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 972; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 973; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 974; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 975; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 976; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 977; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 978; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 979; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 980; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 981; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 982; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 983; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 984; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 985; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 986; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 987; AVX1-NEXT: retq 988; 989; AVX2-LABEL: test_bitreverse_v8i32: 990; AVX2: # %bb.0: 991; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 992; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 993; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 994; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 995; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 996; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 997; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 998; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 999; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1000; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1001; AVX2-NEXT: retq 1002; 1003; AVX512-LABEL: test_bitreverse_v8i32: 1004; AVX512: # %bb.0: 1005; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1006; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1007; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1008; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1009; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1010; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1011; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1012; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1013; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1014; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1015; AVX512-NEXT: retq 1016; 1017; XOPAVX1-LABEL: test_bitreverse_v8i32: 1018; XOPAVX1: # %bb.0: 1019; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1020; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1021; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1022; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1023; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1024; XOPAVX1-NEXT: retq 1025; 1026; XOPAVX2-LABEL: test_bitreverse_v8i32: 1027; XOPAVX2: # %bb.0: 1028; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1029; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1030; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1031; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1032; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1033; XOPAVX2-NEXT: retq 1034 %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) 1035 ret <8 x i32> %b 1036} 1037 1038define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { 1039; SSE2-LABEL: test_bitreverse_v4i64: 1040; SSE2: # %bb.0: 1041; SSE2-NEXT: pxor %xmm4, %xmm4 1042; SSE2-NEXT: movdqa %xmm0, %xmm2 1043; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 1044; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 1045; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1046; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1047; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1048; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1049; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1050; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1051; SSE2-NEXT: packuswb %xmm2, %xmm0 1052; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1053; SSE2-NEXT: movdqa %xmm0, %xmm3 1054; SSE2-NEXT: pand %xmm2, %xmm3 1055; SSE2-NEXT: psllw $4, %xmm3 1056; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1057; SSE2-NEXT: pand %xmm6, %xmm3 1058; SSE2-NEXT: pand %xmm6, %xmm0 1059; SSE2-NEXT: psrlw $4, %xmm0 1060; SSE2-NEXT: pand %xmm2, %xmm0 1061; SSE2-NEXT: por %xmm3, %xmm0 1062; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1063; SSE2-NEXT: movdqa %xmm0, %xmm5 1064; SSE2-NEXT: pand %xmm3, %xmm5 1065; SSE2-NEXT: psllw $2, %xmm5 1066; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 1067; SSE2-NEXT: pand %xmm8, %xmm5 1068; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1069; SSE2-NEXT: pand %xmm9, %xmm0 1070; SSE2-NEXT: psrlw $2, %xmm0 1071; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 1072; SSE2-NEXT: pand %xmm10, %xmm0 1073; SSE2-NEXT: por %xmm5, %xmm0 1074; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1075; SSE2-NEXT: movdqa %xmm0, %xmm7 1076; SSE2-NEXT: pand %xmm5, %xmm7 1077; SSE2-NEXT: psrlw $1, %xmm7 1078; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1079; SSE2-NEXT: pand %xmm11, %xmm7 1080; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1081; SSE2-NEXT: pand %xmm12, %xmm0 1082; SSE2-NEXT: paddb %xmm0, %xmm0 1083; SSE2-NEXT: por %xmm7, %xmm0 1084; SSE2-NEXT: movdqa %xmm1, %xmm7 1085; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] 1086; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,0,1] 1087; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,2,1,0,4,5,6,7] 1088; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4] 1089; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 1090; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1091; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1092; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1093; SSE2-NEXT: packuswb %xmm7, %xmm1 1094; SSE2-NEXT: movdqa %xmm1, %xmm4 1095; SSE2-NEXT: pand %xmm2, %xmm4 1096; SSE2-NEXT: psllw $4, %xmm4 1097; SSE2-NEXT: pand %xmm6, %xmm4 1098; SSE2-NEXT: pand %xmm6, %xmm1 1099; SSE2-NEXT: psrlw $4, %xmm1 1100; SSE2-NEXT: pand %xmm2, %xmm1 1101; SSE2-NEXT: por %xmm4, %xmm1 1102; SSE2-NEXT: pand %xmm1, %xmm3 1103; SSE2-NEXT: psllw $2, %xmm3 1104; SSE2-NEXT: pand %xmm8, %xmm3 1105; SSE2-NEXT: pand %xmm9, %xmm1 1106; SSE2-NEXT: psrlw $2, %xmm1 1107; SSE2-NEXT: pand %xmm10, %xmm1 1108; SSE2-NEXT: por %xmm3, %xmm1 1109; SSE2-NEXT: pand %xmm1, %xmm5 1110; SSE2-NEXT: psrlw $1, %xmm5 1111; SSE2-NEXT: pand %xmm11, %xmm5 1112; SSE2-NEXT: pand %xmm12, %xmm1 1113; SSE2-NEXT: paddb %xmm1, %xmm1 1114; SSE2-NEXT: por %xmm5, %xmm1 1115; SSE2-NEXT: retq 1116; 1117; SSSE3-LABEL: test_bitreverse_v4i64: 1118; SSSE3: # %bb.0: 1119; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1120; SSSE3-NEXT: pshufb %xmm4, %xmm0 1121; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1122; SSSE3-NEXT: movdqa %xmm0, %xmm2 1123; SSSE3-NEXT: pand %xmm5, %xmm2 1124; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1125; SSSE3-NEXT: movdqa %xmm6, %xmm7 1126; SSSE3-NEXT: pshufb %xmm2, %xmm7 1127; SSSE3-NEXT: psrlw $4, %xmm0 1128; SSSE3-NEXT: pand %xmm5, %xmm0 1129; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1130; SSSE3-NEXT: movdqa %xmm2, %xmm3 1131; SSSE3-NEXT: pshufb %xmm0, %xmm3 1132; SSSE3-NEXT: por %xmm7, %xmm3 1133; SSSE3-NEXT: pshufb %xmm4, %xmm1 1134; SSSE3-NEXT: movdqa %xmm1, %xmm0 1135; SSSE3-NEXT: pand %xmm5, %xmm0 1136; SSSE3-NEXT: pshufb %xmm0, %xmm6 1137; SSSE3-NEXT: psrlw $4, %xmm1 1138; SSSE3-NEXT: pand %xmm5, %xmm1 1139; SSSE3-NEXT: pshufb %xmm1, %xmm2 1140; SSSE3-NEXT: por %xmm6, %xmm2 1141; SSSE3-NEXT: movdqa %xmm3, %xmm0 1142; SSSE3-NEXT: movdqa %xmm2, %xmm1 1143; SSSE3-NEXT: retq 1144; 1145; AVX1-LABEL: test_bitreverse_v4i64: 1146; AVX1: # %bb.0: 1147; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1148; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1149; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1150; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1151; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1152; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1153; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1154; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1155; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1156; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1157; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1158; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1159; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1160; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1161; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1162; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1163; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1164; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1165; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1166; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1167; AVX1-NEXT: retq 1168; 1169; AVX2-LABEL: test_bitreverse_v4i64: 1170; AVX2: # %bb.0: 1171; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1172; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1173; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1174; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1175; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1176; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1177; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1178; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1179; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1180; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1181; AVX2-NEXT: retq 1182; 1183; AVX512-LABEL: test_bitreverse_v4i64: 1184; AVX512: # %bb.0: 1185; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1186; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1187; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1188; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1189; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1190; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1191; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1192; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1193; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1194; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1195; AVX512-NEXT: retq 1196; 1197; XOPAVX1-LABEL: test_bitreverse_v4i64: 1198; XOPAVX1: # %bb.0: 1199; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1200; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1201; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1202; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1203; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1204; XOPAVX1-NEXT: retq 1205; 1206; XOPAVX2-LABEL: test_bitreverse_v4i64: 1207; XOPAVX2: # %bb.0: 1208; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1209; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1210; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1211; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1212; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1213; XOPAVX2-NEXT: retq 1214 %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) 1215 ret <4 x i64> %b 1216} 1217 1218define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { 1219; SSE2-LABEL: test_bitreverse_v64i8: 1220; SSE2: # %bb.0: 1221; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1222; SSE2-NEXT: movdqa %xmm0, %xmm5 1223; SSE2-NEXT: pand %xmm13, %xmm5 1224; SSE2-NEXT: psllw $4, %xmm5 1225; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1226; SSE2-NEXT: pand %xmm7, %xmm5 1227; SSE2-NEXT: pand %xmm7, %xmm0 1228; SSE2-NEXT: psrlw $4, %xmm0 1229; SSE2-NEXT: pand %xmm13, %xmm0 1230; SSE2-NEXT: por %xmm5, %xmm0 1231; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1232; SSE2-NEXT: movdqa %xmm0, %xmm6 1233; SSE2-NEXT: pand %xmm5, %xmm6 1234; SSE2-NEXT: psllw $2, %xmm6 1235; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 1236; SSE2-NEXT: pand %xmm8, %xmm6 1237; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1238; SSE2-NEXT: pand %xmm9, %xmm0 1239; SSE2-NEXT: psrlw $2, %xmm0 1240; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 1241; SSE2-NEXT: pand %xmm10, %xmm0 1242; SSE2-NEXT: por %xmm6, %xmm0 1243; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1244; SSE2-NEXT: movdqa %xmm0, %xmm4 1245; SSE2-NEXT: pand %xmm6, %xmm4 1246; SSE2-NEXT: psrlw $1, %xmm4 1247; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1248; SSE2-NEXT: pand %xmm11, %xmm4 1249; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1250; SSE2-NEXT: pand %xmm12, %xmm0 1251; SSE2-NEXT: paddb %xmm0, %xmm0 1252; SSE2-NEXT: por %xmm4, %xmm0 1253; SSE2-NEXT: movdqa %xmm1, %xmm4 1254; SSE2-NEXT: pand %xmm13, %xmm4 1255; SSE2-NEXT: psllw $4, %xmm4 1256; SSE2-NEXT: pand %xmm7, %xmm4 1257; SSE2-NEXT: pand %xmm7, %xmm1 1258; SSE2-NEXT: psrlw $4, %xmm1 1259; SSE2-NEXT: pand %xmm13, %xmm1 1260; SSE2-NEXT: por %xmm4, %xmm1 1261; SSE2-NEXT: movdqa %xmm1, %xmm4 1262; SSE2-NEXT: pand %xmm5, %xmm4 1263; SSE2-NEXT: psllw $2, %xmm4 1264; SSE2-NEXT: pand %xmm8, %xmm4 1265; SSE2-NEXT: pand %xmm9, %xmm1 1266; SSE2-NEXT: psrlw $2, %xmm1 1267; SSE2-NEXT: pand %xmm10, %xmm1 1268; SSE2-NEXT: por %xmm4, %xmm1 1269; SSE2-NEXT: movdqa %xmm1, %xmm4 1270; SSE2-NEXT: pand %xmm6, %xmm4 1271; SSE2-NEXT: psrlw $1, %xmm4 1272; SSE2-NEXT: pand %xmm11, %xmm4 1273; SSE2-NEXT: pand %xmm12, %xmm1 1274; SSE2-NEXT: paddb %xmm1, %xmm1 1275; SSE2-NEXT: por %xmm4, %xmm1 1276; SSE2-NEXT: movdqa %xmm2, %xmm4 1277; SSE2-NEXT: pand %xmm13, %xmm4 1278; SSE2-NEXT: psllw $4, %xmm4 1279; SSE2-NEXT: pand %xmm7, %xmm4 1280; SSE2-NEXT: pand %xmm7, %xmm2 1281; SSE2-NEXT: psrlw $4, %xmm2 1282; SSE2-NEXT: pand %xmm13, %xmm2 1283; SSE2-NEXT: por %xmm4, %xmm2 1284; SSE2-NEXT: movdqa %xmm2, %xmm4 1285; SSE2-NEXT: pand %xmm5, %xmm4 1286; SSE2-NEXT: psllw $2, %xmm4 1287; SSE2-NEXT: pand %xmm8, %xmm4 1288; SSE2-NEXT: pand %xmm9, %xmm2 1289; SSE2-NEXT: psrlw $2, %xmm2 1290; SSE2-NEXT: pand %xmm10, %xmm2 1291; SSE2-NEXT: por %xmm4, %xmm2 1292; SSE2-NEXT: movdqa %xmm2, %xmm4 1293; SSE2-NEXT: pand %xmm6, %xmm4 1294; SSE2-NEXT: psrlw $1, %xmm4 1295; SSE2-NEXT: pand %xmm11, %xmm4 1296; SSE2-NEXT: pand %xmm12, %xmm2 1297; SSE2-NEXT: paddb %xmm2, %xmm2 1298; SSE2-NEXT: por %xmm4, %xmm2 1299; SSE2-NEXT: movdqa %xmm3, %xmm4 1300; SSE2-NEXT: pand %xmm13, %xmm4 1301; SSE2-NEXT: psllw $4, %xmm4 1302; SSE2-NEXT: pand %xmm7, %xmm4 1303; SSE2-NEXT: pand %xmm7, %xmm3 1304; SSE2-NEXT: psrlw $4, %xmm3 1305; SSE2-NEXT: pand %xmm13, %xmm3 1306; SSE2-NEXT: por %xmm4, %xmm3 1307; SSE2-NEXT: pand %xmm3, %xmm5 1308; SSE2-NEXT: psllw $2, %xmm5 1309; SSE2-NEXT: pand %xmm8, %xmm5 1310; SSE2-NEXT: pand %xmm9, %xmm3 1311; SSE2-NEXT: psrlw $2, %xmm3 1312; SSE2-NEXT: pand %xmm10, %xmm3 1313; SSE2-NEXT: por %xmm5, %xmm3 1314; SSE2-NEXT: pand %xmm3, %xmm6 1315; SSE2-NEXT: psrlw $1, %xmm6 1316; SSE2-NEXT: pand %xmm11, %xmm6 1317; SSE2-NEXT: pand %xmm12, %xmm3 1318; SSE2-NEXT: paddb %xmm3, %xmm3 1319; SSE2-NEXT: por %xmm6, %xmm3 1320; SSE2-NEXT: retq 1321; 1322; SSSE3-LABEL: test_bitreverse_v64i8: 1323; SSSE3: # %bb.0: 1324; SSSE3-NEXT: movdqa %xmm0, %xmm5 1325; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1326; SSSE3-NEXT: pand %xmm8, %xmm0 1327; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1328; SSSE3-NEXT: movdqa %xmm9, %xmm6 1329; SSSE3-NEXT: pshufb %xmm0, %xmm6 1330; SSSE3-NEXT: psrlw $4, %xmm5 1331; SSSE3-NEXT: pand %xmm8, %xmm5 1332; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1333; SSSE3-NEXT: movdqa %xmm4, %xmm0 1334; SSSE3-NEXT: pshufb %xmm5, %xmm0 1335; SSSE3-NEXT: por %xmm6, %xmm0 1336; SSSE3-NEXT: movdqa %xmm1, %xmm5 1337; SSSE3-NEXT: pand %xmm8, %xmm5 1338; SSSE3-NEXT: movdqa %xmm9, %xmm6 1339; SSSE3-NEXT: pshufb %xmm5, %xmm6 1340; SSSE3-NEXT: psrlw $4, %xmm1 1341; SSSE3-NEXT: pand %xmm8, %xmm1 1342; SSSE3-NEXT: movdqa %xmm4, %xmm5 1343; SSSE3-NEXT: pshufb %xmm1, %xmm5 1344; SSSE3-NEXT: por %xmm6, %xmm5 1345; SSSE3-NEXT: movdqa %xmm2, %xmm1 1346; SSSE3-NEXT: pand %xmm8, %xmm1 1347; SSSE3-NEXT: movdqa %xmm9, %xmm7 1348; SSSE3-NEXT: pshufb %xmm1, %xmm7 1349; SSSE3-NEXT: psrlw $4, %xmm2 1350; SSSE3-NEXT: pand %xmm8, %xmm2 1351; SSSE3-NEXT: movdqa %xmm4, %xmm6 1352; SSSE3-NEXT: pshufb %xmm2, %xmm6 1353; SSSE3-NEXT: por %xmm7, %xmm6 1354; SSSE3-NEXT: movdqa %xmm3, %xmm1 1355; SSSE3-NEXT: pand %xmm8, %xmm1 1356; SSSE3-NEXT: pshufb %xmm1, %xmm9 1357; SSSE3-NEXT: psrlw $4, %xmm3 1358; SSSE3-NEXT: pand %xmm8, %xmm3 1359; SSSE3-NEXT: pshufb %xmm3, %xmm4 1360; SSSE3-NEXT: por %xmm9, %xmm4 1361; SSSE3-NEXT: movdqa %xmm5, %xmm1 1362; SSSE3-NEXT: movdqa %xmm6, %xmm2 1363; SSSE3-NEXT: movdqa %xmm4, %xmm3 1364; SSSE3-NEXT: retq 1365; 1366; AVX1-LABEL: test_bitreverse_v64i8: 1367; AVX1: # %bb.0: 1368; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1369; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1370; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 1371; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1372; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1373; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1374; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1375; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1376; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1377; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1378; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm4 1379; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1380; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1381; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1382; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1383; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 1384; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1385; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1386; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 1387; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1388; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1389; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1390; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1391; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1392; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1393; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1394; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1395; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1396; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1397; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1398; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1399; AVX1-NEXT: retq 1400; 1401; AVX2-LABEL: test_bitreverse_v64i8: 1402; AVX2: # %bb.0: 1403; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1404; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 1405; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1406; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1407; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1408; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1409; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1410; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0 1411; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 1412; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 1413; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1414; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 1415; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1416; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1 1417; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 1418; AVX2-NEXT: retq 1419; 1420; AVX512F-LABEL: test_bitreverse_v64i8: 1421; AVX512F: # %bb.0: 1422; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1423; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3 1424; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1425; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1426; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 1427; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 1428; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1429; AVX512F-NEXT: vpshufb %ymm0, %ymm5, %ymm0 1430; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 1431; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 1432; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1433; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 1434; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 1435; AVX512F-NEXT: vpshufb %ymm1, %ymm5, %ymm1 1436; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 1437; AVX512F-NEXT: retq 1438; 1439; AVX512BW-LABEL: test_bitreverse_v64i8: 1440; AVX512BW: # %bb.0: 1441; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1442; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 1443; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1444; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 1445; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 1446; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 1447; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1448; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 1449; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 1450; AVX512BW-NEXT: retq 1451; 1452; XOPAVX1-LABEL: test_bitreverse_v64i8: 1453; XOPAVX1: # %bb.0: 1454; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1455; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1456; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1457; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1458; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1459; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1460; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1461; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1462; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1463; XOPAVX1-NEXT: retq 1464; 1465; XOPAVX2-LABEL: test_bitreverse_v64i8: 1466; XOPAVX2: # %bb.0: 1467; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1468; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1469; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1470; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1471; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1472; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1473; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1474; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1475; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 1476; XOPAVX2-NEXT: retq 1477 %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) 1478 ret <64 x i8> %b 1479} 1480 1481define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { 1482; SSE2-LABEL: test_bitreverse_v32i16: 1483; SSE2: # %bb.0: 1484; SSE2-NEXT: pxor %xmm14, %xmm14 1485; SSE2-NEXT: movdqa %xmm0, %xmm4 1486; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] 1487; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] 1488; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6] 1489; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 1490; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 1491; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 1492; SSE2-NEXT: packuswb %xmm4, %xmm0 1493; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1494; SSE2-NEXT: movdqa %xmm0, %xmm5 1495; SSE2-NEXT: pand %xmm8, %xmm5 1496; SSE2-NEXT: psllw $4, %xmm5 1497; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1498; SSE2-NEXT: pand %xmm4, %xmm5 1499; SSE2-NEXT: pand %xmm4, %xmm0 1500; SSE2-NEXT: psrlw $4, %xmm0 1501; SSE2-NEXT: pand %xmm8, %xmm0 1502; SSE2-NEXT: por %xmm5, %xmm0 1503; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1504; SSE2-NEXT: movdqa %xmm0, %xmm7 1505; SSE2-NEXT: pand %xmm5, %xmm7 1506; SSE2-NEXT: psllw $2, %xmm7 1507; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 1508; SSE2-NEXT: pand %xmm9, %xmm7 1509; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1510; SSE2-NEXT: pand %xmm10, %xmm0 1511; SSE2-NEXT: psrlw $2, %xmm0 1512; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 1513; SSE2-NEXT: pand %xmm11, %xmm0 1514; SSE2-NEXT: por %xmm7, %xmm0 1515; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1516; SSE2-NEXT: movdqa %xmm0, %xmm6 1517; SSE2-NEXT: pand %xmm7, %xmm6 1518; SSE2-NEXT: psrlw $1, %xmm6 1519; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1520; SSE2-NEXT: pand %xmm12, %xmm6 1521; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1522; SSE2-NEXT: pand %xmm13, %xmm0 1523; SSE2-NEXT: paddb %xmm0, %xmm0 1524; SSE2-NEXT: por %xmm6, %xmm0 1525; SSE2-NEXT: movdqa %xmm1, %xmm6 1526; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1527; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] 1528; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6] 1529; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 1530; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 1531; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6] 1532; SSE2-NEXT: packuswb %xmm6, %xmm1 1533; SSE2-NEXT: movdqa %xmm1, %xmm6 1534; SSE2-NEXT: pand %xmm8, %xmm6 1535; SSE2-NEXT: psllw $4, %xmm6 1536; SSE2-NEXT: pand %xmm4, %xmm6 1537; SSE2-NEXT: pand %xmm4, %xmm1 1538; SSE2-NEXT: psrlw $4, %xmm1 1539; SSE2-NEXT: pand %xmm8, %xmm1 1540; SSE2-NEXT: por %xmm6, %xmm1 1541; SSE2-NEXT: movdqa %xmm1, %xmm6 1542; SSE2-NEXT: pand %xmm5, %xmm6 1543; SSE2-NEXT: psllw $2, %xmm6 1544; SSE2-NEXT: pand %xmm9, %xmm6 1545; SSE2-NEXT: pand %xmm10, %xmm1 1546; SSE2-NEXT: psrlw $2, %xmm1 1547; SSE2-NEXT: pand %xmm11, %xmm1 1548; SSE2-NEXT: por %xmm6, %xmm1 1549; SSE2-NEXT: movdqa %xmm1, %xmm6 1550; SSE2-NEXT: pand %xmm7, %xmm6 1551; SSE2-NEXT: psrlw $1, %xmm6 1552; SSE2-NEXT: pand %xmm12, %xmm6 1553; SSE2-NEXT: pand %xmm13, %xmm1 1554; SSE2-NEXT: paddb %xmm1, %xmm1 1555; SSE2-NEXT: por %xmm6, %xmm1 1556; SSE2-NEXT: movdqa %xmm2, %xmm6 1557; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1558; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] 1559; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6] 1560; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] 1561; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 1562; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] 1563; SSE2-NEXT: packuswb %xmm6, %xmm2 1564; SSE2-NEXT: movdqa %xmm2, %xmm6 1565; SSE2-NEXT: pand %xmm8, %xmm6 1566; SSE2-NEXT: psllw $4, %xmm6 1567; SSE2-NEXT: pand %xmm4, %xmm6 1568; SSE2-NEXT: pand %xmm4, %xmm2 1569; SSE2-NEXT: psrlw $4, %xmm2 1570; SSE2-NEXT: pand %xmm8, %xmm2 1571; SSE2-NEXT: por %xmm6, %xmm2 1572; SSE2-NEXT: movdqa %xmm2, %xmm6 1573; SSE2-NEXT: pand %xmm5, %xmm6 1574; SSE2-NEXT: psllw $2, %xmm6 1575; SSE2-NEXT: pand %xmm9, %xmm6 1576; SSE2-NEXT: pand %xmm10, %xmm2 1577; SSE2-NEXT: psrlw $2, %xmm2 1578; SSE2-NEXT: pand %xmm11, %xmm2 1579; SSE2-NEXT: por %xmm6, %xmm2 1580; SSE2-NEXT: movdqa %xmm2, %xmm6 1581; SSE2-NEXT: pand %xmm7, %xmm6 1582; SSE2-NEXT: psrlw $1, %xmm6 1583; SSE2-NEXT: pand %xmm12, %xmm6 1584; SSE2-NEXT: pand %xmm13, %xmm2 1585; SSE2-NEXT: paddb %xmm2, %xmm2 1586; SSE2-NEXT: por %xmm6, %xmm2 1587; SSE2-NEXT: movdqa %xmm3, %xmm6 1588; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1589; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] 1590; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6] 1591; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] 1592; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] 1593; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6] 1594; SSE2-NEXT: packuswb %xmm6, %xmm3 1595; SSE2-NEXT: movdqa %xmm3, %xmm6 1596; SSE2-NEXT: pand %xmm8, %xmm6 1597; SSE2-NEXT: psllw $4, %xmm6 1598; SSE2-NEXT: pand %xmm4, %xmm6 1599; SSE2-NEXT: pand %xmm4, %xmm3 1600; SSE2-NEXT: psrlw $4, %xmm3 1601; SSE2-NEXT: pand %xmm8, %xmm3 1602; SSE2-NEXT: por %xmm6, %xmm3 1603; SSE2-NEXT: pand %xmm3, %xmm5 1604; SSE2-NEXT: psllw $2, %xmm5 1605; SSE2-NEXT: pand %xmm9, %xmm5 1606; SSE2-NEXT: pand %xmm10, %xmm3 1607; SSE2-NEXT: psrlw $2, %xmm3 1608; SSE2-NEXT: pand %xmm11, %xmm3 1609; SSE2-NEXT: por %xmm5, %xmm3 1610; SSE2-NEXT: pand %xmm3, %xmm7 1611; SSE2-NEXT: psrlw $1, %xmm7 1612; SSE2-NEXT: pand %xmm12, %xmm7 1613; SSE2-NEXT: pand %xmm13, %xmm3 1614; SSE2-NEXT: paddb %xmm3, %xmm3 1615; SSE2-NEXT: por %xmm7, %xmm3 1616; SSE2-NEXT: retq 1617; 1618; SSSE3-LABEL: test_bitreverse_v32i16: 1619; SSSE3: # %bb.0: 1620; SSSE3-NEXT: movdqa %xmm1, %xmm5 1621; SSSE3-NEXT: movdqa %xmm0, %xmm1 1622; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1623; SSSE3-NEXT: pshufb %xmm8, %xmm1 1624; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1625; SSSE3-NEXT: movdqa %xmm1, %xmm0 1626; SSSE3-NEXT: pand %xmm9, %xmm0 1627; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1628; SSSE3-NEXT: movdqa %xmm7, %xmm6 1629; SSSE3-NEXT: pshufb %xmm0, %xmm6 1630; SSSE3-NEXT: psrlw $4, %xmm1 1631; SSSE3-NEXT: pand %xmm9, %xmm1 1632; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1633; SSSE3-NEXT: movdqa %xmm4, %xmm0 1634; SSSE3-NEXT: pshufb %xmm1, %xmm0 1635; SSSE3-NEXT: por %xmm6, %xmm0 1636; SSSE3-NEXT: pshufb %xmm8, %xmm5 1637; SSSE3-NEXT: movdqa %xmm5, %xmm1 1638; SSSE3-NEXT: pand %xmm9, %xmm1 1639; SSSE3-NEXT: movdqa %xmm7, %xmm6 1640; SSSE3-NEXT: pshufb %xmm1, %xmm6 1641; SSSE3-NEXT: psrlw $4, %xmm5 1642; SSSE3-NEXT: pand %xmm9, %xmm5 1643; SSSE3-NEXT: movdqa %xmm4, %xmm1 1644; SSSE3-NEXT: pshufb %xmm5, %xmm1 1645; SSSE3-NEXT: por %xmm6, %xmm1 1646; SSSE3-NEXT: pshufb %xmm8, %xmm2 1647; SSSE3-NEXT: movdqa %xmm2, %xmm5 1648; SSSE3-NEXT: pand %xmm9, %xmm5 1649; SSSE3-NEXT: movdqa %xmm7, %xmm6 1650; SSSE3-NEXT: pshufb %xmm5, %xmm6 1651; SSSE3-NEXT: psrlw $4, %xmm2 1652; SSSE3-NEXT: pand %xmm9, %xmm2 1653; SSSE3-NEXT: movdqa %xmm4, %xmm5 1654; SSSE3-NEXT: pshufb %xmm2, %xmm5 1655; SSSE3-NEXT: por %xmm6, %xmm5 1656; SSSE3-NEXT: pshufb %xmm8, %xmm3 1657; SSSE3-NEXT: movdqa %xmm3, %xmm2 1658; SSSE3-NEXT: pand %xmm9, %xmm2 1659; SSSE3-NEXT: pshufb %xmm2, %xmm7 1660; SSSE3-NEXT: psrlw $4, %xmm3 1661; SSSE3-NEXT: pand %xmm9, %xmm3 1662; SSSE3-NEXT: pshufb %xmm3, %xmm4 1663; SSSE3-NEXT: por %xmm7, %xmm4 1664; SSSE3-NEXT: movdqa %xmm5, %xmm2 1665; SSSE3-NEXT: movdqa %xmm4, %xmm3 1666; SSSE3-NEXT: retq 1667; 1668; AVX1-LABEL: test_bitreverse_v32i16: 1669; AVX1: # %bb.0: 1670; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1671; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1672; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1673; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1674; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1675; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1676; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1677; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1678; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1679; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1680; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1681; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1682; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1683; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 1684; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1685; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1686; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1687; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 1688; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 1689; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1690; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1691; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1692; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1693; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1694; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1695; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1696; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1697; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1698; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1699; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 1700; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 1701; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1702; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 1703; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 1704; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 1705; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1706; AVX1-NEXT: retq 1707; 1708; AVX2-LABEL: test_bitreverse_v32i16: 1709; AVX2: # %bb.0: 1710; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1711; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1712; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1713; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 1714; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1715; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1716; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1717; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 1718; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1719; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 1720; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 1721; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1722; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 1723; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 1724; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 1725; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 1726; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 1727; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 1728; AVX2-NEXT: retq 1729; 1730; AVX512F-LABEL: test_bitreverse_v32i16: 1731; AVX512F: # %bb.0: 1732; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1733; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1734; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1735; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm4 1736; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1737; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1738; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 1739; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 1740; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1741; AVX512F-NEXT: vpshufb %ymm0, %ymm6, %ymm0 1742; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 1743; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1744; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm2 1745; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 1746; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 1747; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 1748; AVX512F-NEXT: vpshufb %ymm1, %ymm6, %ymm1 1749; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 1750; AVX512F-NEXT: retq 1751; 1752; AVX512BW-LABEL: test_bitreverse_v32i16: 1753; AVX512BW: # %bb.0: 1754; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] 1755; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1756; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 1757; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1758; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 1759; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 1760; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 1761; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1762; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 1763; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 1764; AVX512BW-NEXT: retq 1765; 1766; XOPAVX1-LABEL: test_bitreverse_v32i16: 1767; XOPAVX1: # %bb.0: 1768; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1769; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1770; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1771; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1772; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1773; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1774; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1775; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1776; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1777; XOPAVX1-NEXT: retq 1778; 1779; XOPAVX2-LABEL: test_bitreverse_v32i16: 1780; XOPAVX2: # %bb.0: 1781; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1782; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1783; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1784; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1785; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1786; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1787; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1788; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1789; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 1790; XOPAVX2-NEXT: retq 1791 %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) 1792 ret <32 x i16> %b 1793} 1794 1795define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { 1796; SSE2-LABEL: test_bitreverse_v16i32: 1797; SSE2: # %bb.0: 1798; SSE2-NEXT: pxor %xmm14, %xmm14 1799; SSE2-NEXT: movdqa %xmm0, %xmm4 1800; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] 1801; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 1802; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 1803; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 1804; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1805; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1806; SSE2-NEXT: packuswb %xmm4, %xmm0 1807; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1808; SSE2-NEXT: movdqa %xmm0, %xmm5 1809; SSE2-NEXT: pand %xmm8, %xmm5 1810; SSE2-NEXT: psllw $4, %xmm5 1811; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1812; SSE2-NEXT: pand %xmm4, %xmm5 1813; SSE2-NEXT: pand %xmm4, %xmm0 1814; SSE2-NEXT: psrlw $4, %xmm0 1815; SSE2-NEXT: pand %xmm8, %xmm0 1816; SSE2-NEXT: por %xmm5, %xmm0 1817; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1818; SSE2-NEXT: movdqa %xmm0, %xmm7 1819; SSE2-NEXT: pand %xmm5, %xmm7 1820; SSE2-NEXT: psllw $2, %xmm7 1821; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 1822; SSE2-NEXT: pand %xmm9, %xmm7 1823; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1824; SSE2-NEXT: pand %xmm10, %xmm0 1825; SSE2-NEXT: psrlw $2, %xmm0 1826; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 1827; SSE2-NEXT: pand %xmm11, %xmm0 1828; SSE2-NEXT: por %xmm7, %xmm0 1829; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1830; SSE2-NEXT: movdqa %xmm0, %xmm6 1831; SSE2-NEXT: pand %xmm7, %xmm6 1832; SSE2-NEXT: psrlw $1, %xmm6 1833; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1834; SSE2-NEXT: pand %xmm12, %xmm6 1835; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1836; SSE2-NEXT: pand %xmm13, %xmm0 1837; SSE2-NEXT: paddb %xmm0, %xmm0 1838; SSE2-NEXT: por %xmm6, %xmm0 1839; SSE2-NEXT: movdqa %xmm1, %xmm6 1840; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1841; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1842; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1843; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 1844; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1845; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1846; SSE2-NEXT: packuswb %xmm6, %xmm1 1847; SSE2-NEXT: movdqa %xmm1, %xmm6 1848; SSE2-NEXT: pand %xmm8, %xmm6 1849; SSE2-NEXT: psllw $4, %xmm6 1850; SSE2-NEXT: pand %xmm4, %xmm6 1851; SSE2-NEXT: pand %xmm4, %xmm1 1852; SSE2-NEXT: psrlw $4, %xmm1 1853; SSE2-NEXT: pand %xmm8, %xmm1 1854; SSE2-NEXT: por %xmm6, %xmm1 1855; SSE2-NEXT: movdqa %xmm1, %xmm6 1856; SSE2-NEXT: pand %xmm5, %xmm6 1857; SSE2-NEXT: psllw $2, %xmm6 1858; SSE2-NEXT: pand %xmm9, %xmm6 1859; SSE2-NEXT: pand %xmm10, %xmm1 1860; SSE2-NEXT: psrlw $2, %xmm1 1861; SSE2-NEXT: pand %xmm11, %xmm1 1862; SSE2-NEXT: por %xmm6, %xmm1 1863; SSE2-NEXT: movdqa %xmm1, %xmm6 1864; SSE2-NEXT: pand %xmm7, %xmm6 1865; SSE2-NEXT: psrlw $1, %xmm6 1866; SSE2-NEXT: pand %xmm12, %xmm6 1867; SSE2-NEXT: pand %xmm13, %xmm1 1868; SSE2-NEXT: paddb %xmm1, %xmm1 1869; SSE2-NEXT: por %xmm6, %xmm1 1870; SSE2-NEXT: movdqa %xmm2, %xmm6 1871; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1872; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1873; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1874; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] 1875; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1876; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1877; SSE2-NEXT: packuswb %xmm6, %xmm2 1878; SSE2-NEXT: movdqa %xmm2, %xmm6 1879; SSE2-NEXT: pand %xmm8, %xmm6 1880; SSE2-NEXT: psllw $4, %xmm6 1881; SSE2-NEXT: pand %xmm4, %xmm6 1882; SSE2-NEXT: pand %xmm4, %xmm2 1883; SSE2-NEXT: psrlw $4, %xmm2 1884; SSE2-NEXT: pand %xmm8, %xmm2 1885; SSE2-NEXT: por %xmm6, %xmm2 1886; SSE2-NEXT: movdqa %xmm2, %xmm6 1887; SSE2-NEXT: pand %xmm5, %xmm6 1888; SSE2-NEXT: psllw $2, %xmm6 1889; SSE2-NEXT: pand %xmm9, %xmm6 1890; SSE2-NEXT: pand %xmm10, %xmm2 1891; SSE2-NEXT: psrlw $2, %xmm2 1892; SSE2-NEXT: pand %xmm11, %xmm2 1893; SSE2-NEXT: por %xmm6, %xmm2 1894; SSE2-NEXT: movdqa %xmm2, %xmm6 1895; SSE2-NEXT: pand %xmm7, %xmm6 1896; SSE2-NEXT: psrlw $1, %xmm6 1897; SSE2-NEXT: pand %xmm12, %xmm6 1898; SSE2-NEXT: pand %xmm13, %xmm2 1899; SSE2-NEXT: paddb %xmm2, %xmm2 1900; SSE2-NEXT: por %xmm6, %xmm2 1901; SSE2-NEXT: movdqa %xmm3, %xmm6 1902; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1903; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1904; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1905; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] 1906; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 1907; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 1908; SSE2-NEXT: packuswb %xmm6, %xmm3 1909; SSE2-NEXT: movdqa %xmm3, %xmm6 1910; SSE2-NEXT: pand %xmm8, %xmm6 1911; SSE2-NEXT: psllw $4, %xmm6 1912; SSE2-NEXT: pand %xmm4, %xmm6 1913; SSE2-NEXT: pand %xmm4, %xmm3 1914; SSE2-NEXT: psrlw $4, %xmm3 1915; SSE2-NEXT: pand %xmm8, %xmm3 1916; SSE2-NEXT: por %xmm6, %xmm3 1917; SSE2-NEXT: pand %xmm3, %xmm5 1918; SSE2-NEXT: psllw $2, %xmm5 1919; SSE2-NEXT: pand %xmm9, %xmm5 1920; SSE2-NEXT: pand %xmm10, %xmm3 1921; SSE2-NEXT: psrlw $2, %xmm3 1922; SSE2-NEXT: pand %xmm11, %xmm3 1923; SSE2-NEXT: por %xmm5, %xmm3 1924; SSE2-NEXT: pand %xmm3, %xmm7 1925; SSE2-NEXT: psrlw $1, %xmm7 1926; SSE2-NEXT: pand %xmm12, %xmm7 1927; SSE2-NEXT: pand %xmm13, %xmm3 1928; SSE2-NEXT: paddb %xmm3, %xmm3 1929; SSE2-NEXT: por %xmm7, %xmm3 1930; SSE2-NEXT: retq 1931; 1932; SSSE3-LABEL: test_bitreverse_v16i32: 1933; SSSE3: # %bb.0: 1934; SSSE3-NEXT: movdqa %xmm1, %xmm5 1935; SSSE3-NEXT: movdqa %xmm0, %xmm1 1936; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1937; SSSE3-NEXT: pshufb %xmm8, %xmm1 1938; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1939; SSSE3-NEXT: movdqa %xmm1, %xmm0 1940; SSSE3-NEXT: pand %xmm9, %xmm0 1941; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1942; SSSE3-NEXT: movdqa %xmm7, %xmm6 1943; SSSE3-NEXT: pshufb %xmm0, %xmm6 1944; SSSE3-NEXT: psrlw $4, %xmm1 1945; SSSE3-NEXT: pand %xmm9, %xmm1 1946; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1947; SSSE3-NEXT: movdqa %xmm4, %xmm0 1948; SSSE3-NEXT: pshufb %xmm1, %xmm0 1949; SSSE3-NEXT: por %xmm6, %xmm0 1950; SSSE3-NEXT: pshufb %xmm8, %xmm5 1951; SSSE3-NEXT: movdqa %xmm5, %xmm1 1952; SSSE3-NEXT: pand %xmm9, %xmm1 1953; SSSE3-NEXT: movdqa %xmm7, %xmm6 1954; SSSE3-NEXT: pshufb %xmm1, %xmm6 1955; SSSE3-NEXT: psrlw $4, %xmm5 1956; SSSE3-NEXT: pand %xmm9, %xmm5 1957; SSSE3-NEXT: movdqa %xmm4, %xmm1 1958; SSSE3-NEXT: pshufb %xmm5, %xmm1 1959; SSSE3-NEXT: por %xmm6, %xmm1 1960; SSSE3-NEXT: pshufb %xmm8, %xmm2 1961; SSSE3-NEXT: movdqa %xmm2, %xmm5 1962; SSSE3-NEXT: pand %xmm9, %xmm5 1963; SSSE3-NEXT: movdqa %xmm7, %xmm6 1964; SSSE3-NEXT: pshufb %xmm5, %xmm6 1965; SSSE3-NEXT: psrlw $4, %xmm2 1966; SSSE3-NEXT: pand %xmm9, %xmm2 1967; SSSE3-NEXT: movdqa %xmm4, %xmm5 1968; SSSE3-NEXT: pshufb %xmm2, %xmm5 1969; SSSE3-NEXT: por %xmm6, %xmm5 1970; SSSE3-NEXT: pshufb %xmm8, %xmm3 1971; SSSE3-NEXT: movdqa %xmm3, %xmm2 1972; SSSE3-NEXT: pand %xmm9, %xmm2 1973; SSSE3-NEXT: pshufb %xmm2, %xmm7 1974; SSSE3-NEXT: psrlw $4, %xmm3 1975; SSSE3-NEXT: pand %xmm9, %xmm3 1976; SSSE3-NEXT: pshufb %xmm3, %xmm4 1977; SSSE3-NEXT: por %xmm7, %xmm4 1978; SSSE3-NEXT: movdqa %xmm5, %xmm2 1979; SSSE3-NEXT: movdqa %xmm4, %xmm3 1980; SSSE3-NEXT: retq 1981; 1982; AVX1-LABEL: test_bitreverse_v16i32: 1983; AVX1: # %bb.0: 1984; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1985; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1986; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1987; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1988; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1989; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1990; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1991; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1992; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1993; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1994; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1995; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1996; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1997; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 1998; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1999; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2000; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2001; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2002; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2003; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2004; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2005; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2006; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2007; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2008; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2009; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2010; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2011; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2012; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2013; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2014; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2015; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2016; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2017; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2018; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2019; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2020; AVX1-NEXT: retq 2021; 2022; AVX2-LABEL: test_bitreverse_v16i32: 2023; AVX2: # %bb.0: 2024; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2025; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2026; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2027; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2028; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2029; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2030; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2031; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2032; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2033; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2034; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2035; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2036; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2037; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2038; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2039; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2040; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2041; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2042; AVX2-NEXT: retq 2043; 2044; AVX512F-LABEL: test_bitreverse_v16i32: 2045; AVX512F: # %bb.0: 2046; AVX512F-NEXT: vpsrld $24, %zmm0, %zmm1 2047; AVX512F-NEXT: vpsrld $8, %zmm0, %zmm2 2048; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 2049; AVX512F-NEXT: vpord %zmm1, %zmm2, %zmm1 2050; AVX512F-NEXT: vpslld $24, %zmm0, %zmm2 2051; AVX512F-NEXT: vpslld $8, %zmm0, %zmm0 2052; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 2053; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 2054; AVX512F-NEXT: vpord %zmm0, %zmm2, %zmm0 2055; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 2056; AVX512F-NEXT: vpslld $4, %zmm1, %zmm1 2057; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 2058; AVX512F-NEXT: vpsrld $4, %zmm0, %zmm0 2059; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 2060; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 2061; AVX512F-NEXT: vpslld $2, %zmm1, %zmm1 2062; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 2063; AVX512F-NEXT: vpsrld $2, %zmm0, %zmm0 2064; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 2065; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 2066; AVX512F-NEXT: vpslld $1, %zmm1, %zmm1 2067; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 2068; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 2069; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 2070; AVX512F-NEXT: retq 2071; 2072; AVX512BW-LABEL: test_bitreverse_v16i32: 2073; AVX512BW: # %bb.0: 2074; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] 2075; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2076; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2077; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2078; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2079; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2080; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2081; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2082; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2083; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2084; AVX512BW-NEXT: retq 2085; 2086; XOPAVX1-LABEL: test_bitreverse_v16i32: 2087; XOPAVX1: # %bb.0: 2088; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2089; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 2090; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2091; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2092; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2093; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2094; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2095; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2096; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2097; XOPAVX1-NEXT: retq 2098; 2099; XOPAVX2-LABEL: test_bitreverse_v16i32: 2100; XOPAVX2: # %bb.0: 2101; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2102; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 2103; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2104; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2105; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2106; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2107; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2108; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2109; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2110; XOPAVX2-NEXT: retq 2111 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) 2112 ret <16 x i32> %b 2113} 2114 2115define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { 2116; SSE2-LABEL: test_bitreverse_v8i64: 2117; SSE2: # %bb.0: 2118; SSE2-NEXT: pxor %xmm14, %xmm14 2119; SSE2-NEXT: movdqa %xmm0, %xmm4 2120; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] 2121; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 2122; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2123; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2124; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 2125; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2126; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 2127; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 2128; SSE2-NEXT: packuswb %xmm4, %xmm0 2129; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2130; SSE2-NEXT: movdqa %xmm0, %xmm5 2131; SSE2-NEXT: pand %xmm8, %xmm5 2132; SSE2-NEXT: psllw $4, %xmm5 2133; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 2134; SSE2-NEXT: pand %xmm4, %xmm5 2135; SSE2-NEXT: pand %xmm4, %xmm0 2136; SSE2-NEXT: psrlw $4, %xmm0 2137; SSE2-NEXT: pand %xmm8, %xmm0 2138; SSE2-NEXT: por %xmm5, %xmm0 2139; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 2140; SSE2-NEXT: movdqa %xmm0, %xmm7 2141; SSE2-NEXT: pand %xmm5, %xmm7 2142; SSE2-NEXT: psllw $2, %xmm7 2143; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 2144; SSE2-NEXT: pand %xmm9, %xmm7 2145; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 2146; SSE2-NEXT: pand %xmm10, %xmm0 2147; SSE2-NEXT: psrlw $2, %xmm0 2148; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 2149; SSE2-NEXT: pand %xmm11, %xmm0 2150; SSE2-NEXT: por %xmm7, %xmm0 2151; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 2152; SSE2-NEXT: movdqa %xmm0, %xmm6 2153; SSE2-NEXT: pand %xmm7, %xmm6 2154; SSE2-NEXT: psrlw $1, %xmm6 2155; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 2156; SSE2-NEXT: pand %xmm12, %xmm6 2157; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 2158; SSE2-NEXT: pand %xmm13, %xmm0 2159; SSE2-NEXT: paddb %xmm0, %xmm0 2160; SSE2-NEXT: por %xmm6, %xmm0 2161; SSE2-NEXT: movdqa %xmm1, %xmm6 2162; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 2163; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 2164; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 2165; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 2166; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 2167; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2168; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 2169; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 2170; SSE2-NEXT: packuswb %xmm6, %xmm1 2171; SSE2-NEXT: movdqa %xmm1, %xmm6 2172; SSE2-NEXT: pand %xmm8, %xmm6 2173; SSE2-NEXT: psllw $4, %xmm6 2174; SSE2-NEXT: pand %xmm4, %xmm6 2175; SSE2-NEXT: pand %xmm4, %xmm1 2176; SSE2-NEXT: psrlw $4, %xmm1 2177; SSE2-NEXT: pand %xmm8, %xmm1 2178; SSE2-NEXT: por %xmm6, %xmm1 2179; SSE2-NEXT: movdqa %xmm1, %xmm6 2180; SSE2-NEXT: pand %xmm5, %xmm6 2181; SSE2-NEXT: psllw $2, %xmm6 2182; SSE2-NEXT: pand %xmm9, %xmm6 2183; SSE2-NEXT: pand %xmm10, %xmm1 2184; SSE2-NEXT: psrlw $2, %xmm1 2185; SSE2-NEXT: pand %xmm11, %xmm1 2186; SSE2-NEXT: por %xmm6, %xmm1 2187; SSE2-NEXT: movdqa %xmm1, %xmm6 2188; SSE2-NEXT: pand %xmm7, %xmm6 2189; SSE2-NEXT: psrlw $1, %xmm6 2190; SSE2-NEXT: pand %xmm12, %xmm6 2191; SSE2-NEXT: pand %xmm13, %xmm1 2192; SSE2-NEXT: paddb %xmm1, %xmm1 2193; SSE2-NEXT: por %xmm6, %xmm1 2194; SSE2-NEXT: movdqa %xmm2, %xmm6 2195; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 2196; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 2197; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 2198; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 2199; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] 2200; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2201; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 2202; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 2203; SSE2-NEXT: packuswb %xmm6, %xmm2 2204; SSE2-NEXT: movdqa %xmm2, %xmm6 2205; SSE2-NEXT: pand %xmm8, %xmm6 2206; SSE2-NEXT: psllw $4, %xmm6 2207; SSE2-NEXT: pand %xmm4, %xmm6 2208; SSE2-NEXT: pand %xmm4, %xmm2 2209; SSE2-NEXT: psrlw $4, %xmm2 2210; SSE2-NEXT: pand %xmm8, %xmm2 2211; SSE2-NEXT: por %xmm6, %xmm2 2212; SSE2-NEXT: movdqa %xmm2, %xmm6 2213; SSE2-NEXT: pand %xmm5, %xmm6 2214; SSE2-NEXT: psllw $2, %xmm6 2215; SSE2-NEXT: pand %xmm9, %xmm6 2216; SSE2-NEXT: pand %xmm10, %xmm2 2217; SSE2-NEXT: psrlw $2, %xmm2 2218; SSE2-NEXT: pand %xmm11, %xmm2 2219; SSE2-NEXT: por %xmm6, %xmm2 2220; SSE2-NEXT: movdqa %xmm2, %xmm6 2221; SSE2-NEXT: pand %xmm7, %xmm6 2222; SSE2-NEXT: psrlw $1, %xmm6 2223; SSE2-NEXT: pand %xmm12, %xmm6 2224; SSE2-NEXT: pand %xmm13, %xmm2 2225; SSE2-NEXT: paddb %xmm2, %xmm2 2226; SSE2-NEXT: por %xmm6, %xmm2 2227; SSE2-NEXT: movdqa %xmm3, %xmm6 2228; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 2229; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 2230; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 2231; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 2232; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] 2233; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 2234; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 2235; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 2236; SSE2-NEXT: packuswb %xmm6, %xmm3 2237; SSE2-NEXT: movdqa %xmm3, %xmm6 2238; SSE2-NEXT: pand %xmm8, %xmm6 2239; SSE2-NEXT: psllw $4, %xmm6 2240; SSE2-NEXT: pand %xmm4, %xmm6 2241; SSE2-NEXT: pand %xmm4, %xmm3 2242; SSE2-NEXT: psrlw $4, %xmm3 2243; SSE2-NEXT: pand %xmm8, %xmm3 2244; SSE2-NEXT: por %xmm6, %xmm3 2245; SSE2-NEXT: pand %xmm3, %xmm5 2246; SSE2-NEXT: psllw $2, %xmm5 2247; SSE2-NEXT: pand %xmm9, %xmm5 2248; SSE2-NEXT: pand %xmm10, %xmm3 2249; SSE2-NEXT: psrlw $2, %xmm3 2250; SSE2-NEXT: pand %xmm11, %xmm3 2251; SSE2-NEXT: por %xmm5, %xmm3 2252; SSE2-NEXT: pand %xmm3, %xmm7 2253; SSE2-NEXT: psrlw $1, %xmm7 2254; SSE2-NEXT: pand %xmm12, %xmm7 2255; SSE2-NEXT: pand %xmm13, %xmm3 2256; SSE2-NEXT: paddb %xmm3, %xmm3 2257; SSE2-NEXT: por %xmm7, %xmm3 2258; SSE2-NEXT: retq 2259; 2260; SSSE3-LABEL: test_bitreverse_v8i64: 2261; SSSE3: # %bb.0: 2262; SSSE3-NEXT: movdqa %xmm1, %xmm5 2263; SSSE3-NEXT: movdqa %xmm0, %xmm1 2264; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2265; SSSE3-NEXT: pshufb %xmm8, %xmm1 2266; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2267; SSSE3-NEXT: movdqa %xmm1, %xmm0 2268; SSSE3-NEXT: pand %xmm9, %xmm0 2269; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2270; SSSE3-NEXT: movdqa %xmm7, %xmm6 2271; SSSE3-NEXT: pshufb %xmm0, %xmm6 2272; SSSE3-NEXT: psrlw $4, %xmm1 2273; SSSE3-NEXT: pand %xmm9, %xmm1 2274; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2275; SSSE3-NEXT: movdqa %xmm4, %xmm0 2276; SSSE3-NEXT: pshufb %xmm1, %xmm0 2277; SSSE3-NEXT: por %xmm6, %xmm0 2278; SSSE3-NEXT: pshufb %xmm8, %xmm5 2279; SSSE3-NEXT: movdqa %xmm5, %xmm1 2280; SSSE3-NEXT: pand %xmm9, %xmm1 2281; SSSE3-NEXT: movdqa %xmm7, %xmm6 2282; SSSE3-NEXT: pshufb %xmm1, %xmm6 2283; SSSE3-NEXT: psrlw $4, %xmm5 2284; SSSE3-NEXT: pand %xmm9, %xmm5 2285; SSSE3-NEXT: movdqa %xmm4, %xmm1 2286; SSSE3-NEXT: pshufb %xmm5, %xmm1 2287; SSSE3-NEXT: por %xmm6, %xmm1 2288; SSSE3-NEXT: pshufb %xmm8, %xmm2 2289; SSSE3-NEXT: movdqa %xmm2, %xmm5 2290; SSSE3-NEXT: pand %xmm9, %xmm5 2291; SSSE3-NEXT: movdqa %xmm7, %xmm6 2292; SSSE3-NEXT: pshufb %xmm5, %xmm6 2293; SSSE3-NEXT: psrlw $4, %xmm2 2294; SSSE3-NEXT: pand %xmm9, %xmm2 2295; SSSE3-NEXT: movdqa %xmm4, %xmm5 2296; SSSE3-NEXT: pshufb %xmm2, %xmm5 2297; SSSE3-NEXT: por %xmm6, %xmm5 2298; SSSE3-NEXT: pshufb %xmm8, %xmm3 2299; SSSE3-NEXT: movdqa %xmm3, %xmm2 2300; SSSE3-NEXT: pand %xmm9, %xmm2 2301; SSSE3-NEXT: pshufb %xmm2, %xmm7 2302; SSSE3-NEXT: psrlw $4, %xmm3 2303; SSSE3-NEXT: pand %xmm9, %xmm3 2304; SSSE3-NEXT: pshufb %xmm3, %xmm4 2305; SSSE3-NEXT: por %xmm7, %xmm4 2306; SSSE3-NEXT: movdqa %xmm5, %xmm2 2307; SSSE3-NEXT: movdqa %xmm4, %xmm3 2308; SSSE3-NEXT: retq 2309; 2310; AVX1-LABEL: test_bitreverse_v8i64: 2311; AVX1: # %bb.0: 2312; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2313; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2314; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2315; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2316; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2317; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2318; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2319; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2320; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2321; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2322; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2323; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2324; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2325; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 2326; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2327; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2328; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2329; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2330; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2331; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2332; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2333; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2334; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2335; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2336; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2337; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2338; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2339; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2340; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2341; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2342; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2343; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2344; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2345; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2346; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2347; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2348; AVX1-NEXT: retq 2349; 2350; AVX2-LABEL: test_bitreverse_v8i64: 2351; AVX2: # %bb.0: 2352; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2353; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2354; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2355; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2356; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2357; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2358; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2359; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2360; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2361; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2362; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2363; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2364; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2365; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2366; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2367; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2368; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2369; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2370; AVX2-NEXT: retq 2371; 2372; AVX512F-LABEL: test_bitreverse_v8i64: 2373; AVX512F: # %bb.0: 2374; AVX512F-NEXT: vpsrlq $56, %zmm0, %zmm1 2375; AVX512F-NEXT: vpsrlq $40, %zmm0, %zmm2 2376; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 2377; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 2378; AVX512F-NEXT: vpsrlq $24, %zmm0, %zmm2 2379; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 2380; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 2381; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm2 2382; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 2383; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 2384; AVX512F-NEXT: vpsllq $8, %zmm0, %zmm2 2385; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 2386; AVX512F-NEXT: vpsllq $24, %zmm0, %zmm3 2387; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3 2388; AVX512F-NEXT: vporq %zmm2, %zmm3, %zmm2 2389; AVX512F-NEXT: vpsllq $56, %zmm0, %zmm3 2390; AVX512F-NEXT: vpsllq $40, %zmm0, %zmm0 2391; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 2392; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0 2393; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2394; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0 2395; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 2396; AVX512F-NEXT: vpsllq $4, %zmm1, %zmm1 2397; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 2398; AVX512F-NEXT: vpsrlq $4, %zmm0, %zmm0 2399; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2400; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 2401; AVX512F-NEXT: vpsllq $2, %zmm1, %zmm1 2402; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 2403; AVX512F-NEXT: vpsrlq $2, %zmm0, %zmm0 2404; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2405; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 2406; AVX512F-NEXT: vpsllq $1, %zmm1, %zmm1 2407; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 2408; AVX512F-NEXT: vpsrlq $1, %zmm0, %zmm0 2409; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2410; AVX512F-NEXT: retq 2411; 2412; AVX512BW-LABEL: test_bitreverse_v8i64: 2413; AVX512BW: # %bb.0: 2414; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] 2415; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2416; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2417; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2418; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2419; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2420; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2421; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2422; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2423; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2424; AVX512BW-NEXT: retq 2425; 2426; XOPAVX1-LABEL: test_bitreverse_v8i64: 2427; XOPAVX1: # %bb.0: 2428; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2429; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 2430; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2431; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2432; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2433; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2434; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2435; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2436; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2437; XOPAVX1-NEXT: retq 2438; 2439; XOPAVX2-LABEL: test_bitreverse_v8i64: 2440; XOPAVX2: # %bb.0: 2441; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2442; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 2443; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2444; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2445; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2446; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2447; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2448; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2449; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2450; XOPAVX2-NEXT: retq 2451 %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) 2452 ret <8 x i64> %b 2453} 2454 2455; 2456; Constant Folding 2457; 2458 2459define i32 @fold_bitreverse_i32() nounwind { 2460; ALL-LABEL: fold_bitreverse_i32: 2461; ALL: # %bb.0: 2462; ALL-NEXT: movl $16711935, %eax # imm = 0xFF00FF 2463; ALL-NEXT: retq 2464 %b = call i32 @llvm.bitreverse.i32(i32 4278255360) 2465 ret i32 %b 2466} 2467 2468define <16 x i8> @fold_bitreverse_v16i8() nounwind { 2469; SSE-LABEL: fold_bitreverse_v16i8: 2470; SSE: # %bb.0: 2471; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2472; SSE-NEXT: retq 2473; 2474; AVX-LABEL: fold_bitreverse_v16i8: 2475; AVX: # %bb.0: 2476; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2477; AVX-NEXT: retq 2478; 2479; XOP-LABEL: fold_bitreverse_v16i8: 2480; XOP: # %bb.0: 2481; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2482; XOP-NEXT: retq 2483 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>) 2484 ret <16 x i8> %b 2485} 2486 2487define <16 x i16> @fold_bitreverse_v16i16() nounwind { 2488; SSE-LABEL: fold_bitreverse_v16i16: 2489; SSE: # %bb.0: 2490; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959] 2491; SSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863] 2492; SSE-NEXT: retq 2493; 2494; AVX-LABEL: fold_bitreverse_v16i16: 2495; AVX: # %bb.0: 2496; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 2497; AVX-NEXT: retq 2498; 2499; XOP-LABEL: fold_bitreverse_v16i16: 2500; XOP: # %bb.0: 2501; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 2502; XOP-NEXT: retq 2503 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>) 2504 ret <16 x i16> %b 2505} 2506 2507define <16 x i32> @fold_bitreverse_v16i32() nounwind { 2508; SSE-LABEL: fold_bitreverse_v16i32: 2509; SSE: # %bb.0: 2510; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471] 2511; SSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559] 2512; SSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015] 2513; SSE-NEXT: movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103] 2514; SSE-NEXT: retq 2515; 2516; AVX1-LABEL: fold_bitreverse_v16i32: 2517; AVX1: # %bb.0: 2518; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2519; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2520; AVX1-NEXT: retq 2521; 2522; AVX2-LABEL: fold_bitreverse_v16i32: 2523; AVX2: # %bb.0: 2524; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2525; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2526; AVX2-NEXT: retq 2527; 2528; AVX512-LABEL: fold_bitreverse_v16i32: 2529; AVX512: # %bb.0: 2530; AVX512-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2531; AVX512-NEXT: retq 2532; 2533; XOP-LABEL: fold_bitreverse_v16i32: 2534; XOP: # %bb.0: 2535; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2536; XOP-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2537; XOP-NEXT: retq 2538 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>) 2539 ret <16 x i32> %b 2540} 2541 2542declare i8 @llvm.bitreverse.i8(i8) readnone 2543declare i16 @llvm.bitreverse.i16(i16) readnone 2544declare i32 @llvm.bitreverse.i32(i32) readnone 2545declare i64 @llvm.bitreverse.i64(i64) readnone 2546 2547declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone 2548declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone 2549declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone 2550declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone 2551 2552declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) readnone 2553declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone 2554declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone 2555declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone 2556 2557declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) readnone 2558declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone 2559declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>) readnone 2560declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>) readnone 2561