1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 10 11define i8 @test_bitreverse_i8(i8 %a) nounwind { 12; SSE-LABEL: test_bitreverse_i8: 13; SSE: # BB#0: 14; SSE-NEXT: rolb $4, %dil 15; SSE-NEXT: movl %edi, %eax 16; SSE-NEXT: andb $51, %al 17; SSE-NEXT: shlb $2, %al 18; SSE-NEXT: andb $-52, %dil 19; SSE-NEXT: shrb $2, %dil 20; SSE-NEXT: orb %al, %dil 21; SSE-NEXT: movl %edi, %eax 22; SSE-NEXT: andb $85, %al 23; SSE-NEXT: addb %al, %al 24; SSE-NEXT: andb $-86, %dil 25; SSE-NEXT: shrb %dil 26; SSE-NEXT: orb %al, %dil 27; SSE-NEXT: movl %edi, %eax 28; SSE-NEXT: retq 29; 30; AVX-LABEL: test_bitreverse_i8: 31; AVX: # BB#0: 32; AVX-NEXT: rolb $4, %dil 33; AVX-NEXT: movl %edi, %eax 34; AVX-NEXT: andb $51, %al 35; AVX-NEXT: shlb $2, %al 36; AVX-NEXT: andb $-52, %dil 37; AVX-NEXT: shrb $2, %dil 38; AVX-NEXT: orb %al, %dil 39; AVX-NEXT: movl %edi, %eax 40; AVX-NEXT: andb $85, %al 41; AVX-NEXT: addb %al, %al 42; AVX-NEXT: andb $-86, %dil 43; AVX-NEXT: shrb %dil 44; AVX-NEXT: orb %al, %dil 45; AVX-NEXT: movl %edi, %eax 46; AVX-NEXT: retq 47; 48; XOP-LABEL: test_bitreverse_i8: 49; XOP: # BB#0: 50; XOP-NEXT: vmovd %edi, %xmm0 51; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 52; XOP-NEXT: vpextrb $0, %xmm0, %eax 53; XOP-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> 54; XOP-NEXT: retq 55 %b = call i8 @llvm.bitreverse.i8(i8 %a) 56 ret i8 %b 57} 58 59define i16 @test_bitreverse_i16(i16 %a) nounwind { 60; SSE-LABEL: test_bitreverse_i16: 61; SSE: # BB#0: 62; SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> 63; SSE-NEXT: rolw $8, %di 64; SSE-NEXT: movl %edi, %eax 65; SSE-NEXT: andl $3855, %eax # imm = 0xF0F 66; SSE-NEXT: shll $4, %eax 67; SSE-NEXT: andl $61680, %edi # imm = 0xF0F0 68; SSE-NEXT: shrl $4, %edi 69; SSE-NEXT: orl %eax, %edi 70; SSE-NEXT: movl %edi, %eax 71; SSE-NEXT: andl $13107, %eax # imm = 0x3333 72; SSE-NEXT: andl $52428, %edi # imm = 0xCCCC 73; SSE-NEXT: shrl $2, %edi 74; SSE-NEXT: leal (%rdi,%rax,4), %eax 75; SSE-NEXT: movl %eax, %ecx 76; SSE-NEXT: andl $21845, %ecx # imm = 0x5555 77; SSE-NEXT: andl $43690, %eax # imm = 0xAAAA 78; SSE-NEXT: shrl %eax 79; SSE-NEXT: leal (%rax,%rcx,2), %eax 80; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> 81; SSE-NEXT: retq 82; 83; AVX-LABEL: test_bitreverse_i16: 84; AVX: # BB#0: 85; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> 86; AVX-NEXT: rolw $8, %di 87; AVX-NEXT: movl %edi, %eax 88; AVX-NEXT: andl $3855, %eax # imm = 0xF0F 89; AVX-NEXT: shll $4, %eax 90; AVX-NEXT: andl $61680, %edi # imm = 0xF0F0 91; AVX-NEXT: shrl $4, %edi 92; AVX-NEXT: orl %eax, %edi 93; AVX-NEXT: movl %edi, %eax 94; AVX-NEXT: andl $13107, %eax # imm = 0x3333 95; AVX-NEXT: andl $52428, %edi # imm = 0xCCCC 96; AVX-NEXT: shrl $2, %edi 97; AVX-NEXT: leal (%rdi,%rax,4), %eax 98; AVX-NEXT: movl %eax, %ecx 99; AVX-NEXT: andl $21845, %ecx # imm = 0x5555 100; AVX-NEXT: andl $43690, %eax # imm = 0xAAAA 101; AVX-NEXT: shrl %eax 102; AVX-NEXT: leal (%rax,%rcx,2), %eax 103; AVX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> 104; AVX-NEXT: retq 105; 106; XOP-LABEL: test_bitreverse_i16: 107; XOP: # BB#0: 108; XOP-NEXT: vmovd %edi, %xmm0 109; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 110; XOP-NEXT: vmovd %xmm0, %eax 111; XOP-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> 112; XOP-NEXT: retq 113 %b = call i16 @llvm.bitreverse.i16(i16 %a) 114 ret i16 %b 115} 116 117define i32 @test_bitreverse_i32(i32 %a) nounwind { 118; SSE-LABEL: test_bitreverse_i32: 119; SSE: # BB#0: 120; SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> 121; SSE-NEXT: bswapl %edi 122; SSE-NEXT: movl %edi, %eax 123; SSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 124; SSE-NEXT: shll $4, %eax 125; SSE-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 126; SSE-NEXT: shrl $4, %edi 127; SSE-NEXT: orl %eax, %edi 128; SSE-NEXT: movl %edi, %eax 129; SSE-NEXT: andl $858993459, %eax # imm = 0x33333333 130; SSE-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC 131; SSE-NEXT: shrl $2, %edi 132; SSE-NEXT: leal (%rdi,%rax,4), %eax 133; SSE-NEXT: movl %eax, %ecx 134; SSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555 135; SSE-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA 136; SSE-NEXT: shrl %eax 137; SSE-NEXT: leal (%rax,%rcx,2), %eax 138; SSE-NEXT: retq 139; 140; AVX-LABEL: test_bitreverse_i32: 141; AVX: # BB#0: 142; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> 143; AVX-NEXT: bswapl %edi 144; AVX-NEXT: movl %edi, %eax 145; AVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 146; AVX-NEXT: shll $4, %eax 147; AVX-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 148; AVX-NEXT: shrl $4, %edi 149; AVX-NEXT: orl %eax, %edi 150; AVX-NEXT: movl %edi, %eax 151; AVX-NEXT: andl $858993459, %eax # imm = 0x33333333 152; AVX-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC 153; AVX-NEXT: shrl $2, %edi 154; AVX-NEXT: leal (%rdi,%rax,4), %eax 155; AVX-NEXT: movl %eax, %ecx 156; AVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555 157; AVX-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA 158; AVX-NEXT: shrl %eax 159; AVX-NEXT: leal (%rax,%rcx,2), %eax 160; AVX-NEXT: retq 161; 162; XOP-LABEL: test_bitreverse_i32: 163; XOP: # BB#0: 164; XOP-NEXT: vmovd %edi, %xmm0 165; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 166; XOP-NEXT: vmovd %xmm0, %eax 167; XOP-NEXT: retq 168 %b = call i32 @llvm.bitreverse.i32(i32 %a) 169 ret i32 %b 170} 171 172define i64 @test_bitreverse_i64(i64 %a) nounwind { 173; SSE-LABEL: test_bitreverse_i64: 174; SSE: # BB#0: 175; SSE-NEXT: bswapq %rdi 176; SSE-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F 177; SSE-NEXT: andq %rdi, %rax 178; SSE-NEXT: shlq $4, %rax 179; SSE-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 180; SSE-NEXT: andq %rdi, %rcx 181; SSE-NEXT: shrq $4, %rcx 182; SSE-NEXT: orq %rax, %rcx 183; SSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 184; SSE-NEXT: andq %rcx, %rax 185; SSE-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC 186; SSE-NEXT: andq %rcx, %rdx 187; SSE-NEXT: shrq $2, %rdx 188; SSE-NEXT: leaq (%rdx,%rax,4), %rax 189; SSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 190; SSE-NEXT: andq %rax, %rcx 191; SSE-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA 192; SSE-NEXT: andq %rax, %rdx 193; SSE-NEXT: shrq %rdx 194; SSE-NEXT: leaq (%rdx,%rcx,2), %rax 195; SSE-NEXT: retq 196; 197; AVX-LABEL: test_bitreverse_i64: 198; AVX: # BB#0: 199; AVX-NEXT: bswapq %rdi 200; AVX-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F 201; AVX-NEXT: andq %rdi, %rax 202; AVX-NEXT: shlq $4, %rax 203; AVX-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 204; AVX-NEXT: andq %rdi, %rcx 205; AVX-NEXT: shrq $4, %rcx 206; AVX-NEXT: orq %rax, %rcx 207; AVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 208; AVX-NEXT: andq %rcx, %rax 209; AVX-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC 210; AVX-NEXT: andq %rcx, %rdx 211; AVX-NEXT: shrq $2, %rdx 212; AVX-NEXT: leaq (%rdx,%rax,4), %rax 213; AVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 214; AVX-NEXT: andq %rax, %rcx 215; AVX-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA 216; AVX-NEXT: andq %rax, %rdx 217; AVX-NEXT: shrq %rdx 218; AVX-NEXT: leaq (%rdx,%rcx,2), %rax 219; AVX-NEXT: retq 220; 221; XOP-LABEL: test_bitreverse_i64: 222; XOP: # BB#0: 223; XOP-NEXT: vmovq %rdi, %xmm0 224; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 225; XOP-NEXT: vmovq %xmm0, %rax 226; XOP-NEXT: retq 227 %b = call i64 @llvm.bitreverse.i64(i64 %a) 228 ret i64 %b 229} 230 231define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { 232; SSE2-LABEL: test_bitreverse_v16i8: 233; SSE2: # BB#0: 234; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 235; SSE2-NEXT: movdqa %xmm0, %xmm2 236; SSE2-NEXT: pand %xmm1, %xmm2 237; SSE2-NEXT: psllw $4, %xmm2 238; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 239; SSE2-NEXT: pand %xmm3, %xmm2 240; SSE2-NEXT: pand %xmm3, %xmm0 241; SSE2-NEXT: psrlw $4, %xmm0 242; SSE2-NEXT: pand %xmm1, %xmm0 243; SSE2-NEXT: por %xmm2, %xmm0 244; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 245; SSE2-NEXT: pand %xmm0, %xmm1 246; SSE2-NEXT: psllw $2, %xmm1 247; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 248; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 249; SSE2-NEXT: psrlw $2, %xmm0 250; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 251; SSE2-NEXT: por %xmm1, %xmm0 252; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 253; SSE2-NEXT: pand %xmm0, %xmm1 254; SSE2-NEXT: paddb %xmm1, %xmm1 255; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 256; SSE2-NEXT: psrlw $1, %xmm0 257; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 258; SSE2-NEXT: por %xmm1, %xmm0 259; SSE2-NEXT: retq 260; 261; SSSE3-LABEL: test_bitreverse_v16i8: 262; SSSE3: # BB#0: 263; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 264; SSSE3-NEXT: movdqa %xmm0, %xmm2 265; SSSE3-NEXT: pand %xmm1, %xmm2 266; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 267; SSSE3-NEXT: pshufb %xmm2, %xmm3 268; SSSE3-NEXT: psrlw $4, %xmm0 269; SSSE3-NEXT: pand %xmm1, %xmm0 270; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 271; SSSE3-NEXT: pshufb %xmm0, %xmm1 272; SSSE3-NEXT: por %xmm3, %xmm1 273; SSSE3-NEXT: movdqa %xmm1, %xmm0 274; SSSE3-NEXT: retq 275; 276; AVX-LABEL: test_bitreverse_v16i8: 277; AVX: # BB#0: 278; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 279; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 280; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 281; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 282; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 283; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 284; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 285; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 286; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 287; AVX-NEXT: retq 288; 289; XOP-LABEL: test_bitreverse_v16i8: 290; XOP: # BB#0: 291; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 292; XOP-NEXT: retq 293 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) 294 ret <16 x i8> %b 295} 296 297define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { 298; SSE2-LABEL: test_bitreverse_v8i16: 299; SSE2: # BB#0: 300; SSE2-NEXT: pxor %xmm1, %xmm1 301; SSE2-NEXT: movdqa %xmm0, %xmm2 302; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 303; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 304; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] 305; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 306; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 307; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 308; SSE2-NEXT: packuswb %xmm2, %xmm0 309; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 310; SSE2-NEXT: movdqa %xmm0, %xmm2 311; SSE2-NEXT: pand %xmm1, %xmm2 312; SSE2-NEXT: psllw $4, %xmm2 313; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 314; SSE2-NEXT: pand %xmm3, %xmm2 315; SSE2-NEXT: pand %xmm3, %xmm0 316; SSE2-NEXT: psrlw $4, %xmm0 317; SSE2-NEXT: pand %xmm1, %xmm0 318; SSE2-NEXT: por %xmm2, %xmm0 319; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 320; SSE2-NEXT: pand %xmm0, %xmm1 321; SSE2-NEXT: psllw $2, %xmm1 322; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 323; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 324; SSE2-NEXT: psrlw $2, %xmm0 325; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 326; SSE2-NEXT: por %xmm1, %xmm0 327; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 328; SSE2-NEXT: pand %xmm0, %xmm1 329; SSE2-NEXT: paddb %xmm1, %xmm1 330; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 331; SSE2-NEXT: psrlw $1, %xmm0 332; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 333; SSE2-NEXT: por %xmm1, %xmm0 334; SSE2-NEXT: retq 335; 336; SSSE3-LABEL: test_bitreverse_v8i16: 337; SSSE3: # BB#0: 338; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 339; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 340; SSSE3-NEXT: movdqa %xmm0, %xmm2 341; SSSE3-NEXT: pand %xmm1, %xmm2 342; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 343; SSSE3-NEXT: pshufb %xmm2, %xmm3 344; SSSE3-NEXT: psrlw $4, %xmm0 345; SSSE3-NEXT: pand %xmm1, %xmm0 346; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 347; SSSE3-NEXT: pshufb %xmm0, %xmm1 348; SSSE3-NEXT: por %xmm3, %xmm1 349; SSSE3-NEXT: movdqa %xmm1, %xmm0 350; SSSE3-NEXT: retq 351; 352; AVX-LABEL: test_bitreverse_v8i16: 353; AVX: # BB#0: 354; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 355; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 356; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 357; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 358; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 359; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 360; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 361; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 362; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 363; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 364; AVX-NEXT: retq 365; 366; XOP-LABEL: test_bitreverse_v8i16: 367; XOP: # BB#0: 368; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 369; XOP-NEXT: retq 370 %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) 371 ret <8 x i16> %b 372} 373 374define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { 375; SSE2-LABEL: test_bitreverse_v4i32: 376; SSE2: # BB#0: 377; SSE2-NEXT: pxor %xmm1, %xmm1 378; SSE2-NEXT: movdqa %xmm0, %xmm2 379; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 380; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 381; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 382; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 383; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 384; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 385; SSE2-NEXT: packuswb %xmm2, %xmm0 386; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 387; SSE2-NEXT: movdqa %xmm0, %xmm2 388; SSE2-NEXT: pand %xmm1, %xmm2 389; SSE2-NEXT: psllw $4, %xmm2 390; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 391; SSE2-NEXT: pand %xmm3, %xmm2 392; SSE2-NEXT: pand %xmm3, %xmm0 393; SSE2-NEXT: psrlw $4, %xmm0 394; SSE2-NEXT: pand %xmm1, %xmm0 395; SSE2-NEXT: por %xmm2, %xmm0 396; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 397; SSE2-NEXT: pand %xmm0, %xmm1 398; SSE2-NEXT: psllw $2, %xmm1 399; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 400; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 401; SSE2-NEXT: psrlw $2, %xmm0 402; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 403; SSE2-NEXT: por %xmm1, %xmm0 404; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 405; SSE2-NEXT: pand %xmm0, %xmm1 406; SSE2-NEXT: paddb %xmm1, %xmm1 407; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 408; SSE2-NEXT: psrlw $1, %xmm0 409; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 410; SSE2-NEXT: por %xmm1, %xmm0 411; SSE2-NEXT: retq 412; 413; SSSE3-LABEL: test_bitreverse_v4i32: 414; SSSE3: # BB#0: 415; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 416; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 417; SSSE3-NEXT: movdqa %xmm0, %xmm2 418; SSSE3-NEXT: pand %xmm1, %xmm2 419; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 420; SSSE3-NEXT: pshufb %xmm2, %xmm3 421; SSSE3-NEXT: psrlw $4, %xmm0 422; SSSE3-NEXT: pand %xmm1, %xmm0 423; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 424; SSSE3-NEXT: pshufb %xmm0, %xmm1 425; SSSE3-NEXT: por %xmm3, %xmm1 426; SSSE3-NEXT: movdqa %xmm1, %xmm0 427; SSSE3-NEXT: retq 428; 429; AVX-LABEL: test_bitreverse_v4i32: 430; AVX: # BB#0: 431; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 432; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 433; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 434; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 435; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 436; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 437; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 438; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 439; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 440; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 441; AVX-NEXT: retq 442; 443; XOP-LABEL: test_bitreverse_v4i32: 444; XOP: # BB#0: 445; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 446; XOP-NEXT: retq 447 %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) 448 ret <4 x i32> %b 449} 450 451define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { 452; SSE2-LABEL: test_bitreverse_v2i64: 453; SSE2: # BB#0: 454; SSE2-NEXT: pxor %xmm1, %xmm1 455; SSE2-NEXT: movdqa %xmm0, %xmm2 456; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 457; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 458; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 459; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 460; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 461; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 462; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 463; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 464; SSE2-NEXT: packuswb %xmm2, %xmm0 465; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 466; SSE2-NEXT: movdqa %xmm0, %xmm2 467; SSE2-NEXT: pand %xmm1, %xmm2 468; SSE2-NEXT: psllw $4, %xmm2 469; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 470; SSE2-NEXT: pand %xmm3, %xmm2 471; SSE2-NEXT: pand %xmm3, %xmm0 472; SSE2-NEXT: psrlw $4, %xmm0 473; SSE2-NEXT: pand %xmm1, %xmm0 474; SSE2-NEXT: por %xmm2, %xmm0 475; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 476; SSE2-NEXT: pand %xmm0, %xmm1 477; SSE2-NEXT: psllw $2, %xmm1 478; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 479; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 480; SSE2-NEXT: psrlw $2, %xmm0 481; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 482; SSE2-NEXT: por %xmm1, %xmm0 483; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 484; SSE2-NEXT: pand %xmm0, %xmm1 485; SSE2-NEXT: paddb %xmm1, %xmm1 486; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 487; SSE2-NEXT: psrlw $1, %xmm0 488; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 489; SSE2-NEXT: por %xmm1, %xmm0 490; SSE2-NEXT: retq 491; 492; SSSE3-LABEL: test_bitreverse_v2i64: 493; SSSE3: # BB#0: 494; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 495; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 496; SSSE3-NEXT: movdqa %xmm0, %xmm2 497; SSSE3-NEXT: pand %xmm1, %xmm2 498; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 499; SSSE3-NEXT: pshufb %xmm2, %xmm3 500; SSSE3-NEXT: psrlw $4, %xmm0 501; SSSE3-NEXT: pand %xmm1, %xmm0 502; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 503; SSSE3-NEXT: pshufb %xmm0, %xmm1 504; SSSE3-NEXT: por %xmm3, %xmm1 505; SSSE3-NEXT: movdqa %xmm1, %xmm0 506; SSSE3-NEXT: retq 507; 508; AVX-LABEL: test_bitreverse_v2i64: 509; AVX: # BB#0: 510; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 511; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 512; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 513; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 514; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 515; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 516; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 517; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 518; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 519; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 520; AVX-NEXT: retq 521; 522; XOP-LABEL: test_bitreverse_v2i64: 523; XOP: # BB#0: 524; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 525; XOP-NEXT: retq 526 %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) 527 ret <2 x i64> %b 528} 529 530define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { 531; SSE2-LABEL: test_bitreverse_v32i8: 532; SSE2: # BB#0: 533; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 534; SSE2-NEXT: movdqa %xmm0, %xmm3 535; SSE2-NEXT: pand %xmm2, %xmm3 536; SSE2-NEXT: psllw $4, %xmm3 537; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 538; SSE2-NEXT: pand %xmm5, %xmm3 539; SSE2-NEXT: pand %xmm5, %xmm0 540; SSE2-NEXT: psrlw $4, %xmm0 541; SSE2-NEXT: pand %xmm2, %xmm0 542; SSE2-NEXT: por %xmm3, %xmm0 543; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 544; SSE2-NEXT: movdqa %xmm0, %xmm4 545; SSE2-NEXT: pand %xmm3, %xmm4 546; SSE2-NEXT: psllw $2, %xmm4 547; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 548; SSE2-NEXT: pand %xmm8, %xmm4 549; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 550; SSE2-NEXT: pand %xmm9, %xmm0 551; SSE2-NEXT: psrlw $2, %xmm0 552; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 553; SSE2-NEXT: pand %xmm10, %xmm0 554; SSE2-NEXT: por %xmm4, %xmm0 555; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 556; SSE2-NEXT: movdqa %xmm0, %xmm7 557; SSE2-NEXT: pand %xmm4, %xmm7 558; SSE2-NEXT: psrlw $1, %xmm7 559; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 560; SSE2-NEXT: pand %xmm11, %xmm7 561; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 562; SSE2-NEXT: pand %xmm6, %xmm0 563; SSE2-NEXT: paddb %xmm0, %xmm0 564; SSE2-NEXT: por %xmm7, %xmm0 565; SSE2-NEXT: movdqa %xmm1, %xmm7 566; SSE2-NEXT: pand %xmm2, %xmm7 567; SSE2-NEXT: psllw $4, %xmm7 568; SSE2-NEXT: pand %xmm5, %xmm7 569; SSE2-NEXT: pand %xmm5, %xmm1 570; SSE2-NEXT: psrlw $4, %xmm1 571; SSE2-NEXT: pand %xmm2, %xmm1 572; SSE2-NEXT: por %xmm7, %xmm1 573; SSE2-NEXT: pand %xmm1, %xmm3 574; SSE2-NEXT: psllw $2, %xmm3 575; SSE2-NEXT: pand %xmm8, %xmm3 576; SSE2-NEXT: pand %xmm9, %xmm1 577; SSE2-NEXT: psrlw $2, %xmm1 578; SSE2-NEXT: pand %xmm10, %xmm1 579; SSE2-NEXT: por %xmm3, %xmm1 580; SSE2-NEXT: pand %xmm1, %xmm4 581; SSE2-NEXT: psrlw $1, %xmm4 582; SSE2-NEXT: pand %xmm11, %xmm4 583; SSE2-NEXT: pand %xmm6, %xmm1 584; SSE2-NEXT: paddb %xmm1, %xmm1 585; SSE2-NEXT: por %xmm4, %xmm1 586; SSE2-NEXT: retq 587; 588; SSSE3-LABEL: test_bitreverse_v32i8: 589; SSSE3: # BB#0: 590; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 591; SSSE3-NEXT: movdqa %xmm0, %xmm2 592; SSSE3-NEXT: pand %xmm4, %xmm2 593; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 594; SSSE3-NEXT: movdqa %xmm5, %xmm6 595; SSSE3-NEXT: pshufb %xmm2, %xmm6 596; SSSE3-NEXT: psrlw $4, %xmm0 597; SSSE3-NEXT: pand %xmm4, %xmm0 598; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 599; SSSE3-NEXT: movdqa %xmm2, %xmm3 600; SSSE3-NEXT: pshufb %xmm0, %xmm3 601; SSSE3-NEXT: por %xmm6, %xmm3 602; SSSE3-NEXT: movdqa %xmm1, %xmm0 603; SSSE3-NEXT: pand %xmm4, %xmm0 604; SSSE3-NEXT: pshufb %xmm0, %xmm5 605; SSSE3-NEXT: psrlw $4, %xmm1 606; SSSE3-NEXT: pand %xmm4, %xmm1 607; SSSE3-NEXT: pshufb %xmm1, %xmm2 608; SSSE3-NEXT: por %xmm5, %xmm2 609; SSSE3-NEXT: movdqa %xmm3, %xmm0 610; SSSE3-NEXT: movdqa %xmm2, %xmm1 611; SSSE3-NEXT: retq 612; 613; AVX1-LABEL: test_bitreverse_v32i8: 614; AVX1: # BB#0: 615; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 616; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 617; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3 618; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 619; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 620; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 621; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 622; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 623; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 624; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 625; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3 626; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 627; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 628; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 629; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 630; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 631; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 632; AVX1-NEXT: retq 633; 634; AVX2-LABEL: test_bitreverse_v32i8: 635; AVX2: # BB#0: 636; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 637; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 638; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 639; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 640; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 641; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 642; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 643; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 644; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 645; AVX2-NEXT: retq 646; 647; AVX512-LABEL: test_bitreverse_v32i8: 648; AVX512: # BB#0: 649; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 650; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 651; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 652; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 653; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 654; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 655; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 656; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 657; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 658; AVX512-NEXT: retq 659; 660; XOPAVX1-LABEL: test_bitreverse_v32i8: 661; XOPAVX1: # BB#0: 662; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 663; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 664; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 665; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 666; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 667; XOPAVX1-NEXT: retq 668; 669; XOPAVX2-LABEL: test_bitreverse_v32i8: 670; XOPAVX2: # BB#0: 671; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 672; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 673; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 674; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 675; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 676; XOPAVX2-NEXT: retq 677 %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) 678 ret <32 x i8> %b 679} 680 681define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { 682; SSE2-LABEL: test_bitreverse_v16i16: 683; SSE2: # BB#0: 684; SSE2-NEXT: pxor %xmm4, %xmm4 685; SSE2-NEXT: movdqa %xmm0, %xmm2 686; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 687; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 688; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] 689; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 690; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 691; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 692; SSE2-NEXT: packuswb %xmm2, %xmm0 693; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 694; SSE2-NEXT: movdqa %xmm0, %xmm3 695; SSE2-NEXT: pand %xmm2, %xmm3 696; SSE2-NEXT: psllw $4, %xmm3 697; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 698; SSE2-NEXT: pand %xmm6, %xmm3 699; SSE2-NEXT: pand %xmm6, %xmm0 700; SSE2-NEXT: psrlw $4, %xmm0 701; SSE2-NEXT: pand %xmm2, %xmm0 702; SSE2-NEXT: por %xmm3, %xmm0 703; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 704; SSE2-NEXT: movdqa %xmm0, %xmm5 705; SSE2-NEXT: pand %xmm3, %xmm5 706; SSE2-NEXT: psllw $2, %xmm5 707; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 708; SSE2-NEXT: pand %xmm8, %xmm5 709; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 710; SSE2-NEXT: pand %xmm9, %xmm0 711; SSE2-NEXT: psrlw $2, %xmm0 712; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 713; SSE2-NEXT: pand %xmm10, %xmm0 714; SSE2-NEXT: por %xmm5, %xmm0 715; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 716; SSE2-NEXT: movdqa %xmm0, %xmm7 717; SSE2-NEXT: pand %xmm5, %xmm7 718; SSE2-NEXT: psrlw $1, %xmm7 719; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 720; SSE2-NEXT: pand %xmm11, %xmm7 721; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 722; SSE2-NEXT: pand %xmm12, %xmm0 723; SSE2-NEXT: paddb %xmm0, %xmm0 724; SSE2-NEXT: por %xmm7, %xmm0 725; SSE2-NEXT: movdqa %xmm1, %xmm7 726; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] 727; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,3,2,4,5,6,7] 728; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,7,6] 729; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 730; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 731; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6] 732; SSE2-NEXT: packuswb %xmm7, %xmm1 733; SSE2-NEXT: movdqa %xmm1, %xmm4 734; SSE2-NEXT: pand %xmm2, %xmm4 735; SSE2-NEXT: psllw $4, %xmm4 736; SSE2-NEXT: pand %xmm6, %xmm4 737; SSE2-NEXT: pand %xmm6, %xmm1 738; SSE2-NEXT: psrlw $4, %xmm1 739; SSE2-NEXT: pand %xmm2, %xmm1 740; SSE2-NEXT: por %xmm4, %xmm1 741; SSE2-NEXT: pand %xmm1, %xmm3 742; SSE2-NEXT: psllw $2, %xmm3 743; SSE2-NEXT: pand %xmm8, %xmm3 744; SSE2-NEXT: pand %xmm9, %xmm1 745; SSE2-NEXT: psrlw $2, %xmm1 746; SSE2-NEXT: pand %xmm10, %xmm1 747; SSE2-NEXT: por %xmm3, %xmm1 748; SSE2-NEXT: pand %xmm1, %xmm5 749; SSE2-NEXT: psrlw $1, %xmm5 750; SSE2-NEXT: pand %xmm11, %xmm5 751; SSE2-NEXT: pand %xmm12, %xmm1 752; SSE2-NEXT: paddb %xmm1, %xmm1 753; SSE2-NEXT: por %xmm5, %xmm1 754; SSE2-NEXT: retq 755; 756; SSSE3-LABEL: test_bitreverse_v16i16: 757; SSSE3: # BB#0: 758; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 759; SSSE3-NEXT: pshufb %xmm4, %xmm0 760; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 761; SSSE3-NEXT: movdqa %xmm0, %xmm2 762; SSSE3-NEXT: pand %xmm5, %xmm2 763; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 764; SSSE3-NEXT: movdqa %xmm6, %xmm7 765; SSSE3-NEXT: pshufb %xmm2, %xmm7 766; SSSE3-NEXT: psrlw $4, %xmm0 767; SSSE3-NEXT: pand %xmm5, %xmm0 768; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 769; SSSE3-NEXT: movdqa %xmm2, %xmm3 770; SSSE3-NEXT: pshufb %xmm0, %xmm3 771; SSSE3-NEXT: por %xmm7, %xmm3 772; SSSE3-NEXT: pshufb %xmm4, %xmm1 773; SSSE3-NEXT: movdqa %xmm1, %xmm0 774; SSSE3-NEXT: pand %xmm5, %xmm0 775; SSSE3-NEXT: pshufb %xmm0, %xmm6 776; SSSE3-NEXT: psrlw $4, %xmm1 777; SSSE3-NEXT: pand %xmm5, %xmm1 778; SSSE3-NEXT: pshufb %xmm1, %xmm2 779; SSSE3-NEXT: por %xmm6, %xmm2 780; SSSE3-NEXT: movdqa %xmm3, %xmm0 781; SSSE3-NEXT: movdqa %xmm2, %xmm1 782; SSSE3-NEXT: retq 783; 784; AVX1-LABEL: test_bitreverse_v16i16: 785; AVX1: # BB#0: 786; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 787; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 788; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 789; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 790; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 791; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 792; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 793; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 794; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 795; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 796; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 797; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 798; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 799; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 800; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 801; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 802; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 803; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 804; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 805; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 806; AVX1-NEXT: retq 807; 808; AVX2-LABEL: test_bitreverse_v16i16: 809; AVX2: # BB#0: 810; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 811; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 812; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 813; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 814; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 815; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 816; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 817; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 818; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 819; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 820; AVX2-NEXT: retq 821; 822; AVX512-LABEL: test_bitreverse_v16i16: 823; AVX512: # BB#0: 824; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 825; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 826; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 827; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 828; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 829; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 830; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 831; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 832; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 833; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 834; AVX512-NEXT: retq 835; 836; XOPAVX1-LABEL: test_bitreverse_v16i16: 837; XOPAVX1: # BB#0: 838; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 839; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 840; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 841; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 842; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 843; XOPAVX1-NEXT: retq 844; 845; XOPAVX2-LABEL: test_bitreverse_v16i16: 846; XOPAVX2: # BB#0: 847; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 848; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 849; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 850; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 851; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 852; XOPAVX2-NEXT: retq 853 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) 854 ret <16 x i16> %b 855} 856 857define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { 858; SSE2-LABEL: test_bitreverse_v8i32: 859; SSE2: # BB#0: 860; SSE2-NEXT: pxor %xmm4, %xmm4 861; SSE2-NEXT: movdqa %xmm0, %xmm2 862; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 863; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 864; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 865; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 866; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 867; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 868; SSE2-NEXT: packuswb %xmm2, %xmm0 869; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 870; SSE2-NEXT: movdqa %xmm0, %xmm3 871; SSE2-NEXT: pand %xmm2, %xmm3 872; SSE2-NEXT: psllw $4, %xmm3 873; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 874; SSE2-NEXT: pand %xmm6, %xmm3 875; SSE2-NEXT: pand %xmm6, %xmm0 876; SSE2-NEXT: psrlw $4, %xmm0 877; SSE2-NEXT: pand %xmm2, %xmm0 878; SSE2-NEXT: por %xmm3, %xmm0 879; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 880; SSE2-NEXT: movdqa %xmm0, %xmm5 881; SSE2-NEXT: pand %xmm3, %xmm5 882; SSE2-NEXT: psllw $2, %xmm5 883; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 884; SSE2-NEXT: pand %xmm8, %xmm5 885; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 886; SSE2-NEXT: pand %xmm9, %xmm0 887; SSE2-NEXT: psrlw $2, %xmm0 888; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 889; SSE2-NEXT: pand %xmm10, %xmm0 890; SSE2-NEXT: por %xmm5, %xmm0 891; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 892; SSE2-NEXT: movdqa %xmm0, %xmm7 893; SSE2-NEXT: pand %xmm5, %xmm7 894; SSE2-NEXT: psrlw $1, %xmm7 895; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 896; SSE2-NEXT: pand %xmm11, %xmm7 897; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 898; SSE2-NEXT: pand %xmm12, %xmm0 899; SSE2-NEXT: paddb %xmm0, %xmm0 900; SSE2-NEXT: por %xmm7, %xmm0 901; SSE2-NEXT: movdqa %xmm1, %xmm7 902; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] 903; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,2,1,0,4,5,6,7] 904; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4] 905; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 906; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 907; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 908; SSE2-NEXT: packuswb %xmm7, %xmm1 909; SSE2-NEXT: movdqa %xmm1, %xmm4 910; SSE2-NEXT: pand %xmm2, %xmm4 911; SSE2-NEXT: psllw $4, %xmm4 912; SSE2-NEXT: pand %xmm6, %xmm4 913; SSE2-NEXT: pand %xmm6, %xmm1 914; SSE2-NEXT: psrlw $4, %xmm1 915; SSE2-NEXT: pand %xmm2, %xmm1 916; SSE2-NEXT: por %xmm4, %xmm1 917; SSE2-NEXT: pand %xmm1, %xmm3 918; SSE2-NEXT: psllw $2, %xmm3 919; SSE2-NEXT: pand %xmm8, %xmm3 920; SSE2-NEXT: pand %xmm9, %xmm1 921; SSE2-NEXT: psrlw $2, %xmm1 922; SSE2-NEXT: pand %xmm10, %xmm1 923; SSE2-NEXT: por %xmm3, %xmm1 924; SSE2-NEXT: pand %xmm1, %xmm5 925; SSE2-NEXT: psrlw $1, %xmm5 926; SSE2-NEXT: pand %xmm11, %xmm5 927; SSE2-NEXT: pand %xmm12, %xmm1 928; SSE2-NEXT: paddb %xmm1, %xmm1 929; SSE2-NEXT: por %xmm5, %xmm1 930; SSE2-NEXT: retq 931; 932; SSSE3-LABEL: test_bitreverse_v8i32: 933; SSSE3: # BB#0: 934; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 935; SSSE3-NEXT: pshufb %xmm4, %xmm0 936; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 937; SSSE3-NEXT: movdqa %xmm0, %xmm2 938; SSSE3-NEXT: pand %xmm5, %xmm2 939; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 940; SSSE3-NEXT: movdqa %xmm6, %xmm7 941; SSSE3-NEXT: pshufb %xmm2, %xmm7 942; SSSE3-NEXT: psrlw $4, %xmm0 943; SSSE3-NEXT: pand %xmm5, %xmm0 944; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 945; SSSE3-NEXT: movdqa %xmm2, %xmm3 946; SSSE3-NEXT: pshufb %xmm0, %xmm3 947; SSSE3-NEXT: por %xmm7, %xmm3 948; SSSE3-NEXT: pshufb %xmm4, %xmm1 949; SSSE3-NEXT: movdqa %xmm1, %xmm0 950; SSSE3-NEXT: pand %xmm5, %xmm0 951; SSSE3-NEXT: pshufb %xmm0, %xmm6 952; SSSE3-NEXT: psrlw $4, %xmm1 953; SSSE3-NEXT: pand %xmm5, %xmm1 954; SSSE3-NEXT: pshufb %xmm1, %xmm2 955; SSSE3-NEXT: por %xmm6, %xmm2 956; SSSE3-NEXT: movdqa %xmm3, %xmm0 957; SSSE3-NEXT: movdqa %xmm2, %xmm1 958; SSSE3-NEXT: retq 959; 960; AVX1-LABEL: test_bitreverse_v8i32: 961; AVX1: # BB#0: 962; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 963; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 964; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 965; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 966; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 967; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 968; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 969; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 970; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 971; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 972; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 973; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 974; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 975; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 976; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 977; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 978; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 979; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 980; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 981; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 982; AVX1-NEXT: retq 983; 984; AVX2-LABEL: test_bitreverse_v8i32: 985; AVX2: # BB#0: 986; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 987; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 988; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 989; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 990; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 991; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 992; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 993; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 994; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 995; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 996; AVX2-NEXT: retq 997; 998; AVX512-LABEL: test_bitreverse_v8i32: 999; AVX512: # BB#0: 1000; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1001; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1002; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1003; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1004; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1005; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1006; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1007; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1008; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1009; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1010; AVX512-NEXT: retq 1011; 1012; XOPAVX1-LABEL: test_bitreverse_v8i32: 1013; XOPAVX1: # BB#0: 1014; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1015; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1016; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1017; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1018; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1019; XOPAVX1-NEXT: retq 1020; 1021; XOPAVX2-LABEL: test_bitreverse_v8i32: 1022; XOPAVX2: # BB#0: 1023; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1024; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1025; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1026; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1027; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1028; XOPAVX2-NEXT: retq 1029 %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) 1030 ret <8 x i32> %b 1031} 1032 1033define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { 1034; SSE2-LABEL: test_bitreverse_v4i64: 1035; SSE2: # BB#0: 1036; SSE2-NEXT: pxor %xmm4, %xmm4 1037; SSE2-NEXT: movdqa %xmm0, %xmm2 1038; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 1039; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 1040; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1041; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1042; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1043; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1044; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1045; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1046; SSE2-NEXT: packuswb %xmm2, %xmm0 1047; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1048; SSE2-NEXT: movdqa %xmm0, %xmm3 1049; SSE2-NEXT: pand %xmm2, %xmm3 1050; SSE2-NEXT: psllw $4, %xmm3 1051; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1052; SSE2-NEXT: pand %xmm6, %xmm3 1053; SSE2-NEXT: pand %xmm6, %xmm0 1054; SSE2-NEXT: psrlw $4, %xmm0 1055; SSE2-NEXT: pand %xmm2, %xmm0 1056; SSE2-NEXT: por %xmm3, %xmm0 1057; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1058; SSE2-NEXT: movdqa %xmm0, %xmm5 1059; SSE2-NEXT: pand %xmm3, %xmm5 1060; SSE2-NEXT: psllw $2, %xmm5 1061; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 1062; SSE2-NEXT: pand %xmm8, %xmm5 1063; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1064; SSE2-NEXT: pand %xmm9, %xmm0 1065; SSE2-NEXT: psrlw $2, %xmm0 1066; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 1067; SSE2-NEXT: pand %xmm10, %xmm0 1068; SSE2-NEXT: por %xmm5, %xmm0 1069; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1070; SSE2-NEXT: movdqa %xmm0, %xmm7 1071; SSE2-NEXT: pand %xmm5, %xmm7 1072; SSE2-NEXT: psrlw $1, %xmm7 1073; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1074; SSE2-NEXT: pand %xmm11, %xmm7 1075; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1076; SSE2-NEXT: pand %xmm12, %xmm0 1077; SSE2-NEXT: paddb %xmm0, %xmm0 1078; SSE2-NEXT: por %xmm7, %xmm0 1079; SSE2-NEXT: movdqa %xmm1, %xmm7 1080; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] 1081; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,0,1] 1082; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,2,1,0,4,5,6,7] 1083; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4] 1084; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 1085; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1086; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1087; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1088; SSE2-NEXT: packuswb %xmm7, %xmm1 1089; SSE2-NEXT: movdqa %xmm1, %xmm4 1090; SSE2-NEXT: pand %xmm2, %xmm4 1091; SSE2-NEXT: psllw $4, %xmm4 1092; SSE2-NEXT: pand %xmm6, %xmm4 1093; SSE2-NEXT: pand %xmm6, %xmm1 1094; SSE2-NEXT: psrlw $4, %xmm1 1095; SSE2-NEXT: pand %xmm2, %xmm1 1096; SSE2-NEXT: por %xmm4, %xmm1 1097; SSE2-NEXT: pand %xmm1, %xmm3 1098; SSE2-NEXT: psllw $2, %xmm3 1099; SSE2-NEXT: pand %xmm8, %xmm3 1100; SSE2-NEXT: pand %xmm9, %xmm1 1101; SSE2-NEXT: psrlw $2, %xmm1 1102; SSE2-NEXT: pand %xmm10, %xmm1 1103; SSE2-NEXT: por %xmm3, %xmm1 1104; SSE2-NEXT: pand %xmm1, %xmm5 1105; SSE2-NEXT: psrlw $1, %xmm5 1106; SSE2-NEXT: pand %xmm11, %xmm5 1107; SSE2-NEXT: pand %xmm12, %xmm1 1108; SSE2-NEXT: paddb %xmm1, %xmm1 1109; SSE2-NEXT: por %xmm5, %xmm1 1110; SSE2-NEXT: retq 1111; 1112; SSSE3-LABEL: test_bitreverse_v4i64: 1113; SSSE3: # BB#0: 1114; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1115; SSSE3-NEXT: pshufb %xmm4, %xmm0 1116; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1117; SSSE3-NEXT: movdqa %xmm0, %xmm2 1118; SSSE3-NEXT: pand %xmm5, %xmm2 1119; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1120; SSSE3-NEXT: movdqa %xmm6, %xmm7 1121; SSSE3-NEXT: pshufb %xmm2, %xmm7 1122; SSSE3-NEXT: psrlw $4, %xmm0 1123; SSSE3-NEXT: pand %xmm5, %xmm0 1124; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1125; SSSE3-NEXT: movdqa %xmm2, %xmm3 1126; SSSE3-NEXT: pshufb %xmm0, %xmm3 1127; SSSE3-NEXT: por %xmm7, %xmm3 1128; SSSE3-NEXT: pshufb %xmm4, %xmm1 1129; SSSE3-NEXT: movdqa %xmm1, %xmm0 1130; SSSE3-NEXT: pand %xmm5, %xmm0 1131; SSSE3-NEXT: pshufb %xmm0, %xmm6 1132; SSSE3-NEXT: psrlw $4, %xmm1 1133; SSSE3-NEXT: pand %xmm5, %xmm1 1134; SSSE3-NEXT: pshufb %xmm1, %xmm2 1135; SSSE3-NEXT: por %xmm6, %xmm2 1136; SSSE3-NEXT: movdqa %xmm3, %xmm0 1137; SSSE3-NEXT: movdqa %xmm2, %xmm1 1138; SSSE3-NEXT: retq 1139; 1140; AVX1-LABEL: test_bitreverse_v4i64: 1141; AVX1: # BB#0: 1142; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1143; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1144; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1145; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1146; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1147; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1148; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1149; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1150; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1151; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1152; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1153; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1154; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1155; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1156; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1157; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1158; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1159; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1160; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1161; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1162; AVX1-NEXT: retq 1163; 1164; AVX2-LABEL: test_bitreverse_v4i64: 1165; AVX2: # BB#0: 1166; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1167; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1168; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1169; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1170; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1171; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1172; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1173; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1174; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1175; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1176; AVX2-NEXT: retq 1177; 1178; AVX512-LABEL: test_bitreverse_v4i64: 1179; AVX512: # BB#0: 1180; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1181; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1182; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1183; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1184; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1185; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1186; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1187; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1188; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1189; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1190; AVX512-NEXT: retq 1191; 1192; XOPAVX1-LABEL: test_bitreverse_v4i64: 1193; XOPAVX1: # BB#0: 1194; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1195; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1196; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1197; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1198; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1199; XOPAVX1-NEXT: retq 1200; 1201; XOPAVX2-LABEL: test_bitreverse_v4i64: 1202; XOPAVX2: # BB#0: 1203; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1204; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1205; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1206; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1207; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1208; XOPAVX2-NEXT: retq 1209 %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) 1210 ret <4 x i64> %b 1211} 1212 1213define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { 1214; SSE2-LABEL: test_bitreverse_v64i8: 1215; SSE2: # BB#0: 1216; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1217; SSE2-NEXT: movdqa %xmm0, %xmm5 1218; SSE2-NEXT: pand %xmm13, %xmm5 1219; SSE2-NEXT: psllw $4, %xmm5 1220; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1221; SSE2-NEXT: pand %xmm7, %xmm5 1222; SSE2-NEXT: pand %xmm7, %xmm0 1223; SSE2-NEXT: psrlw $4, %xmm0 1224; SSE2-NEXT: pand %xmm13, %xmm0 1225; SSE2-NEXT: por %xmm5, %xmm0 1226; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1227; SSE2-NEXT: movdqa %xmm0, %xmm6 1228; SSE2-NEXT: pand %xmm5, %xmm6 1229; SSE2-NEXT: psllw $2, %xmm6 1230; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 1231; SSE2-NEXT: pand %xmm8, %xmm6 1232; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1233; SSE2-NEXT: pand %xmm9, %xmm0 1234; SSE2-NEXT: psrlw $2, %xmm0 1235; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 1236; SSE2-NEXT: pand %xmm10, %xmm0 1237; SSE2-NEXT: por %xmm6, %xmm0 1238; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1239; SSE2-NEXT: movdqa %xmm0, %xmm4 1240; SSE2-NEXT: pand %xmm6, %xmm4 1241; SSE2-NEXT: psrlw $1, %xmm4 1242; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1243; SSE2-NEXT: pand %xmm11, %xmm4 1244; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1245; SSE2-NEXT: pand %xmm12, %xmm0 1246; SSE2-NEXT: paddb %xmm0, %xmm0 1247; SSE2-NEXT: por %xmm4, %xmm0 1248; SSE2-NEXT: movdqa %xmm1, %xmm4 1249; SSE2-NEXT: pand %xmm13, %xmm4 1250; SSE2-NEXT: psllw $4, %xmm4 1251; SSE2-NEXT: pand %xmm7, %xmm4 1252; SSE2-NEXT: pand %xmm7, %xmm1 1253; SSE2-NEXT: psrlw $4, %xmm1 1254; SSE2-NEXT: pand %xmm13, %xmm1 1255; SSE2-NEXT: por %xmm4, %xmm1 1256; SSE2-NEXT: movdqa %xmm1, %xmm4 1257; SSE2-NEXT: pand %xmm5, %xmm4 1258; SSE2-NEXT: psllw $2, %xmm4 1259; SSE2-NEXT: pand %xmm8, %xmm4 1260; SSE2-NEXT: pand %xmm9, %xmm1 1261; SSE2-NEXT: psrlw $2, %xmm1 1262; SSE2-NEXT: pand %xmm10, %xmm1 1263; SSE2-NEXT: por %xmm4, %xmm1 1264; SSE2-NEXT: movdqa %xmm1, %xmm4 1265; SSE2-NEXT: pand %xmm6, %xmm4 1266; SSE2-NEXT: psrlw $1, %xmm4 1267; SSE2-NEXT: pand %xmm11, %xmm4 1268; SSE2-NEXT: pand %xmm12, %xmm1 1269; SSE2-NEXT: paddb %xmm1, %xmm1 1270; SSE2-NEXT: por %xmm4, %xmm1 1271; SSE2-NEXT: movdqa %xmm2, %xmm4 1272; SSE2-NEXT: pand %xmm13, %xmm4 1273; SSE2-NEXT: psllw $4, %xmm4 1274; SSE2-NEXT: pand %xmm7, %xmm4 1275; SSE2-NEXT: pand %xmm7, %xmm2 1276; SSE2-NEXT: psrlw $4, %xmm2 1277; SSE2-NEXT: pand %xmm13, %xmm2 1278; SSE2-NEXT: por %xmm4, %xmm2 1279; SSE2-NEXT: movdqa %xmm2, %xmm4 1280; SSE2-NEXT: pand %xmm5, %xmm4 1281; SSE2-NEXT: psllw $2, %xmm4 1282; SSE2-NEXT: pand %xmm8, %xmm4 1283; SSE2-NEXT: pand %xmm9, %xmm2 1284; SSE2-NEXT: psrlw $2, %xmm2 1285; SSE2-NEXT: pand %xmm10, %xmm2 1286; SSE2-NEXT: por %xmm4, %xmm2 1287; SSE2-NEXT: movdqa %xmm2, %xmm4 1288; SSE2-NEXT: pand %xmm6, %xmm4 1289; SSE2-NEXT: psrlw $1, %xmm4 1290; SSE2-NEXT: pand %xmm11, %xmm4 1291; SSE2-NEXT: pand %xmm12, %xmm2 1292; SSE2-NEXT: paddb %xmm2, %xmm2 1293; SSE2-NEXT: por %xmm4, %xmm2 1294; SSE2-NEXT: movdqa %xmm3, %xmm4 1295; SSE2-NEXT: pand %xmm13, %xmm4 1296; SSE2-NEXT: psllw $4, %xmm4 1297; SSE2-NEXT: pand %xmm7, %xmm4 1298; SSE2-NEXT: pand %xmm7, %xmm3 1299; SSE2-NEXT: psrlw $4, %xmm3 1300; SSE2-NEXT: pand %xmm13, %xmm3 1301; SSE2-NEXT: por %xmm4, %xmm3 1302; SSE2-NEXT: pand %xmm3, %xmm5 1303; SSE2-NEXT: psllw $2, %xmm5 1304; SSE2-NEXT: pand %xmm8, %xmm5 1305; SSE2-NEXT: pand %xmm9, %xmm3 1306; SSE2-NEXT: psrlw $2, %xmm3 1307; SSE2-NEXT: pand %xmm10, %xmm3 1308; SSE2-NEXT: por %xmm5, %xmm3 1309; SSE2-NEXT: pand %xmm3, %xmm6 1310; SSE2-NEXT: psrlw $1, %xmm6 1311; SSE2-NEXT: pand %xmm11, %xmm6 1312; SSE2-NEXT: pand %xmm12, %xmm3 1313; SSE2-NEXT: paddb %xmm3, %xmm3 1314; SSE2-NEXT: por %xmm6, %xmm3 1315; SSE2-NEXT: retq 1316; 1317; SSSE3-LABEL: test_bitreverse_v64i8: 1318; SSSE3: # BB#0: 1319; SSSE3-NEXT: movdqa %xmm0, %xmm5 1320; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1321; SSSE3-NEXT: pand %xmm8, %xmm0 1322; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1323; SSSE3-NEXT: movdqa %xmm9, %xmm6 1324; SSSE3-NEXT: pshufb %xmm0, %xmm6 1325; SSSE3-NEXT: psrlw $4, %xmm5 1326; SSSE3-NEXT: pand %xmm8, %xmm5 1327; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1328; SSSE3-NEXT: movdqa %xmm4, %xmm0 1329; SSSE3-NEXT: pshufb %xmm5, %xmm0 1330; SSSE3-NEXT: por %xmm6, %xmm0 1331; SSSE3-NEXT: movdqa %xmm1, %xmm5 1332; SSSE3-NEXT: pand %xmm8, %xmm5 1333; SSSE3-NEXT: movdqa %xmm9, %xmm6 1334; SSSE3-NEXT: pshufb %xmm5, %xmm6 1335; SSSE3-NEXT: psrlw $4, %xmm1 1336; SSSE3-NEXT: pand %xmm8, %xmm1 1337; SSSE3-NEXT: movdqa %xmm4, %xmm5 1338; SSSE3-NEXT: pshufb %xmm1, %xmm5 1339; SSSE3-NEXT: por %xmm6, %xmm5 1340; SSSE3-NEXT: movdqa %xmm2, %xmm1 1341; SSSE3-NEXT: pand %xmm8, %xmm1 1342; SSSE3-NEXT: movdqa %xmm9, %xmm7 1343; SSSE3-NEXT: pshufb %xmm1, %xmm7 1344; SSSE3-NEXT: psrlw $4, %xmm2 1345; SSSE3-NEXT: pand %xmm8, %xmm2 1346; SSSE3-NEXT: movdqa %xmm4, %xmm6 1347; SSSE3-NEXT: pshufb %xmm2, %xmm6 1348; SSSE3-NEXT: por %xmm7, %xmm6 1349; SSSE3-NEXT: movdqa %xmm3, %xmm1 1350; SSSE3-NEXT: pand %xmm8, %xmm1 1351; SSSE3-NEXT: pshufb %xmm1, %xmm9 1352; SSSE3-NEXT: psrlw $4, %xmm3 1353; SSSE3-NEXT: pand %xmm8, %xmm3 1354; SSSE3-NEXT: pshufb %xmm3, %xmm4 1355; SSSE3-NEXT: por %xmm9, %xmm4 1356; SSSE3-NEXT: movdqa %xmm5, %xmm1 1357; SSSE3-NEXT: movdqa %xmm6, %xmm2 1358; SSSE3-NEXT: movdqa %xmm4, %xmm3 1359; SSSE3-NEXT: retq 1360; 1361; AVX1-LABEL: test_bitreverse_v64i8: 1362; AVX1: # BB#0: 1363; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1364; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1365; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm4 1366; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1367; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1368; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1369; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1370; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1371; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1372; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1373; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm4 1374; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1375; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1376; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1377; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1378; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 1379; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1380; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1381; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm4 1382; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1383; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1384; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1385; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1386; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1387; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm4 1388; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1389; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1390; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1391; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1392; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1393; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1394; AVX1-NEXT: retq 1395; 1396; AVX2-LABEL: test_bitreverse_v64i8: 1397; AVX2: # BB#0: 1398; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1399; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 1400; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1401; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1402; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1403; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1404; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1405; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0 1406; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 1407; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 1408; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1409; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 1410; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1411; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1 1412; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 1413; AVX2-NEXT: retq 1414; 1415; AVX512F-LABEL: test_bitreverse_v64i8: 1416; AVX512F: # BB#0: 1417; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1418; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3 1419; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1420; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1421; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 1422; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 1423; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1424; AVX512F-NEXT: vpshufb %ymm0, %ymm5, %ymm0 1425; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 1426; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 1427; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1428; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 1429; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 1430; AVX512F-NEXT: vpshufb %ymm1, %ymm5, %ymm1 1431; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 1432; AVX512F-NEXT: retq 1433; 1434; AVX512BW-LABEL: test_bitreverse_v64i8: 1435; AVX512BW: # BB#0: 1436; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1437; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 1438; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1439; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 1440; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 1441; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 1442; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1443; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 1444; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 1445; AVX512BW-NEXT: retq 1446; 1447; XOPAVX1-LABEL: test_bitreverse_v64i8: 1448; XOPAVX1: # BB#0: 1449; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1450; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1451; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1452; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1453; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1454; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1455; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1456; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1457; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1458; XOPAVX1-NEXT: retq 1459; 1460; XOPAVX2-LABEL: test_bitreverse_v64i8: 1461; XOPAVX2: # BB#0: 1462; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1463; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1464; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1465; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1466; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1467; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1468; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1469; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1470; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 1471; XOPAVX2-NEXT: retq 1472 %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) 1473 ret <64 x i8> %b 1474} 1475 1476define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { 1477; SSE2-LABEL: test_bitreverse_v32i16: 1478; SSE2: # BB#0: 1479; SSE2-NEXT: pxor %xmm14, %xmm14 1480; SSE2-NEXT: movdqa %xmm0, %xmm4 1481; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] 1482; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] 1483; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6] 1484; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 1485; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 1486; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 1487; SSE2-NEXT: packuswb %xmm4, %xmm0 1488; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1489; SSE2-NEXT: movdqa %xmm0, %xmm5 1490; SSE2-NEXT: pand %xmm8, %xmm5 1491; SSE2-NEXT: psllw $4, %xmm5 1492; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1493; SSE2-NEXT: pand %xmm4, %xmm5 1494; SSE2-NEXT: pand %xmm4, %xmm0 1495; SSE2-NEXT: psrlw $4, %xmm0 1496; SSE2-NEXT: pand %xmm8, %xmm0 1497; SSE2-NEXT: por %xmm5, %xmm0 1498; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1499; SSE2-NEXT: movdqa %xmm0, %xmm7 1500; SSE2-NEXT: pand %xmm5, %xmm7 1501; SSE2-NEXT: psllw $2, %xmm7 1502; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 1503; SSE2-NEXT: pand %xmm9, %xmm7 1504; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1505; SSE2-NEXT: pand %xmm10, %xmm0 1506; SSE2-NEXT: psrlw $2, %xmm0 1507; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 1508; SSE2-NEXT: pand %xmm11, %xmm0 1509; SSE2-NEXT: por %xmm7, %xmm0 1510; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1511; SSE2-NEXT: movdqa %xmm0, %xmm6 1512; SSE2-NEXT: pand %xmm7, %xmm6 1513; SSE2-NEXT: psrlw $1, %xmm6 1514; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1515; SSE2-NEXT: pand %xmm12, %xmm6 1516; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1517; SSE2-NEXT: pand %xmm13, %xmm0 1518; SSE2-NEXT: paddb %xmm0, %xmm0 1519; SSE2-NEXT: por %xmm6, %xmm0 1520; SSE2-NEXT: movdqa %xmm1, %xmm6 1521; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1522; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] 1523; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6] 1524; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 1525; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 1526; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6] 1527; SSE2-NEXT: packuswb %xmm6, %xmm1 1528; SSE2-NEXT: movdqa %xmm1, %xmm6 1529; SSE2-NEXT: pand %xmm8, %xmm6 1530; SSE2-NEXT: psllw $4, %xmm6 1531; SSE2-NEXT: pand %xmm4, %xmm6 1532; SSE2-NEXT: pand %xmm4, %xmm1 1533; SSE2-NEXT: psrlw $4, %xmm1 1534; SSE2-NEXT: pand %xmm8, %xmm1 1535; SSE2-NEXT: por %xmm6, %xmm1 1536; SSE2-NEXT: movdqa %xmm1, %xmm6 1537; SSE2-NEXT: pand %xmm5, %xmm6 1538; SSE2-NEXT: psllw $2, %xmm6 1539; SSE2-NEXT: pand %xmm9, %xmm6 1540; SSE2-NEXT: pand %xmm10, %xmm1 1541; SSE2-NEXT: psrlw $2, %xmm1 1542; SSE2-NEXT: pand %xmm11, %xmm1 1543; SSE2-NEXT: por %xmm6, %xmm1 1544; SSE2-NEXT: movdqa %xmm1, %xmm6 1545; SSE2-NEXT: pand %xmm7, %xmm6 1546; SSE2-NEXT: psrlw $1, %xmm6 1547; SSE2-NEXT: pand %xmm12, %xmm6 1548; SSE2-NEXT: pand %xmm13, %xmm1 1549; SSE2-NEXT: paddb %xmm1, %xmm1 1550; SSE2-NEXT: por %xmm6, %xmm1 1551; SSE2-NEXT: movdqa %xmm2, %xmm6 1552; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1553; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] 1554; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6] 1555; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] 1556; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 1557; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] 1558; SSE2-NEXT: packuswb %xmm6, %xmm2 1559; SSE2-NEXT: movdqa %xmm2, %xmm6 1560; SSE2-NEXT: pand %xmm8, %xmm6 1561; SSE2-NEXT: psllw $4, %xmm6 1562; SSE2-NEXT: pand %xmm4, %xmm6 1563; SSE2-NEXT: pand %xmm4, %xmm2 1564; SSE2-NEXT: psrlw $4, %xmm2 1565; SSE2-NEXT: pand %xmm8, %xmm2 1566; SSE2-NEXT: por %xmm6, %xmm2 1567; SSE2-NEXT: movdqa %xmm2, %xmm6 1568; SSE2-NEXT: pand %xmm5, %xmm6 1569; SSE2-NEXT: psllw $2, %xmm6 1570; SSE2-NEXT: pand %xmm9, %xmm6 1571; SSE2-NEXT: pand %xmm10, %xmm2 1572; SSE2-NEXT: psrlw $2, %xmm2 1573; SSE2-NEXT: pand %xmm11, %xmm2 1574; SSE2-NEXT: por %xmm6, %xmm2 1575; SSE2-NEXT: movdqa %xmm2, %xmm6 1576; SSE2-NEXT: pand %xmm7, %xmm6 1577; SSE2-NEXT: psrlw $1, %xmm6 1578; SSE2-NEXT: pand %xmm12, %xmm6 1579; SSE2-NEXT: pand %xmm13, %xmm2 1580; SSE2-NEXT: paddb %xmm2, %xmm2 1581; SSE2-NEXT: por %xmm6, %xmm2 1582; SSE2-NEXT: movdqa %xmm3, %xmm6 1583; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1584; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] 1585; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6] 1586; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] 1587; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] 1588; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6] 1589; SSE2-NEXT: packuswb %xmm6, %xmm3 1590; SSE2-NEXT: movdqa %xmm3, %xmm6 1591; SSE2-NEXT: pand %xmm8, %xmm6 1592; SSE2-NEXT: psllw $4, %xmm6 1593; SSE2-NEXT: pand %xmm4, %xmm6 1594; SSE2-NEXT: pand %xmm4, %xmm3 1595; SSE2-NEXT: psrlw $4, %xmm3 1596; SSE2-NEXT: pand %xmm8, %xmm3 1597; SSE2-NEXT: por %xmm6, %xmm3 1598; SSE2-NEXT: pand %xmm3, %xmm5 1599; SSE2-NEXT: psllw $2, %xmm5 1600; SSE2-NEXT: pand %xmm9, %xmm5 1601; SSE2-NEXT: pand %xmm10, %xmm3 1602; SSE2-NEXT: psrlw $2, %xmm3 1603; SSE2-NEXT: pand %xmm11, %xmm3 1604; SSE2-NEXT: por %xmm5, %xmm3 1605; SSE2-NEXT: pand %xmm3, %xmm7 1606; SSE2-NEXT: psrlw $1, %xmm7 1607; SSE2-NEXT: pand %xmm12, %xmm7 1608; SSE2-NEXT: pand %xmm13, %xmm3 1609; SSE2-NEXT: paddb %xmm3, %xmm3 1610; SSE2-NEXT: por %xmm7, %xmm3 1611; SSE2-NEXT: retq 1612; 1613; SSSE3-LABEL: test_bitreverse_v32i16: 1614; SSSE3: # BB#0: 1615; SSSE3-NEXT: movdqa %xmm1, %xmm5 1616; SSSE3-NEXT: movdqa %xmm0, %xmm1 1617; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1618; SSSE3-NEXT: pshufb %xmm8, %xmm1 1619; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1620; SSSE3-NEXT: movdqa %xmm1, %xmm0 1621; SSSE3-NEXT: pand %xmm9, %xmm0 1622; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1623; SSSE3-NEXT: movdqa %xmm7, %xmm6 1624; SSSE3-NEXT: pshufb %xmm0, %xmm6 1625; SSSE3-NEXT: psrlw $4, %xmm1 1626; SSSE3-NEXT: pand %xmm9, %xmm1 1627; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1628; SSSE3-NEXT: movdqa %xmm4, %xmm0 1629; SSSE3-NEXT: pshufb %xmm1, %xmm0 1630; SSSE3-NEXT: por %xmm6, %xmm0 1631; SSSE3-NEXT: pshufb %xmm8, %xmm5 1632; SSSE3-NEXT: movdqa %xmm5, %xmm1 1633; SSSE3-NEXT: pand %xmm9, %xmm1 1634; SSSE3-NEXT: movdqa %xmm7, %xmm6 1635; SSSE3-NEXT: pshufb %xmm1, %xmm6 1636; SSSE3-NEXT: psrlw $4, %xmm5 1637; SSSE3-NEXT: pand %xmm9, %xmm5 1638; SSSE3-NEXT: movdqa %xmm4, %xmm1 1639; SSSE3-NEXT: pshufb %xmm5, %xmm1 1640; SSSE3-NEXT: por %xmm6, %xmm1 1641; SSSE3-NEXT: pshufb %xmm8, %xmm2 1642; SSSE3-NEXT: movdqa %xmm2, %xmm5 1643; SSSE3-NEXT: pand %xmm9, %xmm5 1644; SSSE3-NEXT: movdqa %xmm7, %xmm6 1645; SSSE3-NEXT: pshufb %xmm5, %xmm6 1646; SSSE3-NEXT: psrlw $4, %xmm2 1647; SSSE3-NEXT: pand %xmm9, %xmm2 1648; SSSE3-NEXT: movdqa %xmm4, %xmm5 1649; SSSE3-NEXT: pshufb %xmm2, %xmm5 1650; SSSE3-NEXT: por %xmm6, %xmm5 1651; SSSE3-NEXT: pshufb %xmm8, %xmm3 1652; SSSE3-NEXT: movdqa %xmm3, %xmm2 1653; SSSE3-NEXT: pand %xmm9, %xmm2 1654; SSSE3-NEXT: pshufb %xmm2, %xmm7 1655; SSSE3-NEXT: psrlw $4, %xmm3 1656; SSSE3-NEXT: pand %xmm9, %xmm3 1657; SSSE3-NEXT: pshufb %xmm3, %xmm4 1658; SSSE3-NEXT: por %xmm7, %xmm4 1659; SSSE3-NEXT: movdqa %xmm5, %xmm2 1660; SSSE3-NEXT: movdqa %xmm4, %xmm3 1661; SSSE3-NEXT: retq 1662; 1663; AVX1-LABEL: test_bitreverse_v32i16: 1664; AVX1: # BB#0: 1665; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1666; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1667; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1668; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1669; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1670; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1671; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1672; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1673; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1674; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1675; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1676; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1677; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1678; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 1679; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1680; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1681; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1682; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 1683; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 1684; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1685; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1686; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1687; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1688; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1689; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1690; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1691; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1692; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1693; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1694; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 1695; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 1696; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1697; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 1698; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 1699; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 1700; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1701; AVX1-NEXT: retq 1702; 1703; AVX2-LABEL: test_bitreverse_v32i16: 1704; AVX2: # BB#0: 1705; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1706; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1707; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1708; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 1709; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1710; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1711; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1712; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 1713; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1714; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 1715; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 1716; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1717; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 1718; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 1719; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 1720; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 1721; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 1722; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 1723; AVX2-NEXT: retq 1724; 1725; AVX512F-LABEL: test_bitreverse_v32i16: 1726; AVX512F: # BB#0: 1727; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1728; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1729; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1730; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm4 1731; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1732; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1733; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 1734; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 1735; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1736; AVX512F-NEXT: vpshufb %ymm0, %ymm6, %ymm0 1737; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 1738; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1739; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm2 1740; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 1741; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 1742; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 1743; AVX512F-NEXT: vpshufb %ymm1, %ymm6, %ymm1 1744; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 1745; AVX512F-NEXT: retq 1746; 1747; AVX512BW-LABEL: test_bitreverse_v32i16: 1748; AVX512BW: # BB#0: 1749; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] 1750; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1751; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 1752; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1753; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 1754; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 1755; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 1756; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1757; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 1758; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 1759; AVX512BW-NEXT: retq 1760; 1761; XOPAVX1-LABEL: test_bitreverse_v32i16: 1762; XOPAVX1: # BB#0: 1763; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1764; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1765; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1766; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1767; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1768; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1769; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1770; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1771; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1772; XOPAVX1-NEXT: retq 1773; 1774; XOPAVX2-LABEL: test_bitreverse_v32i16: 1775; XOPAVX2: # BB#0: 1776; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1777; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1778; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1779; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1780; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1781; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1782; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1783; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1784; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 1785; XOPAVX2-NEXT: retq 1786 %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) 1787 ret <32 x i16> %b 1788} 1789 1790define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { 1791; SSE2-LABEL: test_bitreverse_v16i32: 1792; SSE2: # BB#0: 1793; SSE2-NEXT: pxor %xmm14, %xmm14 1794; SSE2-NEXT: movdqa %xmm0, %xmm4 1795; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] 1796; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 1797; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 1798; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 1799; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1800; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1801; SSE2-NEXT: packuswb %xmm4, %xmm0 1802; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1803; SSE2-NEXT: movdqa %xmm0, %xmm5 1804; SSE2-NEXT: pand %xmm8, %xmm5 1805; SSE2-NEXT: psllw $4, %xmm5 1806; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1807; SSE2-NEXT: pand %xmm4, %xmm5 1808; SSE2-NEXT: pand %xmm4, %xmm0 1809; SSE2-NEXT: psrlw $4, %xmm0 1810; SSE2-NEXT: pand %xmm8, %xmm0 1811; SSE2-NEXT: por %xmm5, %xmm0 1812; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1813; SSE2-NEXT: movdqa %xmm0, %xmm7 1814; SSE2-NEXT: pand %xmm5, %xmm7 1815; SSE2-NEXT: psllw $2, %xmm7 1816; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 1817; SSE2-NEXT: pand %xmm9, %xmm7 1818; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1819; SSE2-NEXT: pand %xmm10, %xmm0 1820; SSE2-NEXT: psrlw $2, %xmm0 1821; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 1822; SSE2-NEXT: pand %xmm11, %xmm0 1823; SSE2-NEXT: por %xmm7, %xmm0 1824; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1825; SSE2-NEXT: movdqa %xmm0, %xmm6 1826; SSE2-NEXT: pand %xmm7, %xmm6 1827; SSE2-NEXT: psrlw $1, %xmm6 1828; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1829; SSE2-NEXT: pand %xmm12, %xmm6 1830; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1831; SSE2-NEXT: pand %xmm13, %xmm0 1832; SSE2-NEXT: paddb %xmm0, %xmm0 1833; SSE2-NEXT: por %xmm6, %xmm0 1834; SSE2-NEXT: movdqa %xmm1, %xmm6 1835; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1836; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1837; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1838; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 1839; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1840; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1841; SSE2-NEXT: packuswb %xmm6, %xmm1 1842; SSE2-NEXT: movdqa %xmm1, %xmm6 1843; SSE2-NEXT: pand %xmm8, %xmm6 1844; SSE2-NEXT: psllw $4, %xmm6 1845; SSE2-NEXT: pand %xmm4, %xmm6 1846; SSE2-NEXT: pand %xmm4, %xmm1 1847; SSE2-NEXT: psrlw $4, %xmm1 1848; SSE2-NEXT: pand %xmm8, %xmm1 1849; SSE2-NEXT: por %xmm6, %xmm1 1850; SSE2-NEXT: movdqa %xmm1, %xmm6 1851; SSE2-NEXT: pand %xmm5, %xmm6 1852; SSE2-NEXT: psllw $2, %xmm6 1853; SSE2-NEXT: pand %xmm9, %xmm6 1854; SSE2-NEXT: pand %xmm10, %xmm1 1855; SSE2-NEXT: psrlw $2, %xmm1 1856; SSE2-NEXT: pand %xmm11, %xmm1 1857; SSE2-NEXT: por %xmm6, %xmm1 1858; SSE2-NEXT: movdqa %xmm1, %xmm6 1859; SSE2-NEXT: pand %xmm7, %xmm6 1860; SSE2-NEXT: psrlw $1, %xmm6 1861; SSE2-NEXT: pand %xmm12, %xmm6 1862; SSE2-NEXT: pand %xmm13, %xmm1 1863; SSE2-NEXT: paddb %xmm1, %xmm1 1864; SSE2-NEXT: por %xmm6, %xmm1 1865; SSE2-NEXT: movdqa %xmm2, %xmm6 1866; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1867; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1868; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1869; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] 1870; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1871; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1872; SSE2-NEXT: packuswb %xmm6, %xmm2 1873; SSE2-NEXT: movdqa %xmm2, %xmm6 1874; SSE2-NEXT: pand %xmm8, %xmm6 1875; SSE2-NEXT: psllw $4, %xmm6 1876; SSE2-NEXT: pand %xmm4, %xmm6 1877; SSE2-NEXT: pand %xmm4, %xmm2 1878; SSE2-NEXT: psrlw $4, %xmm2 1879; SSE2-NEXT: pand %xmm8, %xmm2 1880; SSE2-NEXT: por %xmm6, %xmm2 1881; SSE2-NEXT: movdqa %xmm2, %xmm6 1882; SSE2-NEXT: pand %xmm5, %xmm6 1883; SSE2-NEXT: psllw $2, %xmm6 1884; SSE2-NEXT: pand %xmm9, %xmm6 1885; SSE2-NEXT: pand %xmm10, %xmm2 1886; SSE2-NEXT: psrlw $2, %xmm2 1887; SSE2-NEXT: pand %xmm11, %xmm2 1888; SSE2-NEXT: por %xmm6, %xmm2 1889; SSE2-NEXT: movdqa %xmm2, %xmm6 1890; SSE2-NEXT: pand %xmm7, %xmm6 1891; SSE2-NEXT: psrlw $1, %xmm6 1892; SSE2-NEXT: pand %xmm12, %xmm6 1893; SSE2-NEXT: pand %xmm13, %xmm2 1894; SSE2-NEXT: paddb %xmm2, %xmm2 1895; SSE2-NEXT: por %xmm6, %xmm2 1896; SSE2-NEXT: movdqa %xmm3, %xmm6 1897; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1898; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1899; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1900; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] 1901; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 1902; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 1903; SSE2-NEXT: packuswb %xmm6, %xmm3 1904; SSE2-NEXT: movdqa %xmm3, %xmm6 1905; SSE2-NEXT: pand %xmm8, %xmm6 1906; SSE2-NEXT: psllw $4, %xmm6 1907; SSE2-NEXT: pand %xmm4, %xmm6 1908; SSE2-NEXT: pand %xmm4, %xmm3 1909; SSE2-NEXT: psrlw $4, %xmm3 1910; SSE2-NEXT: pand %xmm8, %xmm3 1911; SSE2-NEXT: por %xmm6, %xmm3 1912; SSE2-NEXT: pand %xmm3, %xmm5 1913; SSE2-NEXT: psllw $2, %xmm5 1914; SSE2-NEXT: pand %xmm9, %xmm5 1915; SSE2-NEXT: pand %xmm10, %xmm3 1916; SSE2-NEXT: psrlw $2, %xmm3 1917; SSE2-NEXT: pand %xmm11, %xmm3 1918; SSE2-NEXT: por %xmm5, %xmm3 1919; SSE2-NEXT: pand %xmm3, %xmm7 1920; SSE2-NEXT: psrlw $1, %xmm7 1921; SSE2-NEXT: pand %xmm12, %xmm7 1922; SSE2-NEXT: pand %xmm13, %xmm3 1923; SSE2-NEXT: paddb %xmm3, %xmm3 1924; SSE2-NEXT: por %xmm7, %xmm3 1925; SSE2-NEXT: retq 1926; 1927; SSSE3-LABEL: test_bitreverse_v16i32: 1928; SSSE3: # BB#0: 1929; SSSE3-NEXT: movdqa %xmm1, %xmm5 1930; SSSE3-NEXT: movdqa %xmm0, %xmm1 1931; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1932; SSSE3-NEXT: pshufb %xmm8, %xmm1 1933; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1934; SSSE3-NEXT: movdqa %xmm1, %xmm0 1935; SSSE3-NEXT: pand %xmm9, %xmm0 1936; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1937; SSSE3-NEXT: movdqa %xmm7, %xmm6 1938; SSSE3-NEXT: pshufb %xmm0, %xmm6 1939; SSSE3-NEXT: psrlw $4, %xmm1 1940; SSSE3-NEXT: pand %xmm9, %xmm1 1941; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1942; SSSE3-NEXT: movdqa %xmm4, %xmm0 1943; SSSE3-NEXT: pshufb %xmm1, %xmm0 1944; SSSE3-NEXT: por %xmm6, %xmm0 1945; SSSE3-NEXT: pshufb %xmm8, %xmm5 1946; SSSE3-NEXT: movdqa %xmm5, %xmm1 1947; SSSE3-NEXT: pand %xmm9, %xmm1 1948; SSSE3-NEXT: movdqa %xmm7, %xmm6 1949; SSSE3-NEXT: pshufb %xmm1, %xmm6 1950; SSSE3-NEXT: psrlw $4, %xmm5 1951; SSSE3-NEXT: pand %xmm9, %xmm5 1952; SSSE3-NEXT: movdqa %xmm4, %xmm1 1953; SSSE3-NEXT: pshufb %xmm5, %xmm1 1954; SSSE3-NEXT: por %xmm6, %xmm1 1955; SSSE3-NEXT: pshufb %xmm8, %xmm2 1956; SSSE3-NEXT: movdqa %xmm2, %xmm5 1957; SSSE3-NEXT: pand %xmm9, %xmm5 1958; SSSE3-NEXT: movdqa %xmm7, %xmm6 1959; SSSE3-NEXT: pshufb %xmm5, %xmm6 1960; SSSE3-NEXT: psrlw $4, %xmm2 1961; SSSE3-NEXT: pand %xmm9, %xmm2 1962; SSSE3-NEXT: movdqa %xmm4, %xmm5 1963; SSSE3-NEXT: pshufb %xmm2, %xmm5 1964; SSSE3-NEXT: por %xmm6, %xmm5 1965; SSSE3-NEXT: pshufb %xmm8, %xmm3 1966; SSSE3-NEXT: movdqa %xmm3, %xmm2 1967; SSSE3-NEXT: pand %xmm9, %xmm2 1968; SSSE3-NEXT: pshufb %xmm2, %xmm7 1969; SSSE3-NEXT: psrlw $4, %xmm3 1970; SSSE3-NEXT: pand %xmm9, %xmm3 1971; SSSE3-NEXT: pshufb %xmm3, %xmm4 1972; SSSE3-NEXT: por %xmm7, %xmm4 1973; SSSE3-NEXT: movdqa %xmm5, %xmm2 1974; SSSE3-NEXT: movdqa %xmm4, %xmm3 1975; SSSE3-NEXT: retq 1976; 1977; AVX1-LABEL: test_bitreverse_v16i32: 1978; AVX1: # BB#0: 1979; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1980; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1981; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1982; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1983; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1984; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1985; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1986; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1987; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1988; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1989; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1990; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1991; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1992; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 1993; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1994; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1995; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1996; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 1997; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 1998; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1999; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2000; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2001; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2002; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2003; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2004; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2005; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2006; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2007; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2008; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2009; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2010; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2011; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2012; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2013; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2014; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2015; AVX1-NEXT: retq 2016; 2017; AVX2-LABEL: test_bitreverse_v16i32: 2018; AVX2: # BB#0: 2019; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2020; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2021; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2022; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2023; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2024; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2025; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2026; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2027; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2028; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2029; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2030; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2031; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2032; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2033; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2034; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2035; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2036; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2037; AVX2-NEXT: retq 2038; 2039; AVX512F-LABEL: test_bitreverse_v16i32: 2040; AVX512F: # BB#0: 2041; AVX512F-NEXT: vpsrld $24, %zmm0, %zmm1 2042; AVX512F-NEXT: vpsrld $8, %zmm0, %zmm2 2043; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 2044; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 2045; AVX512F-NEXT: vpslld $24, %zmm0, %zmm2 2046; AVX512F-NEXT: vpslld $8, %zmm0, %zmm0 2047; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 2048; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2049; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 2050; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 2051; AVX512F-NEXT: vpslld $4, %zmm1, %zmm1 2052; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 2053; AVX512F-NEXT: vpsrld $4, %zmm0, %zmm0 2054; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 2055; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 2056; AVX512F-NEXT: vpslld $2, %zmm1, %zmm1 2057; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 2058; AVX512F-NEXT: vpsrld $2, %zmm0, %zmm0 2059; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 2060; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 2061; AVX512F-NEXT: vpslld $1, %zmm1, %zmm1 2062; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 2063; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 2064; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 2065; AVX512F-NEXT: retq 2066; 2067; AVX512BW-LABEL: test_bitreverse_v16i32: 2068; AVX512BW: # BB#0: 2069; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] 2070; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2071; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2072; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2073; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2074; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2075; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2076; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2077; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2078; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2079; AVX512BW-NEXT: retq 2080; 2081; XOPAVX1-LABEL: test_bitreverse_v16i32: 2082; XOPAVX1: # BB#0: 2083; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2084; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 2085; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2086; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2087; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2088; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2089; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2090; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2091; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2092; XOPAVX1-NEXT: retq 2093; 2094; XOPAVX2-LABEL: test_bitreverse_v16i32: 2095; XOPAVX2: # BB#0: 2096; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2097; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 2098; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2099; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2100; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2101; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2102; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2103; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2104; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2105; XOPAVX2-NEXT: retq 2106 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) 2107 ret <16 x i32> %b 2108} 2109 2110define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { 2111; SSE2-LABEL: test_bitreverse_v8i64: 2112; SSE2: # BB#0: 2113; SSE2-NEXT: pxor %xmm14, %xmm14 2114; SSE2-NEXT: movdqa %xmm0, %xmm4 2115; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] 2116; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 2117; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2118; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2119; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 2120; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2121; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 2122; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 2123; SSE2-NEXT: packuswb %xmm4, %xmm0 2124; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2125; SSE2-NEXT: movdqa %xmm0, %xmm5 2126; SSE2-NEXT: pand %xmm8, %xmm5 2127; SSE2-NEXT: psllw $4, %xmm5 2128; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 2129; SSE2-NEXT: pand %xmm4, %xmm5 2130; SSE2-NEXT: pand %xmm4, %xmm0 2131; SSE2-NEXT: psrlw $4, %xmm0 2132; SSE2-NEXT: pand %xmm8, %xmm0 2133; SSE2-NEXT: por %xmm5, %xmm0 2134; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 2135; SSE2-NEXT: movdqa %xmm0, %xmm7 2136; SSE2-NEXT: pand %xmm5, %xmm7 2137; SSE2-NEXT: psllw $2, %xmm7 2138; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 2139; SSE2-NEXT: pand %xmm9, %xmm7 2140; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 2141; SSE2-NEXT: pand %xmm10, %xmm0 2142; SSE2-NEXT: psrlw $2, %xmm0 2143; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 2144; SSE2-NEXT: pand %xmm11, %xmm0 2145; SSE2-NEXT: por %xmm7, %xmm0 2146; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 2147; SSE2-NEXT: movdqa %xmm0, %xmm6 2148; SSE2-NEXT: pand %xmm7, %xmm6 2149; SSE2-NEXT: psrlw $1, %xmm6 2150; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 2151; SSE2-NEXT: pand %xmm12, %xmm6 2152; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 2153; SSE2-NEXT: pand %xmm13, %xmm0 2154; SSE2-NEXT: paddb %xmm0, %xmm0 2155; SSE2-NEXT: por %xmm6, %xmm0 2156; SSE2-NEXT: movdqa %xmm1, %xmm6 2157; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 2158; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 2159; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 2160; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 2161; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 2162; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2163; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 2164; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 2165; SSE2-NEXT: packuswb %xmm6, %xmm1 2166; SSE2-NEXT: movdqa %xmm1, %xmm6 2167; SSE2-NEXT: pand %xmm8, %xmm6 2168; SSE2-NEXT: psllw $4, %xmm6 2169; SSE2-NEXT: pand %xmm4, %xmm6 2170; SSE2-NEXT: pand %xmm4, %xmm1 2171; SSE2-NEXT: psrlw $4, %xmm1 2172; SSE2-NEXT: pand %xmm8, %xmm1 2173; SSE2-NEXT: por %xmm6, %xmm1 2174; SSE2-NEXT: movdqa %xmm1, %xmm6 2175; SSE2-NEXT: pand %xmm5, %xmm6 2176; SSE2-NEXT: psllw $2, %xmm6 2177; SSE2-NEXT: pand %xmm9, %xmm6 2178; SSE2-NEXT: pand %xmm10, %xmm1 2179; SSE2-NEXT: psrlw $2, %xmm1 2180; SSE2-NEXT: pand %xmm11, %xmm1 2181; SSE2-NEXT: por %xmm6, %xmm1 2182; SSE2-NEXT: movdqa %xmm1, %xmm6 2183; SSE2-NEXT: pand %xmm7, %xmm6 2184; SSE2-NEXT: psrlw $1, %xmm6 2185; SSE2-NEXT: pand %xmm12, %xmm6 2186; SSE2-NEXT: pand %xmm13, %xmm1 2187; SSE2-NEXT: paddb %xmm1, %xmm1 2188; SSE2-NEXT: por %xmm6, %xmm1 2189; SSE2-NEXT: movdqa %xmm2, %xmm6 2190; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 2191; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 2192; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 2193; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 2194; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] 2195; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2196; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 2197; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 2198; SSE2-NEXT: packuswb %xmm6, %xmm2 2199; SSE2-NEXT: movdqa %xmm2, %xmm6 2200; SSE2-NEXT: pand %xmm8, %xmm6 2201; SSE2-NEXT: psllw $4, %xmm6 2202; SSE2-NEXT: pand %xmm4, %xmm6 2203; SSE2-NEXT: pand %xmm4, %xmm2 2204; SSE2-NEXT: psrlw $4, %xmm2 2205; SSE2-NEXT: pand %xmm8, %xmm2 2206; SSE2-NEXT: por %xmm6, %xmm2 2207; SSE2-NEXT: movdqa %xmm2, %xmm6 2208; SSE2-NEXT: pand %xmm5, %xmm6 2209; SSE2-NEXT: psllw $2, %xmm6 2210; SSE2-NEXT: pand %xmm9, %xmm6 2211; SSE2-NEXT: pand %xmm10, %xmm2 2212; SSE2-NEXT: psrlw $2, %xmm2 2213; SSE2-NEXT: pand %xmm11, %xmm2 2214; SSE2-NEXT: por %xmm6, %xmm2 2215; SSE2-NEXT: movdqa %xmm2, %xmm6 2216; SSE2-NEXT: pand %xmm7, %xmm6 2217; SSE2-NEXT: psrlw $1, %xmm6 2218; SSE2-NEXT: pand %xmm12, %xmm6 2219; SSE2-NEXT: pand %xmm13, %xmm2 2220; SSE2-NEXT: paddb %xmm2, %xmm2 2221; SSE2-NEXT: por %xmm6, %xmm2 2222; SSE2-NEXT: movdqa %xmm3, %xmm6 2223; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 2224; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 2225; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 2226; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 2227; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] 2228; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 2229; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 2230; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 2231; SSE2-NEXT: packuswb %xmm6, %xmm3 2232; SSE2-NEXT: movdqa %xmm3, %xmm6 2233; SSE2-NEXT: pand %xmm8, %xmm6 2234; SSE2-NEXT: psllw $4, %xmm6 2235; SSE2-NEXT: pand %xmm4, %xmm6 2236; SSE2-NEXT: pand %xmm4, %xmm3 2237; SSE2-NEXT: psrlw $4, %xmm3 2238; SSE2-NEXT: pand %xmm8, %xmm3 2239; SSE2-NEXT: por %xmm6, %xmm3 2240; SSE2-NEXT: pand %xmm3, %xmm5 2241; SSE2-NEXT: psllw $2, %xmm5 2242; SSE2-NEXT: pand %xmm9, %xmm5 2243; SSE2-NEXT: pand %xmm10, %xmm3 2244; SSE2-NEXT: psrlw $2, %xmm3 2245; SSE2-NEXT: pand %xmm11, %xmm3 2246; SSE2-NEXT: por %xmm5, %xmm3 2247; SSE2-NEXT: pand %xmm3, %xmm7 2248; SSE2-NEXT: psrlw $1, %xmm7 2249; SSE2-NEXT: pand %xmm12, %xmm7 2250; SSE2-NEXT: pand %xmm13, %xmm3 2251; SSE2-NEXT: paddb %xmm3, %xmm3 2252; SSE2-NEXT: por %xmm7, %xmm3 2253; SSE2-NEXT: retq 2254; 2255; SSSE3-LABEL: test_bitreverse_v8i64: 2256; SSSE3: # BB#0: 2257; SSSE3-NEXT: movdqa %xmm1, %xmm5 2258; SSSE3-NEXT: movdqa %xmm0, %xmm1 2259; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2260; SSSE3-NEXT: pshufb %xmm8, %xmm1 2261; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2262; SSSE3-NEXT: movdqa %xmm1, %xmm0 2263; SSSE3-NEXT: pand %xmm9, %xmm0 2264; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2265; SSSE3-NEXT: movdqa %xmm7, %xmm6 2266; SSSE3-NEXT: pshufb %xmm0, %xmm6 2267; SSSE3-NEXT: psrlw $4, %xmm1 2268; SSSE3-NEXT: pand %xmm9, %xmm1 2269; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2270; SSSE3-NEXT: movdqa %xmm4, %xmm0 2271; SSSE3-NEXT: pshufb %xmm1, %xmm0 2272; SSSE3-NEXT: por %xmm6, %xmm0 2273; SSSE3-NEXT: pshufb %xmm8, %xmm5 2274; SSSE3-NEXT: movdqa %xmm5, %xmm1 2275; SSSE3-NEXT: pand %xmm9, %xmm1 2276; SSSE3-NEXT: movdqa %xmm7, %xmm6 2277; SSSE3-NEXT: pshufb %xmm1, %xmm6 2278; SSSE3-NEXT: psrlw $4, %xmm5 2279; SSSE3-NEXT: pand %xmm9, %xmm5 2280; SSSE3-NEXT: movdqa %xmm4, %xmm1 2281; SSSE3-NEXT: pshufb %xmm5, %xmm1 2282; SSSE3-NEXT: por %xmm6, %xmm1 2283; SSSE3-NEXT: pshufb %xmm8, %xmm2 2284; SSSE3-NEXT: movdqa %xmm2, %xmm5 2285; SSSE3-NEXT: pand %xmm9, %xmm5 2286; SSSE3-NEXT: movdqa %xmm7, %xmm6 2287; SSSE3-NEXT: pshufb %xmm5, %xmm6 2288; SSSE3-NEXT: psrlw $4, %xmm2 2289; SSSE3-NEXT: pand %xmm9, %xmm2 2290; SSSE3-NEXT: movdqa %xmm4, %xmm5 2291; SSSE3-NEXT: pshufb %xmm2, %xmm5 2292; SSSE3-NEXT: por %xmm6, %xmm5 2293; SSSE3-NEXT: pshufb %xmm8, %xmm3 2294; SSSE3-NEXT: movdqa %xmm3, %xmm2 2295; SSSE3-NEXT: pand %xmm9, %xmm2 2296; SSSE3-NEXT: pshufb %xmm2, %xmm7 2297; SSSE3-NEXT: psrlw $4, %xmm3 2298; SSSE3-NEXT: pand %xmm9, %xmm3 2299; SSSE3-NEXT: pshufb %xmm3, %xmm4 2300; SSSE3-NEXT: por %xmm7, %xmm4 2301; SSSE3-NEXT: movdqa %xmm5, %xmm2 2302; SSSE3-NEXT: movdqa %xmm4, %xmm3 2303; SSSE3-NEXT: retq 2304; 2305; AVX1-LABEL: test_bitreverse_v8i64: 2306; AVX1: # BB#0: 2307; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2308; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2309; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2310; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2311; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2312; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2313; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2314; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2315; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2316; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2317; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2318; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2319; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2320; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 2321; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2322; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2323; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2324; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2325; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2326; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2327; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2328; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2329; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2330; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2331; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2332; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2333; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2334; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2335; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2336; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2337; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2338; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2339; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2340; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2341; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2342; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2343; AVX1-NEXT: retq 2344; 2345; AVX2-LABEL: test_bitreverse_v8i64: 2346; AVX2: # BB#0: 2347; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2348; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2349; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2350; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2351; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2352; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2353; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2354; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2355; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2356; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2357; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2358; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2359; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2360; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2361; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2362; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2363; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2364; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2365; AVX2-NEXT: retq 2366; 2367; AVX512F-LABEL: test_bitreverse_v8i64: 2368; AVX512F: # BB#0: 2369; AVX512F-NEXT: vpsrlq $56, %zmm0, %zmm1 2370; AVX512F-NEXT: vpsrlq $40, %zmm0, %zmm2 2371; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 2372; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 2373; AVX512F-NEXT: vpsrlq $24, %zmm0, %zmm2 2374; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 2375; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm3 2376; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3 2377; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 2378; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1 2379; AVX512F-NEXT: vpsllq $8, %zmm0, %zmm2 2380; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 2381; AVX512F-NEXT: vpsllq $24, %zmm0, %zmm3 2382; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3 2383; AVX512F-NEXT: vporq %zmm2, %zmm3, %zmm2 2384; AVX512F-NEXT: vpsllq $56, %zmm0, %zmm3 2385; AVX512F-NEXT: vpsllq $40, %zmm0, %zmm0 2386; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 2387; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0 2388; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2389; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0 2390; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 2391; AVX512F-NEXT: vpsllq $4, %zmm1, %zmm1 2392; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 2393; AVX512F-NEXT: vpsrlq $4, %zmm0, %zmm0 2394; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2395; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 2396; AVX512F-NEXT: vpsllq $2, %zmm1, %zmm1 2397; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 2398; AVX512F-NEXT: vpsrlq $2, %zmm0, %zmm0 2399; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2400; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 2401; AVX512F-NEXT: vpsllq $1, %zmm1, %zmm1 2402; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 2403; AVX512F-NEXT: vpsrlq $1, %zmm0, %zmm0 2404; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2405; AVX512F-NEXT: retq 2406; 2407; AVX512BW-LABEL: test_bitreverse_v8i64: 2408; AVX512BW: # BB#0: 2409; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] 2410; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2411; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2412; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2413; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2414; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2415; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2416; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2417; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2418; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2419; AVX512BW-NEXT: retq 2420; 2421; XOPAVX1-LABEL: test_bitreverse_v8i64: 2422; XOPAVX1: # BB#0: 2423; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2424; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 2425; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2426; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2427; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2428; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2429; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2430; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2431; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2432; XOPAVX1-NEXT: retq 2433; 2434; XOPAVX2-LABEL: test_bitreverse_v8i64: 2435; XOPAVX2: # BB#0: 2436; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2437; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 2438; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2439; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2440; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2441; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2442; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2443; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2444; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2445; XOPAVX2-NEXT: retq 2446 %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) 2447 ret <8 x i64> %b 2448} 2449 2450declare i8 @llvm.bitreverse.i8(i8) readnone 2451declare i16 @llvm.bitreverse.i16(i16) readnone 2452declare i32 @llvm.bitreverse.i32(i32) readnone 2453declare i64 @llvm.bitreverse.i64(i64) readnone 2454 2455declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone 2456declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone 2457declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone 2458declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone 2459 2460declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) readnone 2461declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone 2462declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone 2463declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone 2464 2465declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) readnone 2466declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone 2467declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>) readnone 2468declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>) readnone 2469