1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 10 11; Make sure we don't crash with avx512bw and xop 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw 13 14define i8 @test_bitreverse_i8(i8 %a) nounwind { 15; SSE-LABEL: test_bitreverse_i8: 16; SSE: # %bb.0: 17; SSE-NEXT: # kill: def $edi killed $edi def $rdi 18; SSE-NEXT: rolb $4, %dil 19; SSE-NEXT: movl %edi, %eax 20; SSE-NEXT: andb $51, %al 21; SSE-NEXT: shlb $2, %al 22; SSE-NEXT: andb $-52, %dil 23; SSE-NEXT: shrb $2, %dil 24; SSE-NEXT: orb %al, %dil 25; SSE-NEXT: movl %edi, %eax 26; SSE-NEXT: andb $85, %al 27; SSE-NEXT: addb %al, %al 28; SSE-NEXT: andb $-86, %dil 29; SSE-NEXT: shrb %dil 30; SSE-NEXT: addl %edi, %eax 31; SSE-NEXT: # kill: def $al killed $al killed $eax 32; SSE-NEXT: retq 33; 34; AVX-LABEL: test_bitreverse_i8: 35; AVX: # %bb.0: 36; AVX-NEXT: # kill: def $edi killed $edi def $rdi 37; AVX-NEXT: rolb $4, %dil 38; AVX-NEXT: movl %edi, %eax 39; AVX-NEXT: andb $51, %al 40; AVX-NEXT: shlb $2, %al 41; AVX-NEXT: andb $-52, %dil 42; AVX-NEXT: shrb $2, %dil 43; AVX-NEXT: orb %al, %dil 44; AVX-NEXT: movl %edi, %eax 45; AVX-NEXT: andb $85, %al 46; AVX-NEXT: addb %al, %al 47; AVX-NEXT: andb $-86, %dil 48; AVX-NEXT: shrb %dil 49; AVX-NEXT: addl %edi, %eax 50; AVX-NEXT: # kill: def $al killed $al killed $eax 51; AVX-NEXT: retq 52; 53; XOP-LABEL: test_bitreverse_i8: 54; XOP: # %bb.0: 55; XOP-NEXT: vmovd %edi, %xmm0 56; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 57; XOP-NEXT: vmovd %xmm0, %eax 58; XOP-NEXT: # kill: def $al killed $al killed $eax 59; XOP-NEXT: retq 60 %b = call i8 @llvm.bitreverse.i8(i8 %a) 61 ret i8 %b 62} 63 64define i16 @test_bitreverse_i16(i16 %a) nounwind { 65; SSE-LABEL: test_bitreverse_i16: 66; SSE: # %bb.0: 67; SSE-NEXT: # kill: def $edi killed $edi def $rdi 68; SSE-NEXT: rolw $8, %di 69; SSE-NEXT: movl %edi, %eax 70; SSE-NEXT: andl $3855, %eax # imm = 0xF0F 71; SSE-NEXT: shll $4, %eax 72; SSE-NEXT: andl $61680, %edi # imm = 0xF0F0 73; SSE-NEXT: shrl $4, %edi 74; SSE-NEXT: orl %eax, %edi 75; SSE-NEXT: movl %edi, %eax 76; SSE-NEXT: andl $13107, %eax # imm = 0x3333 77; SSE-NEXT: andl $52428, %edi # imm = 0xCCCC 78; SSE-NEXT: shrl $2, %edi 79; SSE-NEXT: leal (%rdi,%rax,4), %eax 80; SSE-NEXT: movl %eax, %ecx 81; SSE-NEXT: andl $21845, %ecx # imm = 0x5555 82; SSE-NEXT: andl $43690, %eax # imm = 0xAAAA 83; SSE-NEXT: shrl %eax 84; SSE-NEXT: leal (%rax,%rcx,2), %eax 85; SSE-NEXT: # kill: def $ax killed $ax killed $eax 86; SSE-NEXT: retq 87; 88; AVX-LABEL: test_bitreverse_i16: 89; AVX: # %bb.0: 90; AVX-NEXT: # kill: def $edi killed $edi def $rdi 91; AVX-NEXT: rolw $8, %di 92; AVX-NEXT: movl %edi, %eax 93; AVX-NEXT: andl $3855, %eax # imm = 0xF0F 94; AVX-NEXT: shll $4, %eax 95; AVX-NEXT: andl $61680, %edi # imm = 0xF0F0 96; AVX-NEXT: shrl $4, %edi 97; AVX-NEXT: orl %eax, %edi 98; AVX-NEXT: movl %edi, %eax 99; AVX-NEXT: andl $13107, %eax # imm = 0x3333 100; AVX-NEXT: andl $52428, %edi # imm = 0xCCCC 101; AVX-NEXT: shrl $2, %edi 102; AVX-NEXT: leal (%rdi,%rax,4), %eax 103; AVX-NEXT: movl %eax, %ecx 104; AVX-NEXT: andl $21845, %ecx # imm = 0x5555 105; AVX-NEXT: andl $43690, %eax # imm = 0xAAAA 106; AVX-NEXT: shrl %eax 107; AVX-NEXT: leal (%rax,%rcx,2), %eax 108; AVX-NEXT: # kill: def $ax killed $ax killed $eax 109; AVX-NEXT: retq 110; 111; XOP-LABEL: test_bitreverse_i16: 112; XOP: # %bb.0: 113; XOP-NEXT: vmovd %edi, %xmm0 114; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 115; XOP-NEXT: vmovd %xmm0, %eax 116; XOP-NEXT: # kill: def $ax killed $ax killed $eax 117; XOP-NEXT: retq 118 %b = call i16 @llvm.bitreverse.i16(i16 %a) 119 ret i16 %b 120} 121 122define i32 @test_bitreverse_i32(i32 %a) nounwind { 123; SSE-LABEL: test_bitreverse_i32: 124; SSE: # %bb.0: 125; SSE-NEXT: # kill: def $edi killed $edi def $rdi 126; SSE-NEXT: bswapl %edi 127; SSE-NEXT: movl %edi, %eax 128; SSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 129; SSE-NEXT: shll $4, %eax 130; SSE-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 131; SSE-NEXT: shrl $4, %edi 132; SSE-NEXT: orl %eax, %edi 133; SSE-NEXT: movl %edi, %eax 134; SSE-NEXT: andl $858993459, %eax # imm = 0x33333333 135; SSE-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC 136; SSE-NEXT: shrl $2, %edi 137; SSE-NEXT: leal (%rdi,%rax,4), %eax 138; SSE-NEXT: movl %eax, %ecx 139; SSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555 140; SSE-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA 141; SSE-NEXT: shrl %eax 142; SSE-NEXT: leal (%rax,%rcx,2), %eax 143; SSE-NEXT: retq 144; 145; AVX-LABEL: test_bitreverse_i32: 146; AVX: # %bb.0: 147; AVX-NEXT: # kill: def $edi killed $edi def $rdi 148; AVX-NEXT: bswapl %edi 149; AVX-NEXT: movl %edi, %eax 150; AVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 151; AVX-NEXT: shll $4, %eax 152; AVX-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 153; AVX-NEXT: shrl $4, %edi 154; AVX-NEXT: orl %eax, %edi 155; AVX-NEXT: movl %edi, %eax 156; AVX-NEXT: andl $858993459, %eax # imm = 0x33333333 157; AVX-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC 158; AVX-NEXT: shrl $2, %edi 159; AVX-NEXT: leal (%rdi,%rax,4), %eax 160; AVX-NEXT: movl %eax, %ecx 161; AVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555 162; AVX-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA 163; AVX-NEXT: shrl %eax 164; AVX-NEXT: leal (%rax,%rcx,2), %eax 165; AVX-NEXT: retq 166; 167; XOP-LABEL: test_bitreverse_i32: 168; XOP: # %bb.0: 169; XOP-NEXT: vmovd %edi, %xmm0 170; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 171; XOP-NEXT: vmovd %xmm0, %eax 172; XOP-NEXT: retq 173 %b = call i32 @llvm.bitreverse.i32(i32 %a) 174 ret i32 %b 175} 176 177define i64 @test_bitreverse_i64(i64 %a) nounwind { 178; SSE-LABEL: test_bitreverse_i64: 179; SSE: # %bb.0: 180; SSE-NEXT: bswapq %rdi 181; SSE-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F 182; SSE-NEXT: andq %rdi, %rax 183; SSE-NEXT: shlq $4, %rax 184; SSE-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 185; SSE-NEXT: andq %rdi, %rcx 186; SSE-NEXT: shrq $4, %rcx 187; SSE-NEXT: orq %rax, %rcx 188; SSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 189; SSE-NEXT: andq %rcx, %rax 190; SSE-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC 191; SSE-NEXT: andq %rcx, %rdx 192; SSE-NEXT: shrq $2, %rdx 193; SSE-NEXT: leaq (%rdx,%rax,4), %rax 194; SSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 195; SSE-NEXT: andq %rax, %rcx 196; SSE-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA 197; SSE-NEXT: andq %rax, %rdx 198; SSE-NEXT: shrq %rdx 199; SSE-NEXT: leaq (%rdx,%rcx,2), %rax 200; SSE-NEXT: retq 201; 202; AVX-LABEL: test_bitreverse_i64: 203; AVX: # %bb.0: 204; AVX-NEXT: bswapq %rdi 205; AVX-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F 206; AVX-NEXT: andq %rdi, %rax 207; AVX-NEXT: shlq $4, %rax 208; AVX-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 209; AVX-NEXT: andq %rdi, %rcx 210; AVX-NEXT: shrq $4, %rcx 211; AVX-NEXT: orq %rax, %rcx 212; AVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 213; AVX-NEXT: andq %rcx, %rax 214; AVX-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC 215; AVX-NEXT: andq %rcx, %rdx 216; AVX-NEXT: shrq $2, %rdx 217; AVX-NEXT: leaq (%rdx,%rax,4), %rax 218; AVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 219; AVX-NEXT: andq %rax, %rcx 220; AVX-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA 221; AVX-NEXT: andq %rax, %rdx 222; AVX-NEXT: shrq %rdx 223; AVX-NEXT: leaq (%rdx,%rcx,2), %rax 224; AVX-NEXT: retq 225; 226; XOP-LABEL: test_bitreverse_i64: 227; XOP: # %bb.0: 228; XOP-NEXT: vmovq %rdi, %xmm0 229; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 230; XOP-NEXT: vmovq %xmm0, %rax 231; XOP-NEXT: retq 232 %b = call i64 @llvm.bitreverse.i64(i64 %a) 233 ret i64 %b 234} 235 236define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { 237; SSE2-LABEL: test_bitreverse_v16i8: 238; SSE2: # %bb.0: 239; SSE2-NEXT: movdqa %xmm0, %xmm1 240; SSE2-NEXT: psllw $4, %xmm1 241; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 242; SSE2-NEXT: psrlw $4, %xmm0 243; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 244; SSE2-NEXT: por %xmm1, %xmm0 245; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 246; SSE2-NEXT: pand %xmm0, %xmm1 247; SSE2-NEXT: psllw $2, %xmm1 248; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 249; SSE2-NEXT: psrlw $2, %xmm0 250; SSE2-NEXT: por %xmm1, %xmm0 251; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 252; SSE2-NEXT: pand %xmm0, %xmm1 253; SSE2-NEXT: paddb %xmm1, %xmm1 254; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 255; SSE2-NEXT: psrlw $1, %xmm0 256; SSE2-NEXT: por %xmm1, %xmm0 257; SSE2-NEXT: retq 258; 259; SSSE3-LABEL: test_bitreverse_v16i8: 260; SSSE3: # %bb.0: 261; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 262; SSSE3-NEXT: movdqa %xmm0, %xmm2 263; SSSE3-NEXT: pand %xmm1, %xmm2 264; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 265; SSSE3-NEXT: pshufb %xmm2, %xmm3 266; SSSE3-NEXT: psrlw $4, %xmm0 267; SSSE3-NEXT: pand %xmm1, %xmm0 268; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 269; SSSE3-NEXT: pshufb %xmm0, %xmm1 270; SSSE3-NEXT: por %xmm3, %xmm1 271; SSSE3-NEXT: movdqa %xmm1, %xmm0 272; SSSE3-NEXT: retq 273; 274; AVX-LABEL: test_bitreverse_v16i8: 275; AVX: # %bb.0: 276; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 277; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 278; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 279; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 280; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 281; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 282; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 283; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 284; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 285; AVX-NEXT: retq 286; 287; XOP-LABEL: test_bitreverse_v16i8: 288; XOP: # %bb.0: 289; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 290; XOP-NEXT: retq 291 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) 292 ret <16 x i8> %b 293} 294 295define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { 296; SSE2-LABEL: test_bitreverse_v8i16: 297; SSE2: # %bb.0: 298; SSE2-NEXT: movdqa %xmm0, %xmm1 299; SSE2-NEXT: psrlw $8, %xmm1 300; SSE2-NEXT: psllw $8, %xmm0 301; SSE2-NEXT: por %xmm1, %xmm0 302; SSE2-NEXT: movdqa %xmm0, %xmm1 303; SSE2-NEXT: psllw $4, %xmm1 304; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 305; SSE2-NEXT: psrlw $4, %xmm0 306; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 307; SSE2-NEXT: por %xmm1, %xmm0 308; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 309; SSE2-NEXT: pand %xmm0, %xmm1 310; SSE2-NEXT: psllw $2, %xmm1 311; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 312; SSE2-NEXT: psrlw $2, %xmm0 313; SSE2-NEXT: por %xmm1, %xmm0 314; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 315; SSE2-NEXT: pand %xmm0, %xmm1 316; SSE2-NEXT: paddb %xmm1, %xmm1 317; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 318; SSE2-NEXT: psrlw $1, %xmm0 319; SSE2-NEXT: por %xmm1, %xmm0 320; SSE2-NEXT: retq 321; 322; SSSE3-LABEL: test_bitreverse_v8i16: 323; SSSE3: # %bb.0: 324; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 325; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 326; SSSE3-NEXT: movdqa %xmm0, %xmm2 327; SSSE3-NEXT: pand %xmm1, %xmm2 328; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 329; SSSE3-NEXT: pshufb %xmm2, %xmm3 330; SSSE3-NEXT: psrlw $4, %xmm0 331; SSSE3-NEXT: pand %xmm1, %xmm0 332; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 333; SSSE3-NEXT: pshufb %xmm0, %xmm1 334; SSSE3-NEXT: por %xmm3, %xmm1 335; SSSE3-NEXT: movdqa %xmm1, %xmm0 336; SSSE3-NEXT: retq 337; 338; AVX-LABEL: test_bitreverse_v8i16: 339; AVX: # %bb.0: 340; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 341; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 342; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 343; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 344; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 345; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 346; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 347; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 348; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 349; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 350; AVX-NEXT: retq 351; 352; XOP-LABEL: test_bitreverse_v8i16: 353; XOP: # %bb.0: 354; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 355; XOP-NEXT: retq 356 %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) 357 ret <8 x i16> %b 358} 359 360define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { 361; SSE2-LABEL: test_bitreverse_v4i32: 362; SSE2: # %bb.0: 363; SSE2-NEXT: pxor %xmm1, %xmm1 364; SSE2-NEXT: movdqa %xmm0, %xmm2 365; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 366; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 367; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 368; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 369; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 370; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 371; SSE2-NEXT: packuswb %xmm2, %xmm0 372; SSE2-NEXT: movdqa %xmm0, %xmm1 373; SSE2-NEXT: psllw $4, %xmm1 374; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 375; SSE2-NEXT: psrlw $4, %xmm0 376; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 377; SSE2-NEXT: por %xmm1, %xmm0 378; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 379; SSE2-NEXT: pand %xmm0, %xmm1 380; SSE2-NEXT: psllw $2, %xmm1 381; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 382; SSE2-NEXT: psrlw $2, %xmm0 383; SSE2-NEXT: por %xmm1, %xmm0 384; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 385; SSE2-NEXT: pand %xmm0, %xmm1 386; SSE2-NEXT: paddb %xmm1, %xmm1 387; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 388; SSE2-NEXT: psrlw $1, %xmm0 389; SSE2-NEXT: por %xmm1, %xmm0 390; SSE2-NEXT: retq 391; 392; SSSE3-LABEL: test_bitreverse_v4i32: 393; SSSE3: # %bb.0: 394; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 395; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 396; SSSE3-NEXT: movdqa %xmm0, %xmm2 397; SSSE3-NEXT: pand %xmm1, %xmm2 398; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 399; SSSE3-NEXT: pshufb %xmm2, %xmm3 400; SSSE3-NEXT: psrlw $4, %xmm0 401; SSSE3-NEXT: pand %xmm1, %xmm0 402; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 403; SSSE3-NEXT: pshufb %xmm0, %xmm1 404; SSSE3-NEXT: por %xmm3, %xmm1 405; SSSE3-NEXT: movdqa %xmm1, %xmm0 406; SSSE3-NEXT: retq 407; 408; AVX-LABEL: test_bitreverse_v4i32: 409; AVX: # %bb.0: 410; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 411; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 412; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 413; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 414; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 415; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 416; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 417; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 418; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 419; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 420; AVX-NEXT: retq 421; 422; XOP-LABEL: test_bitreverse_v4i32: 423; XOP: # %bb.0: 424; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 425; XOP-NEXT: retq 426 %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) 427 ret <4 x i32> %b 428} 429 430define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { 431; SSE2-LABEL: test_bitreverse_v2i64: 432; SSE2: # %bb.0: 433; SSE2-NEXT: pxor %xmm1, %xmm1 434; SSE2-NEXT: movdqa %xmm0, %xmm2 435; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 436; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 437; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 438; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 439; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 440; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 441; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 442; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 443; SSE2-NEXT: packuswb %xmm2, %xmm0 444; SSE2-NEXT: movdqa %xmm0, %xmm1 445; SSE2-NEXT: psllw $4, %xmm1 446; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 447; SSE2-NEXT: psrlw $4, %xmm0 448; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 449; SSE2-NEXT: por %xmm1, %xmm0 450; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 451; SSE2-NEXT: pand %xmm0, %xmm1 452; SSE2-NEXT: psllw $2, %xmm1 453; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 454; SSE2-NEXT: psrlw $2, %xmm0 455; SSE2-NEXT: por %xmm1, %xmm0 456; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 457; SSE2-NEXT: pand %xmm0, %xmm1 458; SSE2-NEXT: paddb %xmm1, %xmm1 459; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 460; SSE2-NEXT: psrlw $1, %xmm0 461; SSE2-NEXT: por %xmm1, %xmm0 462; SSE2-NEXT: retq 463; 464; SSSE3-LABEL: test_bitreverse_v2i64: 465; SSSE3: # %bb.0: 466; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 467; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 468; SSSE3-NEXT: movdqa %xmm0, %xmm2 469; SSSE3-NEXT: pand %xmm1, %xmm2 470; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 471; SSSE3-NEXT: pshufb %xmm2, %xmm3 472; SSSE3-NEXT: psrlw $4, %xmm0 473; SSSE3-NEXT: pand %xmm1, %xmm0 474; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 475; SSSE3-NEXT: pshufb %xmm0, %xmm1 476; SSSE3-NEXT: por %xmm3, %xmm1 477; SSSE3-NEXT: movdqa %xmm1, %xmm0 478; SSSE3-NEXT: retq 479; 480; AVX-LABEL: test_bitreverse_v2i64: 481; AVX: # %bb.0: 482; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 483; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 484; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 485; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 486; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 487; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 488; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 489; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 490; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 491; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 492; AVX-NEXT: retq 493; 494; XOP-LABEL: test_bitreverse_v2i64: 495; XOP: # %bb.0: 496; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 497; XOP-NEXT: retq 498 %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) 499 ret <2 x i64> %b 500} 501 502define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { 503; SSE2-LABEL: test_bitreverse_v32i8: 504; SSE2: # %bb.0: 505; SSE2-NEXT: movdqa %xmm1, %xmm2 506; SSE2-NEXT: movdqa %xmm0, %xmm3 507; SSE2-NEXT: psllw $4, %xmm3 508; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 509; SSE2-NEXT: movdqa %xmm1, %xmm4 510; SSE2-NEXT: pandn %xmm3, %xmm4 511; SSE2-NEXT: psrlw $4, %xmm0 512; SSE2-NEXT: pand %xmm1, %xmm0 513; SSE2-NEXT: por %xmm4, %xmm0 514; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 515; SSE2-NEXT: movdqa %xmm0, %xmm4 516; SSE2-NEXT: pand %xmm3, %xmm4 517; SSE2-NEXT: psllw $2, %xmm4 518; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 519; SSE2-NEXT: pand %xmm5, %xmm0 520; SSE2-NEXT: psrlw $2, %xmm0 521; SSE2-NEXT: por %xmm4, %xmm0 522; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 523; SSE2-NEXT: movdqa %xmm0, %xmm6 524; SSE2-NEXT: pand %xmm4, %xmm6 525; SSE2-NEXT: paddb %xmm6, %xmm6 526; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 527; SSE2-NEXT: pand %xmm7, %xmm0 528; SSE2-NEXT: psrlw $1, %xmm0 529; SSE2-NEXT: por %xmm6, %xmm0 530; SSE2-NEXT: movdqa %xmm2, %xmm6 531; SSE2-NEXT: psllw $4, %xmm6 532; SSE2-NEXT: psrlw $4, %xmm2 533; SSE2-NEXT: pand %xmm1, %xmm2 534; SSE2-NEXT: pandn %xmm6, %xmm1 535; SSE2-NEXT: por %xmm2, %xmm1 536; SSE2-NEXT: pand %xmm1, %xmm3 537; SSE2-NEXT: psllw $2, %xmm3 538; SSE2-NEXT: pand %xmm5, %xmm1 539; SSE2-NEXT: psrlw $2, %xmm1 540; SSE2-NEXT: por %xmm3, %xmm1 541; SSE2-NEXT: pand %xmm1, %xmm4 542; SSE2-NEXT: paddb %xmm4, %xmm4 543; SSE2-NEXT: pand %xmm7, %xmm1 544; SSE2-NEXT: psrlw $1, %xmm1 545; SSE2-NEXT: por %xmm4, %xmm1 546; SSE2-NEXT: retq 547; 548; SSSE3-LABEL: test_bitreverse_v32i8: 549; SSSE3: # %bb.0: 550; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 551; SSSE3-NEXT: movdqa %xmm0, %xmm2 552; SSSE3-NEXT: pand %xmm4, %xmm2 553; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 554; SSSE3-NEXT: movdqa %xmm5, %xmm6 555; SSSE3-NEXT: pshufb %xmm2, %xmm6 556; SSSE3-NEXT: psrlw $4, %xmm0 557; SSSE3-NEXT: pand %xmm4, %xmm0 558; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 559; SSSE3-NEXT: movdqa %xmm2, %xmm3 560; SSSE3-NEXT: pshufb %xmm0, %xmm3 561; SSSE3-NEXT: por %xmm6, %xmm3 562; SSSE3-NEXT: movdqa %xmm1, %xmm0 563; SSSE3-NEXT: pand %xmm4, %xmm0 564; SSSE3-NEXT: pshufb %xmm0, %xmm5 565; SSSE3-NEXT: psrlw $4, %xmm1 566; SSSE3-NEXT: pand %xmm4, %xmm1 567; SSSE3-NEXT: pshufb %xmm1, %xmm2 568; SSSE3-NEXT: por %xmm5, %xmm2 569; SSSE3-NEXT: movdqa %xmm3, %xmm0 570; SSSE3-NEXT: movdqa %xmm2, %xmm1 571; SSSE3-NEXT: retq 572; 573; AVX1-LABEL: test_bitreverse_v32i8: 574; AVX1: # %bb.0: 575; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 576; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 577; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 578; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 579; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 580; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 581; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 582; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 583; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 584; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 585; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 586; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 587; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 588; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 589; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 590; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 591; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 592; AVX1-NEXT: retq 593; 594; AVX2-LABEL: test_bitreverse_v32i8: 595; AVX2: # %bb.0: 596; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 597; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 598; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 599; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 600; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 601; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 602; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 603; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 604; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 605; AVX2-NEXT: retq 606; 607; AVX512-LABEL: test_bitreverse_v32i8: 608; AVX512: # %bb.0: 609; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 610; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 611; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 612; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 613; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 614; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 615; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 616; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 617; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 618; AVX512-NEXT: retq 619; 620; XOPAVX1-LABEL: test_bitreverse_v32i8: 621; XOPAVX1: # %bb.0: 622; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 623; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 624; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 625; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 626; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 627; XOPAVX1-NEXT: retq 628; 629; XOPAVX2-LABEL: test_bitreverse_v32i8: 630; XOPAVX2: # %bb.0: 631; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 632; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 633; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 634; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 635; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 636; XOPAVX2-NEXT: retq 637 %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) 638 ret <32 x i8> %b 639} 640 641define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { 642; SSE2-LABEL: test_bitreverse_v16i16: 643; SSE2: # %bb.0: 644; SSE2-NEXT: movdqa %xmm1, %xmm2 645; SSE2-NEXT: movdqa %xmm0, %xmm1 646; SSE2-NEXT: psrlw $8, %xmm1 647; SSE2-NEXT: psllw $8, %xmm0 648; SSE2-NEXT: por %xmm1, %xmm0 649; SSE2-NEXT: movdqa %xmm0, %xmm3 650; SSE2-NEXT: psllw $4, %xmm3 651; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 652; SSE2-NEXT: movdqa %xmm1, %xmm4 653; SSE2-NEXT: pandn %xmm3, %xmm4 654; SSE2-NEXT: psrlw $4, %xmm0 655; SSE2-NEXT: pand %xmm1, %xmm0 656; SSE2-NEXT: por %xmm4, %xmm0 657; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 658; SSE2-NEXT: movdqa %xmm0, %xmm4 659; SSE2-NEXT: pand %xmm3, %xmm4 660; SSE2-NEXT: psllw $2, %xmm4 661; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 662; SSE2-NEXT: pand %xmm5, %xmm0 663; SSE2-NEXT: psrlw $2, %xmm0 664; SSE2-NEXT: por %xmm4, %xmm0 665; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 666; SSE2-NEXT: movdqa %xmm0, %xmm7 667; SSE2-NEXT: pand %xmm4, %xmm7 668; SSE2-NEXT: paddb %xmm7, %xmm7 669; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 670; SSE2-NEXT: pand %xmm6, %xmm0 671; SSE2-NEXT: psrlw $1, %xmm0 672; SSE2-NEXT: por %xmm7, %xmm0 673; SSE2-NEXT: movdqa %xmm2, %xmm7 674; SSE2-NEXT: psrlw $8, %xmm7 675; SSE2-NEXT: psllw $8, %xmm2 676; SSE2-NEXT: por %xmm7, %xmm2 677; SSE2-NEXT: movdqa %xmm2, %xmm7 678; SSE2-NEXT: psllw $4, %xmm7 679; SSE2-NEXT: psrlw $4, %xmm2 680; SSE2-NEXT: pand %xmm1, %xmm2 681; SSE2-NEXT: pandn %xmm7, %xmm1 682; SSE2-NEXT: por %xmm2, %xmm1 683; SSE2-NEXT: pand %xmm1, %xmm3 684; SSE2-NEXT: psllw $2, %xmm3 685; SSE2-NEXT: pand %xmm5, %xmm1 686; SSE2-NEXT: psrlw $2, %xmm1 687; SSE2-NEXT: por %xmm3, %xmm1 688; SSE2-NEXT: pand %xmm1, %xmm4 689; SSE2-NEXT: paddb %xmm4, %xmm4 690; SSE2-NEXT: pand %xmm6, %xmm1 691; SSE2-NEXT: psrlw $1, %xmm1 692; SSE2-NEXT: por %xmm4, %xmm1 693; SSE2-NEXT: retq 694; 695; SSSE3-LABEL: test_bitreverse_v16i16: 696; SSSE3: # %bb.0: 697; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 698; SSSE3-NEXT: pshufb %xmm4, %xmm0 699; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 700; SSSE3-NEXT: movdqa %xmm0, %xmm2 701; SSSE3-NEXT: pand %xmm5, %xmm2 702; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 703; SSSE3-NEXT: movdqa %xmm6, %xmm7 704; SSSE3-NEXT: pshufb %xmm2, %xmm7 705; SSSE3-NEXT: psrlw $4, %xmm0 706; SSSE3-NEXT: pand %xmm5, %xmm0 707; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 708; SSSE3-NEXT: movdqa %xmm2, %xmm3 709; SSSE3-NEXT: pshufb %xmm0, %xmm3 710; SSSE3-NEXT: por %xmm7, %xmm3 711; SSSE3-NEXT: pshufb %xmm4, %xmm1 712; SSSE3-NEXT: movdqa %xmm1, %xmm0 713; SSSE3-NEXT: pand %xmm5, %xmm0 714; SSSE3-NEXT: pshufb %xmm0, %xmm6 715; SSSE3-NEXT: psrlw $4, %xmm1 716; SSSE3-NEXT: pand %xmm5, %xmm1 717; SSSE3-NEXT: pshufb %xmm1, %xmm2 718; SSSE3-NEXT: por %xmm6, %xmm2 719; SSSE3-NEXT: movdqa %xmm3, %xmm0 720; SSSE3-NEXT: movdqa %xmm2, %xmm1 721; SSSE3-NEXT: retq 722; 723; AVX1-LABEL: test_bitreverse_v16i16: 724; AVX1: # %bb.0: 725; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 726; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 727; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 728; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 729; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 730; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 731; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 732; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 733; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 734; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 735; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 736; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 737; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 738; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 739; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 740; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 741; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 742; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 743; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 744; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 745; AVX1-NEXT: retq 746; 747; AVX2-LABEL: test_bitreverse_v16i16: 748; AVX2: # %bb.0: 749; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 750; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 751; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 752; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 753; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 754; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 755; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 756; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 757; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 758; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 759; AVX2-NEXT: retq 760; 761; AVX512-LABEL: test_bitreverse_v16i16: 762; AVX512: # %bb.0: 763; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 764; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 765; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 766; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 767; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 768; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 769; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 770; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 771; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 772; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 773; AVX512-NEXT: retq 774; 775; XOPAVX1-LABEL: test_bitreverse_v16i16: 776; XOPAVX1: # %bb.0: 777; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 778; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 779; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 780; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 781; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 782; XOPAVX1-NEXT: retq 783; 784; XOPAVX2-LABEL: test_bitreverse_v16i16: 785; XOPAVX2: # %bb.0: 786; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 787; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 788; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 789; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 790; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 791; XOPAVX2-NEXT: retq 792 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) 793 ret <16 x i16> %b 794} 795 796define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { 797; SSE2-LABEL: test_bitreverse_v8i32: 798; SSE2: # %bb.0: 799; SSE2-NEXT: movdqa %xmm1, %xmm2 800; SSE2-NEXT: pxor %xmm4, %xmm4 801; SSE2-NEXT: movdqa %xmm0, %xmm1 802; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] 803; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 804; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 805; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 806; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 807; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 808; SSE2-NEXT: packuswb %xmm1, %xmm0 809; SSE2-NEXT: movdqa %xmm0, %xmm3 810; SSE2-NEXT: psllw $4, %xmm3 811; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 812; SSE2-NEXT: movdqa %xmm1, %xmm5 813; SSE2-NEXT: pandn %xmm3, %xmm5 814; SSE2-NEXT: psrlw $4, %xmm0 815; SSE2-NEXT: pand %xmm1, %xmm0 816; SSE2-NEXT: por %xmm5, %xmm0 817; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 818; SSE2-NEXT: movdqa %xmm0, %xmm5 819; SSE2-NEXT: pand %xmm3, %xmm5 820; SSE2-NEXT: psllw $2, %xmm5 821; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 822; SSE2-NEXT: pand %xmm8, %xmm0 823; SSE2-NEXT: psrlw $2, %xmm0 824; SSE2-NEXT: por %xmm5, %xmm0 825; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 826; SSE2-NEXT: movdqa %xmm0, %xmm6 827; SSE2-NEXT: pand %xmm5, %xmm6 828; SSE2-NEXT: paddb %xmm6, %xmm6 829; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 830; SSE2-NEXT: pand %xmm7, %xmm0 831; SSE2-NEXT: psrlw $1, %xmm0 832; SSE2-NEXT: por %xmm6, %xmm0 833; SSE2-NEXT: movdqa %xmm2, %xmm6 834; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] 835; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 836; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 837; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 838; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 839; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 840; SSE2-NEXT: packuswb %xmm6, %xmm2 841; SSE2-NEXT: movdqa %xmm2, %xmm4 842; SSE2-NEXT: psllw $4, %xmm4 843; SSE2-NEXT: psrlw $4, %xmm2 844; SSE2-NEXT: pand %xmm1, %xmm2 845; SSE2-NEXT: pandn %xmm4, %xmm1 846; SSE2-NEXT: por %xmm2, %xmm1 847; SSE2-NEXT: pand %xmm1, %xmm3 848; SSE2-NEXT: psllw $2, %xmm3 849; SSE2-NEXT: pand %xmm8, %xmm1 850; SSE2-NEXT: psrlw $2, %xmm1 851; SSE2-NEXT: por %xmm3, %xmm1 852; SSE2-NEXT: pand %xmm1, %xmm5 853; SSE2-NEXT: paddb %xmm5, %xmm5 854; SSE2-NEXT: pand %xmm7, %xmm1 855; SSE2-NEXT: psrlw $1, %xmm1 856; SSE2-NEXT: por %xmm5, %xmm1 857; SSE2-NEXT: retq 858; 859; SSSE3-LABEL: test_bitreverse_v8i32: 860; SSSE3: # %bb.0: 861; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 862; SSSE3-NEXT: pshufb %xmm4, %xmm0 863; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 864; SSSE3-NEXT: movdqa %xmm0, %xmm2 865; SSSE3-NEXT: pand %xmm5, %xmm2 866; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 867; SSSE3-NEXT: movdqa %xmm6, %xmm7 868; SSSE3-NEXT: pshufb %xmm2, %xmm7 869; SSSE3-NEXT: psrlw $4, %xmm0 870; SSSE3-NEXT: pand %xmm5, %xmm0 871; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 872; SSSE3-NEXT: movdqa %xmm2, %xmm3 873; SSSE3-NEXT: pshufb %xmm0, %xmm3 874; SSSE3-NEXT: por %xmm7, %xmm3 875; SSSE3-NEXT: pshufb %xmm4, %xmm1 876; SSSE3-NEXT: movdqa %xmm1, %xmm0 877; SSSE3-NEXT: pand %xmm5, %xmm0 878; SSSE3-NEXT: pshufb %xmm0, %xmm6 879; SSSE3-NEXT: psrlw $4, %xmm1 880; SSSE3-NEXT: pand %xmm5, %xmm1 881; SSSE3-NEXT: pshufb %xmm1, %xmm2 882; SSSE3-NEXT: por %xmm6, %xmm2 883; SSSE3-NEXT: movdqa %xmm3, %xmm0 884; SSSE3-NEXT: movdqa %xmm2, %xmm1 885; SSSE3-NEXT: retq 886; 887; AVX1-LABEL: test_bitreverse_v8i32: 888; AVX1: # %bb.0: 889; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 890; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 891; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 892; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 893; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 894; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 895; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 896; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 897; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 898; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 899; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 900; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 901; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 902; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 903; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 904; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 905; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 906; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 907; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 908; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 909; AVX1-NEXT: retq 910; 911; AVX2-LABEL: test_bitreverse_v8i32: 912; AVX2: # %bb.0: 913; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 914; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 915; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 916; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 917; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 918; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 919; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 920; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 921; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 922; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 923; AVX2-NEXT: retq 924; 925; AVX512-LABEL: test_bitreverse_v8i32: 926; AVX512: # %bb.0: 927; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 928; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 929; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 930; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 931; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 932; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 933; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 934; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 935; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 936; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 937; AVX512-NEXT: retq 938; 939; XOPAVX1-LABEL: test_bitreverse_v8i32: 940; XOPAVX1: # %bb.0: 941; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 942; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 943; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 944; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 945; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 946; XOPAVX1-NEXT: retq 947; 948; XOPAVX2-LABEL: test_bitreverse_v8i32: 949; XOPAVX2: # %bb.0: 950; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 951; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 952; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 953; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 954; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 955; XOPAVX2-NEXT: retq 956 %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) 957 ret <8 x i32> %b 958} 959 960define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { 961; SSE2-LABEL: test_bitreverse_v4i64: 962; SSE2: # %bb.0: 963; SSE2-NEXT: movdqa %xmm1, %xmm2 964; SSE2-NEXT: pxor %xmm4, %xmm4 965; SSE2-NEXT: movdqa %xmm0, %xmm1 966; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] 967; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 968; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 969; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 970; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 971; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 972; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 973; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 974; SSE2-NEXT: packuswb %xmm1, %xmm0 975; SSE2-NEXT: movdqa %xmm0, %xmm3 976; SSE2-NEXT: psllw $4, %xmm3 977; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 978; SSE2-NEXT: movdqa %xmm1, %xmm5 979; SSE2-NEXT: pandn %xmm3, %xmm5 980; SSE2-NEXT: psrlw $4, %xmm0 981; SSE2-NEXT: pand %xmm1, %xmm0 982; SSE2-NEXT: por %xmm5, %xmm0 983; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 984; SSE2-NEXT: movdqa %xmm0, %xmm5 985; SSE2-NEXT: pand %xmm3, %xmm5 986; SSE2-NEXT: psllw $2, %xmm5 987; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 988; SSE2-NEXT: pand %xmm8, %xmm0 989; SSE2-NEXT: psrlw $2, %xmm0 990; SSE2-NEXT: por %xmm5, %xmm0 991; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 992; SSE2-NEXT: movdqa %xmm0, %xmm6 993; SSE2-NEXT: pand %xmm5, %xmm6 994; SSE2-NEXT: paddb %xmm6, %xmm6 995; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 996; SSE2-NEXT: pand %xmm7, %xmm0 997; SSE2-NEXT: psrlw $1, %xmm0 998; SSE2-NEXT: por %xmm6, %xmm0 999; SSE2-NEXT: movdqa %xmm2, %xmm6 1000; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] 1001; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 1002; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1003; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1004; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 1005; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 1006; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1007; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1008; SSE2-NEXT: packuswb %xmm6, %xmm2 1009; SSE2-NEXT: movdqa %xmm2, %xmm4 1010; SSE2-NEXT: psllw $4, %xmm4 1011; SSE2-NEXT: psrlw $4, %xmm2 1012; SSE2-NEXT: pand %xmm1, %xmm2 1013; SSE2-NEXT: pandn %xmm4, %xmm1 1014; SSE2-NEXT: por %xmm2, %xmm1 1015; SSE2-NEXT: pand %xmm1, %xmm3 1016; SSE2-NEXT: psllw $2, %xmm3 1017; SSE2-NEXT: pand %xmm8, %xmm1 1018; SSE2-NEXT: psrlw $2, %xmm1 1019; SSE2-NEXT: por %xmm3, %xmm1 1020; SSE2-NEXT: pand %xmm1, %xmm5 1021; SSE2-NEXT: paddb %xmm5, %xmm5 1022; SSE2-NEXT: pand %xmm7, %xmm1 1023; SSE2-NEXT: psrlw $1, %xmm1 1024; SSE2-NEXT: por %xmm5, %xmm1 1025; SSE2-NEXT: retq 1026; 1027; SSSE3-LABEL: test_bitreverse_v4i64: 1028; SSSE3: # %bb.0: 1029; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1030; SSSE3-NEXT: pshufb %xmm4, %xmm0 1031; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1032; SSSE3-NEXT: movdqa %xmm0, %xmm2 1033; SSSE3-NEXT: pand %xmm5, %xmm2 1034; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1035; SSSE3-NEXT: movdqa %xmm6, %xmm7 1036; SSSE3-NEXT: pshufb %xmm2, %xmm7 1037; SSSE3-NEXT: psrlw $4, %xmm0 1038; SSSE3-NEXT: pand %xmm5, %xmm0 1039; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1040; SSSE3-NEXT: movdqa %xmm2, %xmm3 1041; SSSE3-NEXT: pshufb %xmm0, %xmm3 1042; SSSE3-NEXT: por %xmm7, %xmm3 1043; SSSE3-NEXT: pshufb %xmm4, %xmm1 1044; SSSE3-NEXT: movdqa %xmm1, %xmm0 1045; SSSE3-NEXT: pand %xmm5, %xmm0 1046; SSSE3-NEXT: pshufb %xmm0, %xmm6 1047; SSSE3-NEXT: psrlw $4, %xmm1 1048; SSSE3-NEXT: pand %xmm5, %xmm1 1049; SSSE3-NEXT: pshufb %xmm1, %xmm2 1050; SSSE3-NEXT: por %xmm6, %xmm2 1051; SSSE3-NEXT: movdqa %xmm3, %xmm0 1052; SSSE3-NEXT: movdqa %xmm2, %xmm1 1053; SSSE3-NEXT: retq 1054; 1055; AVX1-LABEL: test_bitreverse_v4i64: 1056; AVX1: # %bb.0: 1057; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1058; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1059; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1060; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1061; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1062; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1063; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1064; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1065; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1066; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1067; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1068; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1069; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1070; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1071; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1072; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1073; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1074; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1075; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1076; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1077; AVX1-NEXT: retq 1078; 1079; AVX2-LABEL: test_bitreverse_v4i64: 1080; AVX2: # %bb.0: 1081; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1082; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1083; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1084; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1085; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1086; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1087; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1088; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1089; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1090; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1091; AVX2-NEXT: retq 1092; 1093; AVX512-LABEL: test_bitreverse_v4i64: 1094; AVX512: # %bb.0: 1095; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1096; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1097; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1098; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1099; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1100; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1101; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1102; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1103; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1104; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1105; AVX512-NEXT: retq 1106; 1107; XOPAVX1-LABEL: test_bitreverse_v4i64: 1108; XOPAVX1: # %bb.0: 1109; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1110; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1111; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1112; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1113; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1114; XOPAVX1-NEXT: retq 1115; 1116; XOPAVX2-LABEL: test_bitreverse_v4i64: 1117; XOPAVX2: # %bb.0: 1118; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1119; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1120; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1121; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1122; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1123; XOPAVX2-NEXT: retq 1124 %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) 1125 ret <4 x i64> %b 1126} 1127 1128define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { 1129; SSE2-LABEL: test_bitreverse_v64i8: 1130; SSE2: # %bb.0: 1131; SSE2-NEXT: movdqa %xmm3, %xmm10 1132; SSE2-NEXT: movdqa %xmm0, %xmm5 1133; SSE2-NEXT: psllw $4, %xmm5 1134; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1135; SSE2-NEXT: movdqa %xmm3, %xmm6 1136; SSE2-NEXT: pandn %xmm5, %xmm6 1137; SSE2-NEXT: psrlw $4, %xmm0 1138; SSE2-NEXT: pand %xmm3, %xmm0 1139; SSE2-NEXT: por %xmm6, %xmm0 1140; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1141; SSE2-NEXT: movdqa %xmm0, %xmm6 1142; SSE2-NEXT: pand %xmm5, %xmm6 1143; SSE2-NEXT: psllw $2, %xmm6 1144; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1145; SSE2-NEXT: pand %xmm8, %xmm0 1146; SSE2-NEXT: psrlw $2, %xmm0 1147; SSE2-NEXT: por %xmm6, %xmm0 1148; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1149; SSE2-NEXT: movdqa %xmm0, %xmm7 1150; SSE2-NEXT: pand %xmm6, %xmm7 1151; SSE2-NEXT: paddb %xmm7, %xmm7 1152; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1153; SSE2-NEXT: pand %xmm9, %xmm0 1154; SSE2-NEXT: psrlw $1, %xmm0 1155; SSE2-NEXT: por %xmm7, %xmm0 1156; SSE2-NEXT: movdqa %xmm1, %xmm7 1157; SSE2-NEXT: psllw $4, %xmm7 1158; SSE2-NEXT: movdqa %xmm3, %xmm4 1159; SSE2-NEXT: pandn %xmm7, %xmm4 1160; SSE2-NEXT: psrlw $4, %xmm1 1161; SSE2-NEXT: pand %xmm3, %xmm1 1162; SSE2-NEXT: por %xmm4, %xmm1 1163; SSE2-NEXT: movdqa %xmm1, %xmm4 1164; SSE2-NEXT: pand %xmm5, %xmm4 1165; SSE2-NEXT: psllw $2, %xmm4 1166; SSE2-NEXT: pand %xmm8, %xmm1 1167; SSE2-NEXT: psrlw $2, %xmm1 1168; SSE2-NEXT: por %xmm4, %xmm1 1169; SSE2-NEXT: movdqa %xmm1, %xmm4 1170; SSE2-NEXT: pand %xmm6, %xmm4 1171; SSE2-NEXT: paddb %xmm4, %xmm4 1172; SSE2-NEXT: pand %xmm9, %xmm1 1173; SSE2-NEXT: psrlw $1, %xmm1 1174; SSE2-NEXT: por %xmm4, %xmm1 1175; SSE2-NEXT: movdqa %xmm2, %xmm4 1176; SSE2-NEXT: psllw $4, %xmm4 1177; SSE2-NEXT: movdqa %xmm3, %xmm7 1178; SSE2-NEXT: pandn %xmm4, %xmm7 1179; SSE2-NEXT: psrlw $4, %xmm2 1180; SSE2-NEXT: pand %xmm3, %xmm2 1181; SSE2-NEXT: por %xmm7, %xmm2 1182; SSE2-NEXT: movdqa %xmm2, %xmm4 1183; SSE2-NEXT: pand %xmm5, %xmm4 1184; SSE2-NEXT: psllw $2, %xmm4 1185; SSE2-NEXT: pand %xmm8, %xmm2 1186; SSE2-NEXT: psrlw $2, %xmm2 1187; SSE2-NEXT: por %xmm4, %xmm2 1188; SSE2-NEXT: movdqa %xmm2, %xmm4 1189; SSE2-NEXT: pand %xmm6, %xmm4 1190; SSE2-NEXT: paddb %xmm4, %xmm4 1191; SSE2-NEXT: pand %xmm9, %xmm2 1192; SSE2-NEXT: psrlw $1, %xmm2 1193; SSE2-NEXT: por %xmm4, %xmm2 1194; SSE2-NEXT: movdqa %xmm10, %xmm4 1195; SSE2-NEXT: psllw $4, %xmm4 1196; SSE2-NEXT: psrlw $4, %xmm10 1197; SSE2-NEXT: pand %xmm3, %xmm10 1198; SSE2-NEXT: pandn %xmm4, %xmm3 1199; SSE2-NEXT: por %xmm10, %xmm3 1200; SSE2-NEXT: pand %xmm3, %xmm5 1201; SSE2-NEXT: psllw $2, %xmm5 1202; SSE2-NEXT: pand %xmm8, %xmm3 1203; SSE2-NEXT: psrlw $2, %xmm3 1204; SSE2-NEXT: por %xmm5, %xmm3 1205; SSE2-NEXT: pand %xmm3, %xmm6 1206; SSE2-NEXT: paddb %xmm6, %xmm6 1207; SSE2-NEXT: pand %xmm9, %xmm3 1208; SSE2-NEXT: psrlw $1, %xmm3 1209; SSE2-NEXT: por %xmm6, %xmm3 1210; SSE2-NEXT: retq 1211; 1212; SSSE3-LABEL: test_bitreverse_v64i8: 1213; SSSE3: # %bb.0: 1214; SSSE3-NEXT: movdqa %xmm0, %xmm5 1215; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1216; SSSE3-NEXT: pand %xmm8, %xmm0 1217; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1218; SSSE3-NEXT: movdqa %xmm9, %xmm6 1219; SSSE3-NEXT: pshufb %xmm0, %xmm6 1220; SSSE3-NEXT: psrlw $4, %xmm5 1221; SSSE3-NEXT: pand %xmm8, %xmm5 1222; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1223; SSSE3-NEXT: movdqa %xmm4, %xmm0 1224; SSSE3-NEXT: pshufb %xmm5, %xmm0 1225; SSSE3-NEXT: por %xmm6, %xmm0 1226; SSSE3-NEXT: movdqa %xmm1, %xmm5 1227; SSSE3-NEXT: pand %xmm8, %xmm5 1228; SSSE3-NEXT: movdqa %xmm9, %xmm6 1229; SSSE3-NEXT: pshufb %xmm5, %xmm6 1230; SSSE3-NEXT: psrlw $4, %xmm1 1231; SSSE3-NEXT: pand %xmm8, %xmm1 1232; SSSE3-NEXT: movdqa %xmm4, %xmm5 1233; SSSE3-NEXT: pshufb %xmm1, %xmm5 1234; SSSE3-NEXT: por %xmm6, %xmm5 1235; SSSE3-NEXT: movdqa %xmm2, %xmm1 1236; SSSE3-NEXT: pand %xmm8, %xmm1 1237; SSSE3-NEXT: movdqa %xmm9, %xmm7 1238; SSSE3-NEXT: pshufb %xmm1, %xmm7 1239; SSSE3-NEXT: psrlw $4, %xmm2 1240; SSSE3-NEXT: pand %xmm8, %xmm2 1241; SSSE3-NEXT: movdqa %xmm4, %xmm6 1242; SSSE3-NEXT: pshufb %xmm2, %xmm6 1243; SSSE3-NEXT: por %xmm7, %xmm6 1244; SSSE3-NEXT: movdqa %xmm3, %xmm1 1245; SSSE3-NEXT: pand %xmm8, %xmm1 1246; SSSE3-NEXT: pshufb %xmm1, %xmm9 1247; SSSE3-NEXT: psrlw $4, %xmm3 1248; SSSE3-NEXT: pand %xmm8, %xmm3 1249; SSSE3-NEXT: pshufb %xmm3, %xmm4 1250; SSSE3-NEXT: por %xmm9, %xmm4 1251; SSSE3-NEXT: movdqa %xmm5, %xmm1 1252; SSSE3-NEXT: movdqa %xmm6, %xmm2 1253; SSSE3-NEXT: movdqa %xmm4, %xmm3 1254; SSSE3-NEXT: retq 1255; 1256; AVX1-LABEL: test_bitreverse_v64i8: 1257; AVX1: # %bb.0: 1258; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1259; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1260; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 1261; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1262; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1263; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1264; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1265; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1266; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1267; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1268; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm4 1269; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1270; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1271; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1272; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1273; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 1274; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1275; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1276; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 1277; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1278; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1279; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1280; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1281; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1282; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1283; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1284; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1285; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1286; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1287; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1288; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1289; AVX1-NEXT: retq 1290; 1291; AVX2-LABEL: test_bitreverse_v64i8: 1292; AVX2: # %bb.0: 1293; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1294; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 1295; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1296; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1297; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1298; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1299; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1300; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0 1301; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 1302; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 1303; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1304; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 1305; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1306; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1 1307; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 1308; AVX2-NEXT: retq 1309; 1310; AVX512F-LABEL: test_bitreverse_v64i8: 1311; AVX512F: # %bb.0: 1312; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1313; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1314; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 1315; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1316; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1317; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 1318; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm4 1319; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 1320; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 1321; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 1322; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1323; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 1324; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 1325; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 1326; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 1327; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1328; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0 1329; AVX512F-NEXT: retq 1330; 1331; AVX512BW-LABEL: test_bitreverse_v64i8: 1332; AVX512BW: # %bb.0: 1333; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1334; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 1335; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1336; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 1337; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 1338; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 1339; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1340; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 1341; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 1342; AVX512BW-NEXT: retq 1343; 1344; XOPAVX1-LABEL: test_bitreverse_v64i8: 1345; XOPAVX1: # %bb.0: 1346; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1347; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1348; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1349; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1350; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1351; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1352; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1353; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1354; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1355; XOPAVX1-NEXT: retq 1356; 1357; XOPAVX2-LABEL: test_bitreverse_v64i8: 1358; XOPAVX2: # %bb.0: 1359; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1360; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1361; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1362; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1363; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1364; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1365; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1366; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1367; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 1368; XOPAVX2-NEXT: retq 1369 %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) 1370 ret <64 x i8> %b 1371} 1372 1373define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { 1374; SSE2-LABEL: test_bitreverse_v32i16: 1375; SSE2: # %bb.0: 1376; SSE2-NEXT: movdqa %xmm3, %xmm4 1377; SSE2-NEXT: movdqa %xmm0, %xmm3 1378; SSE2-NEXT: psrlw $8, %xmm3 1379; SSE2-NEXT: psllw $8, %xmm0 1380; SSE2-NEXT: por %xmm3, %xmm0 1381; SSE2-NEXT: movdqa %xmm0, %xmm5 1382; SSE2-NEXT: psllw $4, %xmm5 1383; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1384; SSE2-NEXT: movdqa %xmm3, %xmm6 1385; SSE2-NEXT: pandn %xmm5, %xmm6 1386; SSE2-NEXT: psrlw $4, %xmm0 1387; SSE2-NEXT: pand %xmm3, %xmm0 1388; SSE2-NEXT: por %xmm6, %xmm0 1389; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1390; SSE2-NEXT: movdqa %xmm0, %xmm6 1391; SSE2-NEXT: pand %xmm10, %xmm6 1392; SSE2-NEXT: psllw $2, %xmm6 1393; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1394; SSE2-NEXT: pand %xmm8, %xmm0 1395; SSE2-NEXT: psrlw $2, %xmm0 1396; SSE2-NEXT: por %xmm6, %xmm0 1397; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1398; SSE2-NEXT: movdqa %xmm0, %xmm7 1399; SSE2-NEXT: pand %xmm6, %xmm7 1400; SSE2-NEXT: paddb %xmm7, %xmm7 1401; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1402; SSE2-NEXT: pand %xmm9, %xmm0 1403; SSE2-NEXT: psrlw $1, %xmm0 1404; SSE2-NEXT: por %xmm7, %xmm0 1405; SSE2-NEXT: movdqa %xmm1, %xmm7 1406; SSE2-NEXT: psrlw $8, %xmm7 1407; SSE2-NEXT: psllw $8, %xmm1 1408; SSE2-NEXT: por %xmm7, %xmm1 1409; SSE2-NEXT: movdqa %xmm1, %xmm7 1410; SSE2-NEXT: psllw $4, %xmm7 1411; SSE2-NEXT: movdqa %xmm3, %xmm5 1412; SSE2-NEXT: pandn %xmm7, %xmm5 1413; SSE2-NEXT: psrlw $4, %xmm1 1414; SSE2-NEXT: pand %xmm3, %xmm1 1415; SSE2-NEXT: por %xmm5, %xmm1 1416; SSE2-NEXT: movdqa %xmm1, %xmm5 1417; SSE2-NEXT: pand %xmm10, %xmm5 1418; SSE2-NEXT: psllw $2, %xmm5 1419; SSE2-NEXT: pand %xmm8, %xmm1 1420; SSE2-NEXT: psrlw $2, %xmm1 1421; SSE2-NEXT: por %xmm5, %xmm1 1422; SSE2-NEXT: movdqa %xmm1, %xmm5 1423; SSE2-NEXT: pand %xmm6, %xmm5 1424; SSE2-NEXT: paddb %xmm5, %xmm5 1425; SSE2-NEXT: pand %xmm9, %xmm1 1426; SSE2-NEXT: psrlw $1, %xmm1 1427; SSE2-NEXT: por %xmm5, %xmm1 1428; SSE2-NEXT: movdqa %xmm2, %xmm5 1429; SSE2-NEXT: psrlw $8, %xmm5 1430; SSE2-NEXT: psllw $8, %xmm2 1431; SSE2-NEXT: por %xmm5, %xmm2 1432; SSE2-NEXT: movdqa %xmm2, %xmm5 1433; SSE2-NEXT: psllw $4, %xmm5 1434; SSE2-NEXT: movdqa %xmm3, %xmm7 1435; SSE2-NEXT: pandn %xmm5, %xmm7 1436; SSE2-NEXT: psrlw $4, %xmm2 1437; SSE2-NEXT: pand %xmm3, %xmm2 1438; SSE2-NEXT: por %xmm7, %xmm2 1439; SSE2-NEXT: movdqa %xmm2, %xmm5 1440; SSE2-NEXT: pand %xmm10, %xmm5 1441; SSE2-NEXT: psllw $2, %xmm5 1442; SSE2-NEXT: pand %xmm8, %xmm2 1443; SSE2-NEXT: psrlw $2, %xmm2 1444; SSE2-NEXT: por %xmm5, %xmm2 1445; SSE2-NEXT: movdqa %xmm2, %xmm5 1446; SSE2-NEXT: pand %xmm6, %xmm5 1447; SSE2-NEXT: paddb %xmm5, %xmm5 1448; SSE2-NEXT: pand %xmm9, %xmm2 1449; SSE2-NEXT: psrlw $1, %xmm2 1450; SSE2-NEXT: por %xmm5, %xmm2 1451; SSE2-NEXT: movdqa %xmm4, %xmm5 1452; SSE2-NEXT: psrlw $8, %xmm5 1453; SSE2-NEXT: psllw $8, %xmm4 1454; SSE2-NEXT: por %xmm5, %xmm4 1455; SSE2-NEXT: movdqa %xmm4, %xmm5 1456; SSE2-NEXT: psllw $4, %xmm5 1457; SSE2-NEXT: psrlw $4, %xmm4 1458; SSE2-NEXT: pand %xmm3, %xmm4 1459; SSE2-NEXT: pandn %xmm5, %xmm3 1460; SSE2-NEXT: por %xmm4, %xmm3 1461; SSE2-NEXT: pand %xmm3, %xmm10 1462; SSE2-NEXT: psllw $2, %xmm10 1463; SSE2-NEXT: pand %xmm8, %xmm3 1464; SSE2-NEXT: psrlw $2, %xmm3 1465; SSE2-NEXT: por %xmm10, %xmm3 1466; SSE2-NEXT: pand %xmm3, %xmm6 1467; SSE2-NEXT: paddb %xmm6, %xmm6 1468; SSE2-NEXT: pand %xmm9, %xmm3 1469; SSE2-NEXT: psrlw $1, %xmm3 1470; SSE2-NEXT: por %xmm6, %xmm3 1471; SSE2-NEXT: retq 1472; 1473; SSSE3-LABEL: test_bitreverse_v32i16: 1474; SSSE3: # %bb.0: 1475; SSSE3-NEXT: movdqa %xmm1, %xmm5 1476; SSSE3-NEXT: movdqa %xmm0, %xmm1 1477; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1478; SSSE3-NEXT: pshufb %xmm8, %xmm1 1479; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1480; SSSE3-NEXT: movdqa %xmm1, %xmm0 1481; SSSE3-NEXT: pand %xmm9, %xmm0 1482; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1483; SSSE3-NEXT: movdqa %xmm7, %xmm6 1484; SSSE3-NEXT: pshufb %xmm0, %xmm6 1485; SSSE3-NEXT: psrlw $4, %xmm1 1486; SSSE3-NEXT: pand %xmm9, %xmm1 1487; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1488; SSSE3-NEXT: movdqa %xmm4, %xmm0 1489; SSSE3-NEXT: pshufb %xmm1, %xmm0 1490; SSSE3-NEXT: por %xmm6, %xmm0 1491; SSSE3-NEXT: pshufb %xmm8, %xmm5 1492; SSSE3-NEXT: movdqa %xmm5, %xmm1 1493; SSSE3-NEXT: pand %xmm9, %xmm1 1494; SSSE3-NEXT: movdqa %xmm7, %xmm6 1495; SSSE3-NEXT: pshufb %xmm1, %xmm6 1496; SSSE3-NEXT: psrlw $4, %xmm5 1497; SSSE3-NEXT: pand %xmm9, %xmm5 1498; SSSE3-NEXT: movdqa %xmm4, %xmm1 1499; SSSE3-NEXT: pshufb %xmm5, %xmm1 1500; SSSE3-NEXT: por %xmm6, %xmm1 1501; SSSE3-NEXT: pshufb %xmm8, %xmm2 1502; SSSE3-NEXT: movdqa %xmm2, %xmm5 1503; SSSE3-NEXT: pand %xmm9, %xmm5 1504; SSSE3-NEXT: movdqa %xmm7, %xmm6 1505; SSSE3-NEXT: pshufb %xmm5, %xmm6 1506; SSSE3-NEXT: psrlw $4, %xmm2 1507; SSSE3-NEXT: pand %xmm9, %xmm2 1508; SSSE3-NEXT: movdqa %xmm4, %xmm5 1509; SSSE3-NEXT: pshufb %xmm2, %xmm5 1510; SSSE3-NEXT: por %xmm6, %xmm5 1511; SSSE3-NEXT: pshufb %xmm8, %xmm3 1512; SSSE3-NEXT: movdqa %xmm3, %xmm2 1513; SSSE3-NEXT: pand %xmm9, %xmm2 1514; SSSE3-NEXT: pshufb %xmm2, %xmm7 1515; SSSE3-NEXT: psrlw $4, %xmm3 1516; SSSE3-NEXT: pand %xmm9, %xmm3 1517; SSSE3-NEXT: pshufb %xmm3, %xmm4 1518; SSSE3-NEXT: por %xmm7, %xmm4 1519; SSSE3-NEXT: movdqa %xmm5, %xmm2 1520; SSSE3-NEXT: movdqa %xmm4, %xmm3 1521; SSSE3-NEXT: retq 1522; 1523; AVX1-LABEL: test_bitreverse_v32i16: 1524; AVX1: # %bb.0: 1525; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1526; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1527; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1528; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1529; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1530; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1531; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1532; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1533; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1534; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1535; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1536; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1537; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1538; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 1539; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1540; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1541; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1542; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 1543; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 1544; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1545; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1546; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1547; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1548; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1549; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1550; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1551; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1552; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1553; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1554; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 1555; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 1556; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1557; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 1558; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 1559; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 1560; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1561; AVX1-NEXT: retq 1562; 1563; AVX2-LABEL: test_bitreverse_v32i16: 1564; AVX2: # %bb.0: 1565; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1566; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1567; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1568; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 1569; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1570; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1571; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1572; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 1573; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1574; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 1575; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 1576; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1577; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 1578; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 1579; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 1580; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 1581; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 1582; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 1583; AVX2-NEXT: retq 1584; 1585; AVX512F-LABEL: test_bitreverse_v32i16: 1586; AVX512F: # %bb.0: 1587; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1588; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1589; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1590; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1591; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 1592; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1593; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1594; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1595; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 1596; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 1597; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 1598; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 1599; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 1600; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1601; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 1602; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 1603; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 1604; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 1605; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1606; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 1607; AVX512F-NEXT: retq 1608; 1609; AVX512BW-LABEL: test_bitreverse_v32i16: 1610; AVX512BW: # %bb.0: 1611; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] 1612; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1613; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 1614; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1615; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 1616; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 1617; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 1618; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1619; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 1620; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 1621; AVX512BW-NEXT: retq 1622; 1623; XOPAVX1-LABEL: test_bitreverse_v32i16: 1624; XOPAVX1: # %bb.0: 1625; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1626; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1627; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1628; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1629; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1630; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1631; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1632; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1633; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1634; XOPAVX1-NEXT: retq 1635; 1636; XOPAVX2-LABEL: test_bitreverse_v32i16: 1637; XOPAVX2: # %bb.0: 1638; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1639; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1640; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1641; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1642; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1643; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1644; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1645; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1646; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 1647; XOPAVX2-NEXT: retq 1648 %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) 1649 ret <32 x i16> %b 1650} 1651 1652define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { 1653; SSE2-LABEL: test_bitreverse_v16i32: 1654; SSE2: # %bb.0: 1655; SSE2-NEXT: movdqa %xmm3, %xmm11 1656; SSE2-NEXT: pxor %xmm10, %xmm10 1657; SSE2-NEXT: movdqa %xmm0, %xmm3 1658; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] 1659; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 1660; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 1661; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 1662; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1663; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1664; SSE2-NEXT: packuswb %xmm3, %xmm0 1665; SSE2-NEXT: movdqa %xmm0, %xmm5 1666; SSE2-NEXT: psllw $4, %xmm5 1667; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1668; SSE2-NEXT: movdqa %xmm3, %xmm7 1669; SSE2-NEXT: pandn %xmm5, %xmm7 1670; SSE2-NEXT: psrlw $4, %xmm0 1671; SSE2-NEXT: pand %xmm3, %xmm0 1672; SSE2-NEXT: por %xmm7, %xmm0 1673; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1674; SSE2-NEXT: movdqa %xmm0, %xmm7 1675; SSE2-NEXT: pand %xmm5, %xmm7 1676; SSE2-NEXT: psllw $2, %xmm7 1677; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1678; SSE2-NEXT: pand %xmm8, %xmm0 1679; SSE2-NEXT: psrlw $2, %xmm0 1680; SSE2-NEXT: por %xmm7, %xmm0 1681; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1682; SSE2-NEXT: movdqa %xmm0, %xmm6 1683; SSE2-NEXT: pand %xmm7, %xmm6 1684; SSE2-NEXT: paddb %xmm6, %xmm6 1685; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1686; SSE2-NEXT: pand %xmm9, %xmm0 1687; SSE2-NEXT: psrlw $1, %xmm0 1688; SSE2-NEXT: por %xmm6, %xmm0 1689; SSE2-NEXT: movdqa %xmm1, %xmm6 1690; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] 1691; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1692; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1693; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] 1694; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1695; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1696; SSE2-NEXT: packuswb %xmm6, %xmm1 1697; SSE2-NEXT: movdqa %xmm1, %xmm6 1698; SSE2-NEXT: psllw $4, %xmm6 1699; SSE2-NEXT: movdqa %xmm3, %xmm4 1700; SSE2-NEXT: pandn %xmm6, %xmm4 1701; SSE2-NEXT: psrlw $4, %xmm1 1702; SSE2-NEXT: pand %xmm3, %xmm1 1703; SSE2-NEXT: por %xmm4, %xmm1 1704; SSE2-NEXT: movdqa %xmm1, %xmm4 1705; SSE2-NEXT: pand %xmm5, %xmm4 1706; SSE2-NEXT: psllw $2, %xmm4 1707; SSE2-NEXT: pand %xmm8, %xmm1 1708; SSE2-NEXT: psrlw $2, %xmm1 1709; SSE2-NEXT: por %xmm4, %xmm1 1710; SSE2-NEXT: movdqa %xmm1, %xmm4 1711; SSE2-NEXT: pand %xmm7, %xmm4 1712; SSE2-NEXT: paddb %xmm4, %xmm4 1713; SSE2-NEXT: pand %xmm9, %xmm1 1714; SSE2-NEXT: psrlw $1, %xmm1 1715; SSE2-NEXT: por %xmm4, %xmm1 1716; SSE2-NEXT: movdqa %xmm2, %xmm4 1717; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] 1718; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 1719; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 1720; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] 1721; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1722; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1723; SSE2-NEXT: packuswb %xmm4, %xmm2 1724; SSE2-NEXT: movdqa %xmm2, %xmm4 1725; SSE2-NEXT: psllw $4, %xmm4 1726; SSE2-NEXT: movdqa %xmm3, %xmm6 1727; SSE2-NEXT: pandn %xmm4, %xmm6 1728; SSE2-NEXT: psrlw $4, %xmm2 1729; SSE2-NEXT: pand %xmm3, %xmm2 1730; SSE2-NEXT: por %xmm6, %xmm2 1731; SSE2-NEXT: movdqa %xmm2, %xmm4 1732; SSE2-NEXT: pand %xmm5, %xmm4 1733; SSE2-NEXT: psllw $2, %xmm4 1734; SSE2-NEXT: pand %xmm8, %xmm2 1735; SSE2-NEXT: psrlw $2, %xmm2 1736; SSE2-NEXT: por %xmm4, %xmm2 1737; SSE2-NEXT: movdqa %xmm2, %xmm4 1738; SSE2-NEXT: pand %xmm7, %xmm4 1739; SSE2-NEXT: paddb %xmm4, %xmm4 1740; SSE2-NEXT: pand %xmm9, %xmm2 1741; SSE2-NEXT: psrlw $1, %xmm2 1742; SSE2-NEXT: por %xmm4, %xmm2 1743; SSE2-NEXT: movdqa %xmm11, %xmm4 1744; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] 1745; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 1746; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 1747; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] 1748; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[3,2,1,0,4,5,6,7] 1749; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1750; SSE2-NEXT: packuswb %xmm4, %xmm6 1751; SSE2-NEXT: movdqa %xmm6, %xmm4 1752; SSE2-NEXT: psllw $4, %xmm4 1753; SSE2-NEXT: psrlw $4, %xmm6 1754; SSE2-NEXT: pand %xmm3, %xmm6 1755; SSE2-NEXT: pandn %xmm4, %xmm3 1756; SSE2-NEXT: por %xmm6, %xmm3 1757; SSE2-NEXT: pand %xmm3, %xmm5 1758; SSE2-NEXT: psllw $2, %xmm5 1759; SSE2-NEXT: pand %xmm8, %xmm3 1760; SSE2-NEXT: psrlw $2, %xmm3 1761; SSE2-NEXT: por %xmm5, %xmm3 1762; SSE2-NEXT: pand %xmm3, %xmm7 1763; SSE2-NEXT: paddb %xmm7, %xmm7 1764; SSE2-NEXT: pand %xmm9, %xmm3 1765; SSE2-NEXT: psrlw $1, %xmm3 1766; SSE2-NEXT: por %xmm7, %xmm3 1767; SSE2-NEXT: retq 1768; 1769; SSSE3-LABEL: test_bitreverse_v16i32: 1770; SSSE3: # %bb.0: 1771; SSSE3-NEXT: movdqa %xmm1, %xmm5 1772; SSSE3-NEXT: movdqa %xmm0, %xmm1 1773; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1774; SSSE3-NEXT: pshufb %xmm8, %xmm1 1775; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1776; SSSE3-NEXT: movdqa %xmm1, %xmm0 1777; SSSE3-NEXT: pand %xmm9, %xmm0 1778; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1779; SSSE3-NEXT: movdqa %xmm7, %xmm6 1780; SSSE3-NEXT: pshufb %xmm0, %xmm6 1781; SSSE3-NEXT: psrlw $4, %xmm1 1782; SSSE3-NEXT: pand %xmm9, %xmm1 1783; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1784; SSSE3-NEXT: movdqa %xmm4, %xmm0 1785; SSSE3-NEXT: pshufb %xmm1, %xmm0 1786; SSSE3-NEXT: por %xmm6, %xmm0 1787; SSSE3-NEXT: pshufb %xmm8, %xmm5 1788; SSSE3-NEXT: movdqa %xmm5, %xmm1 1789; SSSE3-NEXT: pand %xmm9, %xmm1 1790; SSSE3-NEXT: movdqa %xmm7, %xmm6 1791; SSSE3-NEXT: pshufb %xmm1, %xmm6 1792; SSSE3-NEXT: psrlw $4, %xmm5 1793; SSSE3-NEXT: pand %xmm9, %xmm5 1794; SSSE3-NEXT: movdqa %xmm4, %xmm1 1795; SSSE3-NEXT: pshufb %xmm5, %xmm1 1796; SSSE3-NEXT: por %xmm6, %xmm1 1797; SSSE3-NEXT: pshufb %xmm8, %xmm2 1798; SSSE3-NEXT: movdqa %xmm2, %xmm5 1799; SSSE3-NEXT: pand %xmm9, %xmm5 1800; SSSE3-NEXT: movdqa %xmm7, %xmm6 1801; SSSE3-NEXT: pshufb %xmm5, %xmm6 1802; SSSE3-NEXT: psrlw $4, %xmm2 1803; SSSE3-NEXT: pand %xmm9, %xmm2 1804; SSSE3-NEXT: movdqa %xmm4, %xmm5 1805; SSSE3-NEXT: pshufb %xmm2, %xmm5 1806; SSSE3-NEXT: por %xmm6, %xmm5 1807; SSSE3-NEXT: pshufb %xmm8, %xmm3 1808; SSSE3-NEXT: movdqa %xmm3, %xmm2 1809; SSSE3-NEXT: pand %xmm9, %xmm2 1810; SSSE3-NEXT: pshufb %xmm2, %xmm7 1811; SSSE3-NEXT: psrlw $4, %xmm3 1812; SSSE3-NEXT: pand %xmm9, %xmm3 1813; SSSE3-NEXT: pshufb %xmm3, %xmm4 1814; SSSE3-NEXT: por %xmm7, %xmm4 1815; SSSE3-NEXT: movdqa %xmm5, %xmm2 1816; SSSE3-NEXT: movdqa %xmm4, %xmm3 1817; SSSE3-NEXT: retq 1818; 1819; AVX1-LABEL: test_bitreverse_v16i32: 1820; AVX1: # %bb.0: 1821; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1822; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1823; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1824; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1825; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1826; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1827; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1828; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1829; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1830; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1831; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1832; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1833; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1834; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 1835; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1836; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1837; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1838; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 1839; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 1840; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1841; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1842; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1843; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1844; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1845; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1846; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1847; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1848; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1849; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1850; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 1851; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 1852; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1853; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 1854; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 1855; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 1856; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1857; AVX1-NEXT: retq 1858; 1859; AVX2-LABEL: test_bitreverse_v16i32: 1860; AVX2: # %bb.0: 1861; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1862; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1863; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1864; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 1865; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1866; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1867; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1868; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 1869; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1870; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 1871; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 1872; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1873; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 1874; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 1875; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 1876; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 1877; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 1878; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 1879; AVX2-NEXT: retq 1880; 1881; AVX512F-LABEL: test_bitreverse_v16i32: 1882; AVX512F: # %bb.0: 1883; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1884; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1885; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1886; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1887; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 1888; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1889; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1890; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1891; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 1892; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 1893; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 1894; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 1895; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 1896; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1897; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 1898; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 1899; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 1900; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 1901; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1902; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 1903; AVX512F-NEXT: retq 1904; 1905; AVX512BW-LABEL: test_bitreverse_v16i32: 1906; AVX512BW: # %bb.0: 1907; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] 1908; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1909; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 1910; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1911; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 1912; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 1913; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 1914; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1915; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 1916; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 1917; AVX512BW-NEXT: retq 1918; 1919; XOPAVX1-LABEL: test_bitreverse_v16i32: 1920; XOPAVX1: # %bb.0: 1921; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1922; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1923; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1924; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1925; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1926; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1927; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1928; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1929; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1930; XOPAVX1-NEXT: retq 1931; 1932; XOPAVX2-LABEL: test_bitreverse_v16i32: 1933; XOPAVX2: # %bb.0: 1934; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1935; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1936; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1937; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1938; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1939; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1940; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1941; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1942; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 1943; XOPAVX2-NEXT: retq 1944 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) 1945 ret <16 x i32> %b 1946} 1947 1948define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { 1949; SSE2-LABEL: test_bitreverse_v8i64: 1950; SSE2: # %bb.0: 1951; SSE2-NEXT: movdqa %xmm3, %xmm11 1952; SSE2-NEXT: pxor %xmm10, %xmm10 1953; SSE2-NEXT: movdqa %xmm0, %xmm3 1954; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] 1955; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 1956; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 1957; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 1958; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 1959; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1960; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1961; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1962; SSE2-NEXT: packuswb %xmm3, %xmm0 1963; SSE2-NEXT: movdqa %xmm0, %xmm5 1964; SSE2-NEXT: psllw $4, %xmm5 1965; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1966; SSE2-NEXT: movdqa %xmm3, %xmm7 1967; SSE2-NEXT: pandn %xmm5, %xmm7 1968; SSE2-NEXT: psrlw $4, %xmm0 1969; SSE2-NEXT: pand %xmm3, %xmm0 1970; SSE2-NEXT: por %xmm7, %xmm0 1971; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1972; SSE2-NEXT: movdqa %xmm0, %xmm7 1973; SSE2-NEXT: pand %xmm5, %xmm7 1974; SSE2-NEXT: psllw $2, %xmm7 1975; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1976; SSE2-NEXT: pand %xmm8, %xmm0 1977; SSE2-NEXT: psrlw $2, %xmm0 1978; SSE2-NEXT: por %xmm7, %xmm0 1979; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1980; SSE2-NEXT: movdqa %xmm0, %xmm6 1981; SSE2-NEXT: pand %xmm7, %xmm6 1982; SSE2-NEXT: paddb %xmm6, %xmm6 1983; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1984; SSE2-NEXT: pand %xmm9, %xmm0 1985; SSE2-NEXT: psrlw $1, %xmm0 1986; SSE2-NEXT: por %xmm6, %xmm0 1987; SSE2-NEXT: movdqa %xmm1, %xmm6 1988; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] 1989; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 1990; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1991; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1992; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] 1993; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1994; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1995; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1996; SSE2-NEXT: packuswb %xmm6, %xmm1 1997; SSE2-NEXT: movdqa %xmm1, %xmm6 1998; SSE2-NEXT: psllw $4, %xmm6 1999; SSE2-NEXT: movdqa %xmm3, %xmm4 2000; SSE2-NEXT: pandn %xmm6, %xmm4 2001; SSE2-NEXT: psrlw $4, %xmm1 2002; SSE2-NEXT: pand %xmm3, %xmm1 2003; SSE2-NEXT: por %xmm4, %xmm1 2004; SSE2-NEXT: movdqa %xmm1, %xmm4 2005; SSE2-NEXT: pand %xmm5, %xmm4 2006; SSE2-NEXT: psllw $2, %xmm4 2007; SSE2-NEXT: pand %xmm8, %xmm1 2008; SSE2-NEXT: psrlw $2, %xmm1 2009; SSE2-NEXT: por %xmm4, %xmm1 2010; SSE2-NEXT: movdqa %xmm1, %xmm4 2011; SSE2-NEXT: pand %xmm7, %xmm4 2012; SSE2-NEXT: paddb %xmm4, %xmm4 2013; SSE2-NEXT: pand %xmm9, %xmm1 2014; SSE2-NEXT: psrlw $1, %xmm1 2015; SSE2-NEXT: por %xmm4, %xmm1 2016; SSE2-NEXT: movdqa %xmm2, %xmm4 2017; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] 2018; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 2019; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2020; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2021; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] 2022; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2023; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 2024; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 2025; SSE2-NEXT: packuswb %xmm4, %xmm2 2026; SSE2-NEXT: movdqa %xmm2, %xmm4 2027; SSE2-NEXT: psllw $4, %xmm4 2028; SSE2-NEXT: movdqa %xmm3, %xmm6 2029; SSE2-NEXT: pandn %xmm4, %xmm6 2030; SSE2-NEXT: psrlw $4, %xmm2 2031; SSE2-NEXT: pand %xmm3, %xmm2 2032; SSE2-NEXT: por %xmm6, %xmm2 2033; SSE2-NEXT: movdqa %xmm2, %xmm4 2034; SSE2-NEXT: pand %xmm5, %xmm4 2035; SSE2-NEXT: psllw $2, %xmm4 2036; SSE2-NEXT: pand %xmm8, %xmm2 2037; SSE2-NEXT: psrlw $2, %xmm2 2038; SSE2-NEXT: por %xmm4, %xmm2 2039; SSE2-NEXT: movdqa %xmm2, %xmm4 2040; SSE2-NEXT: pand %xmm7, %xmm4 2041; SSE2-NEXT: paddb %xmm4, %xmm4 2042; SSE2-NEXT: pand %xmm9, %xmm2 2043; SSE2-NEXT: psrlw $1, %xmm2 2044; SSE2-NEXT: por %xmm4, %xmm2 2045; SSE2-NEXT: movdqa %xmm11, %xmm4 2046; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] 2047; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 2048; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2049; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2050; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] 2051; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[2,3,0,1] 2052; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 2053; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 2054; SSE2-NEXT: packuswb %xmm4, %xmm6 2055; SSE2-NEXT: movdqa %xmm6, %xmm4 2056; SSE2-NEXT: psllw $4, %xmm4 2057; SSE2-NEXT: psrlw $4, %xmm6 2058; SSE2-NEXT: pand %xmm3, %xmm6 2059; SSE2-NEXT: pandn %xmm4, %xmm3 2060; SSE2-NEXT: por %xmm6, %xmm3 2061; SSE2-NEXT: pand %xmm3, %xmm5 2062; SSE2-NEXT: psllw $2, %xmm5 2063; SSE2-NEXT: pand %xmm8, %xmm3 2064; SSE2-NEXT: psrlw $2, %xmm3 2065; SSE2-NEXT: por %xmm5, %xmm3 2066; SSE2-NEXT: pand %xmm3, %xmm7 2067; SSE2-NEXT: paddb %xmm7, %xmm7 2068; SSE2-NEXT: pand %xmm9, %xmm3 2069; SSE2-NEXT: psrlw $1, %xmm3 2070; SSE2-NEXT: por %xmm7, %xmm3 2071; SSE2-NEXT: retq 2072; 2073; SSSE3-LABEL: test_bitreverse_v8i64: 2074; SSSE3: # %bb.0: 2075; SSSE3-NEXT: movdqa %xmm1, %xmm5 2076; SSSE3-NEXT: movdqa %xmm0, %xmm1 2077; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2078; SSSE3-NEXT: pshufb %xmm8, %xmm1 2079; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2080; SSSE3-NEXT: movdqa %xmm1, %xmm0 2081; SSSE3-NEXT: pand %xmm9, %xmm0 2082; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2083; SSSE3-NEXT: movdqa %xmm7, %xmm6 2084; SSSE3-NEXT: pshufb %xmm0, %xmm6 2085; SSSE3-NEXT: psrlw $4, %xmm1 2086; SSSE3-NEXT: pand %xmm9, %xmm1 2087; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2088; SSSE3-NEXT: movdqa %xmm4, %xmm0 2089; SSSE3-NEXT: pshufb %xmm1, %xmm0 2090; SSSE3-NEXT: por %xmm6, %xmm0 2091; SSSE3-NEXT: pshufb %xmm8, %xmm5 2092; SSSE3-NEXT: movdqa %xmm5, %xmm1 2093; SSSE3-NEXT: pand %xmm9, %xmm1 2094; SSSE3-NEXT: movdqa %xmm7, %xmm6 2095; SSSE3-NEXT: pshufb %xmm1, %xmm6 2096; SSSE3-NEXT: psrlw $4, %xmm5 2097; SSSE3-NEXT: pand %xmm9, %xmm5 2098; SSSE3-NEXT: movdqa %xmm4, %xmm1 2099; SSSE3-NEXT: pshufb %xmm5, %xmm1 2100; SSSE3-NEXT: por %xmm6, %xmm1 2101; SSSE3-NEXT: pshufb %xmm8, %xmm2 2102; SSSE3-NEXT: movdqa %xmm2, %xmm5 2103; SSSE3-NEXT: pand %xmm9, %xmm5 2104; SSSE3-NEXT: movdqa %xmm7, %xmm6 2105; SSSE3-NEXT: pshufb %xmm5, %xmm6 2106; SSSE3-NEXT: psrlw $4, %xmm2 2107; SSSE3-NEXT: pand %xmm9, %xmm2 2108; SSSE3-NEXT: movdqa %xmm4, %xmm5 2109; SSSE3-NEXT: pshufb %xmm2, %xmm5 2110; SSSE3-NEXT: por %xmm6, %xmm5 2111; SSSE3-NEXT: pshufb %xmm8, %xmm3 2112; SSSE3-NEXT: movdqa %xmm3, %xmm2 2113; SSSE3-NEXT: pand %xmm9, %xmm2 2114; SSSE3-NEXT: pshufb %xmm2, %xmm7 2115; SSSE3-NEXT: psrlw $4, %xmm3 2116; SSSE3-NEXT: pand %xmm9, %xmm3 2117; SSSE3-NEXT: pshufb %xmm3, %xmm4 2118; SSSE3-NEXT: por %xmm7, %xmm4 2119; SSSE3-NEXT: movdqa %xmm5, %xmm2 2120; SSSE3-NEXT: movdqa %xmm4, %xmm3 2121; SSSE3-NEXT: retq 2122; 2123; AVX1-LABEL: test_bitreverse_v8i64: 2124; AVX1: # %bb.0: 2125; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2126; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2127; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2128; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2129; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2130; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2131; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2132; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2133; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2134; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2135; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2136; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2137; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2138; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 2139; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2140; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2141; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2142; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2143; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2144; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2145; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2146; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2147; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2148; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2149; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2150; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2151; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2152; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2153; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2154; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2155; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2156; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2157; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2158; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2159; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2160; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2161; AVX1-NEXT: retq 2162; 2163; AVX2-LABEL: test_bitreverse_v8i64: 2164; AVX2: # %bb.0: 2165; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2166; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2167; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2168; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2169; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2170; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2171; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2172; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2173; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2174; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2175; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2176; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2177; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2178; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2179; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2180; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2181; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2182; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2183; AVX2-NEXT: retq 2184; 2185; AVX512F-LABEL: test_bitreverse_v8i64: 2186; AVX512F: # %bb.0: 2187; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2188; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2189; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2190; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2191; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 2192; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2193; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2194; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2195; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 2196; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2197; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 2198; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2199; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 2200; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2201; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 2202; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 2203; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 2204; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 2205; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2206; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 2207; AVX512F-NEXT: retq 2208; 2209; AVX512BW-LABEL: test_bitreverse_v8i64: 2210; AVX512BW: # %bb.0: 2211; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] 2212; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2213; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2214; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2215; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2216; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2217; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2218; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2219; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2220; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2221; AVX512BW-NEXT: retq 2222; 2223; XOPAVX1-LABEL: test_bitreverse_v8i64: 2224; XOPAVX1: # %bb.0: 2225; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2226; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 2227; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2228; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2229; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2230; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2231; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2232; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2233; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2234; XOPAVX1-NEXT: retq 2235; 2236; XOPAVX2-LABEL: test_bitreverse_v8i64: 2237; XOPAVX2: # %bb.0: 2238; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2239; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 2240; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2241; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2242; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2243; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2244; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2245; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2246; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2247; XOPAVX2-NEXT: retq 2248 %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) 2249 ret <8 x i64> %b 2250} 2251 2252; 2253; Constant Folding 2254; 2255 2256define i32 @fold_bitreverse_i32() nounwind { 2257; ALL-LABEL: fold_bitreverse_i32: 2258; ALL: # %bb.0: 2259; ALL-NEXT: movl $16711935, %eax # imm = 0xFF00FF 2260; ALL-NEXT: retq 2261 %b = call i32 @llvm.bitreverse.i32(i32 4278255360) 2262 ret i32 %b 2263} 2264 2265define <16 x i8> @fold_bitreverse_v16i8() nounwind { 2266; SSE-LABEL: fold_bitreverse_v16i8: 2267; SSE: # %bb.0: 2268; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2269; SSE-NEXT: retq 2270; 2271; AVX-LABEL: fold_bitreverse_v16i8: 2272; AVX: # %bb.0: 2273; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2274; AVX-NEXT: retq 2275; 2276; XOP-LABEL: fold_bitreverse_v16i8: 2277; XOP: # %bb.0: 2278; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2279; XOP-NEXT: retq 2280 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>) 2281 ret <16 x i8> %b 2282} 2283 2284define <16 x i16> @fold_bitreverse_v16i16() nounwind { 2285; SSE-LABEL: fold_bitreverse_v16i16: 2286; SSE: # %bb.0: 2287; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959] 2288; SSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863] 2289; SSE-NEXT: retq 2290; 2291; AVX-LABEL: fold_bitreverse_v16i16: 2292; AVX: # %bb.0: 2293; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 2294; AVX-NEXT: retq 2295; 2296; XOP-LABEL: fold_bitreverse_v16i16: 2297; XOP: # %bb.0: 2298; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 2299; XOP-NEXT: retq 2300 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>) 2301 ret <16 x i16> %b 2302} 2303 2304define <16 x i32> @fold_bitreverse_v16i32() nounwind { 2305; SSE-LABEL: fold_bitreverse_v16i32: 2306; SSE: # %bb.0: 2307; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471] 2308; SSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559] 2309; SSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015] 2310; SSE-NEXT: movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103] 2311; SSE-NEXT: retq 2312; 2313; AVX1-LABEL: fold_bitreverse_v16i32: 2314; AVX1: # %bb.0: 2315; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2316; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2317; AVX1-NEXT: retq 2318; 2319; AVX2-LABEL: fold_bitreverse_v16i32: 2320; AVX2: # %bb.0: 2321; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2322; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2323; AVX2-NEXT: retq 2324; 2325; AVX512-LABEL: fold_bitreverse_v16i32: 2326; AVX512: # %bb.0: 2327; AVX512-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2328; AVX512-NEXT: retq 2329; 2330; XOP-LABEL: fold_bitreverse_v16i32: 2331; XOP: # %bb.0: 2332; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2333; XOP-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2334; XOP-NEXT: retq 2335 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>) 2336 ret <16 x i32> %b 2337} 2338 2339declare i8 @llvm.bitreverse.i8(i8) readnone 2340declare i16 @llvm.bitreverse.i16(i16) readnone 2341declare i32 @llvm.bitreverse.i32(i32) readnone 2342declare i64 @llvm.bitreverse.i64(i64) readnone 2343 2344declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone 2345declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone 2346declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone 2347declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone 2348 2349declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) readnone 2350declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone 2351declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone 2352declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone 2353 2354declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) readnone 2355declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone 2356declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>) readnone 2357declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>) readnone 2358