1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNISSE 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX2 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX512F 14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX512BW 15 16; Make sure we don't crash with avx512bw and xop 17; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw 18 19define i8 @test_bitreverse_i8(i8 %a) nounwind { 20; SSE-LABEL: test_bitreverse_i8: 21; SSE: # %bb.0: 22; SSE-NEXT: # kill: def $edi killed $edi def $rdi 23; SSE-NEXT: rolb $4, %dil 24; SSE-NEXT: movl %edi, %eax 25; SSE-NEXT: andb $51, %al 26; SSE-NEXT: shlb $2, %al 27; SSE-NEXT: andb $-52, %dil 28; SSE-NEXT: shrb $2, %dil 29; SSE-NEXT: orb %al, %dil 30; SSE-NEXT: movl %edi, %eax 31; SSE-NEXT: andb $85, %al 32; SSE-NEXT: addb %al, %al 33; SSE-NEXT: andb $-86, %dil 34; SSE-NEXT: shrb %dil 35; SSE-NEXT: addl %edi, %eax 36; SSE-NEXT: # kill: def $al killed $al killed $eax 37; SSE-NEXT: retq 38; 39; AVX-LABEL: test_bitreverse_i8: 40; AVX: # %bb.0: 41; AVX-NEXT: # kill: def $edi killed $edi def $rdi 42; AVX-NEXT: rolb $4, %dil 43; AVX-NEXT: movl %edi, %eax 44; AVX-NEXT: andb $51, %al 45; AVX-NEXT: shlb $2, %al 46; AVX-NEXT: andb $-52, %dil 47; AVX-NEXT: shrb $2, %dil 48; AVX-NEXT: orb %al, %dil 49; AVX-NEXT: movl %edi, %eax 50; AVX-NEXT: andb $85, %al 51; AVX-NEXT: addb %al, %al 52; AVX-NEXT: andb $-86, %dil 53; AVX-NEXT: shrb %dil 54; AVX-NEXT: addl %edi, %eax 55; AVX-NEXT: # kill: def $al killed $al killed $eax 56; AVX-NEXT: retq 57; 58; XOP-LABEL: test_bitreverse_i8: 59; XOP: # %bb.0: 60; XOP-NEXT: vmovd %edi, %xmm0 61; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 62; XOP-NEXT: vmovd %xmm0, %eax 63; XOP-NEXT: # kill: def $al killed $al killed $eax 64; XOP-NEXT: retq 65; 66; GFNISSE-LABEL: test_bitreverse_i8: 67; GFNISSE: # %bb.0: 68; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi 69; GFNISSE-NEXT: rolb $4, %dil 70; GFNISSE-NEXT: movl %edi, %eax 71; GFNISSE-NEXT: andb $51, %al 72; GFNISSE-NEXT: shlb $2, %al 73; GFNISSE-NEXT: andb $-52, %dil 74; GFNISSE-NEXT: shrb $2, %dil 75; GFNISSE-NEXT: orb %al, %dil 76; GFNISSE-NEXT: movl %edi, %eax 77; GFNISSE-NEXT: andb $85, %al 78; GFNISSE-NEXT: addb %al, %al 79; GFNISSE-NEXT: andb $-86, %dil 80; GFNISSE-NEXT: shrb %dil 81; GFNISSE-NEXT: addl %edi, %eax 82; GFNISSE-NEXT: # kill: def $al killed $al killed $eax 83; GFNISSE-NEXT: retq 84; 85; GFNIAVX-LABEL: test_bitreverse_i8: 86; GFNIAVX: # %bb.0: 87; GFNIAVX-NEXT: # kill: def $edi killed $edi def $rdi 88; GFNIAVX-NEXT: rolb $4, %dil 89; GFNIAVX-NEXT: movl %edi, %eax 90; GFNIAVX-NEXT: andb $51, %al 91; GFNIAVX-NEXT: shlb $2, %al 92; GFNIAVX-NEXT: andb $-52, %dil 93; GFNIAVX-NEXT: shrb $2, %dil 94; GFNIAVX-NEXT: orb %al, %dil 95; GFNIAVX-NEXT: movl %edi, %eax 96; GFNIAVX-NEXT: andb $85, %al 97; GFNIAVX-NEXT: addb %al, %al 98; GFNIAVX-NEXT: andb $-86, %dil 99; GFNIAVX-NEXT: shrb %dil 100; GFNIAVX-NEXT: addl %edi, %eax 101; GFNIAVX-NEXT: # kill: def $al killed $al killed $eax 102; GFNIAVX-NEXT: retq 103; 104; GFNIAVX2-LABEL: test_bitreverse_i8: 105; GFNIAVX2: # %bb.0: 106; GFNIAVX2-NEXT: # kill: def $edi killed $edi def $rdi 107; GFNIAVX2-NEXT: rolb $4, %dil 108; GFNIAVX2-NEXT: movl %edi, %eax 109; GFNIAVX2-NEXT: andb $51, %al 110; GFNIAVX2-NEXT: shlb $2, %al 111; GFNIAVX2-NEXT: andb $-52, %dil 112; GFNIAVX2-NEXT: shrb $2, %dil 113; GFNIAVX2-NEXT: orb %al, %dil 114; GFNIAVX2-NEXT: movl %edi, %eax 115; GFNIAVX2-NEXT: andb $85, %al 116; GFNIAVX2-NEXT: addb %al, %al 117; GFNIAVX2-NEXT: andb $-86, %dil 118; GFNIAVX2-NEXT: shrb %dil 119; GFNIAVX2-NEXT: addl %edi, %eax 120; GFNIAVX2-NEXT: # kill: def $al killed $al killed $eax 121; GFNIAVX2-NEXT: retq 122; 123; GFNIAVX512F-LABEL: test_bitreverse_i8: 124; GFNIAVX512F: # %bb.0: 125; GFNIAVX512F-NEXT: # kill: def $edi killed $edi def $rdi 126; GFNIAVX512F-NEXT: rolb $4, %dil 127; GFNIAVX512F-NEXT: movl %edi, %eax 128; GFNIAVX512F-NEXT: andb $51, %al 129; GFNIAVX512F-NEXT: shlb $2, %al 130; GFNIAVX512F-NEXT: andb $-52, %dil 131; GFNIAVX512F-NEXT: shrb $2, %dil 132; GFNIAVX512F-NEXT: orb %al, %dil 133; GFNIAVX512F-NEXT: movl %edi, %eax 134; GFNIAVX512F-NEXT: andb $85, %al 135; GFNIAVX512F-NEXT: addb %al, %al 136; GFNIAVX512F-NEXT: andb $-86, %dil 137; GFNIAVX512F-NEXT: shrb %dil 138; GFNIAVX512F-NEXT: addl %edi, %eax 139; GFNIAVX512F-NEXT: # kill: def $al killed $al killed $eax 140; GFNIAVX512F-NEXT: retq 141; 142; GFNIAVX512BW-LABEL: test_bitreverse_i8: 143; GFNIAVX512BW: # %bb.0: 144; GFNIAVX512BW-NEXT: # kill: def $edi killed $edi def $rdi 145; GFNIAVX512BW-NEXT: rolb $4, %dil 146; GFNIAVX512BW-NEXT: movl %edi, %eax 147; GFNIAVX512BW-NEXT: andb $51, %al 148; GFNIAVX512BW-NEXT: shlb $2, %al 149; GFNIAVX512BW-NEXT: andb $-52, %dil 150; GFNIAVX512BW-NEXT: shrb $2, %dil 151; GFNIAVX512BW-NEXT: orb %al, %dil 152; GFNIAVX512BW-NEXT: movl %edi, %eax 153; GFNIAVX512BW-NEXT: andb $85, %al 154; GFNIAVX512BW-NEXT: addb %al, %al 155; GFNIAVX512BW-NEXT: andb $-86, %dil 156; GFNIAVX512BW-NEXT: shrb %dil 157; GFNIAVX512BW-NEXT: addl %edi, %eax 158; GFNIAVX512BW-NEXT: # kill: def $al killed $al killed $eax 159; GFNIAVX512BW-NEXT: retq 160 %b = call i8 @llvm.bitreverse.i8(i8 %a) 161 ret i8 %b 162} 163 164define i16 @test_bitreverse_i16(i16 %a) nounwind { 165; SSE-LABEL: test_bitreverse_i16: 166; SSE: # %bb.0: 167; SSE-NEXT: # kill: def $edi killed $edi def $rdi 168; SSE-NEXT: rolw $8, %di 169; SSE-NEXT: movl %edi, %eax 170; SSE-NEXT: andl $3855, %eax # imm = 0xF0F 171; SSE-NEXT: shll $4, %eax 172; SSE-NEXT: andl $61680, %edi # imm = 0xF0F0 173; SSE-NEXT: shrl $4, %edi 174; SSE-NEXT: orl %eax, %edi 175; SSE-NEXT: movl %edi, %eax 176; SSE-NEXT: andl $13107, %eax # imm = 0x3333 177; SSE-NEXT: andl $52428, %edi # imm = 0xCCCC 178; SSE-NEXT: shrl $2, %edi 179; SSE-NEXT: leal (%rdi,%rax,4), %eax 180; SSE-NEXT: movl %eax, %ecx 181; SSE-NEXT: andl $21845, %ecx # imm = 0x5555 182; SSE-NEXT: andl $43690, %eax # imm = 0xAAAA 183; SSE-NEXT: shrl %eax 184; SSE-NEXT: leal (%rax,%rcx,2), %eax 185; SSE-NEXT: # kill: def $ax killed $ax killed $eax 186; SSE-NEXT: retq 187; 188; AVX-LABEL: test_bitreverse_i16: 189; AVX: # %bb.0: 190; AVX-NEXT: # kill: def $edi killed $edi def $rdi 191; AVX-NEXT: rolw $8, %di 192; AVX-NEXT: movl %edi, %eax 193; AVX-NEXT: andl $3855, %eax # imm = 0xF0F 194; AVX-NEXT: shll $4, %eax 195; AVX-NEXT: andl $61680, %edi # imm = 0xF0F0 196; AVX-NEXT: shrl $4, %edi 197; AVX-NEXT: orl %eax, %edi 198; AVX-NEXT: movl %edi, %eax 199; AVX-NEXT: andl $13107, %eax # imm = 0x3333 200; AVX-NEXT: andl $52428, %edi # imm = 0xCCCC 201; AVX-NEXT: shrl $2, %edi 202; AVX-NEXT: leal (%rdi,%rax,4), %eax 203; AVX-NEXT: movl %eax, %ecx 204; AVX-NEXT: andl $21845, %ecx # imm = 0x5555 205; AVX-NEXT: andl $43690, %eax # imm = 0xAAAA 206; AVX-NEXT: shrl %eax 207; AVX-NEXT: leal (%rax,%rcx,2), %eax 208; AVX-NEXT: # kill: def $ax killed $ax killed $eax 209; AVX-NEXT: retq 210; 211; XOP-LABEL: test_bitreverse_i16: 212; XOP: # %bb.0: 213; XOP-NEXT: vmovd %edi, %xmm0 214; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 215; XOP-NEXT: vmovd %xmm0, %eax 216; XOP-NEXT: # kill: def $ax killed $ax killed $eax 217; XOP-NEXT: retq 218; 219; GFNISSE-LABEL: test_bitreverse_i16: 220; GFNISSE: # %bb.0: 221; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi 222; GFNISSE-NEXT: rolw $8, %di 223; GFNISSE-NEXT: movl %edi, %eax 224; GFNISSE-NEXT: andl $3855, %eax # imm = 0xF0F 225; GFNISSE-NEXT: shll $4, %eax 226; GFNISSE-NEXT: andl $61680, %edi # imm = 0xF0F0 227; GFNISSE-NEXT: shrl $4, %edi 228; GFNISSE-NEXT: orl %eax, %edi 229; GFNISSE-NEXT: movl %edi, %eax 230; GFNISSE-NEXT: andl $13107, %eax # imm = 0x3333 231; GFNISSE-NEXT: andl $52428, %edi # imm = 0xCCCC 232; GFNISSE-NEXT: shrl $2, %edi 233; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax 234; GFNISSE-NEXT: movl %eax, %ecx 235; GFNISSE-NEXT: andl $21845, %ecx # imm = 0x5555 236; GFNISSE-NEXT: andl $43690, %eax # imm = 0xAAAA 237; GFNISSE-NEXT: shrl %eax 238; GFNISSE-NEXT: leal (%rax,%rcx,2), %eax 239; GFNISSE-NEXT: # kill: def $ax killed $ax killed $eax 240; GFNISSE-NEXT: retq 241; 242; GFNIAVX-LABEL: test_bitreverse_i16: 243; GFNIAVX: # %bb.0: 244; GFNIAVX-NEXT: # kill: def $edi killed $edi def $rdi 245; GFNIAVX-NEXT: rolw $8, %di 246; GFNIAVX-NEXT: movl %edi, %eax 247; GFNIAVX-NEXT: andl $3855, %eax # imm = 0xF0F 248; GFNIAVX-NEXT: shll $4, %eax 249; GFNIAVX-NEXT: andl $61680, %edi # imm = 0xF0F0 250; GFNIAVX-NEXT: shrl $4, %edi 251; GFNIAVX-NEXT: orl %eax, %edi 252; GFNIAVX-NEXT: movl %edi, %eax 253; GFNIAVX-NEXT: andl $13107, %eax # imm = 0x3333 254; GFNIAVX-NEXT: andl $52428, %edi # imm = 0xCCCC 255; GFNIAVX-NEXT: shrl $2, %edi 256; GFNIAVX-NEXT: leal (%rdi,%rax,4), %eax 257; GFNIAVX-NEXT: movl %eax, %ecx 258; GFNIAVX-NEXT: andl $21845, %ecx # imm = 0x5555 259; GFNIAVX-NEXT: andl $43690, %eax # imm = 0xAAAA 260; GFNIAVX-NEXT: shrl %eax 261; GFNIAVX-NEXT: leal (%rax,%rcx,2), %eax 262; GFNIAVX-NEXT: # kill: def $ax killed $ax killed $eax 263; GFNIAVX-NEXT: retq 264; 265; GFNIAVX2-LABEL: test_bitreverse_i16: 266; GFNIAVX2: # %bb.0: 267; GFNIAVX2-NEXT: # kill: def $edi killed $edi def $rdi 268; GFNIAVX2-NEXT: rolw $8, %di 269; GFNIAVX2-NEXT: movl %edi, %eax 270; GFNIAVX2-NEXT: andl $3855, %eax # imm = 0xF0F 271; GFNIAVX2-NEXT: shll $4, %eax 272; GFNIAVX2-NEXT: andl $61680, %edi # imm = 0xF0F0 273; GFNIAVX2-NEXT: shrl $4, %edi 274; GFNIAVX2-NEXT: orl %eax, %edi 275; GFNIAVX2-NEXT: movl %edi, %eax 276; GFNIAVX2-NEXT: andl $13107, %eax # imm = 0x3333 277; GFNIAVX2-NEXT: andl $52428, %edi # imm = 0xCCCC 278; GFNIAVX2-NEXT: shrl $2, %edi 279; GFNIAVX2-NEXT: leal (%rdi,%rax,4), %eax 280; GFNIAVX2-NEXT: movl %eax, %ecx 281; GFNIAVX2-NEXT: andl $21845, %ecx # imm = 0x5555 282; GFNIAVX2-NEXT: andl $43690, %eax # imm = 0xAAAA 283; GFNIAVX2-NEXT: shrl %eax 284; GFNIAVX2-NEXT: leal (%rax,%rcx,2), %eax 285; GFNIAVX2-NEXT: # kill: def $ax killed $ax killed $eax 286; GFNIAVX2-NEXT: retq 287; 288; GFNIAVX512F-LABEL: test_bitreverse_i16: 289; GFNIAVX512F: # %bb.0: 290; GFNIAVX512F-NEXT: # kill: def $edi killed $edi def $rdi 291; GFNIAVX512F-NEXT: rolw $8, %di 292; GFNIAVX512F-NEXT: movl %edi, %eax 293; GFNIAVX512F-NEXT: andl $3855, %eax # imm = 0xF0F 294; GFNIAVX512F-NEXT: shll $4, %eax 295; GFNIAVX512F-NEXT: andl $61680, %edi # imm = 0xF0F0 296; GFNIAVX512F-NEXT: shrl $4, %edi 297; GFNIAVX512F-NEXT: orl %eax, %edi 298; GFNIAVX512F-NEXT: movl %edi, %eax 299; GFNIAVX512F-NEXT: andl $13107, %eax # imm = 0x3333 300; GFNIAVX512F-NEXT: andl $52428, %edi # imm = 0xCCCC 301; GFNIAVX512F-NEXT: shrl $2, %edi 302; GFNIAVX512F-NEXT: leal (%rdi,%rax,4), %eax 303; GFNIAVX512F-NEXT: movl %eax, %ecx 304; GFNIAVX512F-NEXT: andl $21845, %ecx # imm = 0x5555 305; GFNIAVX512F-NEXT: andl $43690, %eax # imm = 0xAAAA 306; GFNIAVX512F-NEXT: shrl %eax 307; GFNIAVX512F-NEXT: leal (%rax,%rcx,2), %eax 308; GFNIAVX512F-NEXT: # kill: def $ax killed $ax killed $eax 309; GFNIAVX512F-NEXT: retq 310; 311; GFNIAVX512BW-LABEL: test_bitreverse_i16: 312; GFNIAVX512BW: # %bb.0: 313; GFNIAVX512BW-NEXT: # kill: def $edi killed $edi def $rdi 314; GFNIAVX512BW-NEXT: rolw $8, %di 315; GFNIAVX512BW-NEXT: movl %edi, %eax 316; GFNIAVX512BW-NEXT: andl $3855, %eax # imm = 0xF0F 317; GFNIAVX512BW-NEXT: shll $4, %eax 318; GFNIAVX512BW-NEXT: andl $61680, %edi # imm = 0xF0F0 319; GFNIAVX512BW-NEXT: shrl $4, %edi 320; GFNIAVX512BW-NEXT: orl %eax, %edi 321; GFNIAVX512BW-NEXT: movl %edi, %eax 322; GFNIAVX512BW-NEXT: andl $13107, %eax # imm = 0x3333 323; GFNIAVX512BW-NEXT: andl $52428, %edi # imm = 0xCCCC 324; GFNIAVX512BW-NEXT: shrl $2, %edi 325; GFNIAVX512BW-NEXT: leal (%rdi,%rax,4), %eax 326; GFNIAVX512BW-NEXT: movl %eax, %ecx 327; GFNIAVX512BW-NEXT: andl $21845, %ecx # imm = 0x5555 328; GFNIAVX512BW-NEXT: andl $43690, %eax # imm = 0xAAAA 329; GFNIAVX512BW-NEXT: shrl %eax 330; GFNIAVX512BW-NEXT: leal (%rax,%rcx,2), %eax 331; GFNIAVX512BW-NEXT: # kill: def $ax killed $ax killed $eax 332; GFNIAVX512BW-NEXT: retq 333 %b = call i16 @llvm.bitreverse.i16(i16 %a) 334 ret i16 %b 335} 336 337define i32 @test_bitreverse_i32(i32 %a) nounwind { 338; SSE-LABEL: test_bitreverse_i32: 339; SSE: # %bb.0: 340; SSE-NEXT: # kill: def $edi killed $edi def $rdi 341; SSE-NEXT: bswapl %edi 342; SSE-NEXT: movl %edi, %eax 343; SSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 344; SSE-NEXT: shll $4, %eax 345; SSE-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 346; SSE-NEXT: shrl $4, %edi 347; SSE-NEXT: orl %eax, %edi 348; SSE-NEXT: movl %edi, %eax 349; SSE-NEXT: andl $858993459, %eax # imm = 0x33333333 350; SSE-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC 351; SSE-NEXT: shrl $2, %edi 352; SSE-NEXT: leal (%rdi,%rax,4), %eax 353; SSE-NEXT: movl %eax, %ecx 354; SSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555 355; SSE-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA 356; SSE-NEXT: shrl %eax 357; SSE-NEXT: leal (%rax,%rcx,2), %eax 358; SSE-NEXT: retq 359; 360; AVX-LABEL: test_bitreverse_i32: 361; AVX: # %bb.0: 362; AVX-NEXT: # kill: def $edi killed $edi def $rdi 363; AVX-NEXT: bswapl %edi 364; AVX-NEXT: movl %edi, %eax 365; AVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 366; AVX-NEXT: shll $4, %eax 367; AVX-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 368; AVX-NEXT: shrl $4, %edi 369; AVX-NEXT: orl %eax, %edi 370; AVX-NEXT: movl %edi, %eax 371; AVX-NEXT: andl $858993459, %eax # imm = 0x33333333 372; AVX-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC 373; AVX-NEXT: shrl $2, %edi 374; AVX-NEXT: leal (%rdi,%rax,4), %eax 375; AVX-NEXT: movl %eax, %ecx 376; AVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555 377; AVX-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA 378; AVX-NEXT: shrl %eax 379; AVX-NEXT: leal (%rax,%rcx,2), %eax 380; AVX-NEXT: retq 381; 382; XOP-LABEL: test_bitreverse_i32: 383; XOP: # %bb.0: 384; XOP-NEXT: vmovd %edi, %xmm0 385; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 386; XOP-NEXT: vmovd %xmm0, %eax 387; XOP-NEXT: retq 388; 389; GFNISSE-LABEL: test_bitreverse_i32: 390; GFNISSE: # %bb.0: 391; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi 392; GFNISSE-NEXT: bswapl %edi 393; GFNISSE-NEXT: movl %edi, %eax 394; GFNISSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 395; GFNISSE-NEXT: shll $4, %eax 396; GFNISSE-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 397; GFNISSE-NEXT: shrl $4, %edi 398; GFNISSE-NEXT: orl %eax, %edi 399; GFNISSE-NEXT: movl %edi, %eax 400; GFNISSE-NEXT: andl $858993459, %eax # imm = 0x33333333 401; GFNISSE-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC 402; GFNISSE-NEXT: shrl $2, %edi 403; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax 404; GFNISSE-NEXT: movl %eax, %ecx 405; GFNISSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555 406; GFNISSE-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA 407; GFNISSE-NEXT: shrl %eax 408; GFNISSE-NEXT: leal (%rax,%rcx,2), %eax 409; GFNISSE-NEXT: retq 410; 411; GFNIAVX-LABEL: test_bitreverse_i32: 412; GFNIAVX: # %bb.0: 413; GFNIAVX-NEXT: # kill: def $edi killed $edi def $rdi 414; GFNIAVX-NEXT: bswapl %edi 415; GFNIAVX-NEXT: movl %edi, %eax 416; GFNIAVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 417; GFNIAVX-NEXT: shll $4, %eax 418; GFNIAVX-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 419; GFNIAVX-NEXT: shrl $4, %edi 420; GFNIAVX-NEXT: orl %eax, %edi 421; GFNIAVX-NEXT: movl %edi, %eax 422; GFNIAVX-NEXT: andl $858993459, %eax # imm = 0x33333333 423; GFNIAVX-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC 424; GFNIAVX-NEXT: shrl $2, %edi 425; GFNIAVX-NEXT: leal (%rdi,%rax,4), %eax 426; GFNIAVX-NEXT: movl %eax, %ecx 427; GFNIAVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555 428; GFNIAVX-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA 429; GFNIAVX-NEXT: shrl %eax 430; GFNIAVX-NEXT: leal (%rax,%rcx,2), %eax 431; GFNIAVX-NEXT: retq 432; 433; GFNIAVX2-LABEL: test_bitreverse_i32: 434; GFNIAVX2: # %bb.0: 435; GFNIAVX2-NEXT: # kill: def $edi killed $edi def $rdi 436; GFNIAVX2-NEXT: bswapl %edi 437; GFNIAVX2-NEXT: movl %edi, %eax 438; GFNIAVX2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 439; GFNIAVX2-NEXT: shll $4, %eax 440; GFNIAVX2-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 441; GFNIAVX2-NEXT: shrl $4, %edi 442; GFNIAVX2-NEXT: orl %eax, %edi 443; GFNIAVX2-NEXT: movl %edi, %eax 444; GFNIAVX2-NEXT: andl $858993459, %eax # imm = 0x33333333 445; GFNIAVX2-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC 446; GFNIAVX2-NEXT: shrl $2, %edi 447; GFNIAVX2-NEXT: leal (%rdi,%rax,4), %eax 448; GFNIAVX2-NEXT: movl %eax, %ecx 449; GFNIAVX2-NEXT: andl $1431655765, %ecx # imm = 0x55555555 450; GFNIAVX2-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA 451; GFNIAVX2-NEXT: shrl %eax 452; GFNIAVX2-NEXT: leal (%rax,%rcx,2), %eax 453; GFNIAVX2-NEXT: retq 454; 455; GFNIAVX512F-LABEL: test_bitreverse_i32: 456; GFNIAVX512F: # %bb.0: 457; GFNIAVX512F-NEXT: # kill: def $edi killed $edi def $rdi 458; GFNIAVX512F-NEXT: bswapl %edi 459; GFNIAVX512F-NEXT: movl %edi, %eax 460; GFNIAVX512F-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 461; GFNIAVX512F-NEXT: shll $4, %eax 462; GFNIAVX512F-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 463; GFNIAVX512F-NEXT: shrl $4, %edi 464; GFNIAVX512F-NEXT: orl %eax, %edi 465; GFNIAVX512F-NEXT: movl %edi, %eax 466; GFNIAVX512F-NEXT: andl $858993459, %eax # imm = 0x33333333 467; GFNIAVX512F-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC 468; GFNIAVX512F-NEXT: shrl $2, %edi 469; GFNIAVX512F-NEXT: leal (%rdi,%rax,4), %eax 470; GFNIAVX512F-NEXT: movl %eax, %ecx 471; GFNIAVX512F-NEXT: andl $1431655765, %ecx # imm = 0x55555555 472; GFNIAVX512F-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA 473; GFNIAVX512F-NEXT: shrl %eax 474; GFNIAVX512F-NEXT: leal (%rax,%rcx,2), %eax 475; GFNIAVX512F-NEXT: retq 476; 477; GFNIAVX512BW-LABEL: test_bitreverse_i32: 478; GFNIAVX512BW: # %bb.0: 479; GFNIAVX512BW-NEXT: # kill: def $edi killed $edi def $rdi 480; GFNIAVX512BW-NEXT: bswapl %edi 481; GFNIAVX512BW-NEXT: movl %edi, %eax 482; GFNIAVX512BW-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 483; GFNIAVX512BW-NEXT: shll $4, %eax 484; GFNIAVX512BW-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 485; GFNIAVX512BW-NEXT: shrl $4, %edi 486; GFNIAVX512BW-NEXT: orl %eax, %edi 487; GFNIAVX512BW-NEXT: movl %edi, %eax 488; GFNIAVX512BW-NEXT: andl $858993459, %eax # imm = 0x33333333 489; GFNIAVX512BW-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC 490; GFNIAVX512BW-NEXT: shrl $2, %edi 491; GFNIAVX512BW-NEXT: leal (%rdi,%rax,4), %eax 492; GFNIAVX512BW-NEXT: movl %eax, %ecx 493; GFNIAVX512BW-NEXT: andl $1431655765, %ecx # imm = 0x55555555 494; GFNIAVX512BW-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA 495; GFNIAVX512BW-NEXT: shrl %eax 496; GFNIAVX512BW-NEXT: leal (%rax,%rcx,2), %eax 497; GFNIAVX512BW-NEXT: retq 498 %b = call i32 @llvm.bitreverse.i32(i32 %a) 499 ret i32 %b 500} 501 502define i64 @test_bitreverse_i64(i64 %a) nounwind { 503; SSE-LABEL: test_bitreverse_i64: 504; SSE: # %bb.0: 505; SSE-NEXT: bswapq %rdi 506; SSE-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F 507; SSE-NEXT: andq %rdi, %rax 508; SSE-NEXT: shlq $4, %rax 509; SSE-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 510; SSE-NEXT: andq %rdi, %rcx 511; SSE-NEXT: shrq $4, %rcx 512; SSE-NEXT: orq %rax, %rcx 513; SSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 514; SSE-NEXT: andq %rcx, %rax 515; SSE-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC 516; SSE-NEXT: andq %rcx, %rdx 517; SSE-NEXT: shrq $2, %rdx 518; SSE-NEXT: leaq (%rdx,%rax,4), %rax 519; SSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 520; SSE-NEXT: andq %rax, %rcx 521; SSE-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA 522; SSE-NEXT: andq %rax, %rdx 523; SSE-NEXT: shrq %rdx 524; SSE-NEXT: leaq (%rdx,%rcx,2), %rax 525; SSE-NEXT: retq 526; 527; AVX-LABEL: test_bitreverse_i64: 528; AVX: # %bb.0: 529; AVX-NEXT: bswapq %rdi 530; AVX-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F 531; AVX-NEXT: andq %rdi, %rax 532; AVX-NEXT: shlq $4, %rax 533; AVX-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 534; AVX-NEXT: andq %rdi, %rcx 535; AVX-NEXT: shrq $4, %rcx 536; AVX-NEXT: orq %rax, %rcx 537; AVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 538; AVX-NEXT: andq %rcx, %rax 539; AVX-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC 540; AVX-NEXT: andq %rcx, %rdx 541; AVX-NEXT: shrq $2, %rdx 542; AVX-NEXT: leaq (%rdx,%rax,4), %rax 543; AVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 544; AVX-NEXT: andq %rax, %rcx 545; AVX-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA 546; AVX-NEXT: andq %rax, %rdx 547; AVX-NEXT: shrq %rdx 548; AVX-NEXT: leaq (%rdx,%rcx,2), %rax 549; AVX-NEXT: retq 550; 551; XOP-LABEL: test_bitreverse_i64: 552; XOP: # %bb.0: 553; XOP-NEXT: vmovq %rdi, %xmm0 554; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 555; XOP-NEXT: vmovq %xmm0, %rax 556; XOP-NEXT: retq 557; 558; GFNISSE-LABEL: test_bitreverse_i64: 559; GFNISSE: # %bb.0: 560; GFNISSE-NEXT: bswapq %rdi 561; GFNISSE-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F 562; GFNISSE-NEXT: andq %rdi, %rax 563; GFNISSE-NEXT: shlq $4, %rax 564; GFNISSE-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 565; GFNISSE-NEXT: andq %rdi, %rcx 566; GFNISSE-NEXT: shrq $4, %rcx 567; GFNISSE-NEXT: orq %rax, %rcx 568; GFNISSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 569; GFNISSE-NEXT: andq %rcx, %rax 570; GFNISSE-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC 571; GFNISSE-NEXT: andq %rcx, %rdx 572; GFNISSE-NEXT: shrq $2, %rdx 573; GFNISSE-NEXT: leaq (%rdx,%rax,4), %rax 574; GFNISSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 575; GFNISSE-NEXT: andq %rax, %rcx 576; GFNISSE-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA 577; GFNISSE-NEXT: andq %rax, %rdx 578; GFNISSE-NEXT: shrq %rdx 579; GFNISSE-NEXT: leaq (%rdx,%rcx,2), %rax 580; GFNISSE-NEXT: retq 581; 582; GFNIAVX-LABEL: test_bitreverse_i64: 583; GFNIAVX: # %bb.0: 584; GFNIAVX-NEXT: bswapq %rdi 585; GFNIAVX-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F 586; GFNIAVX-NEXT: andq %rdi, %rax 587; GFNIAVX-NEXT: shlq $4, %rax 588; GFNIAVX-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 589; GFNIAVX-NEXT: andq %rdi, %rcx 590; GFNIAVX-NEXT: shrq $4, %rcx 591; GFNIAVX-NEXT: orq %rax, %rcx 592; GFNIAVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 593; GFNIAVX-NEXT: andq %rcx, %rax 594; GFNIAVX-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC 595; GFNIAVX-NEXT: andq %rcx, %rdx 596; GFNIAVX-NEXT: shrq $2, %rdx 597; GFNIAVX-NEXT: leaq (%rdx,%rax,4), %rax 598; GFNIAVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 599; GFNIAVX-NEXT: andq %rax, %rcx 600; GFNIAVX-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA 601; GFNIAVX-NEXT: andq %rax, %rdx 602; GFNIAVX-NEXT: shrq %rdx 603; GFNIAVX-NEXT: leaq (%rdx,%rcx,2), %rax 604; GFNIAVX-NEXT: retq 605; 606; GFNIAVX2-LABEL: test_bitreverse_i64: 607; GFNIAVX2: # %bb.0: 608; GFNIAVX2-NEXT: bswapq %rdi 609; GFNIAVX2-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F 610; GFNIAVX2-NEXT: andq %rdi, %rax 611; GFNIAVX2-NEXT: shlq $4, %rax 612; GFNIAVX2-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 613; GFNIAVX2-NEXT: andq %rdi, %rcx 614; GFNIAVX2-NEXT: shrq $4, %rcx 615; GFNIAVX2-NEXT: orq %rax, %rcx 616; GFNIAVX2-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 617; GFNIAVX2-NEXT: andq %rcx, %rax 618; GFNIAVX2-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC 619; GFNIAVX2-NEXT: andq %rcx, %rdx 620; GFNIAVX2-NEXT: shrq $2, %rdx 621; GFNIAVX2-NEXT: leaq (%rdx,%rax,4), %rax 622; GFNIAVX2-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 623; GFNIAVX2-NEXT: andq %rax, %rcx 624; GFNIAVX2-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA 625; GFNIAVX2-NEXT: andq %rax, %rdx 626; GFNIAVX2-NEXT: shrq %rdx 627; GFNIAVX2-NEXT: leaq (%rdx,%rcx,2), %rax 628; GFNIAVX2-NEXT: retq 629; 630; GFNIAVX512F-LABEL: test_bitreverse_i64: 631; GFNIAVX512F: # %bb.0: 632; GFNIAVX512F-NEXT: bswapq %rdi 633; GFNIAVX512F-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F 634; GFNIAVX512F-NEXT: andq %rdi, %rax 635; GFNIAVX512F-NEXT: shlq $4, %rax 636; GFNIAVX512F-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 637; GFNIAVX512F-NEXT: andq %rdi, %rcx 638; GFNIAVX512F-NEXT: shrq $4, %rcx 639; GFNIAVX512F-NEXT: orq %rax, %rcx 640; GFNIAVX512F-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 641; GFNIAVX512F-NEXT: andq %rcx, %rax 642; GFNIAVX512F-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC 643; GFNIAVX512F-NEXT: andq %rcx, %rdx 644; GFNIAVX512F-NEXT: shrq $2, %rdx 645; GFNIAVX512F-NEXT: leaq (%rdx,%rax,4), %rax 646; GFNIAVX512F-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 647; GFNIAVX512F-NEXT: andq %rax, %rcx 648; GFNIAVX512F-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA 649; GFNIAVX512F-NEXT: andq %rax, %rdx 650; GFNIAVX512F-NEXT: shrq %rdx 651; GFNIAVX512F-NEXT: leaq (%rdx,%rcx,2), %rax 652; GFNIAVX512F-NEXT: retq 653; 654; GFNIAVX512BW-LABEL: test_bitreverse_i64: 655; GFNIAVX512BW: # %bb.0: 656; GFNIAVX512BW-NEXT: bswapq %rdi 657; GFNIAVX512BW-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F 658; GFNIAVX512BW-NEXT: andq %rdi, %rax 659; GFNIAVX512BW-NEXT: shlq $4, %rax 660; GFNIAVX512BW-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 661; GFNIAVX512BW-NEXT: andq %rdi, %rcx 662; GFNIAVX512BW-NEXT: shrq $4, %rcx 663; GFNIAVX512BW-NEXT: orq %rax, %rcx 664; GFNIAVX512BW-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 665; GFNIAVX512BW-NEXT: andq %rcx, %rax 666; GFNIAVX512BW-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC 667; GFNIAVX512BW-NEXT: andq %rcx, %rdx 668; GFNIAVX512BW-NEXT: shrq $2, %rdx 669; GFNIAVX512BW-NEXT: leaq (%rdx,%rax,4), %rax 670; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 671; GFNIAVX512BW-NEXT: andq %rax, %rcx 672; GFNIAVX512BW-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA 673; GFNIAVX512BW-NEXT: andq %rax, %rdx 674; GFNIAVX512BW-NEXT: shrq %rdx 675; GFNIAVX512BW-NEXT: leaq (%rdx,%rcx,2), %rax 676; GFNIAVX512BW-NEXT: retq 677 %b = call i64 @llvm.bitreverse.i64(i64 %a) 678 ret i64 %b 679} 680 681define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { 682; SSE2-LABEL: test_bitreverse_v16i8: 683; SSE2: # %bb.0: 684; SSE2-NEXT: movdqa %xmm0, %xmm1 685; SSE2-NEXT: psllw $4, %xmm1 686; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 687; SSE2-NEXT: psrlw $4, %xmm0 688; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 689; SSE2-NEXT: por %xmm1, %xmm0 690; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 691; SSE2-NEXT: pand %xmm0, %xmm1 692; SSE2-NEXT: psllw $2, %xmm1 693; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 694; SSE2-NEXT: psrlw $2, %xmm0 695; SSE2-NEXT: por %xmm1, %xmm0 696; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 697; SSE2-NEXT: pand %xmm0, %xmm1 698; SSE2-NEXT: paddb %xmm1, %xmm1 699; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 700; SSE2-NEXT: psrlw $1, %xmm0 701; SSE2-NEXT: por %xmm1, %xmm0 702; SSE2-NEXT: retq 703; 704; SSSE3-LABEL: test_bitreverse_v16i8: 705; SSSE3: # %bb.0: 706; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 707; SSSE3-NEXT: movdqa %xmm0, %xmm2 708; SSSE3-NEXT: pand %xmm1, %xmm2 709; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 710; SSSE3-NEXT: pshufb %xmm2, %xmm3 711; SSSE3-NEXT: psrlw $4, %xmm0 712; SSSE3-NEXT: pand %xmm1, %xmm0 713; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 714; SSSE3-NEXT: pshufb %xmm0, %xmm1 715; SSSE3-NEXT: por %xmm3, %xmm1 716; SSSE3-NEXT: movdqa %xmm1, %xmm0 717; SSSE3-NEXT: retq 718; 719; AVX-LABEL: test_bitreverse_v16i8: 720; AVX: # %bb.0: 721; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 722; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 723; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 724; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 725; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 726; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 727; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 728; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 729; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 730; AVX-NEXT: retq 731; 732; XOP-LABEL: test_bitreverse_v16i8: 733; XOP: # %bb.0: 734; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 735; XOP-NEXT: retq 736; 737; GFNISSE-LABEL: test_bitreverse_v16i8: 738; GFNISSE: # %bb.0: 739; GFNISSE-NEXT: gf2p8affineqb $0, {{.*}}(%rip), %xmm0 740; GFNISSE-NEXT: retq 741; 742; GFNIAVX-LABEL: test_bitreverse_v16i8: 743; GFNIAVX: # %bb.0: 744; GFNIAVX-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 745; GFNIAVX-NEXT: retq 746; 747; GFNIAVX2-LABEL: test_bitreverse_v16i8: 748; GFNIAVX2: # %bb.0: 749; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 750; GFNIAVX2-NEXT: retq 751; 752; GFNIAVX512F-LABEL: test_bitreverse_v16i8: 753; GFNIAVX512F: # %bb.0: 754; GFNIAVX512F-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 755; GFNIAVX512F-NEXT: retq 756; 757; GFNIAVX512BW-LABEL: test_bitreverse_v16i8: 758; GFNIAVX512BW: # %bb.0: 759; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 760; GFNIAVX512BW-NEXT: retq 761 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) 762 ret <16 x i8> %b 763} 764 765define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { 766; SSE2-LABEL: test_bitreverse_v8i16: 767; SSE2: # %bb.0: 768; SSE2-NEXT: movdqa %xmm0, %xmm1 769; SSE2-NEXT: psrlw $8, %xmm1 770; SSE2-NEXT: psllw $8, %xmm0 771; SSE2-NEXT: por %xmm1, %xmm0 772; SSE2-NEXT: movdqa %xmm0, %xmm1 773; SSE2-NEXT: psllw $4, %xmm1 774; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 775; SSE2-NEXT: psrlw $4, %xmm0 776; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 777; SSE2-NEXT: por %xmm1, %xmm0 778; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 779; SSE2-NEXT: pand %xmm0, %xmm1 780; SSE2-NEXT: psllw $2, %xmm1 781; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 782; SSE2-NEXT: psrlw $2, %xmm0 783; SSE2-NEXT: por %xmm1, %xmm0 784; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 785; SSE2-NEXT: pand %xmm0, %xmm1 786; SSE2-NEXT: paddb %xmm1, %xmm1 787; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 788; SSE2-NEXT: psrlw $1, %xmm0 789; SSE2-NEXT: por %xmm1, %xmm0 790; SSE2-NEXT: retq 791; 792; SSSE3-LABEL: test_bitreverse_v8i16: 793; SSSE3: # %bb.0: 794; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 795; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 796; SSSE3-NEXT: movdqa %xmm0, %xmm2 797; SSSE3-NEXT: pand %xmm1, %xmm2 798; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 799; SSSE3-NEXT: pshufb %xmm2, %xmm3 800; SSSE3-NEXT: psrlw $4, %xmm0 801; SSSE3-NEXT: pand %xmm1, %xmm0 802; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 803; SSSE3-NEXT: pshufb %xmm0, %xmm1 804; SSSE3-NEXT: por %xmm3, %xmm1 805; SSSE3-NEXT: movdqa %xmm1, %xmm0 806; SSSE3-NEXT: retq 807; 808; AVX-LABEL: test_bitreverse_v8i16: 809; AVX: # %bb.0: 810; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 811; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 812; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 813; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 814; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 815; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 816; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 817; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 818; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 819; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 820; AVX-NEXT: retq 821; 822; XOP-LABEL: test_bitreverse_v8i16: 823; XOP: # %bb.0: 824; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 825; XOP-NEXT: retq 826; 827; GFNISSE-LABEL: test_bitreverse_v8i16: 828; GFNISSE: # %bb.0: 829; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 830; GFNISSE-NEXT: gf2p8affineqb $0, {{.*}}(%rip), %xmm0 831; GFNISSE-NEXT: retq 832; 833; GFNIAVX-LABEL: test_bitreverse_v8i16: 834; GFNIAVX: # %bb.0: 835; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 836; GFNIAVX-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 837; GFNIAVX-NEXT: retq 838; 839; GFNIAVX2-LABEL: test_bitreverse_v8i16: 840; GFNIAVX2: # %bb.0: 841; GFNIAVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 842; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 843; GFNIAVX2-NEXT: retq 844; 845; GFNIAVX512F-LABEL: test_bitreverse_v8i16: 846; GFNIAVX512F: # %bb.0: 847; GFNIAVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 848; GFNIAVX512F-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 849; GFNIAVX512F-NEXT: retq 850; 851; GFNIAVX512BW-LABEL: test_bitreverse_v8i16: 852; GFNIAVX512BW: # %bb.0: 853; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 854; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 855; GFNIAVX512BW-NEXT: retq 856 %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) 857 ret <8 x i16> %b 858} 859 860define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { 861; SSE2-LABEL: test_bitreverse_v4i32: 862; SSE2: # %bb.0: 863; SSE2-NEXT: pxor %xmm1, %xmm1 864; SSE2-NEXT: movdqa %xmm0, %xmm2 865; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 866; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 867; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 868; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 869; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 870; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 871; SSE2-NEXT: packuswb %xmm2, %xmm0 872; SSE2-NEXT: movdqa %xmm0, %xmm1 873; SSE2-NEXT: psllw $4, %xmm1 874; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 875; SSE2-NEXT: psrlw $4, %xmm0 876; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 877; SSE2-NEXT: por %xmm1, %xmm0 878; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 879; SSE2-NEXT: pand %xmm0, %xmm1 880; SSE2-NEXT: psllw $2, %xmm1 881; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 882; SSE2-NEXT: psrlw $2, %xmm0 883; SSE2-NEXT: por %xmm1, %xmm0 884; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 885; SSE2-NEXT: pand %xmm0, %xmm1 886; SSE2-NEXT: paddb %xmm1, %xmm1 887; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 888; SSE2-NEXT: psrlw $1, %xmm0 889; SSE2-NEXT: por %xmm1, %xmm0 890; SSE2-NEXT: retq 891; 892; SSSE3-LABEL: test_bitreverse_v4i32: 893; SSSE3: # %bb.0: 894; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 895; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 896; SSSE3-NEXT: movdqa %xmm0, %xmm2 897; SSSE3-NEXT: pand %xmm1, %xmm2 898; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 899; SSSE3-NEXT: pshufb %xmm2, %xmm3 900; SSSE3-NEXT: psrlw $4, %xmm0 901; SSSE3-NEXT: pand %xmm1, %xmm0 902; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 903; SSSE3-NEXT: pshufb %xmm0, %xmm1 904; SSSE3-NEXT: por %xmm3, %xmm1 905; SSSE3-NEXT: movdqa %xmm1, %xmm0 906; SSSE3-NEXT: retq 907; 908; AVX-LABEL: test_bitreverse_v4i32: 909; AVX: # %bb.0: 910; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 911; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 912; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 913; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 914; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 915; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 916; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 917; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 918; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 919; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 920; AVX-NEXT: retq 921; 922; XOP-LABEL: test_bitreverse_v4i32: 923; XOP: # %bb.0: 924; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 925; XOP-NEXT: retq 926; 927; GFNISSE-LABEL: test_bitreverse_v4i32: 928; GFNISSE: # %bb.0: 929; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 930; GFNISSE-NEXT: gf2p8affineqb $0, {{.*}}(%rip), %xmm0 931; GFNISSE-NEXT: retq 932; 933; GFNIAVX-LABEL: test_bitreverse_v4i32: 934; GFNIAVX: # %bb.0: 935; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 936; GFNIAVX-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 937; GFNIAVX-NEXT: retq 938; 939; GFNIAVX2-LABEL: test_bitreverse_v4i32: 940; GFNIAVX2: # %bb.0: 941; GFNIAVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 942; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 943; GFNIAVX2-NEXT: retq 944; 945; GFNIAVX512F-LABEL: test_bitreverse_v4i32: 946; GFNIAVX512F: # %bb.0: 947; GFNIAVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 948; GFNIAVX512F-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 949; GFNIAVX512F-NEXT: retq 950; 951; GFNIAVX512BW-LABEL: test_bitreverse_v4i32: 952; GFNIAVX512BW: # %bb.0: 953; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 954; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 955; GFNIAVX512BW-NEXT: retq 956 %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) 957 ret <4 x i32> %b 958} 959 960define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { 961; SSE2-LABEL: test_bitreverse_v2i64: 962; SSE2: # %bb.0: 963; SSE2-NEXT: pxor %xmm1, %xmm1 964; SSE2-NEXT: movdqa %xmm0, %xmm2 965; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 966; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 967; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 968; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 969; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 970; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 971; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 972; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 973; SSE2-NEXT: packuswb %xmm2, %xmm0 974; SSE2-NEXT: movdqa %xmm0, %xmm1 975; SSE2-NEXT: psllw $4, %xmm1 976; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 977; SSE2-NEXT: psrlw $4, %xmm0 978; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 979; SSE2-NEXT: por %xmm1, %xmm0 980; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 981; SSE2-NEXT: pand %xmm0, %xmm1 982; SSE2-NEXT: psllw $2, %xmm1 983; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 984; SSE2-NEXT: psrlw $2, %xmm0 985; SSE2-NEXT: por %xmm1, %xmm0 986; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 987; SSE2-NEXT: pand %xmm0, %xmm1 988; SSE2-NEXT: paddb %xmm1, %xmm1 989; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 990; SSE2-NEXT: psrlw $1, %xmm0 991; SSE2-NEXT: por %xmm1, %xmm0 992; SSE2-NEXT: retq 993; 994; SSSE3-LABEL: test_bitreverse_v2i64: 995; SSSE3: # %bb.0: 996; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 997; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 998; SSSE3-NEXT: movdqa %xmm0, %xmm2 999; SSSE3-NEXT: pand %xmm1, %xmm2 1000; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1001; SSSE3-NEXT: pshufb %xmm2, %xmm3 1002; SSSE3-NEXT: psrlw $4, %xmm0 1003; SSSE3-NEXT: pand %xmm1, %xmm0 1004; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1005; SSSE3-NEXT: pshufb %xmm0, %xmm1 1006; SSSE3-NEXT: por %xmm3, %xmm1 1007; SSSE3-NEXT: movdqa %xmm1, %xmm0 1008; SSSE3-NEXT: retq 1009; 1010; AVX-LABEL: test_bitreverse_v2i64: 1011; AVX: # %bb.0: 1012; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1013; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1014; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 1015; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1016; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1017; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1018; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 1019; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1020; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 1021; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 1022; AVX-NEXT: retq 1023; 1024; XOP-LABEL: test_bitreverse_v2i64: 1025; XOP: # %bb.0: 1026; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 1027; XOP-NEXT: retq 1028; 1029; GFNISSE-LABEL: test_bitreverse_v2i64: 1030; GFNISSE: # %bb.0: 1031; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1032; GFNISSE-NEXT: gf2p8affineqb $0, {{.*}}(%rip), %xmm0 1033; GFNISSE-NEXT: retq 1034; 1035; GFNIAVX-LABEL: test_bitreverse_v2i64: 1036; GFNIAVX: # %bb.0: 1037; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1038; GFNIAVX-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 1039; GFNIAVX-NEXT: retq 1040; 1041; GFNIAVX2-LABEL: test_bitreverse_v2i64: 1042; GFNIAVX2: # %bb.0: 1043; GFNIAVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1044; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 1045; GFNIAVX2-NEXT: retq 1046; 1047; GFNIAVX512F-LABEL: test_bitreverse_v2i64: 1048; GFNIAVX512F: # %bb.0: 1049; GFNIAVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1050; GFNIAVX512F-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 1051; GFNIAVX512F-NEXT: retq 1052; 1053; GFNIAVX512BW-LABEL: test_bitreverse_v2i64: 1054; GFNIAVX512BW: # %bb.0: 1055; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1056; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 1057; GFNIAVX512BW-NEXT: retq 1058 %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) 1059 ret <2 x i64> %b 1060} 1061 1062define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { 1063; SSE2-LABEL: test_bitreverse_v32i8: 1064; SSE2: # %bb.0: 1065; SSE2-NEXT: movdqa %xmm1, %xmm2 1066; SSE2-NEXT: movdqa %xmm0, %xmm3 1067; SSE2-NEXT: psllw $4, %xmm3 1068; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1069; SSE2-NEXT: movdqa %xmm1, %xmm4 1070; SSE2-NEXT: pandn %xmm3, %xmm4 1071; SSE2-NEXT: psrlw $4, %xmm0 1072; SSE2-NEXT: pand %xmm1, %xmm0 1073; SSE2-NEXT: por %xmm4, %xmm0 1074; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1075; SSE2-NEXT: movdqa %xmm0, %xmm4 1076; SSE2-NEXT: pand %xmm3, %xmm4 1077; SSE2-NEXT: psllw $2, %xmm4 1078; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1079; SSE2-NEXT: pand %xmm5, %xmm0 1080; SSE2-NEXT: psrlw $2, %xmm0 1081; SSE2-NEXT: por %xmm4, %xmm0 1082; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1083; SSE2-NEXT: movdqa %xmm0, %xmm6 1084; SSE2-NEXT: pand %xmm4, %xmm6 1085; SSE2-NEXT: paddb %xmm6, %xmm6 1086; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1087; SSE2-NEXT: pand %xmm7, %xmm0 1088; SSE2-NEXT: psrlw $1, %xmm0 1089; SSE2-NEXT: por %xmm6, %xmm0 1090; SSE2-NEXT: movdqa %xmm2, %xmm6 1091; SSE2-NEXT: psllw $4, %xmm6 1092; SSE2-NEXT: psrlw $4, %xmm2 1093; SSE2-NEXT: pand %xmm1, %xmm2 1094; SSE2-NEXT: pandn %xmm6, %xmm1 1095; SSE2-NEXT: por %xmm2, %xmm1 1096; SSE2-NEXT: pand %xmm1, %xmm3 1097; SSE2-NEXT: psllw $2, %xmm3 1098; SSE2-NEXT: pand %xmm5, %xmm1 1099; SSE2-NEXT: psrlw $2, %xmm1 1100; SSE2-NEXT: por %xmm3, %xmm1 1101; SSE2-NEXT: pand %xmm1, %xmm4 1102; SSE2-NEXT: paddb %xmm4, %xmm4 1103; SSE2-NEXT: pand %xmm7, %xmm1 1104; SSE2-NEXT: psrlw $1, %xmm1 1105; SSE2-NEXT: por %xmm4, %xmm1 1106; SSE2-NEXT: retq 1107; 1108; SSSE3-LABEL: test_bitreverse_v32i8: 1109; SSSE3: # %bb.0: 1110; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1111; SSSE3-NEXT: movdqa %xmm0, %xmm2 1112; SSSE3-NEXT: pand %xmm4, %xmm2 1113; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1114; SSSE3-NEXT: movdqa %xmm5, %xmm6 1115; SSSE3-NEXT: pshufb %xmm2, %xmm6 1116; SSSE3-NEXT: psrlw $4, %xmm0 1117; SSSE3-NEXT: pand %xmm4, %xmm0 1118; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1119; SSSE3-NEXT: movdqa %xmm2, %xmm3 1120; SSSE3-NEXT: pshufb %xmm0, %xmm3 1121; SSSE3-NEXT: por %xmm6, %xmm3 1122; SSSE3-NEXT: movdqa %xmm1, %xmm0 1123; SSSE3-NEXT: pand %xmm4, %xmm0 1124; SSSE3-NEXT: pshufb %xmm0, %xmm5 1125; SSSE3-NEXT: psrlw $4, %xmm1 1126; SSSE3-NEXT: pand %xmm4, %xmm1 1127; SSSE3-NEXT: pshufb %xmm1, %xmm2 1128; SSSE3-NEXT: por %xmm5, %xmm2 1129; SSSE3-NEXT: movdqa %xmm3, %xmm0 1130; SSSE3-NEXT: movdqa %xmm2, %xmm1 1131; SSSE3-NEXT: retq 1132; 1133; AVX1-LABEL: test_bitreverse_v32i8: 1134; AVX1: # %bb.0: 1135; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1136; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1137; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 1138; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1139; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 1140; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1141; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1142; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1143; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 1144; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 1145; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 1146; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 1147; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1148; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1149; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 1150; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 1151; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1152; AVX1-NEXT: retq 1153; 1154; AVX2-LABEL: test_bitreverse_v32i8: 1155; AVX2: # %bb.0: 1156; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1157; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1158; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1159; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1160; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1161; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1162; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1163; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1164; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1165; AVX2-NEXT: retq 1166; 1167; AVX512-LABEL: test_bitreverse_v32i8: 1168; AVX512: # %bb.0: 1169; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1170; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1171; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1172; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1173; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1174; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1175; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1176; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1177; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1178; AVX512-NEXT: retq 1179; 1180; XOPAVX1-LABEL: test_bitreverse_v32i8: 1181; XOPAVX1: # %bb.0: 1182; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1183; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1184; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1185; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1186; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1187; XOPAVX1-NEXT: retq 1188; 1189; XOPAVX2-LABEL: test_bitreverse_v32i8: 1190; XOPAVX2: # %bb.0: 1191; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1192; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1193; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1194; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1195; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1196; XOPAVX2-NEXT: retq 1197; 1198; GFNISSE-LABEL: test_bitreverse_v32i8: 1199; GFNISSE: # %bb.0: 1200; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] 1201; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 1202; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 1203; GFNISSE-NEXT: retq 1204; 1205; GFNIAVX-LABEL: test_bitreverse_v32i8: 1206; GFNIAVX: # %bb.0: 1207; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1208; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] 1209; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm2, %xmm1, %xmm1 1210; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm2, %xmm0, %xmm0 1211; GFNIAVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1212; GFNIAVX-NEXT: retq 1213; 1214; GFNIAVX2-LABEL: test_bitreverse_v32i8: 1215; GFNIAVX2: # %bb.0: 1216; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1217; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1218; GFNIAVX2-NEXT: retq 1219; 1220; GFNIAVX512F-LABEL: test_bitreverse_v32i8: 1221; GFNIAVX512F: # %bb.0: 1222; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1223; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1224; GFNIAVX512F-NEXT: retq 1225; 1226; GFNIAVX512BW-LABEL: test_bitreverse_v32i8: 1227; GFNIAVX512BW: # %bb.0: 1228; GFNIAVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1229; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1230; GFNIAVX512BW-NEXT: retq 1231 %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) 1232 ret <32 x i8> %b 1233} 1234 1235define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { 1236; SSE2-LABEL: test_bitreverse_v16i16: 1237; SSE2: # %bb.0: 1238; SSE2-NEXT: movdqa %xmm1, %xmm2 1239; SSE2-NEXT: movdqa %xmm0, %xmm1 1240; SSE2-NEXT: psrlw $8, %xmm1 1241; SSE2-NEXT: psllw $8, %xmm0 1242; SSE2-NEXT: por %xmm1, %xmm0 1243; SSE2-NEXT: movdqa %xmm0, %xmm3 1244; SSE2-NEXT: psllw $4, %xmm3 1245; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1246; SSE2-NEXT: movdqa %xmm1, %xmm4 1247; SSE2-NEXT: pandn %xmm3, %xmm4 1248; SSE2-NEXT: psrlw $4, %xmm0 1249; SSE2-NEXT: pand %xmm1, %xmm0 1250; SSE2-NEXT: por %xmm4, %xmm0 1251; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1252; SSE2-NEXT: movdqa %xmm0, %xmm4 1253; SSE2-NEXT: pand %xmm3, %xmm4 1254; SSE2-NEXT: psllw $2, %xmm4 1255; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1256; SSE2-NEXT: pand %xmm5, %xmm0 1257; SSE2-NEXT: psrlw $2, %xmm0 1258; SSE2-NEXT: por %xmm4, %xmm0 1259; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1260; SSE2-NEXT: movdqa %xmm0, %xmm7 1261; SSE2-NEXT: pand %xmm4, %xmm7 1262; SSE2-NEXT: paddb %xmm7, %xmm7 1263; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1264; SSE2-NEXT: pand %xmm6, %xmm0 1265; SSE2-NEXT: psrlw $1, %xmm0 1266; SSE2-NEXT: por %xmm7, %xmm0 1267; SSE2-NEXT: movdqa %xmm2, %xmm7 1268; SSE2-NEXT: psrlw $8, %xmm7 1269; SSE2-NEXT: psllw $8, %xmm2 1270; SSE2-NEXT: por %xmm7, %xmm2 1271; SSE2-NEXT: movdqa %xmm2, %xmm7 1272; SSE2-NEXT: psllw $4, %xmm7 1273; SSE2-NEXT: psrlw $4, %xmm2 1274; SSE2-NEXT: pand %xmm1, %xmm2 1275; SSE2-NEXT: pandn %xmm7, %xmm1 1276; SSE2-NEXT: por %xmm2, %xmm1 1277; SSE2-NEXT: pand %xmm1, %xmm3 1278; SSE2-NEXT: psllw $2, %xmm3 1279; SSE2-NEXT: pand %xmm5, %xmm1 1280; SSE2-NEXT: psrlw $2, %xmm1 1281; SSE2-NEXT: por %xmm3, %xmm1 1282; SSE2-NEXT: pand %xmm1, %xmm4 1283; SSE2-NEXT: paddb %xmm4, %xmm4 1284; SSE2-NEXT: pand %xmm6, %xmm1 1285; SSE2-NEXT: psrlw $1, %xmm1 1286; SSE2-NEXT: por %xmm4, %xmm1 1287; SSE2-NEXT: retq 1288; 1289; SSSE3-LABEL: test_bitreverse_v16i16: 1290; SSSE3: # %bb.0: 1291; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1292; SSSE3-NEXT: pshufb %xmm4, %xmm0 1293; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1294; SSSE3-NEXT: movdqa %xmm0, %xmm2 1295; SSSE3-NEXT: pand %xmm5, %xmm2 1296; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1297; SSSE3-NEXT: movdqa %xmm6, %xmm7 1298; SSSE3-NEXT: pshufb %xmm2, %xmm7 1299; SSSE3-NEXT: psrlw $4, %xmm0 1300; SSSE3-NEXT: pand %xmm5, %xmm0 1301; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1302; SSSE3-NEXT: movdqa %xmm2, %xmm3 1303; SSSE3-NEXT: pshufb %xmm0, %xmm3 1304; SSSE3-NEXT: por %xmm7, %xmm3 1305; SSSE3-NEXT: pshufb %xmm4, %xmm1 1306; SSSE3-NEXT: movdqa %xmm1, %xmm0 1307; SSSE3-NEXT: pand %xmm5, %xmm0 1308; SSSE3-NEXT: pshufb %xmm0, %xmm6 1309; SSSE3-NEXT: psrlw $4, %xmm1 1310; SSSE3-NEXT: pand %xmm5, %xmm1 1311; SSSE3-NEXT: pshufb %xmm1, %xmm2 1312; SSSE3-NEXT: por %xmm6, %xmm2 1313; SSSE3-NEXT: movdqa %xmm3, %xmm0 1314; SSSE3-NEXT: movdqa %xmm2, %xmm1 1315; SSSE3-NEXT: retq 1316; 1317; AVX1-LABEL: test_bitreverse_v16i16: 1318; AVX1: # %bb.0: 1319; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1320; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1321; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1322; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1323; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1324; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1325; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1326; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1327; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1328; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1329; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1330; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1331; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1332; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1333; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1334; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1335; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1336; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1337; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1338; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1339; AVX1-NEXT: retq 1340; 1341; AVX2-LABEL: test_bitreverse_v16i16: 1342; AVX2: # %bb.0: 1343; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1344; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1345; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1346; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1347; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1348; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1349; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1350; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1351; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1352; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1353; AVX2-NEXT: retq 1354; 1355; AVX512-LABEL: test_bitreverse_v16i16: 1356; AVX512: # %bb.0: 1357; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1358; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1359; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1360; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1361; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1362; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1363; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1364; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1365; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1366; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1367; AVX512-NEXT: retq 1368; 1369; XOPAVX1-LABEL: test_bitreverse_v16i16: 1370; XOPAVX1: # %bb.0: 1371; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1372; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1373; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1374; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1375; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1376; XOPAVX1-NEXT: retq 1377; 1378; XOPAVX2-LABEL: test_bitreverse_v16i16: 1379; XOPAVX2: # %bb.0: 1380; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1381; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1382; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1383; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1384; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1385; XOPAVX2-NEXT: retq 1386; 1387; GFNISSE-LABEL: test_bitreverse_v16i16: 1388; GFNISSE: # %bb.0: 1389; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1390; GFNISSE-NEXT: pshufb %xmm2, %xmm0 1391; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1392; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 1393; GFNISSE-NEXT: pshufb %xmm2, %xmm1 1394; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 1395; GFNISSE-NEXT: retq 1396; 1397; GFNIAVX-LABEL: test_bitreverse_v16i16: 1398; GFNIAVX: # %bb.0: 1399; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1400; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1401; GFNIAVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1402; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1403; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1 1404; GFNIAVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1405; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0 1406; GFNIAVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1407; GFNIAVX-NEXT: retq 1408; 1409; GFNIAVX2-LABEL: test_bitreverse_v16i16: 1410; GFNIAVX2: # %bb.0: 1411; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1412; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1413; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1414; GFNIAVX2-NEXT: retq 1415; 1416; GFNIAVX512F-LABEL: test_bitreverse_v16i16: 1417; GFNIAVX512F: # %bb.0: 1418; GFNIAVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1419; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1420; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1421; GFNIAVX512F-NEXT: retq 1422; 1423; GFNIAVX512BW-LABEL: test_bitreverse_v16i16: 1424; GFNIAVX512BW: # %bb.0: 1425; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1426; GFNIAVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1427; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1428; GFNIAVX512BW-NEXT: retq 1429 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) 1430 ret <16 x i16> %b 1431} 1432 1433define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { 1434; SSE2-LABEL: test_bitreverse_v8i32: 1435; SSE2: # %bb.0: 1436; SSE2-NEXT: movdqa %xmm1, %xmm2 1437; SSE2-NEXT: pxor %xmm4, %xmm4 1438; SSE2-NEXT: movdqa %xmm0, %xmm1 1439; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] 1440; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1441; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1442; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1443; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1444; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1445; SSE2-NEXT: packuswb %xmm1, %xmm0 1446; SSE2-NEXT: movdqa %xmm0, %xmm3 1447; SSE2-NEXT: psllw $4, %xmm3 1448; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1449; SSE2-NEXT: movdqa %xmm1, %xmm5 1450; SSE2-NEXT: pandn %xmm3, %xmm5 1451; SSE2-NEXT: psrlw $4, %xmm0 1452; SSE2-NEXT: pand %xmm1, %xmm0 1453; SSE2-NEXT: por %xmm5, %xmm0 1454; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1455; SSE2-NEXT: movdqa %xmm0, %xmm5 1456; SSE2-NEXT: pand %xmm3, %xmm5 1457; SSE2-NEXT: psllw $2, %xmm5 1458; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1459; SSE2-NEXT: pand %xmm8, %xmm0 1460; SSE2-NEXT: psrlw $2, %xmm0 1461; SSE2-NEXT: por %xmm5, %xmm0 1462; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1463; SSE2-NEXT: movdqa %xmm0, %xmm6 1464; SSE2-NEXT: pand %xmm5, %xmm6 1465; SSE2-NEXT: paddb %xmm6, %xmm6 1466; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1467; SSE2-NEXT: pand %xmm7, %xmm0 1468; SSE2-NEXT: psrlw $1, %xmm0 1469; SSE2-NEXT: por %xmm6, %xmm0 1470; SSE2-NEXT: movdqa %xmm2, %xmm6 1471; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] 1472; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1473; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1474; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 1475; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1476; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1477; SSE2-NEXT: packuswb %xmm6, %xmm2 1478; SSE2-NEXT: movdqa %xmm2, %xmm4 1479; SSE2-NEXT: psllw $4, %xmm4 1480; SSE2-NEXT: psrlw $4, %xmm2 1481; SSE2-NEXT: pand %xmm1, %xmm2 1482; SSE2-NEXT: pandn %xmm4, %xmm1 1483; SSE2-NEXT: por %xmm2, %xmm1 1484; SSE2-NEXT: pand %xmm1, %xmm3 1485; SSE2-NEXT: psllw $2, %xmm3 1486; SSE2-NEXT: pand %xmm8, %xmm1 1487; SSE2-NEXT: psrlw $2, %xmm1 1488; SSE2-NEXT: por %xmm3, %xmm1 1489; SSE2-NEXT: pand %xmm1, %xmm5 1490; SSE2-NEXT: paddb %xmm5, %xmm5 1491; SSE2-NEXT: pand %xmm7, %xmm1 1492; SSE2-NEXT: psrlw $1, %xmm1 1493; SSE2-NEXT: por %xmm5, %xmm1 1494; SSE2-NEXT: retq 1495; 1496; SSSE3-LABEL: test_bitreverse_v8i32: 1497; SSSE3: # %bb.0: 1498; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1499; SSSE3-NEXT: pshufb %xmm4, %xmm0 1500; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1501; SSSE3-NEXT: movdqa %xmm0, %xmm2 1502; SSSE3-NEXT: pand %xmm5, %xmm2 1503; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1504; SSSE3-NEXT: movdqa %xmm6, %xmm7 1505; SSSE3-NEXT: pshufb %xmm2, %xmm7 1506; SSSE3-NEXT: psrlw $4, %xmm0 1507; SSSE3-NEXT: pand %xmm5, %xmm0 1508; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1509; SSSE3-NEXT: movdqa %xmm2, %xmm3 1510; SSSE3-NEXT: pshufb %xmm0, %xmm3 1511; SSSE3-NEXT: por %xmm7, %xmm3 1512; SSSE3-NEXT: pshufb %xmm4, %xmm1 1513; SSSE3-NEXT: movdqa %xmm1, %xmm0 1514; SSSE3-NEXT: pand %xmm5, %xmm0 1515; SSSE3-NEXT: pshufb %xmm0, %xmm6 1516; SSSE3-NEXT: psrlw $4, %xmm1 1517; SSSE3-NEXT: pand %xmm5, %xmm1 1518; SSSE3-NEXT: pshufb %xmm1, %xmm2 1519; SSSE3-NEXT: por %xmm6, %xmm2 1520; SSSE3-NEXT: movdqa %xmm3, %xmm0 1521; SSSE3-NEXT: movdqa %xmm2, %xmm1 1522; SSSE3-NEXT: retq 1523; 1524; AVX1-LABEL: test_bitreverse_v8i32: 1525; AVX1: # %bb.0: 1526; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1527; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1528; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1529; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1530; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1531; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1532; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1533; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1534; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1535; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1536; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1537; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1538; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1539; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1540; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1541; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1542; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1543; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1544; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1545; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1546; AVX1-NEXT: retq 1547; 1548; AVX2-LABEL: test_bitreverse_v8i32: 1549; AVX2: # %bb.0: 1550; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1551; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1552; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1553; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1554; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1555; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1556; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1557; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1558; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1559; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1560; AVX2-NEXT: retq 1561; 1562; AVX512-LABEL: test_bitreverse_v8i32: 1563; AVX512: # %bb.0: 1564; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1565; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1566; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1567; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1568; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1569; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1570; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1571; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1572; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1573; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1574; AVX512-NEXT: retq 1575; 1576; XOPAVX1-LABEL: test_bitreverse_v8i32: 1577; XOPAVX1: # %bb.0: 1578; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1579; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1580; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1581; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1582; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1583; XOPAVX1-NEXT: retq 1584; 1585; XOPAVX2-LABEL: test_bitreverse_v8i32: 1586; XOPAVX2: # %bb.0: 1587; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1588; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1589; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1590; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1591; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1592; XOPAVX2-NEXT: retq 1593; 1594; GFNISSE-LABEL: test_bitreverse_v8i32: 1595; GFNISSE: # %bb.0: 1596; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1597; GFNISSE-NEXT: pshufb %xmm2, %xmm0 1598; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1599; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 1600; GFNISSE-NEXT: pshufb %xmm2, %xmm1 1601; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 1602; GFNISSE-NEXT: retq 1603; 1604; GFNIAVX-LABEL: test_bitreverse_v8i32: 1605; GFNIAVX: # %bb.0: 1606; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1607; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1608; GFNIAVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1609; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1610; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1 1611; GFNIAVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1612; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0 1613; GFNIAVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1614; GFNIAVX-NEXT: retq 1615; 1616; GFNIAVX2-LABEL: test_bitreverse_v8i32: 1617; GFNIAVX2: # %bb.0: 1618; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1619; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1620; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1621; GFNIAVX2-NEXT: retq 1622; 1623; GFNIAVX512F-LABEL: test_bitreverse_v8i32: 1624; GFNIAVX512F: # %bb.0: 1625; GFNIAVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1626; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1627; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1628; GFNIAVX512F-NEXT: retq 1629; 1630; GFNIAVX512BW-LABEL: test_bitreverse_v8i32: 1631; GFNIAVX512BW: # %bb.0: 1632; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1633; GFNIAVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1634; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1635; GFNIAVX512BW-NEXT: retq 1636 %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) 1637 ret <8 x i32> %b 1638} 1639 1640define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { 1641; SSE2-LABEL: test_bitreverse_v4i64: 1642; SSE2: # %bb.0: 1643; SSE2-NEXT: movdqa %xmm1, %xmm2 1644; SSE2-NEXT: pxor %xmm4, %xmm4 1645; SSE2-NEXT: movdqa %xmm0, %xmm1 1646; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] 1647; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1648; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1649; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1650; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1651; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1652; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1653; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1654; SSE2-NEXT: packuswb %xmm1, %xmm0 1655; SSE2-NEXT: movdqa %xmm0, %xmm3 1656; SSE2-NEXT: psllw $4, %xmm3 1657; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1658; SSE2-NEXT: movdqa %xmm1, %xmm5 1659; SSE2-NEXT: pandn %xmm3, %xmm5 1660; SSE2-NEXT: psrlw $4, %xmm0 1661; SSE2-NEXT: pand %xmm1, %xmm0 1662; SSE2-NEXT: por %xmm5, %xmm0 1663; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1664; SSE2-NEXT: movdqa %xmm0, %xmm5 1665; SSE2-NEXT: pand %xmm3, %xmm5 1666; SSE2-NEXT: psllw $2, %xmm5 1667; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1668; SSE2-NEXT: pand %xmm8, %xmm0 1669; SSE2-NEXT: psrlw $2, %xmm0 1670; SSE2-NEXT: por %xmm5, %xmm0 1671; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1672; SSE2-NEXT: movdqa %xmm0, %xmm6 1673; SSE2-NEXT: pand %xmm5, %xmm6 1674; SSE2-NEXT: paddb %xmm6, %xmm6 1675; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1676; SSE2-NEXT: pand %xmm7, %xmm0 1677; SSE2-NEXT: psrlw $1, %xmm0 1678; SSE2-NEXT: por %xmm6, %xmm0 1679; SSE2-NEXT: movdqa %xmm2, %xmm6 1680; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] 1681; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 1682; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1683; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1684; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 1685; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 1686; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1687; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1688; SSE2-NEXT: packuswb %xmm6, %xmm2 1689; SSE2-NEXT: movdqa %xmm2, %xmm4 1690; SSE2-NEXT: psllw $4, %xmm4 1691; SSE2-NEXT: psrlw $4, %xmm2 1692; SSE2-NEXT: pand %xmm1, %xmm2 1693; SSE2-NEXT: pandn %xmm4, %xmm1 1694; SSE2-NEXT: por %xmm2, %xmm1 1695; SSE2-NEXT: pand %xmm1, %xmm3 1696; SSE2-NEXT: psllw $2, %xmm3 1697; SSE2-NEXT: pand %xmm8, %xmm1 1698; SSE2-NEXT: psrlw $2, %xmm1 1699; SSE2-NEXT: por %xmm3, %xmm1 1700; SSE2-NEXT: pand %xmm1, %xmm5 1701; SSE2-NEXT: paddb %xmm5, %xmm5 1702; SSE2-NEXT: pand %xmm7, %xmm1 1703; SSE2-NEXT: psrlw $1, %xmm1 1704; SSE2-NEXT: por %xmm5, %xmm1 1705; SSE2-NEXT: retq 1706; 1707; SSSE3-LABEL: test_bitreverse_v4i64: 1708; SSSE3: # %bb.0: 1709; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1710; SSSE3-NEXT: pshufb %xmm4, %xmm0 1711; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1712; SSSE3-NEXT: movdqa %xmm0, %xmm2 1713; SSSE3-NEXT: pand %xmm5, %xmm2 1714; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1715; SSSE3-NEXT: movdqa %xmm6, %xmm7 1716; SSSE3-NEXT: pshufb %xmm2, %xmm7 1717; SSSE3-NEXT: psrlw $4, %xmm0 1718; SSSE3-NEXT: pand %xmm5, %xmm0 1719; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1720; SSSE3-NEXT: movdqa %xmm2, %xmm3 1721; SSSE3-NEXT: pshufb %xmm0, %xmm3 1722; SSSE3-NEXT: por %xmm7, %xmm3 1723; SSSE3-NEXT: pshufb %xmm4, %xmm1 1724; SSSE3-NEXT: movdqa %xmm1, %xmm0 1725; SSSE3-NEXT: pand %xmm5, %xmm0 1726; SSSE3-NEXT: pshufb %xmm0, %xmm6 1727; SSSE3-NEXT: psrlw $4, %xmm1 1728; SSSE3-NEXT: pand %xmm5, %xmm1 1729; SSSE3-NEXT: pshufb %xmm1, %xmm2 1730; SSSE3-NEXT: por %xmm6, %xmm2 1731; SSSE3-NEXT: movdqa %xmm3, %xmm0 1732; SSSE3-NEXT: movdqa %xmm2, %xmm1 1733; SSSE3-NEXT: retq 1734; 1735; AVX1-LABEL: test_bitreverse_v4i64: 1736; AVX1: # %bb.0: 1737; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1738; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1739; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1740; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1741; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1742; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1743; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1744; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1745; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1746; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1747; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1748; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1749; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1750; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1751; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1752; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1753; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1754; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1755; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1756; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1757; AVX1-NEXT: retq 1758; 1759; AVX2-LABEL: test_bitreverse_v4i64: 1760; AVX2: # %bb.0: 1761; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1762; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1763; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1764; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1765; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1766; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1767; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1768; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1769; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1770; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1771; AVX2-NEXT: retq 1772; 1773; AVX512-LABEL: test_bitreverse_v4i64: 1774; AVX512: # %bb.0: 1775; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1776; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1777; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1778; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1779; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1780; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1781; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1782; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1783; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1784; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1785; AVX512-NEXT: retq 1786; 1787; XOPAVX1-LABEL: test_bitreverse_v4i64: 1788; XOPAVX1: # %bb.0: 1789; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1790; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1791; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1792; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1793; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1794; XOPAVX1-NEXT: retq 1795; 1796; XOPAVX2-LABEL: test_bitreverse_v4i64: 1797; XOPAVX2: # %bb.0: 1798; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1799; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1800; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1801; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1802; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1803; XOPAVX2-NEXT: retq 1804; 1805; GFNISSE-LABEL: test_bitreverse_v4i64: 1806; GFNISSE: # %bb.0: 1807; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1808; GFNISSE-NEXT: pshufb %xmm2, %xmm0 1809; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1810; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 1811; GFNISSE-NEXT: pshufb %xmm2, %xmm1 1812; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 1813; GFNISSE-NEXT: retq 1814; 1815; GFNIAVX-LABEL: test_bitreverse_v4i64: 1816; GFNIAVX: # %bb.0: 1817; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1818; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1819; GFNIAVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1820; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1821; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1 1822; GFNIAVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1823; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0 1824; GFNIAVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1825; GFNIAVX-NEXT: retq 1826; 1827; GFNIAVX2-LABEL: test_bitreverse_v4i64: 1828; GFNIAVX2: # %bb.0: 1829; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1830; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1831; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1832; GFNIAVX2-NEXT: retq 1833; 1834; GFNIAVX512F-LABEL: test_bitreverse_v4i64: 1835; GFNIAVX512F: # %bb.0: 1836; GFNIAVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1837; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1838; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1839; GFNIAVX512F-NEXT: retq 1840; 1841; GFNIAVX512BW-LABEL: test_bitreverse_v4i64: 1842; GFNIAVX512BW: # %bb.0: 1843; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1844; GFNIAVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1845; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1846; GFNIAVX512BW-NEXT: retq 1847 %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) 1848 ret <4 x i64> %b 1849} 1850 1851define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { 1852; SSE2-LABEL: test_bitreverse_v64i8: 1853; SSE2: # %bb.0: 1854; SSE2-NEXT: movdqa %xmm3, %xmm10 1855; SSE2-NEXT: movdqa %xmm0, %xmm5 1856; SSE2-NEXT: psllw $4, %xmm5 1857; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1858; SSE2-NEXT: movdqa %xmm3, %xmm6 1859; SSE2-NEXT: pandn %xmm5, %xmm6 1860; SSE2-NEXT: psrlw $4, %xmm0 1861; SSE2-NEXT: pand %xmm3, %xmm0 1862; SSE2-NEXT: por %xmm6, %xmm0 1863; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1864; SSE2-NEXT: movdqa %xmm0, %xmm6 1865; SSE2-NEXT: pand %xmm5, %xmm6 1866; SSE2-NEXT: psllw $2, %xmm6 1867; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1868; SSE2-NEXT: pand %xmm8, %xmm0 1869; SSE2-NEXT: psrlw $2, %xmm0 1870; SSE2-NEXT: por %xmm6, %xmm0 1871; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1872; SSE2-NEXT: movdqa %xmm0, %xmm7 1873; SSE2-NEXT: pand %xmm6, %xmm7 1874; SSE2-NEXT: paddb %xmm7, %xmm7 1875; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1876; SSE2-NEXT: pand %xmm9, %xmm0 1877; SSE2-NEXT: psrlw $1, %xmm0 1878; SSE2-NEXT: por %xmm7, %xmm0 1879; SSE2-NEXT: movdqa %xmm1, %xmm7 1880; SSE2-NEXT: psllw $4, %xmm7 1881; SSE2-NEXT: movdqa %xmm3, %xmm4 1882; SSE2-NEXT: pandn %xmm7, %xmm4 1883; SSE2-NEXT: psrlw $4, %xmm1 1884; SSE2-NEXT: pand %xmm3, %xmm1 1885; SSE2-NEXT: por %xmm4, %xmm1 1886; SSE2-NEXT: movdqa %xmm1, %xmm4 1887; SSE2-NEXT: pand %xmm5, %xmm4 1888; SSE2-NEXT: psllw $2, %xmm4 1889; SSE2-NEXT: pand %xmm8, %xmm1 1890; SSE2-NEXT: psrlw $2, %xmm1 1891; SSE2-NEXT: por %xmm4, %xmm1 1892; SSE2-NEXT: movdqa %xmm1, %xmm4 1893; SSE2-NEXT: pand %xmm6, %xmm4 1894; SSE2-NEXT: paddb %xmm4, %xmm4 1895; SSE2-NEXT: pand %xmm9, %xmm1 1896; SSE2-NEXT: psrlw $1, %xmm1 1897; SSE2-NEXT: por %xmm4, %xmm1 1898; SSE2-NEXT: movdqa %xmm2, %xmm4 1899; SSE2-NEXT: psllw $4, %xmm4 1900; SSE2-NEXT: movdqa %xmm3, %xmm7 1901; SSE2-NEXT: pandn %xmm4, %xmm7 1902; SSE2-NEXT: psrlw $4, %xmm2 1903; SSE2-NEXT: pand %xmm3, %xmm2 1904; SSE2-NEXT: por %xmm7, %xmm2 1905; SSE2-NEXT: movdqa %xmm2, %xmm4 1906; SSE2-NEXT: pand %xmm5, %xmm4 1907; SSE2-NEXT: psllw $2, %xmm4 1908; SSE2-NEXT: pand %xmm8, %xmm2 1909; SSE2-NEXT: psrlw $2, %xmm2 1910; SSE2-NEXT: por %xmm4, %xmm2 1911; SSE2-NEXT: movdqa %xmm2, %xmm4 1912; SSE2-NEXT: pand %xmm6, %xmm4 1913; SSE2-NEXT: paddb %xmm4, %xmm4 1914; SSE2-NEXT: pand %xmm9, %xmm2 1915; SSE2-NEXT: psrlw $1, %xmm2 1916; SSE2-NEXT: por %xmm4, %xmm2 1917; SSE2-NEXT: movdqa %xmm10, %xmm4 1918; SSE2-NEXT: psllw $4, %xmm4 1919; SSE2-NEXT: psrlw $4, %xmm10 1920; SSE2-NEXT: pand %xmm3, %xmm10 1921; SSE2-NEXT: pandn %xmm4, %xmm3 1922; SSE2-NEXT: por %xmm10, %xmm3 1923; SSE2-NEXT: pand %xmm3, %xmm5 1924; SSE2-NEXT: psllw $2, %xmm5 1925; SSE2-NEXT: pand %xmm8, %xmm3 1926; SSE2-NEXT: psrlw $2, %xmm3 1927; SSE2-NEXT: por %xmm5, %xmm3 1928; SSE2-NEXT: pand %xmm3, %xmm6 1929; SSE2-NEXT: paddb %xmm6, %xmm6 1930; SSE2-NEXT: pand %xmm9, %xmm3 1931; SSE2-NEXT: psrlw $1, %xmm3 1932; SSE2-NEXT: por %xmm6, %xmm3 1933; SSE2-NEXT: retq 1934; 1935; SSSE3-LABEL: test_bitreverse_v64i8: 1936; SSSE3: # %bb.0: 1937; SSSE3-NEXT: movdqa %xmm0, %xmm5 1938; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1939; SSSE3-NEXT: pand %xmm8, %xmm0 1940; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1941; SSSE3-NEXT: movdqa %xmm9, %xmm6 1942; SSSE3-NEXT: pshufb %xmm0, %xmm6 1943; SSSE3-NEXT: psrlw $4, %xmm5 1944; SSSE3-NEXT: pand %xmm8, %xmm5 1945; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1946; SSSE3-NEXT: movdqa %xmm4, %xmm0 1947; SSSE3-NEXT: pshufb %xmm5, %xmm0 1948; SSSE3-NEXT: por %xmm6, %xmm0 1949; SSSE3-NEXT: movdqa %xmm1, %xmm5 1950; SSSE3-NEXT: pand %xmm8, %xmm5 1951; SSSE3-NEXT: movdqa %xmm9, %xmm6 1952; SSSE3-NEXT: pshufb %xmm5, %xmm6 1953; SSSE3-NEXT: psrlw $4, %xmm1 1954; SSSE3-NEXT: pand %xmm8, %xmm1 1955; SSSE3-NEXT: movdqa %xmm4, %xmm5 1956; SSSE3-NEXT: pshufb %xmm1, %xmm5 1957; SSSE3-NEXT: por %xmm6, %xmm5 1958; SSSE3-NEXT: movdqa %xmm2, %xmm1 1959; SSSE3-NEXT: pand %xmm8, %xmm1 1960; SSSE3-NEXT: movdqa %xmm9, %xmm7 1961; SSSE3-NEXT: pshufb %xmm1, %xmm7 1962; SSSE3-NEXT: psrlw $4, %xmm2 1963; SSSE3-NEXT: pand %xmm8, %xmm2 1964; SSSE3-NEXT: movdqa %xmm4, %xmm6 1965; SSSE3-NEXT: pshufb %xmm2, %xmm6 1966; SSSE3-NEXT: por %xmm7, %xmm6 1967; SSSE3-NEXT: movdqa %xmm3, %xmm1 1968; SSSE3-NEXT: pand %xmm8, %xmm1 1969; SSSE3-NEXT: pshufb %xmm1, %xmm9 1970; SSSE3-NEXT: psrlw $4, %xmm3 1971; SSSE3-NEXT: pand %xmm8, %xmm3 1972; SSSE3-NEXT: pshufb %xmm3, %xmm4 1973; SSSE3-NEXT: por %xmm9, %xmm4 1974; SSSE3-NEXT: movdqa %xmm5, %xmm1 1975; SSSE3-NEXT: movdqa %xmm6, %xmm2 1976; SSSE3-NEXT: movdqa %xmm4, %xmm3 1977; SSSE3-NEXT: retq 1978; 1979; AVX1-LABEL: test_bitreverse_v64i8: 1980; AVX1: # %bb.0: 1981; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1982; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1983; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 1984; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1985; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1986; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1987; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1988; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1989; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1990; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1991; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm4 1992; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1993; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1994; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1995; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1996; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 1997; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1998; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1999; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 2000; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 2001; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2002; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 2003; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 2004; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 2005; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 2006; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 2007; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2008; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 2009; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 2010; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 2011; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2012; AVX1-NEXT: retq 2013; 2014; AVX2-LABEL: test_bitreverse_v64i8: 2015; AVX2: # %bb.0: 2016; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2017; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 2018; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2019; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 2020; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2021; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2022; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2023; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0 2024; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 2025; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 2026; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 2027; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2028; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 2029; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1 2030; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 2031; AVX2-NEXT: retq 2032; 2033; AVX512F-LABEL: test_bitreverse_v64i8: 2034; AVX512F: # %bb.0: 2035; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2036; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2037; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 2038; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2039; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 2040; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 2041; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm4 2042; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 2043; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2044; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 2045; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2046; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 2047; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 2048; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 2049; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 2050; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2051; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0 2052; AVX512F-NEXT: retq 2053; 2054; AVX512BW-LABEL: test_bitreverse_v64i8: 2055; AVX512BW: # %bb.0: 2056; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2057; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2058; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2059; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2060; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2061; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2062; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2063; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2064; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2065; AVX512BW-NEXT: retq 2066; 2067; XOPAVX1-LABEL: test_bitreverse_v64i8: 2068; XOPAVX1: # %bb.0: 2069; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2070; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 2071; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2072; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2073; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2074; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2075; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2076; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2077; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2078; XOPAVX1-NEXT: retq 2079; 2080; XOPAVX2-LABEL: test_bitreverse_v64i8: 2081; XOPAVX2: # %bb.0: 2082; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2083; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 2084; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2085; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2086; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2087; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2088; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2089; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2090; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2091; XOPAVX2-NEXT: retq 2092; 2093; GFNISSE-LABEL: test_bitreverse_v64i8: 2094; GFNISSE: # %bb.0: 2095; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] 2096; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 2097; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 2098; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 2099; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm3 2100; GFNISSE-NEXT: retq 2101; 2102; GFNIAVX-LABEL: test_bitreverse_v64i8: 2103; GFNIAVX: # %bb.0: 2104; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm2 2105; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 2106; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm2, %xmm2 2107; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0 2108; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2109; GFNIAVX-NEXT: vextractf128 $1, %ymm1, %xmm2 2110; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm2, %xmm2 2111; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1 2112; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2113; GFNIAVX-NEXT: retq 2114; 2115; GFNIAVX2-LABEL: test_bitreverse_v64i8: 2116; GFNIAVX2: # %bb.0: 2117; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2118; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 2119; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 2120; GFNIAVX2-NEXT: retq 2121; 2122; GFNIAVX512F-LABEL: test_bitreverse_v64i8: 2123; GFNIAVX512F: # %bb.0: 2124; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2125; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2126; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 2127; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 2128; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2129; GFNIAVX512F-NEXT: retq 2130; 2131; GFNIAVX512BW-LABEL: test_bitreverse_v64i8: 2132; GFNIAVX512BW: # %bb.0: 2133; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{.*}}(%rip){1to8}, %zmm0, %zmm0 2134; GFNIAVX512BW-NEXT: retq 2135 %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) 2136 ret <64 x i8> %b 2137} 2138 2139define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { 2140; SSE2-LABEL: test_bitreverse_v32i16: 2141; SSE2: # %bb.0: 2142; SSE2-NEXT: movdqa %xmm3, %xmm4 2143; SSE2-NEXT: movdqa %xmm0, %xmm3 2144; SSE2-NEXT: psrlw $8, %xmm3 2145; SSE2-NEXT: psllw $8, %xmm0 2146; SSE2-NEXT: por %xmm3, %xmm0 2147; SSE2-NEXT: movdqa %xmm0, %xmm5 2148; SSE2-NEXT: psllw $4, %xmm5 2149; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2150; SSE2-NEXT: movdqa %xmm3, %xmm6 2151; SSE2-NEXT: pandn %xmm5, %xmm6 2152; SSE2-NEXT: psrlw $4, %xmm0 2153; SSE2-NEXT: pand %xmm3, %xmm0 2154; SSE2-NEXT: por %xmm6, %xmm0 2155; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 2156; SSE2-NEXT: movdqa %xmm0, %xmm6 2157; SSE2-NEXT: pand %xmm10, %xmm6 2158; SSE2-NEXT: psllw $2, %xmm6 2159; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 2160; SSE2-NEXT: pand %xmm8, %xmm0 2161; SSE2-NEXT: psrlw $2, %xmm0 2162; SSE2-NEXT: por %xmm6, %xmm0 2163; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 2164; SSE2-NEXT: movdqa %xmm0, %xmm7 2165; SSE2-NEXT: pand %xmm6, %xmm7 2166; SSE2-NEXT: paddb %xmm7, %xmm7 2167; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 2168; SSE2-NEXT: pand %xmm9, %xmm0 2169; SSE2-NEXT: psrlw $1, %xmm0 2170; SSE2-NEXT: por %xmm7, %xmm0 2171; SSE2-NEXT: movdqa %xmm1, %xmm7 2172; SSE2-NEXT: psrlw $8, %xmm7 2173; SSE2-NEXT: psllw $8, %xmm1 2174; SSE2-NEXT: por %xmm7, %xmm1 2175; SSE2-NEXT: movdqa %xmm1, %xmm7 2176; SSE2-NEXT: psllw $4, %xmm7 2177; SSE2-NEXT: movdqa %xmm3, %xmm5 2178; SSE2-NEXT: pandn %xmm7, %xmm5 2179; SSE2-NEXT: psrlw $4, %xmm1 2180; SSE2-NEXT: pand %xmm3, %xmm1 2181; SSE2-NEXT: por %xmm5, %xmm1 2182; SSE2-NEXT: movdqa %xmm1, %xmm5 2183; SSE2-NEXT: pand %xmm10, %xmm5 2184; SSE2-NEXT: psllw $2, %xmm5 2185; SSE2-NEXT: pand %xmm8, %xmm1 2186; SSE2-NEXT: psrlw $2, %xmm1 2187; SSE2-NEXT: por %xmm5, %xmm1 2188; SSE2-NEXT: movdqa %xmm1, %xmm5 2189; SSE2-NEXT: pand %xmm6, %xmm5 2190; SSE2-NEXT: paddb %xmm5, %xmm5 2191; SSE2-NEXT: pand %xmm9, %xmm1 2192; SSE2-NEXT: psrlw $1, %xmm1 2193; SSE2-NEXT: por %xmm5, %xmm1 2194; SSE2-NEXT: movdqa %xmm2, %xmm5 2195; SSE2-NEXT: psrlw $8, %xmm5 2196; SSE2-NEXT: psllw $8, %xmm2 2197; SSE2-NEXT: por %xmm5, %xmm2 2198; SSE2-NEXT: movdqa %xmm2, %xmm5 2199; SSE2-NEXT: psllw $4, %xmm5 2200; SSE2-NEXT: movdqa %xmm3, %xmm7 2201; SSE2-NEXT: pandn %xmm5, %xmm7 2202; SSE2-NEXT: psrlw $4, %xmm2 2203; SSE2-NEXT: pand %xmm3, %xmm2 2204; SSE2-NEXT: por %xmm7, %xmm2 2205; SSE2-NEXT: movdqa %xmm2, %xmm5 2206; SSE2-NEXT: pand %xmm10, %xmm5 2207; SSE2-NEXT: psllw $2, %xmm5 2208; SSE2-NEXT: pand %xmm8, %xmm2 2209; SSE2-NEXT: psrlw $2, %xmm2 2210; SSE2-NEXT: por %xmm5, %xmm2 2211; SSE2-NEXT: movdqa %xmm2, %xmm5 2212; SSE2-NEXT: pand %xmm6, %xmm5 2213; SSE2-NEXT: paddb %xmm5, %xmm5 2214; SSE2-NEXT: pand %xmm9, %xmm2 2215; SSE2-NEXT: psrlw $1, %xmm2 2216; SSE2-NEXT: por %xmm5, %xmm2 2217; SSE2-NEXT: movdqa %xmm4, %xmm5 2218; SSE2-NEXT: psrlw $8, %xmm5 2219; SSE2-NEXT: psllw $8, %xmm4 2220; SSE2-NEXT: por %xmm5, %xmm4 2221; SSE2-NEXT: movdqa %xmm4, %xmm5 2222; SSE2-NEXT: psllw $4, %xmm5 2223; SSE2-NEXT: psrlw $4, %xmm4 2224; SSE2-NEXT: pand %xmm3, %xmm4 2225; SSE2-NEXT: pandn %xmm5, %xmm3 2226; SSE2-NEXT: por %xmm4, %xmm3 2227; SSE2-NEXT: pand %xmm3, %xmm10 2228; SSE2-NEXT: psllw $2, %xmm10 2229; SSE2-NEXT: pand %xmm8, %xmm3 2230; SSE2-NEXT: psrlw $2, %xmm3 2231; SSE2-NEXT: por %xmm10, %xmm3 2232; SSE2-NEXT: pand %xmm3, %xmm6 2233; SSE2-NEXT: paddb %xmm6, %xmm6 2234; SSE2-NEXT: pand %xmm9, %xmm3 2235; SSE2-NEXT: psrlw $1, %xmm3 2236; SSE2-NEXT: por %xmm6, %xmm3 2237; SSE2-NEXT: retq 2238; 2239; SSSE3-LABEL: test_bitreverse_v32i16: 2240; SSSE3: # %bb.0: 2241; SSSE3-NEXT: movdqa %xmm1, %xmm5 2242; SSSE3-NEXT: movdqa %xmm0, %xmm1 2243; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2244; SSSE3-NEXT: pshufb %xmm8, %xmm1 2245; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2246; SSSE3-NEXT: movdqa %xmm1, %xmm0 2247; SSSE3-NEXT: pand %xmm9, %xmm0 2248; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2249; SSSE3-NEXT: movdqa %xmm7, %xmm6 2250; SSSE3-NEXT: pshufb %xmm0, %xmm6 2251; SSSE3-NEXT: psrlw $4, %xmm1 2252; SSSE3-NEXT: pand %xmm9, %xmm1 2253; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2254; SSSE3-NEXT: movdqa %xmm4, %xmm0 2255; SSSE3-NEXT: pshufb %xmm1, %xmm0 2256; SSSE3-NEXT: por %xmm6, %xmm0 2257; SSSE3-NEXT: pshufb %xmm8, %xmm5 2258; SSSE3-NEXT: movdqa %xmm5, %xmm1 2259; SSSE3-NEXT: pand %xmm9, %xmm1 2260; SSSE3-NEXT: movdqa %xmm7, %xmm6 2261; SSSE3-NEXT: pshufb %xmm1, %xmm6 2262; SSSE3-NEXT: psrlw $4, %xmm5 2263; SSSE3-NEXT: pand %xmm9, %xmm5 2264; SSSE3-NEXT: movdqa %xmm4, %xmm1 2265; SSSE3-NEXT: pshufb %xmm5, %xmm1 2266; SSSE3-NEXT: por %xmm6, %xmm1 2267; SSSE3-NEXT: pshufb %xmm8, %xmm2 2268; SSSE3-NEXT: movdqa %xmm2, %xmm5 2269; SSSE3-NEXT: pand %xmm9, %xmm5 2270; SSSE3-NEXT: movdqa %xmm7, %xmm6 2271; SSSE3-NEXT: pshufb %xmm5, %xmm6 2272; SSSE3-NEXT: psrlw $4, %xmm2 2273; SSSE3-NEXT: pand %xmm9, %xmm2 2274; SSSE3-NEXT: movdqa %xmm4, %xmm5 2275; SSSE3-NEXT: pshufb %xmm2, %xmm5 2276; SSSE3-NEXT: por %xmm6, %xmm5 2277; SSSE3-NEXT: pshufb %xmm8, %xmm3 2278; SSSE3-NEXT: movdqa %xmm3, %xmm2 2279; SSSE3-NEXT: pand %xmm9, %xmm2 2280; SSSE3-NEXT: pshufb %xmm2, %xmm7 2281; SSSE3-NEXT: psrlw $4, %xmm3 2282; SSSE3-NEXT: pand %xmm9, %xmm3 2283; SSSE3-NEXT: pshufb %xmm3, %xmm4 2284; SSSE3-NEXT: por %xmm7, %xmm4 2285; SSSE3-NEXT: movdqa %xmm5, %xmm2 2286; SSSE3-NEXT: movdqa %xmm4, %xmm3 2287; SSSE3-NEXT: retq 2288; 2289; AVX1-LABEL: test_bitreverse_v32i16: 2290; AVX1: # %bb.0: 2291; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2292; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2293; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2294; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2295; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2296; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2297; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2298; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2299; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2300; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2301; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2302; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2303; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2304; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 2305; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2306; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2307; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2308; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2309; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2310; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2311; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2312; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2313; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2314; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2315; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2316; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2317; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2318; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2319; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2320; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2321; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2322; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2323; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2324; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2325; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2326; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2327; AVX1-NEXT: retq 2328; 2329; AVX2-LABEL: test_bitreverse_v32i16: 2330; AVX2: # %bb.0: 2331; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2332; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2333; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2334; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2335; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2336; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2337; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2338; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2339; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2340; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2341; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2342; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2343; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2344; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2345; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2346; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2347; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2348; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2349; AVX2-NEXT: retq 2350; 2351; AVX512F-LABEL: test_bitreverse_v32i16: 2352; AVX512F: # %bb.0: 2353; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2354; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2355; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2356; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2357; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 2358; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2359; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2360; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2361; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 2362; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2363; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 2364; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2365; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 2366; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2367; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 2368; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 2369; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 2370; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 2371; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2372; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 2373; AVX512F-NEXT: retq 2374; 2375; AVX512BW-LABEL: test_bitreverse_v32i16: 2376; AVX512BW: # %bb.0: 2377; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] 2378; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2379; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2380; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2381; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2382; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2383; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2384; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2385; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2386; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2387; AVX512BW-NEXT: retq 2388; 2389; XOPAVX1-LABEL: test_bitreverse_v32i16: 2390; XOPAVX1: # %bb.0: 2391; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2392; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 2393; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2394; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2395; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2396; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2397; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2398; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2399; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2400; XOPAVX1-NEXT: retq 2401; 2402; XOPAVX2-LABEL: test_bitreverse_v32i16: 2403; XOPAVX2: # %bb.0: 2404; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2405; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 2406; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2407; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2408; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2409; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2410; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2411; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2412; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2413; XOPAVX2-NEXT: retq 2414; 2415; GFNISSE-LABEL: test_bitreverse_v32i16: 2416; GFNISSE: # %bb.0: 2417; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2418; GFNISSE-NEXT: pshufb %xmm4, %xmm0 2419; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] 2420; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 2421; GFNISSE-NEXT: pshufb %xmm4, %xmm1 2422; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 2423; GFNISSE-NEXT: pshufb %xmm4, %xmm2 2424; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 2425; GFNISSE-NEXT: pshufb %xmm4, %xmm3 2426; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 2427; GFNISSE-NEXT: retq 2428; 2429; GFNIAVX-LABEL: test_bitreverse_v32i16: 2430; GFNIAVX: # %bb.0: 2431; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm2 2432; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2433; GFNIAVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2434; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] 2435; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 2436; GFNIAVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2437; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0 2438; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2439; GFNIAVX-NEXT: vextractf128 $1, %ymm1, %xmm2 2440; GFNIAVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2441; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 2442; GFNIAVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2443; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1 2444; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2445; GFNIAVX-NEXT: retq 2446; 2447; GFNIAVX2-LABEL: test_bitreverse_v32i16: 2448; GFNIAVX2: # %bb.0: 2449; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2450; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2451; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2452; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2453; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2454; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2455; GFNIAVX2-NEXT: retq 2456; 2457; GFNIAVX512F-LABEL: test_bitreverse_v32i16: 2458; GFNIAVX512F: # %bb.0: 2459; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2460; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2461; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2462; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2463; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2464; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2465; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2466; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2467; GFNIAVX512F-NEXT: retq 2468; 2469; GFNIAVX512BW-LABEL: test_bitreverse_v32i16: 2470; GFNIAVX512BW: # %bb.0: 2471; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] 2472; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{.*}}(%rip){1to8}, %zmm0, %zmm0 2473; GFNIAVX512BW-NEXT: retq 2474 %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) 2475 ret <32 x i16> %b 2476} 2477 2478define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { 2479; SSE2-LABEL: test_bitreverse_v16i32: 2480; SSE2: # %bb.0: 2481; SSE2-NEXT: movdqa %xmm3, %xmm11 2482; SSE2-NEXT: pxor %xmm10, %xmm10 2483; SSE2-NEXT: movdqa %xmm0, %xmm3 2484; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] 2485; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 2486; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 2487; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 2488; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 2489; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 2490; SSE2-NEXT: packuswb %xmm3, %xmm0 2491; SSE2-NEXT: movdqa %xmm0, %xmm5 2492; SSE2-NEXT: psllw $4, %xmm5 2493; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2494; SSE2-NEXT: movdqa %xmm3, %xmm7 2495; SSE2-NEXT: pandn %xmm5, %xmm7 2496; SSE2-NEXT: psrlw $4, %xmm0 2497; SSE2-NEXT: pand %xmm3, %xmm0 2498; SSE2-NEXT: por %xmm7, %xmm0 2499; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 2500; SSE2-NEXT: movdqa %xmm0, %xmm7 2501; SSE2-NEXT: pand %xmm5, %xmm7 2502; SSE2-NEXT: psllw $2, %xmm7 2503; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 2504; SSE2-NEXT: pand %xmm8, %xmm0 2505; SSE2-NEXT: psrlw $2, %xmm0 2506; SSE2-NEXT: por %xmm7, %xmm0 2507; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 2508; SSE2-NEXT: movdqa %xmm0, %xmm6 2509; SSE2-NEXT: pand %xmm7, %xmm6 2510; SSE2-NEXT: paddb %xmm6, %xmm6 2511; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 2512; SSE2-NEXT: pand %xmm9, %xmm0 2513; SSE2-NEXT: psrlw $1, %xmm0 2514; SSE2-NEXT: por %xmm6, %xmm0 2515; SSE2-NEXT: movdqa %xmm1, %xmm6 2516; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] 2517; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 2518; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 2519; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] 2520; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 2521; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 2522; SSE2-NEXT: packuswb %xmm6, %xmm1 2523; SSE2-NEXT: movdqa %xmm1, %xmm6 2524; SSE2-NEXT: psllw $4, %xmm6 2525; SSE2-NEXT: movdqa %xmm3, %xmm4 2526; SSE2-NEXT: pandn %xmm6, %xmm4 2527; SSE2-NEXT: psrlw $4, %xmm1 2528; SSE2-NEXT: pand %xmm3, %xmm1 2529; SSE2-NEXT: por %xmm4, %xmm1 2530; SSE2-NEXT: movdqa %xmm1, %xmm4 2531; SSE2-NEXT: pand %xmm5, %xmm4 2532; SSE2-NEXT: psllw $2, %xmm4 2533; SSE2-NEXT: pand %xmm8, %xmm1 2534; SSE2-NEXT: psrlw $2, %xmm1 2535; SSE2-NEXT: por %xmm4, %xmm1 2536; SSE2-NEXT: movdqa %xmm1, %xmm4 2537; SSE2-NEXT: pand %xmm7, %xmm4 2538; SSE2-NEXT: paddb %xmm4, %xmm4 2539; SSE2-NEXT: pand %xmm9, %xmm1 2540; SSE2-NEXT: psrlw $1, %xmm1 2541; SSE2-NEXT: por %xmm4, %xmm1 2542; SSE2-NEXT: movdqa %xmm2, %xmm4 2543; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] 2544; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2545; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2546; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] 2547; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 2548; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 2549; SSE2-NEXT: packuswb %xmm4, %xmm2 2550; SSE2-NEXT: movdqa %xmm2, %xmm4 2551; SSE2-NEXT: psllw $4, %xmm4 2552; SSE2-NEXT: movdqa %xmm3, %xmm6 2553; SSE2-NEXT: pandn %xmm4, %xmm6 2554; SSE2-NEXT: psrlw $4, %xmm2 2555; SSE2-NEXT: pand %xmm3, %xmm2 2556; SSE2-NEXT: por %xmm6, %xmm2 2557; SSE2-NEXT: movdqa %xmm2, %xmm4 2558; SSE2-NEXT: pand %xmm5, %xmm4 2559; SSE2-NEXT: psllw $2, %xmm4 2560; SSE2-NEXT: pand %xmm8, %xmm2 2561; SSE2-NEXT: psrlw $2, %xmm2 2562; SSE2-NEXT: por %xmm4, %xmm2 2563; SSE2-NEXT: movdqa %xmm2, %xmm4 2564; SSE2-NEXT: pand %xmm7, %xmm4 2565; SSE2-NEXT: paddb %xmm4, %xmm4 2566; SSE2-NEXT: pand %xmm9, %xmm2 2567; SSE2-NEXT: psrlw $1, %xmm2 2568; SSE2-NEXT: por %xmm4, %xmm2 2569; SSE2-NEXT: movdqa %xmm11, %xmm4 2570; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] 2571; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2572; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2573; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] 2574; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[3,2,1,0,4,5,6,7] 2575; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 2576; SSE2-NEXT: packuswb %xmm4, %xmm6 2577; SSE2-NEXT: movdqa %xmm6, %xmm4 2578; SSE2-NEXT: psllw $4, %xmm4 2579; SSE2-NEXT: psrlw $4, %xmm6 2580; SSE2-NEXT: pand %xmm3, %xmm6 2581; SSE2-NEXT: pandn %xmm4, %xmm3 2582; SSE2-NEXT: por %xmm6, %xmm3 2583; SSE2-NEXT: pand %xmm3, %xmm5 2584; SSE2-NEXT: psllw $2, %xmm5 2585; SSE2-NEXT: pand %xmm8, %xmm3 2586; SSE2-NEXT: psrlw $2, %xmm3 2587; SSE2-NEXT: por %xmm5, %xmm3 2588; SSE2-NEXT: pand %xmm3, %xmm7 2589; SSE2-NEXT: paddb %xmm7, %xmm7 2590; SSE2-NEXT: pand %xmm9, %xmm3 2591; SSE2-NEXT: psrlw $1, %xmm3 2592; SSE2-NEXT: por %xmm7, %xmm3 2593; SSE2-NEXT: retq 2594; 2595; SSSE3-LABEL: test_bitreverse_v16i32: 2596; SSSE3: # %bb.0: 2597; SSSE3-NEXT: movdqa %xmm1, %xmm5 2598; SSSE3-NEXT: movdqa %xmm0, %xmm1 2599; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2600; SSSE3-NEXT: pshufb %xmm8, %xmm1 2601; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2602; SSSE3-NEXT: movdqa %xmm1, %xmm0 2603; SSSE3-NEXT: pand %xmm9, %xmm0 2604; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2605; SSSE3-NEXT: movdqa %xmm7, %xmm6 2606; SSSE3-NEXT: pshufb %xmm0, %xmm6 2607; SSSE3-NEXT: psrlw $4, %xmm1 2608; SSSE3-NEXT: pand %xmm9, %xmm1 2609; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2610; SSSE3-NEXT: movdqa %xmm4, %xmm0 2611; SSSE3-NEXT: pshufb %xmm1, %xmm0 2612; SSSE3-NEXT: por %xmm6, %xmm0 2613; SSSE3-NEXT: pshufb %xmm8, %xmm5 2614; SSSE3-NEXT: movdqa %xmm5, %xmm1 2615; SSSE3-NEXT: pand %xmm9, %xmm1 2616; SSSE3-NEXT: movdqa %xmm7, %xmm6 2617; SSSE3-NEXT: pshufb %xmm1, %xmm6 2618; SSSE3-NEXT: psrlw $4, %xmm5 2619; SSSE3-NEXT: pand %xmm9, %xmm5 2620; SSSE3-NEXT: movdqa %xmm4, %xmm1 2621; SSSE3-NEXT: pshufb %xmm5, %xmm1 2622; SSSE3-NEXT: por %xmm6, %xmm1 2623; SSSE3-NEXT: pshufb %xmm8, %xmm2 2624; SSSE3-NEXT: movdqa %xmm2, %xmm5 2625; SSSE3-NEXT: pand %xmm9, %xmm5 2626; SSSE3-NEXT: movdqa %xmm7, %xmm6 2627; SSSE3-NEXT: pshufb %xmm5, %xmm6 2628; SSSE3-NEXT: psrlw $4, %xmm2 2629; SSSE3-NEXT: pand %xmm9, %xmm2 2630; SSSE3-NEXT: movdqa %xmm4, %xmm5 2631; SSSE3-NEXT: pshufb %xmm2, %xmm5 2632; SSSE3-NEXT: por %xmm6, %xmm5 2633; SSSE3-NEXT: pshufb %xmm8, %xmm3 2634; SSSE3-NEXT: movdqa %xmm3, %xmm2 2635; SSSE3-NEXT: pand %xmm9, %xmm2 2636; SSSE3-NEXT: pshufb %xmm2, %xmm7 2637; SSSE3-NEXT: psrlw $4, %xmm3 2638; SSSE3-NEXT: pand %xmm9, %xmm3 2639; SSSE3-NEXT: pshufb %xmm3, %xmm4 2640; SSSE3-NEXT: por %xmm7, %xmm4 2641; SSSE3-NEXT: movdqa %xmm5, %xmm2 2642; SSSE3-NEXT: movdqa %xmm4, %xmm3 2643; SSSE3-NEXT: retq 2644; 2645; AVX1-LABEL: test_bitreverse_v16i32: 2646; AVX1: # %bb.0: 2647; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2648; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2649; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2650; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2651; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2652; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2653; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2654; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2655; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2656; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2657; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2658; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2659; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2660; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 2661; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2662; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2663; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2664; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2665; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2666; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2667; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2668; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2669; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2670; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2671; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2672; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2673; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2674; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2675; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2676; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2677; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2678; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2679; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2680; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2681; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2682; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2683; AVX1-NEXT: retq 2684; 2685; AVX2-LABEL: test_bitreverse_v16i32: 2686; AVX2: # %bb.0: 2687; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2688; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2689; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2690; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2691; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2692; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2693; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2694; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2695; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2696; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2697; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2698; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2699; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2700; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2701; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2702; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2703; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2704; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2705; AVX2-NEXT: retq 2706; 2707; AVX512F-LABEL: test_bitreverse_v16i32: 2708; AVX512F: # %bb.0: 2709; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2710; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2711; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2712; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2713; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 2714; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2715; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2716; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2717; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 2718; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2719; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 2720; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2721; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 2722; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2723; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 2724; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 2725; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 2726; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 2727; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2728; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 2729; AVX512F-NEXT: retq 2730; 2731; AVX512BW-LABEL: test_bitreverse_v16i32: 2732; AVX512BW: # %bb.0: 2733; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] 2734; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2735; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2736; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2737; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2738; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2739; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2740; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2741; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2742; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2743; AVX512BW-NEXT: retq 2744; 2745; XOPAVX1-LABEL: test_bitreverse_v16i32: 2746; XOPAVX1: # %bb.0: 2747; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2748; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 2749; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2750; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2751; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2752; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2753; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2754; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2755; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2756; XOPAVX1-NEXT: retq 2757; 2758; XOPAVX2-LABEL: test_bitreverse_v16i32: 2759; XOPAVX2: # %bb.0: 2760; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2761; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 2762; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2763; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2764; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2765; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2766; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2767; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2768; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2769; XOPAVX2-NEXT: retq 2770; 2771; GFNISSE-LABEL: test_bitreverse_v16i32: 2772; GFNISSE: # %bb.0: 2773; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2774; GFNISSE-NEXT: pshufb %xmm4, %xmm0 2775; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] 2776; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 2777; GFNISSE-NEXT: pshufb %xmm4, %xmm1 2778; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 2779; GFNISSE-NEXT: pshufb %xmm4, %xmm2 2780; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 2781; GFNISSE-NEXT: pshufb %xmm4, %xmm3 2782; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 2783; GFNISSE-NEXT: retq 2784; 2785; GFNIAVX-LABEL: test_bitreverse_v16i32: 2786; GFNIAVX: # %bb.0: 2787; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm2 2788; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2789; GFNIAVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2790; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] 2791; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 2792; GFNIAVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2793; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0 2794; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2795; GFNIAVX-NEXT: vextractf128 $1, %ymm1, %xmm2 2796; GFNIAVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2797; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 2798; GFNIAVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2799; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1 2800; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2801; GFNIAVX-NEXT: retq 2802; 2803; GFNIAVX2-LABEL: test_bitreverse_v16i32: 2804; GFNIAVX2: # %bb.0: 2805; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2806; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2807; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2808; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2809; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2810; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2811; GFNIAVX2-NEXT: retq 2812; 2813; GFNIAVX512F-LABEL: test_bitreverse_v16i32: 2814; GFNIAVX512F: # %bb.0: 2815; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2816; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2817; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2818; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2819; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2820; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2821; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2822; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2823; GFNIAVX512F-NEXT: retq 2824; 2825; GFNIAVX512BW-LABEL: test_bitreverse_v16i32: 2826; GFNIAVX512BW: # %bb.0: 2827; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] 2828; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{.*}}(%rip){1to8}, %zmm0, %zmm0 2829; GFNIAVX512BW-NEXT: retq 2830 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) 2831 ret <16 x i32> %b 2832} 2833 2834define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { 2835; SSE2-LABEL: test_bitreverse_v8i64: 2836; SSE2: # %bb.0: 2837; SSE2-NEXT: movdqa %xmm3, %xmm11 2838; SSE2-NEXT: pxor %xmm10, %xmm10 2839; SSE2-NEXT: movdqa %xmm0, %xmm3 2840; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] 2841; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 2842; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 2843; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 2844; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 2845; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2846; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 2847; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 2848; SSE2-NEXT: packuswb %xmm3, %xmm0 2849; SSE2-NEXT: movdqa %xmm0, %xmm5 2850; SSE2-NEXT: psllw $4, %xmm5 2851; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2852; SSE2-NEXT: movdqa %xmm3, %xmm7 2853; SSE2-NEXT: pandn %xmm5, %xmm7 2854; SSE2-NEXT: psrlw $4, %xmm0 2855; SSE2-NEXT: pand %xmm3, %xmm0 2856; SSE2-NEXT: por %xmm7, %xmm0 2857; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 2858; SSE2-NEXT: movdqa %xmm0, %xmm7 2859; SSE2-NEXT: pand %xmm5, %xmm7 2860; SSE2-NEXT: psllw $2, %xmm7 2861; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 2862; SSE2-NEXT: pand %xmm8, %xmm0 2863; SSE2-NEXT: psrlw $2, %xmm0 2864; SSE2-NEXT: por %xmm7, %xmm0 2865; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 2866; SSE2-NEXT: movdqa %xmm0, %xmm6 2867; SSE2-NEXT: pand %xmm7, %xmm6 2868; SSE2-NEXT: paddb %xmm6, %xmm6 2869; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 2870; SSE2-NEXT: pand %xmm9, %xmm0 2871; SSE2-NEXT: psrlw $1, %xmm0 2872; SSE2-NEXT: por %xmm6, %xmm0 2873; SSE2-NEXT: movdqa %xmm1, %xmm6 2874; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] 2875; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 2876; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 2877; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 2878; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] 2879; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2880; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 2881; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 2882; SSE2-NEXT: packuswb %xmm6, %xmm1 2883; SSE2-NEXT: movdqa %xmm1, %xmm6 2884; SSE2-NEXT: psllw $4, %xmm6 2885; SSE2-NEXT: movdqa %xmm3, %xmm4 2886; SSE2-NEXT: pandn %xmm6, %xmm4 2887; SSE2-NEXT: psrlw $4, %xmm1 2888; SSE2-NEXT: pand %xmm3, %xmm1 2889; SSE2-NEXT: por %xmm4, %xmm1 2890; SSE2-NEXT: movdqa %xmm1, %xmm4 2891; SSE2-NEXT: pand %xmm5, %xmm4 2892; SSE2-NEXT: psllw $2, %xmm4 2893; SSE2-NEXT: pand %xmm8, %xmm1 2894; SSE2-NEXT: psrlw $2, %xmm1 2895; SSE2-NEXT: por %xmm4, %xmm1 2896; SSE2-NEXT: movdqa %xmm1, %xmm4 2897; SSE2-NEXT: pand %xmm7, %xmm4 2898; SSE2-NEXT: paddb %xmm4, %xmm4 2899; SSE2-NEXT: pand %xmm9, %xmm1 2900; SSE2-NEXT: psrlw $1, %xmm1 2901; SSE2-NEXT: por %xmm4, %xmm1 2902; SSE2-NEXT: movdqa %xmm2, %xmm4 2903; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] 2904; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 2905; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2906; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2907; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] 2908; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2909; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 2910; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 2911; SSE2-NEXT: packuswb %xmm4, %xmm2 2912; SSE2-NEXT: movdqa %xmm2, %xmm4 2913; SSE2-NEXT: psllw $4, %xmm4 2914; SSE2-NEXT: movdqa %xmm3, %xmm6 2915; SSE2-NEXT: pandn %xmm4, %xmm6 2916; SSE2-NEXT: psrlw $4, %xmm2 2917; SSE2-NEXT: pand %xmm3, %xmm2 2918; SSE2-NEXT: por %xmm6, %xmm2 2919; SSE2-NEXT: movdqa %xmm2, %xmm4 2920; SSE2-NEXT: pand %xmm5, %xmm4 2921; SSE2-NEXT: psllw $2, %xmm4 2922; SSE2-NEXT: pand %xmm8, %xmm2 2923; SSE2-NEXT: psrlw $2, %xmm2 2924; SSE2-NEXT: por %xmm4, %xmm2 2925; SSE2-NEXT: movdqa %xmm2, %xmm4 2926; SSE2-NEXT: pand %xmm7, %xmm4 2927; SSE2-NEXT: paddb %xmm4, %xmm4 2928; SSE2-NEXT: pand %xmm9, %xmm2 2929; SSE2-NEXT: psrlw $1, %xmm2 2930; SSE2-NEXT: por %xmm4, %xmm2 2931; SSE2-NEXT: movdqa %xmm11, %xmm4 2932; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] 2933; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 2934; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2935; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2936; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] 2937; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[2,3,0,1] 2938; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 2939; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 2940; SSE2-NEXT: packuswb %xmm4, %xmm6 2941; SSE2-NEXT: movdqa %xmm6, %xmm4 2942; SSE2-NEXT: psllw $4, %xmm4 2943; SSE2-NEXT: psrlw $4, %xmm6 2944; SSE2-NEXT: pand %xmm3, %xmm6 2945; SSE2-NEXT: pandn %xmm4, %xmm3 2946; SSE2-NEXT: por %xmm6, %xmm3 2947; SSE2-NEXT: pand %xmm3, %xmm5 2948; SSE2-NEXT: psllw $2, %xmm5 2949; SSE2-NEXT: pand %xmm8, %xmm3 2950; SSE2-NEXT: psrlw $2, %xmm3 2951; SSE2-NEXT: por %xmm5, %xmm3 2952; SSE2-NEXT: pand %xmm3, %xmm7 2953; SSE2-NEXT: paddb %xmm7, %xmm7 2954; SSE2-NEXT: pand %xmm9, %xmm3 2955; SSE2-NEXT: psrlw $1, %xmm3 2956; SSE2-NEXT: por %xmm7, %xmm3 2957; SSE2-NEXT: retq 2958; 2959; SSSE3-LABEL: test_bitreverse_v8i64: 2960; SSSE3: # %bb.0: 2961; SSSE3-NEXT: movdqa %xmm1, %xmm5 2962; SSSE3-NEXT: movdqa %xmm0, %xmm1 2963; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2964; SSSE3-NEXT: pshufb %xmm8, %xmm1 2965; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2966; SSSE3-NEXT: movdqa %xmm1, %xmm0 2967; SSSE3-NEXT: pand %xmm9, %xmm0 2968; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2969; SSSE3-NEXT: movdqa %xmm7, %xmm6 2970; SSSE3-NEXT: pshufb %xmm0, %xmm6 2971; SSSE3-NEXT: psrlw $4, %xmm1 2972; SSSE3-NEXT: pand %xmm9, %xmm1 2973; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2974; SSSE3-NEXT: movdqa %xmm4, %xmm0 2975; SSSE3-NEXT: pshufb %xmm1, %xmm0 2976; SSSE3-NEXT: por %xmm6, %xmm0 2977; SSSE3-NEXT: pshufb %xmm8, %xmm5 2978; SSSE3-NEXT: movdqa %xmm5, %xmm1 2979; SSSE3-NEXT: pand %xmm9, %xmm1 2980; SSSE3-NEXT: movdqa %xmm7, %xmm6 2981; SSSE3-NEXT: pshufb %xmm1, %xmm6 2982; SSSE3-NEXT: psrlw $4, %xmm5 2983; SSSE3-NEXT: pand %xmm9, %xmm5 2984; SSSE3-NEXT: movdqa %xmm4, %xmm1 2985; SSSE3-NEXT: pshufb %xmm5, %xmm1 2986; SSSE3-NEXT: por %xmm6, %xmm1 2987; SSSE3-NEXT: pshufb %xmm8, %xmm2 2988; SSSE3-NEXT: movdqa %xmm2, %xmm5 2989; SSSE3-NEXT: pand %xmm9, %xmm5 2990; SSSE3-NEXT: movdqa %xmm7, %xmm6 2991; SSSE3-NEXT: pshufb %xmm5, %xmm6 2992; SSSE3-NEXT: psrlw $4, %xmm2 2993; SSSE3-NEXT: pand %xmm9, %xmm2 2994; SSSE3-NEXT: movdqa %xmm4, %xmm5 2995; SSSE3-NEXT: pshufb %xmm2, %xmm5 2996; SSSE3-NEXT: por %xmm6, %xmm5 2997; SSSE3-NEXT: pshufb %xmm8, %xmm3 2998; SSSE3-NEXT: movdqa %xmm3, %xmm2 2999; SSSE3-NEXT: pand %xmm9, %xmm2 3000; SSSE3-NEXT: pshufb %xmm2, %xmm7 3001; SSSE3-NEXT: psrlw $4, %xmm3 3002; SSSE3-NEXT: pand %xmm9, %xmm3 3003; SSSE3-NEXT: pshufb %xmm3, %xmm4 3004; SSSE3-NEXT: por %xmm7, %xmm4 3005; SSSE3-NEXT: movdqa %xmm5, %xmm2 3006; SSSE3-NEXT: movdqa %xmm4, %xmm3 3007; SSSE3-NEXT: retq 3008; 3009; AVX1-LABEL: test_bitreverse_v8i64: 3010; AVX1: # %bb.0: 3011; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3012; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3013; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3014; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3015; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 3016; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3017; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3018; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 3019; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 3020; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3021; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 3022; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 3023; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 3024; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 3025; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3026; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 3027; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 3028; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 3029; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 3030; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3031; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3032; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3033; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 3034; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3035; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 3036; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 3037; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 3038; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 3039; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 3040; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 3041; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 3042; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 3043; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 3044; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 3045; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 3046; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3047; AVX1-NEXT: retq 3048; 3049; AVX2-LABEL: test_bitreverse_v8i64: 3050; AVX2: # %bb.0: 3051; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3052; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3053; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3054; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 3055; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3056; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 3057; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 3058; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 3059; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3060; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 3061; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 3062; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3063; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 3064; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 3065; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 3066; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 3067; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 3068; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 3069; AVX2-NEXT: retq 3070; 3071; AVX512F-LABEL: test_bitreverse_v8i64: 3072; AVX512F: # %bb.0: 3073; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 3074; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3075; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3076; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3077; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 3078; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3079; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 3080; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3081; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 3082; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 3083; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 3084; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 3085; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 3086; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3087; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 3088; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 3089; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 3090; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 3091; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 3092; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 3093; AVX512F-NEXT: retq 3094; 3095; AVX512BW-LABEL: test_bitreverse_v8i64: 3096; AVX512BW: # %bb.0: 3097; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] 3098; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3099; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 3100; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3101; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 3102; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 3103; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 3104; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3105; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 3106; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 3107; AVX512BW-NEXT: retq 3108; 3109; XOPAVX1-LABEL: test_bitreverse_v8i64: 3110; XOPAVX1: # %bb.0: 3111; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3112; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 3113; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3114; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 3115; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3116; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3117; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3118; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 3119; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3120; XOPAVX1-NEXT: retq 3121; 3122; XOPAVX2-LABEL: test_bitreverse_v8i64: 3123; XOPAVX2: # %bb.0: 3124; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 3125; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 3126; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3127; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 3128; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 3129; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 3130; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3131; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 3132; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 3133; XOPAVX2-NEXT: retq 3134; 3135; GFNISSE-LABEL: test_bitreverse_v8i64: 3136; GFNISSE: # %bb.0: 3137; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3138; GFNISSE-NEXT: pshufb %xmm4, %xmm0 3139; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] 3140; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 3141; GFNISSE-NEXT: pshufb %xmm4, %xmm1 3142; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 3143; GFNISSE-NEXT: pshufb %xmm4, %xmm2 3144; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 3145; GFNISSE-NEXT: pshufb %xmm4, %xmm3 3146; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 3147; GFNISSE-NEXT: retq 3148; 3149; GFNIAVX-LABEL: test_bitreverse_v8i64: 3150; GFNIAVX: # %bb.0: 3151; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm2 3152; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3153; GFNIAVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3154; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] 3155; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 3156; GFNIAVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 3157; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0 3158; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3159; GFNIAVX-NEXT: vextractf128 $1, %ymm1, %xmm2 3160; GFNIAVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3161; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 3162; GFNIAVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 3163; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1 3164; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3165; GFNIAVX-NEXT: retq 3166; 3167; GFNIAVX2-LABEL: test_bitreverse_v8i64: 3168; GFNIAVX2: # %bb.0: 3169; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3170; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3171; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 3172; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 3173; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3174; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 3175; GFNIAVX2-NEXT: retq 3176; 3177; GFNIAVX512F-LABEL: test_bitreverse_v8i64: 3178; GFNIAVX512F: # %bb.0: 3179; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 3180; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3181; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3182; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 3183; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 3184; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3185; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 3186; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 3187; GFNIAVX512F-NEXT: retq 3188; 3189; GFNIAVX512BW-LABEL: test_bitreverse_v8i64: 3190; GFNIAVX512BW: # %bb.0: 3191; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] 3192; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{.*}}(%rip){1to8}, %zmm0, %zmm0 3193; GFNIAVX512BW-NEXT: retq 3194 %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) 3195 ret <8 x i64> %b 3196} 3197 3198; 3199; Constant Folding 3200; 3201 3202define i32 @fold_bitreverse_i32() nounwind { 3203; ALL-LABEL: fold_bitreverse_i32: 3204; ALL: # %bb.0: 3205; ALL-NEXT: movl $16711935, %eax # imm = 0xFF00FF 3206; ALL-NEXT: retq 3207 %b = call i32 @llvm.bitreverse.i32(i32 4278255360) 3208 ret i32 %b 3209} 3210 3211define <16 x i8> @fold_bitreverse_v16i8() nounwind { 3212; SSE-LABEL: fold_bitreverse_v16i8: 3213; SSE: # %bb.0: 3214; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3215; SSE-NEXT: retq 3216; 3217; AVX-LABEL: fold_bitreverse_v16i8: 3218; AVX: # %bb.0: 3219; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3220; AVX-NEXT: retq 3221; 3222; XOP-LABEL: fold_bitreverse_v16i8: 3223; XOP: # %bb.0: 3224; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3225; XOP-NEXT: retq 3226; 3227; GFNISSE-LABEL: fold_bitreverse_v16i8: 3228; GFNISSE: # %bb.0: 3229; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3230; GFNISSE-NEXT: retq 3231; 3232; GFNIAVX-LABEL: fold_bitreverse_v16i8: 3233; GFNIAVX: # %bb.0: 3234; GFNIAVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3235; GFNIAVX-NEXT: retq 3236; 3237; GFNIAVX2-LABEL: fold_bitreverse_v16i8: 3238; GFNIAVX2: # %bb.0: 3239; GFNIAVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3240; GFNIAVX2-NEXT: retq 3241; 3242; GFNIAVX512F-LABEL: fold_bitreverse_v16i8: 3243; GFNIAVX512F: # %bb.0: 3244; GFNIAVX512F-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3245; GFNIAVX512F-NEXT: retq 3246; 3247; GFNIAVX512BW-LABEL: fold_bitreverse_v16i8: 3248; GFNIAVX512BW: # %bb.0: 3249; GFNIAVX512BW-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3250; GFNIAVX512BW-NEXT: retq 3251 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>) 3252 ret <16 x i8> %b 3253} 3254 3255define <16 x i16> @fold_bitreverse_v16i16() nounwind { 3256; SSE-LABEL: fold_bitreverse_v16i16: 3257; SSE: # %bb.0: 3258; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959] 3259; SSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863] 3260; SSE-NEXT: retq 3261; 3262; AVX-LABEL: fold_bitreverse_v16i16: 3263; AVX: # %bb.0: 3264; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 3265; AVX-NEXT: retq 3266; 3267; XOP-LABEL: fold_bitreverse_v16i16: 3268; XOP: # %bb.0: 3269; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 3270; XOP-NEXT: retq 3271; 3272; GFNISSE-LABEL: fold_bitreverse_v16i16: 3273; GFNISSE: # %bb.0: 3274; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959] 3275; GFNISSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863] 3276; GFNISSE-NEXT: retq 3277; 3278; GFNIAVX-LABEL: fold_bitreverse_v16i16: 3279; GFNIAVX: # %bb.0: 3280; GFNIAVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 3281; GFNIAVX-NEXT: retq 3282; 3283; GFNIAVX2-LABEL: fold_bitreverse_v16i16: 3284; GFNIAVX2: # %bb.0: 3285; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 3286; GFNIAVX2-NEXT: retq 3287; 3288; GFNIAVX512F-LABEL: fold_bitreverse_v16i16: 3289; GFNIAVX512F: # %bb.0: 3290; GFNIAVX512F-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 3291; GFNIAVX512F-NEXT: retq 3292; 3293; GFNIAVX512BW-LABEL: fold_bitreverse_v16i16: 3294; GFNIAVX512BW: # %bb.0: 3295; GFNIAVX512BW-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 3296; GFNIAVX512BW-NEXT: retq 3297 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>) 3298 ret <16 x i16> %b 3299} 3300 3301define <16 x i32> @fold_bitreverse_v16i32() nounwind { 3302; SSE-LABEL: fold_bitreverse_v16i32: 3303; SSE: # %bb.0: 3304; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471] 3305; SSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559] 3306; SSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015] 3307; SSE-NEXT: movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103] 3308; SSE-NEXT: retq 3309; 3310; AVX1-LABEL: fold_bitreverse_v16i32: 3311; AVX1: # %bb.0: 3312; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 3313; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3314; AVX1-NEXT: retq 3315; 3316; AVX2-LABEL: fold_bitreverse_v16i32: 3317; AVX2: # %bb.0: 3318; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 3319; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3320; AVX2-NEXT: retq 3321; 3322; AVX512-LABEL: fold_bitreverse_v16i32: 3323; AVX512: # %bb.0: 3324; AVX512-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3325; AVX512-NEXT: retq 3326; 3327; XOP-LABEL: fold_bitreverse_v16i32: 3328; XOP: # %bb.0: 3329; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 3330; XOP-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3331; XOP-NEXT: retq 3332; 3333; GFNISSE-LABEL: fold_bitreverse_v16i32: 3334; GFNISSE: # %bb.0: 3335; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471] 3336; GFNISSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559] 3337; GFNISSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015] 3338; GFNISSE-NEXT: movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103] 3339; GFNISSE-NEXT: retq 3340; 3341; GFNIAVX-LABEL: fold_bitreverse_v16i32: 3342; GFNIAVX: # %bb.0: 3343; GFNIAVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 3344; GFNIAVX-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3345; GFNIAVX-NEXT: retq 3346; 3347; GFNIAVX2-LABEL: fold_bitreverse_v16i32: 3348; GFNIAVX2: # %bb.0: 3349; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 3350; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3351; GFNIAVX2-NEXT: retq 3352; 3353; GFNIAVX512F-LABEL: fold_bitreverse_v16i32: 3354; GFNIAVX512F: # %bb.0: 3355; GFNIAVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3356; GFNIAVX512F-NEXT: retq 3357; 3358; GFNIAVX512BW-LABEL: fold_bitreverse_v16i32: 3359; GFNIAVX512BW: # %bb.0: 3360; GFNIAVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3361; GFNIAVX512BW-NEXT: retq 3362 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>) 3363 ret <16 x i32> %b 3364} 3365 3366declare i8 @llvm.bitreverse.i8(i8) readnone 3367declare i16 @llvm.bitreverse.i16(i16) readnone 3368declare i32 @llvm.bitreverse.i32(i32) readnone 3369declare i64 @llvm.bitreverse.i64(i64) readnone 3370 3371declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone 3372declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone 3373declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone 3374declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone 3375 3376declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) readnone 3377declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone 3378declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone 3379declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone 3380 3381declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) readnone 3382declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone 3383declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>) readnone 3384declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>) readnone 3385