1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNISSE 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX2 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX512F 14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX512BW 15 16; Make sure we don't crash with avx512bw and xop 17; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw 18 19define i8 @test_bitreverse_i8(i8 %a) nounwind { 20; SSE-LABEL: test_bitreverse_i8: 21; SSE: # %bb.0: 22; SSE-NEXT: # kill: def $edi killed $edi def $rdi 23; SSE-NEXT: rolb $4, %dil 24; SSE-NEXT: movl %edi, %eax 25; SSE-NEXT: andb $51, %al 26; SSE-NEXT: shlb $2, %al 27; SSE-NEXT: shrb $2, %dil 28; SSE-NEXT: andb $51, %dil 29; SSE-NEXT: orb %al, %dil 30; SSE-NEXT: movl %edi, %eax 31; SSE-NEXT: andb $85, %al 32; SSE-NEXT: addb %al, %al 33; SSE-NEXT: shrb %dil 34; SSE-NEXT: andb $85, %dil 35; SSE-NEXT: addl %edi, %eax 36; SSE-NEXT: # kill: def $al killed $al killed $eax 37; SSE-NEXT: retq 38; 39; AVX-LABEL: test_bitreverse_i8: 40; AVX: # %bb.0: 41; AVX-NEXT: # kill: def $edi killed $edi def $rdi 42; AVX-NEXT: rolb $4, %dil 43; AVX-NEXT: movl %edi, %eax 44; AVX-NEXT: andb $51, %al 45; AVX-NEXT: shlb $2, %al 46; AVX-NEXT: shrb $2, %dil 47; AVX-NEXT: andb $51, %dil 48; AVX-NEXT: orb %al, %dil 49; AVX-NEXT: movl %edi, %eax 50; AVX-NEXT: andb $85, %al 51; AVX-NEXT: addb %al, %al 52; AVX-NEXT: shrb %dil 53; AVX-NEXT: andb $85, %dil 54; AVX-NEXT: addl %edi, %eax 55; AVX-NEXT: # kill: def $al killed $al killed $eax 56; AVX-NEXT: retq 57; 58; XOP-LABEL: test_bitreverse_i8: 59; XOP: # %bb.0: 60; XOP-NEXT: vmovd %edi, %xmm0 61; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 62; XOP-NEXT: vmovd %xmm0, %eax 63; XOP-NEXT: # kill: def $al killed $al killed $eax 64; XOP-NEXT: retq 65; 66; GFNISSE-LABEL: test_bitreverse_i8: 67; GFNISSE: # %bb.0: 68; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi 69; GFNISSE-NEXT: rolb $4, %dil 70; GFNISSE-NEXT: movl %edi, %eax 71; GFNISSE-NEXT: andb $51, %al 72; GFNISSE-NEXT: shlb $2, %al 73; GFNISSE-NEXT: shrb $2, %dil 74; GFNISSE-NEXT: andb $51, %dil 75; GFNISSE-NEXT: orb %al, %dil 76; GFNISSE-NEXT: movl %edi, %eax 77; GFNISSE-NEXT: andb $85, %al 78; GFNISSE-NEXT: addb %al, %al 79; GFNISSE-NEXT: shrb %dil 80; GFNISSE-NEXT: andb $85, %dil 81; GFNISSE-NEXT: addl %edi, %eax 82; GFNISSE-NEXT: # kill: def $al killed $al killed $eax 83; GFNISSE-NEXT: retq 84; 85; GFNIAVX-LABEL: test_bitreverse_i8: 86; GFNIAVX: # %bb.0: 87; GFNIAVX-NEXT: # kill: def $edi killed $edi def $rdi 88; GFNIAVX-NEXT: rolb $4, %dil 89; GFNIAVX-NEXT: movl %edi, %eax 90; GFNIAVX-NEXT: andb $51, %al 91; GFNIAVX-NEXT: shlb $2, %al 92; GFNIAVX-NEXT: shrb $2, %dil 93; GFNIAVX-NEXT: andb $51, %dil 94; GFNIAVX-NEXT: orb %al, %dil 95; GFNIAVX-NEXT: movl %edi, %eax 96; GFNIAVX-NEXT: andb $85, %al 97; GFNIAVX-NEXT: addb %al, %al 98; GFNIAVX-NEXT: shrb %dil 99; GFNIAVX-NEXT: andb $85, %dil 100; GFNIAVX-NEXT: addl %edi, %eax 101; GFNIAVX-NEXT: # kill: def $al killed $al killed $eax 102; GFNIAVX-NEXT: retq 103; 104; GFNIAVX2-LABEL: test_bitreverse_i8: 105; GFNIAVX2: # %bb.0: 106; GFNIAVX2-NEXT: # kill: def $edi killed $edi def $rdi 107; GFNIAVX2-NEXT: rolb $4, %dil 108; GFNIAVX2-NEXT: movl %edi, %eax 109; GFNIAVX2-NEXT: andb $51, %al 110; GFNIAVX2-NEXT: shlb $2, %al 111; GFNIAVX2-NEXT: shrb $2, %dil 112; GFNIAVX2-NEXT: andb $51, %dil 113; GFNIAVX2-NEXT: orb %al, %dil 114; GFNIAVX2-NEXT: movl %edi, %eax 115; GFNIAVX2-NEXT: andb $85, %al 116; GFNIAVX2-NEXT: addb %al, %al 117; GFNIAVX2-NEXT: shrb %dil 118; GFNIAVX2-NEXT: andb $85, %dil 119; GFNIAVX2-NEXT: addl %edi, %eax 120; GFNIAVX2-NEXT: # kill: def $al killed $al killed $eax 121; GFNIAVX2-NEXT: retq 122; 123; GFNIAVX512F-LABEL: test_bitreverse_i8: 124; GFNIAVX512F: # %bb.0: 125; GFNIAVX512F-NEXT: # kill: def $edi killed $edi def $rdi 126; GFNIAVX512F-NEXT: rolb $4, %dil 127; GFNIAVX512F-NEXT: movl %edi, %eax 128; GFNIAVX512F-NEXT: andb $51, %al 129; GFNIAVX512F-NEXT: shlb $2, %al 130; GFNIAVX512F-NEXT: shrb $2, %dil 131; GFNIAVX512F-NEXT: andb $51, %dil 132; GFNIAVX512F-NEXT: orb %al, %dil 133; GFNIAVX512F-NEXT: movl %edi, %eax 134; GFNIAVX512F-NEXT: andb $85, %al 135; GFNIAVX512F-NEXT: addb %al, %al 136; GFNIAVX512F-NEXT: shrb %dil 137; GFNIAVX512F-NEXT: andb $85, %dil 138; GFNIAVX512F-NEXT: addl %edi, %eax 139; GFNIAVX512F-NEXT: # kill: def $al killed $al killed $eax 140; GFNIAVX512F-NEXT: retq 141; 142; GFNIAVX512BW-LABEL: test_bitreverse_i8: 143; GFNIAVX512BW: # %bb.0: 144; GFNIAVX512BW-NEXT: # kill: def $edi killed $edi def $rdi 145; GFNIAVX512BW-NEXT: rolb $4, %dil 146; GFNIAVX512BW-NEXT: movl %edi, %eax 147; GFNIAVX512BW-NEXT: andb $51, %al 148; GFNIAVX512BW-NEXT: shlb $2, %al 149; GFNIAVX512BW-NEXT: shrb $2, %dil 150; GFNIAVX512BW-NEXT: andb $51, %dil 151; GFNIAVX512BW-NEXT: orb %al, %dil 152; GFNIAVX512BW-NEXT: movl %edi, %eax 153; GFNIAVX512BW-NEXT: andb $85, %al 154; GFNIAVX512BW-NEXT: addb %al, %al 155; GFNIAVX512BW-NEXT: shrb %dil 156; GFNIAVX512BW-NEXT: andb $85, %dil 157; GFNIAVX512BW-NEXT: addl %edi, %eax 158; GFNIAVX512BW-NEXT: # kill: def $al killed $al killed $eax 159; GFNIAVX512BW-NEXT: retq 160 %b = call i8 @llvm.bitreverse.i8(i8 %a) 161 ret i8 %b 162} 163 164define i16 @test_bitreverse_i16(i16 %a) nounwind { 165; SSE-LABEL: test_bitreverse_i16: 166; SSE: # %bb.0: 167; SSE-NEXT: # kill: def $edi killed $edi def $rdi 168; SSE-NEXT: rolw $8, %di 169; SSE-NEXT: movl %edi, %eax 170; SSE-NEXT: andl $3855, %eax # imm = 0xF0F 171; SSE-NEXT: shll $4, %eax 172; SSE-NEXT: shrl $4, %edi 173; SSE-NEXT: andl $3855, %edi # imm = 0xF0F 174; SSE-NEXT: orl %eax, %edi 175; SSE-NEXT: movl %edi, %eax 176; SSE-NEXT: andl $13107, %eax # imm = 0x3333 177; SSE-NEXT: shrl $2, %edi 178; SSE-NEXT: andl $13107, %edi # imm = 0x3333 179; SSE-NEXT: leal (%rdi,%rax,4), %eax 180; SSE-NEXT: movl %eax, %ecx 181; SSE-NEXT: andl $21845, %ecx # imm = 0x5555 182; SSE-NEXT: shrl %eax 183; SSE-NEXT: andl $21845, %eax # imm = 0x5555 184; SSE-NEXT: leal (%rax,%rcx,2), %eax 185; SSE-NEXT: # kill: def $ax killed $ax killed $eax 186; SSE-NEXT: retq 187; 188; AVX-LABEL: test_bitreverse_i16: 189; AVX: # %bb.0: 190; AVX-NEXT: # kill: def $edi killed $edi def $rdi 191; AVX-NEXT: rolw $8, %di 192; AVX-NEXT: movl %edi, %eax 193; AVX-NEXT: andl $3855, %eax # imm = 0xF0F 194; AVX-NEXT: shll $4, %eax 195; AVX-NEXT: shrl $4, %edi 196; AVX-NEXT: andl $3855, %edi # imm = 0xF0F 197; AVX-NEXT: orl %eax, %edi 198; AVX-NEXT: movl %edi, %eax 199; AVX-NEXT: andl $13107, %eax # imm = 0x3333 200; AVX-NEXT: shrl $2, %edi 201; AVX-NEXT: andl $13107, %edi # imm = 0x3333 202; AVX-NEXT: leal (%rdi,%rax,4), %eax 203; AVX-NEXT: movl %eax, %ecx 204; AVX-NEXT: andl $21845, %ecx # imm = 0x5555 205; AVX-NEXT: shrl %eax 206; AVX-NEXT: andl $21845, %eax # imm = 0x5555 207; AVX-NEXT: leal (%rax,%rcx,2), %eax 208; AVX-NEXT: # kill: def $ax killed $ax killed $eax 209; AVX-NEXT: retq 210; 211; XOP-LABEL: test_bitreverse_i16: 212; XOP: # %bb.0: 213; XOP-NEXT: vmovd %edi, %xmm0 214; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 215; XOP-NEXT: vmovd %xmm0, %eax 216; XOP-NEXT: # kill: def $ax killed $ax killed $eax 217; XOP-NEXT: retq 218; 219; GFNISSE-LABEL: test_bitreverse_i16: 220; GFNISSE: # %bb.0: 221; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi 222; GFNISSE-NEXT: rolw $8, %di 223; GFNISSE-NEXT: movl %edi, %eax 224; GFNISSE-NEXT: andl $3855, %eax # imm = 0xF0F 225; GFNISSE-NEXT: shll $4, %eax 226; GFNISSE-NEXT: shrl $4, %edi 227; GFNISSE-NEXT: andl $3855, %edi # imm = 0xF0F 228; GFNISSE-NEXT: orl %eax, %edi 229; GFNISSE-NEXT: movl %edi, %eax 230; GFNISSE-NEXT: andl $13107, %eax # imm = 0x3333 231; GFNISSE-NEXT: shrl $2, %edi 232; GFNISSE-NEXT: andl $13107, %edi # imm = 0x3333 233; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax 234; GFNISSE-NEXT: movl %eax, %ecx 235; GFNISSE-NEXT: andl $21845, %ecx # imm = 0x5555 236; GFNISSE-NEXT: shrl %eax 237; GFNISSE-NEXT: andl $21845, %eax # imm = 0x5555 238; GFNISSE-NEXT: leal (%rax,%rcx,2), %eax 239; GFNISSE-NEXT: # kill: def $ax killed $ax killed $eax 240; GFNISSE-NEXT: retq 241; 242; GFNIAVX-LABEL: test_bitreverse_i16: 243; GFNIAVX: # %bb.0: 244; GFNIAVX-NEXT: # kill: def $edi killed $edi def $rdi 245; GFNIAVX-NEXT: rolw $8, %di 246; GFNIAVX-NEXT: movl %edi, %eax 247; GFNIAVX-NEXT: andl $3855, %eax # imm = 0xF0F 248; GFNIAVX-NEXT: shll $4, %eax 249; GFNIAVX-NEXT: shrl $4, %edi 250; GFNIAVX-NEXT: andl $3855, %edi # imm = 0xF0F 251; GFNIAVX-NEXT: orl %eax, %edi 252; GFNIAVX-NEXT: movl %edi, %eax 253; GFNIAVX-NEXT: andl $13107, %eax # imm = 0x3333 254; GFNIAVX-NEXT: shrl $2, %edi 255; GFNIAVX-NEXT: andl $13107, %edi # imm = 0x3333 256; GFNIAVX-NEXT: leal (%rdi,%rax,4), %eax 257; GFNIAVX-NEXT: movl %eax, %ecx 258; GFNIAVX-NEXT: andl $21845, %ecx # imm = 0x5555 259; GFNIAVX-NEXT: shrl %eax 260; GFNIAVX-NEXT: andl $21845, %eax # imm = 0x5555 261; GFNIAVX-NEXT: leal (%rax,%rcx,2), %eax 262; GFNIAVX-NEXT: # kill: def $ax killed $ax killed $eax 263; GFNIAVX-NEXT: retq 264; 265; GFNIAVX2-LABEL: test_bitreverse_i16: 266; GFNIAVX2: # %bb.0: 267; GFNIAVX2-NEXT: # kill: def $edi killed $edi def $rdi 268; GFNIAVX2-NEXT: rolw $8, %di 269; GFNIAVX2-NEXT: movl %edi, %eax 270; GFNIAVX2-NEXT: andl $3855, %eax # imm = 0xF0F 271; GFNIAVX2-NEXT: shll $4, %eax 272; GFNIAVX2-NEXT: shrl $4, %edi 273; GFNIAVX2-NEXT: andl $3855, %edi # imm = 0xF0F 274; GFNIAVX2-NEXT: orl %eax, %edi 275; GFNIAVX2-NEXT: movl %edi, %eax 276; GFNIAVX2-NEXT: andl $13107, %eax # imm = 0x3333 277; GFNIAVX2-NEXT: shrl $2, %edi 278; GFNIAVX2-NEXT: andl $13107, %edi # imm = 0x3333 279; GFNIAVX2-NEXT: leal (%rdi,%rax,4), %eax 280; GFNIAVX2-NEXT: movl %eax, %ecx 281; GFNIAVX2-NEXT: andl $21845, %ecx # imm = 0x5555 282; GFNIAVX2-NEXT: shrl %eax 283; GFNIAVX2-NEXT: andl $21845, %eax # imm = 0x5555 284; GFNIAVX2-NEXT: leal (%rax,%rcx,2), %eax 285; GFNIAVX2-NEXT: # kill: def $ax killed $ax killed $eax 286; GFNIAVX2-NEXT: retq 287; 288; GFNIAVX512F-LABEL: test_bitreverse_i16: 289; GFNIAVX512F: # %bb.0: 290; GFNIAVX512F-NEXT: # kill: def $edi killed $edi def $rdi 291; GFNIAVX512F-NEXT: rolw $8, %di 292; GFNIAVX512F-NEXT: movl %edi, %eax 293; GFNIAVX512F-NEXT: andl $3855, %eax # imm = 0xF0F 294; GFNIAVX512F-NEXT: shll $4, %eax 295; GFNIAVX512F-NEXT: shrl $4, %edi 296; GFNIAVX512F-NEXT: andl $3855, %edi # imm = 0xF0F 297; GFNIAVX512F-NEXT: orl %eax, %edi 298; GFNIAVX512F-NEXT: movl %edi, %eax 299; GFNIAVX512F-NEXT: andl $13107, %eax # imm = 0x3333 300; GFNIAVX512F-NEXT: shrl $2, %edi 301; GFNIAVX512F-NEXT: andl $13107, %edi # imm = 0x3333 302; GFNIAVX512F-NEXT: leal (%rdi,%rax,4), %eax 303; GFNIAVX512F-NEXT: movl %eax, %ecx 304; GFNIAVX512F-NEXT: andl $21845, %ecx # imm = 0x5555 305; GFNIAVX512F-NEXT: shrl %eax 306; GFNIAVX512F-NEXT: andl $21845, %eax # imm = 0x5555 307; GFNIAVX512F-NEXT: leal (%rax,%rcx,2), %eax 308; GFNIAVX512F-NEXT: # kill: def $ax killed $ax killed $eax 309; GFNIAVX512F-NEXT: retq 310; 311; GFNIAVX512BW-LABEL: test_bitreverse_i16: 312; GFNIAVX512BW: # %bb.0: 313; GFNIAVX512BW-NEXT: # kill: def $edi killed $edi def $rdi 314; GFNIAVX512BW-NEXT: rolw $8, %di 315; GFNIAVX512BW-NEXT: movl %edi, %eax 316; GFNIAVX512BW-NEXT: andl $3855, %eax # imm = 0xF0F 317; GFNIAVX512BW-NEXT: shll $4, %eax 318; GFNIAVX512BW-NEXT: shrl $4, %edi 319; GFNIAVX512BW-NEXT: andl $3855, %edi # imm = 0xF0F 320; GFNIAVX512BW-NEXT: orl %eax, %edi 321; GFNIAVX512BW-NEXT: movl %edi, %eax 322; GFNIAVX512BW-NEXT: andl $13107, %eax # imm = 0x3333 323; GFNIAVX512BW-NEXT: shrl $2, %edi 324; GFNIAVX512BW-NEXT: andl $13107, %edi # imm = 0x3333 325; GFNIAVX512BW-NEXT: leal (%rdi,%rax,4), %eax 326; GFNIAVX512BW-NEXT: movl %eax, %ecx 327; GFNIAVX512BW-NEXT: andl $21845, %ecx # imm = 0x5555 328; GFNIAVX512BW-NEXT: shrl %eax 329; GFNIAVX512BW-NEXT: andl $21845, %eax # imm = 0x5555 330; GFNIAVX512BW-NEXT: leal (%rax,%rcx,2), %eax 331; GFNIAVX512BW-NEXT: # kill: def $ax killed $ax killed $eax 332; GFNIAVX512BW-NEXT: retq 333 %b = call i16 @llvm.bitreverse.i16(i16 %a) 334 ret i16 %b 335} 336 337define i32 @test_bitreverse_i32(i32 %a) nounwind { 338; SSE-LABEL: test_bitreverse_i32: 339; SSE: # %bb.0: 340; SSE-NEXT: # kill: def $edi killed $edi def $rdi 341; SSE-NEXT: bswapl %edi 342; SSE-NEXT: movl %edi, %eax 343; SSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 344; SSE-NEXT: shll $4, %eax 345; SSE-NEXT: shrl $4, %edi 346; SSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 347; SSE-NEXT: orl %eax, %edi 348; SSE-NEXT: movl %edi, %eax 349; SSE-NEXT: andl $858993459, %eax # imm = 0x33333333 350; SSE-NEXT: shrl $2, %edi 351; SSE-NEXT: andl $858993459, %edi # imm = 0x33333333 352; SSE-NEXT: leal (%rdi,%rax,4), %eax 353; SSE-NEXT: movl %eax, %ecx 354; SSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555 355; SSE-NEXT: shrl %eax 356; SSE-NEXT: andl $1431655765, %eax # imm = 0x55555555 357; SSE-NEXT: leal (%rax,%rcx,2), %eax 358; SSE-NEXT: retq 359; 360; AVX-LABEL: test_bitreverse_i32: 361; AVX: # %bb.0: 362; AVX-NEXT: # kill: def $edi killed $edi def $rdi 363; AVX-NEXT: bswapl %edi 364; AVX-NEXT: movl %edi, %eax 365; AVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 366; AVX-NEXT: shll $4, %eax 367; AVX-NEXT: shrl $4, %edi 368; AVX-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 369; AVX-NEXT: orl %eax, %edi 370; AVX-NEXT: movl %edi, %eax 371; AVX-NEXT: andl $858993459, %eax # imm = 0x33333333 372; AVX-NEXT: shrl $2, %edi 373; AVX-NEXT: andl $858993459, %edi # imm = 0x33333333 374; AVX-NEXT: leal (%rdi,%rax,4), %eax 375; AVX-NEXT: movl %eax, %ecx 376; AVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555 377; AVX-NEXT: shrl %eax 378; AVX-NEXT: andl $1431655765, %eax # imm = 0x55555555 379; AVX-NEXT: leal (%rax,%rcx,2), %eax 380; AVX-NEXT: retq 381; 382; XOP-LABEL: test_bitreverse_i32: 383; XOP: # %bb.0: 384; XOP-NEXT: vmovd %edi, %xmm0 385; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 386; XOP-NEXT: vmovd %xmm0, %eax 387; XOP-NEXT: retq 388; 389; GFNISSE-LABEL: test_bitreverse_i32: 390; GFNISSE: # %bb.0: 391; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi 392; GFNISSE-NEXT: bswapl %edi 393; GFNISSE-NEXT: movl %edi, %eax 394; GFNISSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 395; GFNISSE-NEXT: shll $4, %eax 396; GFNISSE-NEXT: shrl $4, %edi 397; GFNISSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 398; GFNISSE-NEXT: orl %eax, %edi 399; GFNISSE-NEXT: movl %edi, %eax 400; GFNISSE-NEXT: andl $858993459, %eax # imm = 0x33333333 401; GFNISSE-NEXT: shrl $2, %edi 402; GFNISSE-NEXT: andl $858993459, %edi # imm = 0x33333333 403; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax 404; GFNISSE-NEXT: movl %eax, %ecx 405; GFNISSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555 406; GFNISSE-NEXT: shrl %eax 407; GFNISSE-NEXT: andl $1431655765, %eax # imm = 0x55555555 408; GFNISSE-NEXT: leal (%rax,%rcx,2), %eax 409; GFNISSE-NEXT: retq 410; 411; GFNIAVX-LABEL: test_bitreverse_i32: 412; GFNIAVX: # %bb.0: 413; GFNIAVX-NEXT: # kill: def $edi killed $edi def $rdi 414; GFNIAVX-NEXT: bswapl %edi 415; GFNIAVX-NEXT: movl %edi, %eax 416; GFNIAVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 417; GFNIAVX-NEXT: shll $4, %eax 418; GFNIAVX-NEXT: shrl $4, %edi 419; GFNIAVX-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 420; GFNIAVX-NEXT: orl %eax, %edi 421; GFNIAVX-NEXT: movl %edi, %eax 422; GFNIAVX-NEXT: andl $858993459, %eax # imm = 0x33333333 423; GFNIAVX-NEXT: shrl $2, %edi 424; GFNIAVX-NEXT: andl $858993459, %edi # imm = 0x33333333 425; GFNIAVX-NEXT: leal (%rdi,%rax,4), %eax 426; GFNIAVX-NEXT: movl %eax, %ecx 427; GFNIAVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555 428; GFNIAVX-NEXT: shrl %eax 429; GFNIAVX-NEXT: andl $1431655765, %eax # imm = 0x55555555 430; GFNIAVX-NEXT: leal (%rax,%rcx,2), %eax 431; GFNIAVX-NEXT: retq 432; 433; GFNIAVX2-LABEL: test_bitreverse_i32: 434; GFNIAVX2: # %bb.0: 435; GFNIAVX2-NEXT: # kill: def $edi killed $edi def $rdi 436; GFNIAVX2-NEXT: bswapl %edi 437; GFNIAVX2-NEXT: movl %edi, %eax 438; GFNIAVX2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 439; GFNIAVX2-NEXT: shll $4, %eax 440; GFNIAVX2-NEXT: shrl $4, %edi 441; GFNIAVX2-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 442; GFNIAVX2-NEXT: orl %eax, %edi 443; GFNIAVX2-NEXT: movl %edi, %eax 444; GFNIAVX2-NEXT: andl $858993459, %eax # imm = 0x33333333 445; GFNIAVX2-NEXT: shrl $2, %edi 446; GFNIAVX2-NEXT: andl $858993459, %edi # imm = 0x33333333 447; GFNIAVX2-NEXT: leal (%rdi,%rax,4), %eax 448; GFNIAVX2-NEXT: movl %eax, %ecx 449; GFNIAVX2-NEXT: andl $1431655765, %ecx # imm = 0x55555555 450; GFNIAVX2-NEXT: shrl %eax 451; GFNIAVX2-NEXT: andl $1431655765, %eax # imm = 0x55555555 452; GFNIAVX2-NEXT: leal (%rax,%rcx,2), %eax 453; GFNIAVX2-NEXT: retq 454; 455; GFNIAVX512F-LABEL: test_bitreverse_i32: 456; GFNIAVX512F: # %bb.0: 457; GFNIAVX512F-NEXT: # kill: def $edi killed $edi def $rdi 458; GFNIAVX512F-NEXT: bswapl %edi 459; GFNIAVX512F-NEXT: movl %edi, %eax 460; GFNIAVX512F-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 461; GFNIAVX512F-NEXT: shll $4, %eax 462; GFNIAVX512F-NEXT: shrl $4, %edi 463; GFNIAVX512F-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 464; GFNIAVX512F-NEXT: orl %eax, %edi 465; GFNIAVX512F-NEXT: movl %edi, %eax 466; GFNIAVX512F-NEXT: andl $858993459, %eax # imm = 0x33333333 467; GFNIAVX512F-NEXT: shrl $2, %edi 468; GFNIAVX512F-NEXT: andl $858993459, %edi # imm = 0x33333333 469; GFNIAVX512F-NEXT: leal (%rdi,%rax,4), %eax 470; GFNIAVX512F-NEXT: movl %eax, %ecx 471; GFNIAVX512F-NEXT: andl $1431655765, %ecx # imm = 0x55555555 472; GFNIAVX512F-NEXT: shrl %eax 473; GFNIAVX512F-NEXT: andl $1431655765, %eax # imm = 0x55555555 474; GFNIAVX512F-NEXT: leal (%rax,%rcx,2), %eax 475; GFNIAVX512F-NEXT: retq 476; 477; GFNIAVX512BW-LABEL: test_bitreverse_i32: 478; GFNIAVX512BW: # %bb.0: 479; GFNIAVX512BW-NEXT: # kill: def $edi killed $edi def $rdi 480; GFNIAVX512BW-NEXT: bswapl %edi 481; GFNIAVX512BW-NEXT: movl %edi, %eax 482; GFNIAVX512BW-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 483; GFNIAVX512BW-NEXT: shll $4, %eax 484; GFNIAVX512BW-NEXT: shrl $4, %edi 485; GFNIAVX512BW-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 486; GFNIAVX512BW-NEXT: orl %eax, %edi 487; GFNIAVX512BW-NEXT: movl %edi, %eax 488; GFNIAVX512BW-NEXT: andl $858993459, %eax # imm = 0x33333333 489; GFNIAVX512BW-NEXT: shrl $2, %edi 490; GFNIAVX512BW-NEXT: andl $858993459, %edi # imm = 0x33333333 491; GFNIAVX512BW-NEXT: leal (%rdi,%rax,4), %eax 492; GFNIAVX512BW-NEXT: movl %eax, %ecx 493; GFNIAVX512BW-NEXT: andl $1431655765, %ecx # imm = 0x55555555 494; GFNIAVX512BW-NEXT: shrl %eax 495; GFNIAVX512BW-NEXT: andl $1431655765, %eax # imm = 0x55555555 496; GFNIAVX512BW-NEXT: leal (%rax,%rcx,2), %eax 497; GFNIAVX512BW-NEXT: retq 498 %b = call i32 @llvm.bitreverse.i32(i32 %a) 499 ret i32 %b 500} 501 502define i64 @test_bitreverse_i64(i64 %a) nounwind { 503; SSE-LABEL: test_bitreverse_i64: 504; SSE: # %bb.0: 505; SSE-NEXT: bswapq %rdi 506; SSE-NEXT: movq %rdi, %rax 507; SSE-NEXT: shrq $4, %rax 508; SSE-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 509; SSE-NEXT: andq %rcx, %rax 510; SSE-NEXT: andq %rcx, %rdi 511; SSE-NEXT: shlq $4, %rdi 512; SSE-NEXT: orq %rax, %rdi 513; SSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 514; SSE-NEXT: movq %rdi, %rcx 515; SSE-NEXT: andq %rax, %rcx 516; SSE-NEXT: shrq $2, %rdi 517; SSE-NEXT: andq %rax, %rdi 518; SSE-NEXT: leaq (%rdi,%rcx,4), %rax 519; SSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 520; SSE-NEXT: movq %rax, %rdx 521; SSE-NEXT: andq %rcx, %rdx 522; SSE-NEXT: shrq %rax 523; SSE-NEXT: andq %rcx, %rax 524; SSE-NEXT: leaq (%rax,%rdx,2), %rax 525; SSE-NEXT: retq 526; 527; AVX-LABEL: test_bitreverse_i64: 528; AVX: # %bb.0: 529; AVX-NEXT: bswapq %rdi 530; AVX-NEXT: movq %rdi, %rax 531; AVX-NEXT: shrq $4, %rax 532; AVX-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 533; AVX-NEXT: andq %rcx, %rax 534; AVX-NEXT: andq %rcx, %rdi 535; AVX-NEXT: shlq $4, %rdi 536; AVX-NEXT: orq %rax, %rdi 537; AVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 538; AVX-NEXT: movq %rdi, %rcx 539; AVX-NEXT: andq %rax, %rcx 540; AVX-NEXT: shrq $2, %rdi 541; AVX-NEXT: andq %rax, %rdi 542; AVX-NEXT: leaq (%rdi,%rcx,4), %rax 543; AVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 544; AVX-NEXT: movq %rax, %rdx 545; AVX-NEXT: andq %rcx, %rdx 546; AVX-NEXT: shrq %rax 547; AVX-NEXT: andq %rcx, %rax 548; AVX-NEXT: leaq (%rax,%rdx,2), %rax 549; AVX-NEXT: retq 550; 551; XOP-LABEL: test_bitreverse_i64: 552; XOP: # %bb.0: 553; XOP-NEXT: vmovq %rdi, %xmm0 554; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 555; XOP-NEXT: vmovq %xmm0, %rax 556; XOP-NEXT: retq 557; 558; GFNISSE-LABEL: test_bitreverse_i64: 559; GFNISSE: # %bb.0: 560; GFNISSE-NEXT: bswapq %rdi 561; GFNISSE-NEXT: movq %rdi, %rax 562; GFNISSE-NEXT: shrq $4, %rax 563; GFNISSE-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 564; GFNISSE-NEXT: andq %rcx, %rax 565; GFNISSE-NEXT: andq %rcx, %rdi 566; GFNISSE-NEXT: shlq $4, %rdi 567; GFNISSE-NEXT: orq %rax, %rdi 568; GFNISSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 569; GFNISSE-NEXT: movq %rdi, %rcx 570; GFNISSE-NEXT: andq %rax, %rcx 571; GFNISSE-NEXT: shrq $2, %rdi 572; GFNISSE-NEXT: andq %rax, %rdi 573; GFNISSE-NEXT: leaq (%rdi,%rcx,4), %rax 574; GFNISSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 575; GFNISSE-NEXT: movq %rax, %rdx 576; GFNISSE-NEXT: andq %rcx, %rdx 577; GFNISSE-NEXT: shrq %rax 578; GFNISSE-NEXT: andq %rcx, %rax 579; GFNISSE-NEXT: leaq (%rax,%rdx,2), %rax 580; GFNISSE-NEXT: retq 581; 582; GFNIAVX-LABEL: test_bitreverse_i64: 583; GFNIAVX: # %bb.0: 584; GFNIAVX-NEXT: bswapq %rdi 585; GFNIAVX-NEXT: movq %rdi, %rax 586; GFNIAVX-NEXT: shrq $4, %rax 587; GFNIAVX-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 588; GFNIAVX-NEXT: andq %rcx, %rax 589; GFNIAVX-NEXT: andq %rcx, %rdi 590; GFNIAVX-NEXT: shlq $4, %rdi 591; GFNIAVX-NEXT: orq %rax, %rdi 592; GFNIAVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 593; GFNIAVX-NEXT: movq %rdi, %rcx 594; GFNIAVX-NEXT: andq %rax, %rcx 595; GFNIAVX-NEXT: shrq $2, %rdi 596; GFNIAVX-NEXT: andq %rax, %rdi 597; GFNIAVX-NEXT: leaq (%rdi,%rcx,4), %rax 598; GFNIAVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 599; GFNIAVX-NEXT: movq %rax, %rdx 600; GFNIAVX-NEXT: andq %rcx, %rdx 601; GFNIAVX-NEXT: shrq %rax 602; GFNIAVX-NEXT: andq %rcx, %rax 603; GFNIAVX-NEXT: leaq (%rax,%rdx,2), %rax 604; GFNIAVX-NEXT: retq 605; 606; GFNIAVX2-LABEL: test_bitreverse_i64: 607; GFNIAVX2: # %bb.0: 608; GFNIAVX2-NEXT: bswapq %rdi 609; GFNIAVX2-NEXT: movq %rdi, %rax 610; GFNIAVX2-NEXT: shrq $4, %rax 611; GFNIAVX2-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 612; GFNIAVX2-NEXT: andq %rcx, %rax 613; GFNIAVX2-NEXT: andq %rcx, %rdi 614; GFNIAVX2-NEXT: shlq $4, %rdi 615; GFNIAVX2-NEXT: orq %rax, %rdi 616; GFNIAVX2-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 617; GFNIAVX2-NEXT: movq %rdi, %rcx 618; GFNIAVX2-NEXT: andq %rax, %rcx 619; GFNIAVX2-NEXT: shrq $2, %rdi 620; GFNIAVX2-NEXT: andq %rax, %rdi 621; GFNIAVX2-NEXT: leaq (%rdi,%rcx,4), %rax 622; GFNIAVX2-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 623; GFNIAVX2-NEXT: movq %rax, %rdx 624; GFNIAVX2-NEXT: andq %rcx, %rdx 625; GFNIAVX2-NEXT: shrq %rax 626; GFNIAVX2-NEXT: andq %rcx, %rax 627; GFNIAVX2-NEXT: leaq (%rax,%rdx,2), %rax 628; GFNIAVX2-NEXT: retq 629; 630; GFNIAVX512F-LABEL: test_bitreverse_i64: 631; GFNIAVX512F: # %bb.0: 632; GFNIAVX512F-NEXT: bswapq %rdi 633; GFNIAVX512F-NEXT: movq %rdi, %rax 634; GFNIAVX512F-NEXT: shrq $4, %rax 635; GFNIAVX512F-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 636; GFNIAVX512F-NEXT: andq %rcx, %rax 637; GFNIAVX512F-NEXT: andq %rcx, %rdi 638; GFNIAVX512F-NEXT: shlq $4, %rdi 639; GFNIAVX512F-NEXT: orq %rax, %rdi 640; GFNIAVX512F-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 641; GFNIAVX512F-NEXT: movq %rdi, %rcx 642; GFNIAVX512F-NEXT: andq %rax, %rcx 643; GFNIAVX512F-NEXT: shrq $2, %rdi 644; GFNIAVX512F-NEXT: andq %rax, %rdi 645; GFNIAVX512F-NEXT: leaq (%rdi,%rcx,4), %rax 646; GFNIAVX512F-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 647; GFNIAVX512F-NEXT: movq %rax, %rdx 648; GFNIAVX512F-NEXT: andq %rcx, %rdx 649; GFNIAVX512F-NEXT: shrq %rax 650; GFNIAVX512F-NEXT: andq %rcx, %rax 651; GFNIAVX512F-NEXT: leaq (%rax,%rdx,2), %rax 652; GFNIAVX512F-NEXT: retq 653; 654; GFNIAVX512BW-LABEL: test_bitreverse_i64: 655; GFNIAVX512BW: # %bb.0: 656; GFNIAVX512BW-NEXT: bswapq %rdi 657; GFNIAVX512BW-NEXT: movq %rdi, %rax 658; GFNIAVX512BW-NEXT: shrq $4, %rax 659; GFNIAVX512BW-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 660; GFNIAVX512BW-NEXT: andq %rcx, %rax 661; GFNIAVX512BW-NEXT: andq %rcx, %rdi 662; GFNIAVX512BW-NEXT: shlq $4, %rdi 663; GFNIAVX512BW-NEXT: orq %rax, %rdi 664; GFNIAVX512BW-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 665; GFNIAVX512BW-NEXT: movq %rdi, %rcx 666; GFNIAVX512BW-NEXT: andq %rax, %rcx 667; GFNIAVX512BW-NEXT: shrq $2, %rdi 668; GFNIAVX512BW-NEXT: andq %rax, %rdi 669; GFNIAVX512BW-NEXT: leaq (%rdi,%rcx,4), %rax 670; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 671; GFNIAVX512BW-NEXT: movq %rax, %rdx 672; GFNIAVX512BW-NEXT: andq %rcx, %rdx 673; GFNIAVX512BW-NEXT: shrq %rax 674; GFNIAVX512BW-NEXT: andq %rcx, %rax 675; GFNIAVX512BW-NEXT: leaq (%rax,%rdx,2), %rax 676; GFNIAVX512BW-NEXT: retq 677 %b = call i64 @llvm.bitreverse.i64(i64 %a) 678 ret i64 %b 679} 680 681define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { 682; SSE2-LABEL: test_bitreverse_v16i8: 683; SSE2: # %bb.0: 684; SSE2-NEXT: movdqa %xmm0, %xmm1 685; SSE2-NEXT: psllw $4, %xmm1 686; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 687; SSE2-NEXT: psrlw $4, %xmm0 688; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 689; SSE2-NEXT: por %xmm1, %xmm0 690; SSE2-NEXT: movdqa %xmm0, %xmm1 691; SSE2-NEXT: psrlw $2, %xmm1 692; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 693; SSE2-NEXT: pand %xmm2, %xmm1 694; SSE2-NEXT: pand %xmm2, %xmm0 695; SSE2-NEXT: psllw $2, %xmm0 696; SSE2-NEXT: por %xmm1, %xmm0 697; SSE2-NEXT: movdqa %xmm0, %xmm1 698; SSE2-NEXT: psrlw $1, %xmm1 699; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 700; SSE2-NEXT: pand %xmm2, %xmm1 701; SSE2-NEXT: pand %xmm2, %xmm0 702; SSE2-NEXT: paddb %xmm0, %xmm0 703; SSE2-NEXT: por %xmm1, %xmm0 704; SSE2-NEXT: retq 705; 706; SSSE3-LABEL: test_bitreverse_v16i8: 707; SSSE3: # %bb.0: 708; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 709; SSSE3-NEXT: movdqa %xmm0, %xmm2 710; SSSE3-NEXT: pand %xmm1, %xmm2 711; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 712; SSSE3-NEXT: pshufb %xmm2, %xmm3 713; SSSE3-NEXT: psrlw $4, %xmm0 714; SSSE3-NEXT: pand %xmm1, %xmm0 715; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 716; SSSE3-NEXT: pshufb %xmm0, %xmm1 717; SSSE3-NEXT: por %xmm3, %xmm1 718; SSSE3-NEXT: movdqa %xmm1, %xmm0 719; SSSE3-NEXT: retq 720; 721; AVX-LABEL: test_bitreverse_v16i8: 722; AVX: # %bb.0: 723; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 724; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 725; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 726; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 727; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 728; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 729; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 730; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 731; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 732; AVX-NEXT: retq 733; 734; XOP-LABEL: test_bitreverse_v16i8: 735; XOP: # %bb.0: 736; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 737; XOP-NEXT: retq 738; 739; GFNISSE-LABEL: test_bitreverse_v16i8: 740; GFNISSE: # %bb.0: 741; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 742; GFNISSE-NEXT: retq 743; 744; GFNIAVX-LABEL: test_bitreverse_v16i8: 745; GFNIAVX: # %bb.0: 746; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 747; GFNIAVX-NEXT: retq 748; 749; GFNIAVX2-LABEL: test_bitreverse_v16i8: 750; GFNIAVX2: # %bb.0: 751; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 752; GFNIAVX2-NEXT: retq 753; 754; GFNIAVX512F-LABEL: test_bitreverse_v16i8: 755; GFNIAVX512F: # %bb.0: 756; GFNIAVX512F-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 757; GFNIAVX512F-NEXT: retq 758; 759; GFNIAVX512BW-LABEL: test_bitreverse_v16i8: 760; GFNIAVX512BW: # %bb.0: 761; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 762; GFNIAVX512BW-NEXT: retq 763 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) 764 ret <16 x i8> %b 765} 766 767define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { 768; SSE2-LABEL: test_bitreverse_v8i16: 769; SSE2: # %bb.0: 770; SSE2-NEXT: movdqa %xmm0, %xmm1 771; SSE2-NEXT: psrlw $8, %xmm1 772; SSE2-NEXT: psllw $8, %xmm0 773; SSE2-NEXT: por %xmm1, %xmm0 774; SSE2-NEXT: movdqa %xmm0, %xmm1 775; SSE2-NEXT: psllw $4, %xmm1 776; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 777; SSE2-NEXT: psrlw $4, %xmm0 778; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 779; SSE2-NEXT: por %xmm1, %xmm0 780; SSE2-NEXT: movdqa %xmm0, %xmm1 781; SSE2-NEXT: psrlw $2, %xmm1 782; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 783; SSE2-NEXT: pand %xmm2, %xmm1 784; SSE2-NEXT: pand %xmm2, %xmm0 785; SSE2-NEXT: psllw $2, %xmm0 786; SSE2-NEXT: por %xmm1, %xmm0 787; SSE2-NEXT: movdqa %xmm0, %xmm1 788; SSE2-NEXT: psrlw $1, %xmm1 789; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 790; SSE2-NEXT: pand %xmm2, %xmm1 791; SSE2-NEXT: pand %xmm2, %xmm0 792; SSE2-NEXT: paddb %xmm0, %xmm0 793; SSE2-NEXT: por %xmm1, %xmm0 794; SSE2-NEXT: retq 795; 796; SSSE3-LABEL: test_bitreverse_v8i16: 797; SSSE3: # %bb.0: 798; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 799; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 800; SSSE3-NEXT: movdqa %xmm0, %xmm2 801; SSSE3-NEXT: pand %xmm1, %xmm2 802; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 803; SSSE3-NEXT: pshufb %xmm2, %xmm3 804; SSSE3-NEXT: psrlw $4, %xmm0 805; SSSE3-NEXT: pand %xmm1, %xmm0 806; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 807; SSSE3-NEXT: pshufb %xmm0, %xmm1 808; SSSE3-NEXT: por %xmm3, %xmm1 809; SSSE3-NEXT: movdqa %xmm1, %xmm0 810; SSSE3-NEXT: retq 811; 812; AVX-LABEL: test_bitreverse_v8i16: 813; AVX: # %bb.0: 814; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 815; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 816; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 817; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 818; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 819; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 820; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 821; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 822; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 823; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 824; AVX-NEXT: retq 825; 826; XOP-LABEL: test_bitreverse_v8i16: 827; XOP: # %bb.0: 828; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 829; XOP-NEXT: retq 830; 831; GFNISSE-LABEL: test_bitreverse_v8i16: 832; GFNISSE: # %bb.0: 833; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 834; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 835; GFNISSE-NEXT: retq 836; 837; GFNIAVX-LABEL: test_bitreverse_v8i16: 838; GFNIAVX: # %bb.0: 839; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 840; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 841; GFNIAVX-NEXT: retq 842; 843; GFNIAVX2-LABEL: test_bitreverse_v8i16: 844; GFNIAVX2: # %bb.0: 845; GFNIAVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 846; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 847; GFNIAVX2-NEXT: retq 848; 849; GFNIAVX512F-LABEL: test_bitreverse_v8i16: 850; GFNIAVX512F: # %bb.0: 851; GFNIAVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 852; GFNIAVX512F-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 853; GFNIAVX512F-NEXT: retq 854; 855; GFNIAVX512BW-LABEL: test_bitreverse_v8i16: 856; GFNIAVX512BW: # %bb.0: 857; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 858; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 859; GFNIAVX512BW-NEXT: retq 860 %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) 861 ret <8 x i16> %b 862} 863 864define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { 865; SSE2-LABEL: test_bitreverse_v4i32: 866; SSE2: # %bb.0: 867; SSE2-NEXT: pxor %xmm1, %xmm1 868; SSE2-NEXT: movdqa %xmm0, %xmm2 869; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 870; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 871; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 872; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 873; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 874; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 875; SSE2-NEXT: packuswb %xmm2, %xmm0 876; SSE2-NEXT: movdqa %xmm0, %xmm1 877; SSE2-NEXT: psllw $4, %xmm1 878; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 879; SSE2-NEXT: psrlw $4, %xmm0 880; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 881; SSE2-NEXT: por %xmm1, %xmm0 882; SSE2-NEXT: movdqa %xmm0, %xmm1 883; SSE2-NEXT: psrlw $2, %xmm1 884; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 885; SSE2-NEXT: pand %xmm2, %xmm1 886; SSE2-NEXT: pand %xmm2, %xmm0 887; SSE2-NEXT: psllw $2, %xmm0 888; SSE2-NEXT: por %xmm1, %xmm0 889; SSE2-NEXT: movdqa %xmm0, %xmm1 890; SSE2-NEXT: psrlw $1, %xmm1 891; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 892; SSE2-NEXT: pand %xmm2, %xmm1 893; SSE2-NEXT: pand %xmm2, %xmm0 894; SSE2-NEXT: paddb %xmm0, %xmm0 895; SSE2-NEXT: por %xmm1, %xmm0 896; SSE2-NEXT: retq 897; 898; SSSE3-LABEL: test_bitreverse_v4i32: 899; SSSE3: # %bb.0: 900; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 901; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 902; SSSE3-NEXT: movdqa %xmm0, %xmm2 903; SSSE3-NEXT: pand %xmm1, %xmm2 904; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 905; SSSE3-NEXT: pshufb %xmm2, %xmm3 906; SSSE3-NEXT: psrlw $4, %xmm0 907; SSSE3-NEXT: pand %xmm1, %xmm0 908; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 909; SSSE3-NEXT: pshufb %xmm0, %xmm1 910; SSSE3-NEXT: por %xmm3, %xmm1 911; SSSE3-NEXT: movdqa %xmm1, %xmm0 912; SSSE3-NEXT: retq 913; 914; AVX-LABEL: test_bitreverse_v4i32: 915; AVX: # %bb.0: 916; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 917; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 918; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 919; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 920; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 921; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 922; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 923; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 924; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 925; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 926; AVX-NEXT: retq 927; 928; XOP-LABEL: test_bitreverse_v4i32: 929; XOP: # %bb.0: 930; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 931; XOP-NEXT: retq 932; 933; GFNISSE-LABEL: test_bitreverse_v4i32: 934; GFNISSE: # %bb.0: 935; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 936; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 937; GFNISSE-NEXT: retq 938; 939; GFNIAVX-LABEL: test_bitreverse_v4i32: 940; GFNIAVX: # %bb.0: 941; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 942; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 943; GFNIAVX-NEXT: retq 944; 945; GFNIAVX2-LABEL: test_bitreverse_v4i32: 946; GFNIAVX2: # %bb.0: 947; GFNIAVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 948; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 949; GFNIAVX2-NEXT: retq 950; 951; GFNIAVX512F-LABEL: test_bitreverse_v4i32: 952; GFNIAVX512F: # %bb.0: 953; GFNIAVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 954; GFNIAVX512F-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 955; GFNIAVX512F-NEXT: retq 956; 957; GFNIAVX512BW-LABEL: test_bitreverse_v4i32: 958; GFNIAVX512BW: # %bb.0: 959; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 960; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 961; GFNIAVX512BW-NEXT: retq 962 %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) 963 ret <4 x i32> %b 964} 965 966define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { 967; SSE2-LABEL: test_bitreverse_v2i64: 968; SSE2: # %bb.0: 969; SSE2-NEXT: pxor %xmm1, %xmm1 970; SSE2-NEXT: movdqa %xmm0, %xmm2 971; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 972; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 973; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 974; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 975; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 976; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 977; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 978; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 979; SSE2-NEXT: packuswb %xmm2, %xmm0 980; SSE2-NEXT: movdqa %xmm0, %xmm1 981; SSE2-NEXT: psllw $4, %xmm1 982; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 983; SSE2-NEXT: psrlw $4, %xmm0 984; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 985; SSE2-NEXT: por %xmm1, %xmm0 986; SSE2-NEXT: movdqa %xmm0, %xmm1 987; SSE2-NEXT: psrlw $2, %xmm1 988; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 989; SSE2-NEXT: pand %xmm2, %xmm1 990; SSE2-NEXT: pand %xmm2, %xmm0 991; SSE2-NEXT: psllw $2, %xmm0 992; SSE2-NEXT: por %xmm1, %xmm0 993; SSE2-NEXT: movdqa %xmm0, %xmm1 994; SSE2-NEXT: psrlw $1, %xmm1 995; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 996; SSE2-NEXT: pand %xmm2, %xmm1 997; SSE2-NEXT: pand %xmm2, %xmm0 998; SSE2-NEXT: paddb %xmm0, %xmm0 999; SSE2-NEXT: por %xmm1, %xmm0 1000; SSE2-NEXT: retq 1001; 1002; SSSE3-LABEL: test_bitreverse_v2i64: 1003; SSSE3: # %bb.0: 1004; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1005; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1006; SSSE3-NEXT: movdqa %xmm0, %xmm2 1007; SSSE3-NEXT: pand %xmm1, %xmm2 1008; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1009; SSSE3-NEXT: pshufb %xmm2, %xmm3 1010; SSSE3-NEXT: psrlw $4, %xmm0 1011; SSSE3-NEXT: pand %xmm1, %xmm0 1012; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1013; SSSE3-NEXT: pshufb %xmm0, %xmm1 1014; SSSE3-NEXT: por %xmm3, %xmm1 1015; SSSE3-NEXT: movdqa %xmm1, %xmm0 1016; SSSE3-NEXT: retq 1017; 1018; AVX-LABEL: test_bitreverse_v2i64: 1019; AVX: # %bb.0: 1020; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1021; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1022; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 1023; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1024; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1025; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1026; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 1027; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1028; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 1029; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 1030; AVX-NEXT: retq 1031; 1032; XOP-LABEL: test_bitreverse_v2i64: 1033; XOP: # %bb.0: 1034; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 1035; XOP-NEXT: retq 1036; 1037; GFNISSE-LABEL: test_bitreverse_v2i64: 1038; GFNISSE: # %bb.0: 1039; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1040; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1041; GFNISSE-NEXT: retq 1042; 1043; GFNIAVX-LABEL: test_bitreverse_v2i64: 1044; GFNIAVX: # %bb.0: 1045; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1046; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1047; GFNIAVX-NEXT: retq 1048; 1049; GFNIAVX2-LABEL: test_bitreverse_v2i64: 1050; GFNIAVX2: # %bb.0: 1051; GFNIAVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1052; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1053; GFNIAVX2-NEXT: retq 1054; 1055; GFNIAVX512F-LABEL: test_bitreverse_v2i64: 1056; GFNIAVX512F: # %bb.0: 1057; GFNIAVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1058; GFNIAVX512F-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1059; GFNIAVX512F-NEXT: retq 1060; 1061; GFNIAVX512BW-LABEL: test_bitreverse_v2i64: 1062; GFNIAVX512BW: # %bb.0: 1063; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1064; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1065; GFNIAVX512BW-NEXT: retq 1066 %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) 1067 ret <2 x i64> %b 1068} 1069 1070define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { 1071; SSE2-LABEL: test_bitreverse_v32i8: 1072; SSE2: # %bb.0: 1073; SSE2-NEXT: movdqa %xmm1, %xmm2 1074; SSE2-NEXT: movdqa %xmm0, %xmm3 1075; SSE2-NEXT: psllw $4, %xmm3 1076; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1077; SSE2-NEXT: movdqa %xmm1, %xmm4 1078; SSE2-NEXT: pandn %xmm3, %xmm4 1079; SSE2-NEXT: psrlw $4, %xmm0 1080; SSE2-NEXT: pand %xmm1, %xmm0 1081; SSE2-NEXT: por %xmm4, %xmm0 1082; SSE2-NEXT: movdqa %xmm0, %xmm4 1083; SSE2-NEXT: psrlw $2, %xmm4 1084; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1085; SSE2-NEXT: pand %xmm3, %xmm4 1086; SSE2-NEXT: pand %xmm3, %xmm0 1087; SSE2-NEXT: psllw $2, %xmm0 1088; SSE2-NEXT: por %xmm4, %xmm0 1089; SSE2-NEXT: movdqa %xmm0, %xmm5 1090; SSE2-NEXT: psrlw $1, %xmm5 1091; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1092; SSE2-NEXT: pand %xmm4, %xmm5 1093; SSE2-NEXT: pand %xmm4, %xmm0 1094; SSE2-NEXT: paddb %xmm0, %xmm0 1095; SSE2-NEXT: por %xmm5, %xmm0 1096; SSE2-NEXT: movdqa %xmm2, %xmm5 1097; SSE2-NEXT: psllw $4, %xmm5 1098; SSE2-NEXT: psrlw $4, %xmm2 1099; SSE2-NEXT: pand %xmm1, %xmm2 1100; SSE2-NEXT: pandn %xmm5, %xmm1 1101; SSE2-NEXT: por %xmm2, %xmm1 1102; SSE2-NEXT: movdqa %xmm1, %xmm2 1103; SSE2-NEXT: psrlw $2, %xmm2 1104; SSE2-NEXT: pand %xmm3, %xmm2 1105; SSE2-NEXT: pand %xmm3, %xmm1 1106; SSE2-NEXT: psllw $2, %xmm1 1107; SSE2-NEXT: por %xmm2, %xmm1 1108; SSE2-NEXT: movdqa %xmm1, %xmm2 1109; SSE2-NEXT: psrlw $1, %xmm2 1110; SSE2-NEXT: pand %xmm4, %xmm2 1111; SSE2-NEXT: pand %xmm4, %xmm1 1112; SSE2-NEXT: paddb %xmm1, %xmm1 1113; SSE2-NEXT: por %xmm2, %xmm1 1114; SSE2-NEXT: retq 1115; 1116; SSSE3-LABEL: test_bitreverse_v32i8: 1117; SSSE3: # %bb.0: 1118; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1119; SSSE3-NEXT: movdqa %xmm0, %xmm2 1120; SSSE3-NEXT: pand %xmm4, %xmm2 1121; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1122; SSSE3-NEXT: movdqa %xmm5, %xmm6 1123; SSSE3-NEXT: pshufb %xmm2, %xmm6 1124; SSSE3-NEXT: psrlw $4, %xmm0 1125; SSSE3-NEXT: pand %xmm4, %xmm0 1126; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1127; SSSE3-NEXT: movdqa %xmm2, %xmm3 1128; SSSE3-NEXT: pshufb %xmm0, %xmm3 1129; SSSE3-NEXT: por %xmm6, %xmm3 1130; SSSE3-NEXT: movdqa %xmm1, %xmm0 1131; SSSE3-NEXT: pand %xmm4, %xmm0 1132; SSSE3-NEXT: pshufb %xmm0, %xmm5 1133; SSSE3-NEXT: psrlw $4, %xmm1 1134; SSSE3-NEXT: pand %xmm4, %xmm1 1135; SSSE3-NEXT: pshufb %xmm1, %xmm2 1136; SSSE3-NEXT: por %xmm5, %xmm2 1137; SSSE3-NEXT: movdqa %xmm3, %xmm0 1138; SSSE3-NEXT: movdqa %xmm2, %xmm1 1139; SSSE3-NEXT: retq 1140; 1141; AVX1-LABEL: test_bitreverse_v32i8: 1142; AVX1: # %bb.0: 1143; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1144; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1145; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 1146; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1147; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 1148; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1149; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1150; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1151; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 1152; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 1153; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 1154; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 1155; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1156; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1157; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 1158; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 1159; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1160; AVX1-NEXT: retq 1161; 1162; AVX2-LABEL: test_bitreverse_v32i8: 1163; AVX2: # %bb.0: 1164; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1165; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1166; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1167; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1168; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1169; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1170; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1171; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1172; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1173; AVX2-NEXT: retq 1174; 1175; AVX512-LABEL: test_bitreverse_v32i8: 1176; AVX512: # %bb.0: 1177; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1178; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1179; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1180; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1181; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1182; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1183; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1184; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1185; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1186; AVX512-NEXT: retq 1187; 1188; XOPAVX1-LABEL: test_bitreverse_v32i8: 1189; XOPAVX1: # %bb.0: 1190; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1191; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1192; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1193; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1194; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1195; XOPAVX1-NEXT: retq 1196; 1197; XOPAVX2-LABEL: test_bitreverse_v32i8: 1198; XOPAVX2: # %bb.0: 1199; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1200; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1201; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1202; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1203; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1204; XOPAVX2-NEXT: retq 1205; 1206; GFNISSE-LABEL: test_bitreverse_v32i8: 1207; GFNISSE: # %bb.0: 1208; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] 1209; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 1210; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 1211; GFNISSE-NEXT: retq 1212; 1213; GFNIAVX-LABEL: test_bitreverse_v32i8: 1214; GFNIAVX: # %bb.0: 1215; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1216; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] 1217; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm2, %xmm1, %xmm1 1218; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm2, %xmm0, %xmm0 1219; GFNIAVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1220; GFNIAVX-NEXT: retq 1221; 1222; GFNIAVX2-LABEL: test_bitreverse_v32i8: 1223; GFNIAVX2: # %bb.0: 1224; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1225; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1226; GFNIAVX2-NEXT: retq 1227; 1228; GFNIAVX512F-LABEL: test_bitreverse_v32i8: 1229; GFNIAVX512F: # %bb.0: 1230; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1231; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1232; GFNIAVX512F-NEXT: retq 1233; 1234; GFNIAVX512BW-LABEL: test_bitreverse_v32i8: 1235; GFNIAVX512BW: # %bb.0: 1236; GFNIAVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1237; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1238; GFNIAVX512BW-NEXT: retq 1239 %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) 1240 ret <32 x i8> %b 1241} 1242 1243define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { 1244; SSE2-LABEL: test_bitreverse_v16i16: 1245; SSE2: # %bb.0: 1246; SSE2-NEXT: movdqa %xmm1, %xmm2 1247; SSE2-NEXT: movdqa %xmm0, %xmm1 1248; SSE2-NEXT: psrlw $8, %xmm1 1249; SSE2-NEXT: psllw $8, %xmm0 1250; SSE2-NEXT: por %xmm1, %xmm0 1251; SSE2-NEXT: movdqa %xmm0, %xmm3 1252; SSE2-NEXT: psllw $4, %xmm3 1253; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1254; SSE2-NEXT: movdqa %xmm1, %xmm4 1255; SSE2-NEXT: pandn %xmm3, %xmm4 1256; SSE2-NEXT: psrlw $4, %xmm0 1257; SSE2-NEXT: pand %xmm1, %xmm0 1258; SSE2-NEXT: por %xmm4, %xmm0 1259; SSE2-NEXT: movdqa %xmm0, %xmm4 1260; SSE2-NEXT: psrlw $2, %xmm4 1261; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1262; SSE2-NEXT: pand %xmm3, %xmm4 1263; SSE2-NEXT: pand %xmm3, %xmm0 1264; SSE2-NEXT: psllw $2, %xmm0 1265; SSE2-NEXT: por %xmm4, %xmm0 1266; SSE2-NEXT: movdqa %xmm0, %xmm5 1267; SSE2-NEXT: psrlw $1, %xmm5 1268; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1269; SSE2-NEXT: pand %xmm4, %xmm5 1270; SSE2-NEXT: pand %xmm4, %xmm0 1271; SSE2-NEXT: paddb %xmm0, %xmm0 1272; SSE2-NEXT: por %xmm5, %xmm0 1273; SSE2-NEXT: movdqa %xmm2, %xmm5 1274; SSE2-NEXT: psrlw $8, %xmm5 1275; SSE2-NEXT: psllw $8, %xmm2 1276; SSE2-NEXT: por %xmm5, %xmm2 1277; SSE2-NEXT: movdqa %xmm2, %xmm5 1278; SSE2-NEXT: psllw $4, %xmm5 1279; SSE2-NEXT: psrlw $4, %xmm2 1280; SSE2-NEXT: pand %xmm1, %xmm2 1281; SSE2-NEXT: pandn %xmm5, %xmm1 1282; SSE2-NEXT: por %xmm2, %xmm1 1283; SSE2-NEXT: movdqa %xmm1, %xmm2 1284; SSE2-NEXT: psrlw $2, %xmm2 1285; SSE2-NEXT: pand %xmm3, %xmm2 1286; SSE2-NEXT: pand %xmm3, %xmm1 1287; SSE2-NEXT: psllw $2, %xmm1 1288; SSE2-NEXT: por %xmm2, %xmm1 1289; SSE2-NEXT: movdqa %xmm1, %xmm2 1290; SSE2-NEXT: psrlw $1, %xmm2 1291; SSE2-NEXT: pand %xmm4, %xmm2 1292; SSE2-NEXT: pand %xmm4, %xmm1 1293; SSE2-NEXT: paddb %xmm1, %xmm1 1294; SSE2-NEXT: por %xmm2, %xmm1 1295; SSE2-NEXT: retq 1296; 1297; SSSE3-LABEL: test_bitreverse_v16i16: 1298; SSSE3: # %bb.0: 1299; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1300; SSSE3-NEXT: pshufb %xmm4, %xmm0 1301; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1302; SSSE3-NEXT: movdqa %xmm0, %xmm2 1303; SSSE3-NEXT: pand %xmm5, %xmm2 1304; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1305; SSSE3-NEXT: movdqa %xmm6, %xmm7 1306; SSSE3-NEXT: pshufb %xmm2, %xmm7 1307; SSSE3-NEXT: psrlw $4, %xmm0 1308; SSSE3-NEXT: pand %xmm5, %xmm0 1309; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1310; SSSE3-NEXT: movdqa %xmm2, %xmm3 1311; SSSE3-NEXT: pshufb %xmm0, %xmm3 1312; SSSE3-NEXT: por %xmm7, %xmm3 1313; SSSE3-NEXT: pshufb %xmm4, %xmm1 1314; SSSE3-NEXT: movdqa %xmm1, %xmm0 1315; SSSE3-NEXT: pand %xmm5, %xmm0 1316; SSSE3-NEXT: pshufb %xmm0, %xmm6 1317; SSSE3-NEXT: psrlw $4, %xmm1 1318; SSSE3-NEXT: pand %xmm5, %xmm1 1319; SSSE3-NEXT: pshufb %xmm1, %xmm2 1320; SSSE3-NEXT: por %xmm6, %xmm2 1321; SSSE3-NEXT: movdqa %xmm3, %xmm0 1322; SSSE3-NEXT: movdqa %xmm2, %xmm1 1323; SSSE3-NEXT: retq 1324; 1325; AVX1-LABEL: test_bitreverse_v16i16: 1326; AVX1: # %bb.0: 1327; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1328; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1329; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1330; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1331; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1332; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1333; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1334; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1335; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1336; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1337; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1338; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1339; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1340; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1341; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1342; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1343; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1344; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1345; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1346; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1347; AVX1-NEXT: retq 1348; 1349; AVX2-LABEL: test_bitreverse_v16i16: 1350; AVX2: # %bb.0: 1351; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1352; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1353; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1354; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1355; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1356; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1357; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1358; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1359; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1360; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1361; AVX2-NEXT: retq 1362; 1363; AVX512-LABEL: test_bitreverse_v16i16: 1364; AVX512: # %bb.0: 1365; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1366; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1367; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1368; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1369; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1370; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1371; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1372; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1373; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1374; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1375; AVX512-NEXT: retq 1376; 1377; XOPAVX1-LABEL: test_bitreverse_v16i16: 1378; XOPAVX1: # %bb.0: 1379; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1380; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1381; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1382; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1383; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1384; XOPAVX1-NEXT: retq 1385; 1386; XOPAVX2-LABEL: test_bitreverse_v16i16: 1387; XOPAVX2: # %bb.0: 1388; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1389; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1390; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1391; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1392; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1393; XOPAVX2-NEXT: retq 1394; 1395; GFNISSE-LABEL: test_bitreverse_v16i16: 1396; GFNISSE: # %bb.0: 1397; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1398; GFNISSE-NEXT: pshufb %xmm2, %xmm0 1399; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1400; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 1401; GFNISSE-NEXT: pshufb %xmm2, %xmm1 1402; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 1403; GFNISSE-NEXT: retq 1404; 1405; GFNIAVX-LABEL: test_bitreverse_v16i16: 1406; GFNIAVX: # %bb.0: 1407; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1408; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1409; GFNIAVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1410; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1411; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1 1412; GFNIAVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1413; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0 1414; GFNIAVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1415; GFNIAVX-NEXT: retq 1416; 1417; GFNIAVX2-LABEL: test_bitreverse_v16i16: 1418; GFNIAVX2: # %bb.0: 1419; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1420; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1421; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1422; GFNIAVX2-NEXT: retq 1423; 1424; GFNIAVX512F-LABEL: test_bitreverse_v16i16: 1425; GFNIAVX512F: # %bb.0: 1426; GFNIAVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1427; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1428; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1429; GFNIAVX512F-NEXT: retq 1430; 1431; GFNIAVX512BW-LABEL: test_bitreverse_v16i16: 1432; GFNIAVX512BW: # %bb.0: 1433; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1434; GFNIAVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1435; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1436; GFNIAVX512BW-NEXT: retq 1437 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) 1438 ret <16 x i16> %b 1439} 1440 1441define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { 1442; SSE2-LABEL: test_bitreverse_v8i32: 1443; SSE2: # %bb.0: 1444; SSE2-NEXT: movdqa %xmm1, %xmm2 1445; SSE2-NEXT: pxor %xmm3, %xmm3 1446; SSE2-NEXT: movdqa %xmm0, %xmm1 1447; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] 1448; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1449; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1450; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 1451; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1452; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1453; SSE2-NEXT: packuswb %xmm1, %xmm0 1454; SSE2-NEXT: movdqa %xmm0, %xmm4 1455; SSE2-NEXT: psllw $4, %xmm4 1456; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1457; SSE2-NEXT: movdqa %xmm1, %xmm5 1458; SSE2-NEXT: pandn %xmm4, %xmm5 1459; SSE2-NEXT: psrlw $4, %xmm0 1460; SSE2-NEXT: pand %xmm1, %xmm0 1461; SSE2-NEXT: por %xmm5, %xmm0 1462; SSE2-NEXT: movdqa %xmm0, %xmm5 1463; SSE2-NEXT: psrlw $2, %xmm5 1464; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1465; SSE2-NEXT: pand %xmm4, %xmm5 1466; SSE2-NEXT: pand %xmm4, %xmm0 1467; SSE2-NEXT: psllw $2, %xmm0 1468; SSE2-NEXT: por %xmm5, %xmm0 1469; SSE2-NEXT: movdqa %xmm0, %xmm6 1470; SSE2-NEXT: psrlw $1, %xmm6 1471; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1472; SSE2-NEXT: pand %xmm5, %xmm6 1473; SSE2-NEXT: pand %xmm5, %xmm0 1474; SSE2-NEXT: paddb %xmm0, %xmm0 1475; SSE2-NEXT: por %xmm6, %xmm0 1476; SSE2-NEXT: movdqa %xmm2, %xmm6 1477; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] 1478; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1479; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1480; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 1481; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1482; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1483; SSE2-NEXT: packuswb %xmm6, %xmm2 1484; SSE2-NEXT: movdqa %xmm2, %xmm3 1485; SSE2-NEXT: psllw $4, %xmm3 1486; SSE2-NEXT: psrlw $4, %xmm2 1487; SSE2-NEXT: pand %xmm1, %xmm2 1488; SSE2-NEXT: pandn %xmm3, %xmm1 1489; SSE2-NEXT: por %xmm2, %xmm1 1490; SSE2-NEXT: movdqa %xmm1, %xmm2 1491; SSE2-NEXT: psrlw $2, %xmm2 1492; SSE2-NEXT: pand %xmm4, %xmm2 1493; SSE2-NEXT: pand %xmm4, %xmm1 1494; SSE2-NEXT: psllw $2, %xmm1 1495; SSE2-NEXT: por %xmm2, %xmm1 1496; SSE2-NEXT: movdqa %xmm1, %xmm2 1497; SSE2-NEXT: psrlw $1, %xmm2 1498; SSE2-NEXT: pand %xmm5, %xmm2 1499; SSE2-NEXT: pand %xmm5, %xmm1 1500; SSE2-NEXT: paddb %xmm1, %xmm1 1501; SSE2-NEXT: por %xmm2, %xmm1 1502; SSE2-NEXT: retq 1503; 1504; SSSE3-LABEL: test_bitreverse_v8i32: 1505; SSSE3: # %bb.0: 1506; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1507; SSSE3-NEXT: pshufb %xmm4, %xmm0 1508; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1509; SSSE3-NEXT: movdqa %xmm0, %xmm2 1510; SSSE3-NEXT: pand %xmm5, %xmm2 1511; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1512; SSSE3-NEXT: movdqa %xmm6, %xmm7 1513; SSSE3-NEXT: pshufb %xmm2, %xmm7 1514; SSSE3-NEXT: psrlw $4, %xmm0 1515; SSSE3-NEXT: pand %xmm5, %xmm0 1516; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1517; SSSE3-NEXT: movdqa %xmm2, %xmm3 1518; SSSE3-NEXT: pshufb %xmm0, %xmm3 1519; SSSE3-NEXT: por %xmm7, %xmm3 1520; SSSE3-NEXT: pshufb %xmm4, %xmm1 1521; SSSE3-NEXT: movdqa %xmm1, %xmm0 1522; SSSE3-NEXT: pand %xmm5, %xmm0 1523; SSSE3-NEXT: pshufb %xmm0, %xmm6 1524; SSSE3-NEXT: psrlw $4, %xmm1 1525; SSSE3-NEXT: pand %xmm5, %xmm1 1526; SSSE3-NEXT: pshufb %xmm1, %xmm2 1527; SSSE3-NEXT: por %xmm6, %xmm2 1528; SSSE3-NEXT: movdqa %xmm3, %xmm0 1529; SSSE3-NEXT: movdqa %xmm2, %xmm1 1530; SSSE3-NEXT: retq 1531; 1532; AVX1-LABEL: test_bitreverse_v8i32: 1533; AVX1: # %bb.0: 1534; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1535; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1536; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1537; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1538; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1539; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1540; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1541; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1542; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1543; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1544; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1545; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1546; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1547; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1548; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1549; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1550; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1551; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1552; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1553; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1554; AVX1-NEXT: retq 1555; 1556; AVX2-LABEL: test_bitreverse_v8i32: 1557; AVX2: # %bb.0: 1558; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1559; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1560; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1561; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1562; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1563; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1564; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1565; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1566; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1567; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1568; AVX2-NEXT: retq 1569; 1570; AVX512-LABEL: test_bitreverse_v8i32: 1571; AVX512: # %bb.0: 1572; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1573; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1574; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1575; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1576; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1577; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1578; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1579; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1580; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1581; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1582; AVX512-NEXT: retq 1583; 1584; XOPAVX1-LABEL: test_bitreverse_v8i32: 1585; XOPAVX1: # %bb.0: 1586; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1587; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1588; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1589; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1590; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1591; XOPAVX1-NEXT: retq 1592; 1593; XOPAVX2-LABEL: test_bitreverse_v8i32: 1594; XOPAVX2: # %bb.0: 1595; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1596; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1597; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1598; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1599; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1600; XOPAVX2-NEXT: retq 1601; 1602; GFNISSE-LABEL: test_bitreverse_v8i32: 1603; GFNISSE: # %bb.0: 1604; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1605; GFNISSE-NEXT: pshufb %xmm2, %xmm0 1606; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1607; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 1608; GFNISSE-NEXT: pshufb %xmm2, %xmm1 1609; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 1610; GFNISSE-NEXT: retq 1611; 1612; GFNIAVX-LABEL: test_bitreverse_v8i32: 1613; GFNIAVX: # %bb.0: 1614; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1615; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1616; GFNIAVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1617; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1618; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1 1619; GFNIAVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1620; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0 1621; GFNIAVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1622; GFNIAVX-NEXT: retq 1623; 1624; GFNIAVX2-LABEL: test_bitreverse_v8i32: 1625; GFNIAVX2: # %bb.0: 1626; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1627; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1628; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1629; GFNIAVX2-NEXT: retq 1630; 1631; GFNIAVX512F-LABEL: test_bitreverse_v8i32: 1632; GFNIAVX512F: # %bb.0: 1633; GFNIAVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1634; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1635; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1636; GFNIAVX512F-NEXT: retq 1637; 1638; GFNIAVX512BW-LABEL: test_bitreverse_v8i32: 1639; GFNIAVX512BW: # %bb.0: 1640; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1641; GFNIAVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1642; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1643; GFNIAVX512BW-NEXT: retq 1644 %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) 1645 ret <8 x i32> %b 1646} 1647 1648define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { 1649; SSE2-LABEL: test_bitreverse_v4i64: 1650; SSE2: # %bb.0: 1651; SSE2-NEXT: movdqa %xmm1, %xmm2 1652; SSE2-NEXT: pxor %xmm3, %xmm3 1653; SSE2-NEXT: movdqa %xmm0, %xmm1 1654; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] 1655; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1656; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1657; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1658; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 1659; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1660; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1661; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1662; SSE2-NEXT: packuswb %xmm1, %xmm0 1663; SSE2-NEXT: movdqa %xmm0, %xmm4 1664; SSE2-NEXT: psllw $4, %xmm4 1665; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1666; SSE2-NEXT: movdqa %xmm1, %xmm5 1667; SSE2-NEXT: pandn %xmm4, %xmm5 1668; SSE2-NEXT: psrlw $4, %xmm0 1669; SSE2-NEXT: pand %xmm1, %xmm0 1670; SSE2-NEXT: por %xmm5, %xmm0 1671; SSE2-NEXT: movdqa %xmm0, %xmm5 1672; SSE2-NEXT: psrlw $2, %xmm5 1673; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1674; SSE2-NEXT: pand %xmm4, %xmm5 1675; SSE2-NEXT: pand %xmm4, %xmm0 1676; SSE2-NEXT: psllw $2, %xmm0 1677; SSE2-NEXT: por %xmm5, %xmm0 1678; SSE2-NEXT: movdqa %xmm0, %xmm6 1679; SSE2-NEXT: psrlw $1, %xmm6 1680; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1681; SSE2-NEXT: pand %xmm5, %xmm6 1682; SSE2-NEXT: pand %xmm5, %xmm0 1683; SSE2-NEXT: paddb %xmm0, %xmm0 1684; SSE2-NEXT: por %xmm6, %xmm0 1685; SSE2-NEXT: movdqa %xmm2, %xmm6 1686; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] 1687; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 1688; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1689; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1690; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 1691; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 1692; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1693; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1694; SSE2-NEXT: packuswb %xmm6, %xmm2 1695; SSE2-NEXT: movdqa %xmm2, %xmm3 1696; SSE2-NEXT: psllw $4, %xmm3 1697; SSE2-NEXT: psrlw $4, %xmm2 1698; SSE2-NEXT: pand %xmm1, %xmm2 1699; SSE2-NEXT: pandn %xmm3, %xmm1 1700; SSE2-NEXT: por %xmm2, %xmm1 1701; SSE2-NEXT: movdqa %xmm1, %xmm2 1702; SSE2-NEXT: psrlw $2, %xmm2 1703; SSE2-NEXT: pand %xmm4, %xmm2 1704; SSE2-NEXT: pand %xmm4, %xmm1 1705; SSE2-NEXT: psllw $2, %xmm1 1706; SSE2-NEXT: por %xmm2, %xmm1 1707; SSE2-NEXT: movdqa %xmm1, %xmm2 1708; SSE2-NEXT: psrlw $1, %xmm2 1709; SSE2-NEXT: pand %xmm5, %xmm2 1710; SSE2-NEXT: pand %xmm5, %xmm1 1711; SSE2-NEXT: paddb %xmm1, %xmm1 1712; SSE2-NEXT: por %xmm2, %xmm1 1713; SSE2-NEXT: retq 1714; 1715; SSSE3-LABEL: test_bitreverse_v4i64: 1716; SSSE3: # %bb.0: 1717; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1718; SSSE3-NEXT: pshufb %xmm4, %xmm0 1719; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1720; SSSE3-NEXT: movdqa %xmm0, %xmm2 1721; SSSE3-NEXT: pand %xmm5, %xmm2 1722; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1723; SSSE3-NEXT: movdqa %xmm6, %xmm7 1724; SSSE3-NEXT: pshufb %xmm2, %xmm7 1725; SSSE3-NEXT: psrlw $4, %xmm0 1726; SSSE3-NEXT: pand %xmm5, %xmm0 1727; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1728; SSSE3-NEXT: movdqa %xmm2, %xmm3 1729; SSSE3-NEXT: pshufb %xmm0, %xmm3 1730; SSSE3-NEXT: por %xmm7, %xmm3 1731; SSSE3-NEXT: pshufb %xmm4, %xmm1 1732; SSSE3-NEXT: movdqa %xmm1, %xmm0 1733; SSSE3-NEXT: pand %xmm5, %xmm0 1734; SSSE3-NEXT: pshufb %xmm0, %xmm6 1735; SSSE3-NEXT: psrlw $4, %xmm1 1736; SSSE3-NEXT: pand %xmm5, %xmm1 1737; SSSE3-NEXT: pshufb %xmm1, %xmm2 1738; SSSE3-NEXT: por %xmm6, %xmm2 1739; SSSE3-NEXT: movdqa %xmm3, %xmm0 1740; SSSE3-NEXT: movdqa %xmm2, %xmm1 1741; SSSE3-NEXT: retq 1742; 1743; AVX1-LABEL: test_bitreverse_v4i64: 1744; AVX1: # %bb.0: 1745; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1746; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1747; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1748; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1749; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1750; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1751; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1752; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1753; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1754; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1755; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1756; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1757; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1758; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1759; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1760; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1761; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1762; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1763; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1764; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1765; AVX1-NEXT: retq 1766; 1767; AVX2-LABEL: test_bitreverse_v4i64: 1768; AVX2: # %bb.0: 1769; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1770; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1771; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1772; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1773; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1774; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1775; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1776; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1777; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1778; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1779; AVX2-NEXT: retq 1780; 1781; AVX512-LABEL: test_bitreverse_v4i64: 1782; AVX512: # %bb.0: 1783; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1784; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1785; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1786; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1787; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1788; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1789; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1790; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1791; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1792; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1793; AVX512-NEXT: retq 1794; 1795; XOPAVX1-LABEL: test_bitreverse_v4i64: 1796; XOPAVX1: # %bb.0: 1797; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1798; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1799; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1800; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1801; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1802; XOPAVX1-NEXT: retq 1803; 1804; XOPAVX2-LABEL: test_bitreverse_v4i64: 1805; XOPAVX2: # %bb.0: 1806; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1807; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1808; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1809; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1810; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1811; XOPAVX2-NEXT: retq 1812; 1813; GFNISSE-LABEL: test_bitreverse_v4i64: 1814; GFNISSE: # %bb.0: 1815; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1816; GFNISSE-NEXT: pshufb %xmm2, %xmm0 1817; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1818; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 1819; GFNISSE-NEXT: pshufb %xmm2, %xmm1 1820; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 1821; GFNISSE-NEXT: retq 1822; 1823; GFNIAVX-LABEL: test_bitreverse_v4i64: 1824; GFNIAVX: # %bb.0: 1825; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1826; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1827; GFNIAVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1828; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1829; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1 1830; GFNIAVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1831; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0 1832; GFNIAVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1833; GFNIAVX-NEXT: retq 1834; 1835; GFNIAVX2-LABEL: test_bitreverse_v4i64: 1836; GFNIAVX2: # %bb.0: 1837; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1838; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1839; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1840; GFNIAVX2-NEXT: retq 1841; 1842; GFNIAVX512F-LABEL: test_bitreverse_v4i64: 1843; GFNIAVX512F: # %bb.0: 1844; GFNIAVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1845; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1846; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1847; GFNIAVX512F-NEXT: retq 1848; 1849; GFNIAVX512BW-LABEL: test_bitreverse_v4i64: 1850; GFNIAVX512BW: # %bb.0: 1851; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1852; GFNIAVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1853; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1854; GFNIAVX512BW-NEXT: retq 1855 %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) 1856 ret <4 x i64> %b 1857} 1858 1859define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { 1860; SSE2-LABEL: test_bitreverse_v64i8: 1861; SSE2: # %bb.0: 1862; SSE2-NEXT: movdqa %xmm3, %xmm4 1863; SSE2-NEXT: movdqa %xmm0, %xmm5 1864; SSE2-NEXT: psllw $4, %xmm5 1865; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1866; SSE2-NEXT: movdqa %xmm3, %xmm6 1867; SSE2-NEXT: pandn %xmm5, %xmm6 1868; SSE2-NEXT: psrlw $4, %xmm0 1869; SSE2-NEXT: pand %xmm3, %xmm0 1870; SSE2-NEXT: por %xmm6, %xmm0 1871; SSE2-NEXT: movdqa %xmm0, %xmm6 1872; SSE2-NEXT: psrlw $2, %xmm6 1873; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1874; SSE2-NEXT: pand %xmm8, %xmm6 1875; SSE2-NEXT: pand %xmm8, %xmm0 1876; SSE2-NEXT: psllw $2, %xmm0 1877; SSE2-NEXT: por %xmm6, %xmm0 1878; SSE2-NEXT: movdqa %xmm0, %xmm7 1879; SSE2-NEXT: psrlw $1, %xmm7 1880; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1881; SSE2-NEXT: pand %xmm6, %xmm7 1882; SSE2-NEXT: pand %xmm6, %xmm0 1883; SSE2-NEXT: paddb %xmm0, %xmm0 1884; SSE2-NEXT: por %xmm7, %xmm0 1885; SSE2-NEXT: movdqa %xmm1, %xmm7 1886; SSE2-NEXT: psllw $4, %xmm7 1887; SSE2-NEXT: movdqa %xmm3, %xmm5 1888; SSE2-NEXT: pandn %xmm7, %xmm5 1889; SSE2-NEXT: psrlw $4, %xmm1 1890; SSE2-NEXT: pand %xmm3, %xmm1 1891; SSE2-NEXT: por %xmm5, %xmm1 1892; SSE2-NEXT: movdqa %xmm1, %xmm5 1893; SSE2-NEXT: psrlw $2, %xmm5 1894; SSE2-NEXT: pand %xmm8, %xmm5 1895; SSE2-NEXT: pand %xmm8, %xmm1 1896; SSE2-NEXT: psllw $2, %xmm1 1897; SSE2-NEXT: por %xmm5, %xmm1 1898; SSE2-NEXT: movdqa %xmm1, %xmm5 1899; SSE2-NEXT: psrlw $1, %xmm5 1900; SSE2-NEXT: pand %xmm6, %xmm5 1901; SSE2-NEXT: pand %xmm6, %xmm1 1902; SSE2-NEXT: paddb %xmm1, %xmm1 1903; SSE2-NEXT: por %xmm5, %xmm1 1904; SSE2-NEXT: movdqa %xmm2, %xmm5 1905; SSE2-NEXT: psllw $4, %xmm5 1906; SSE2-NEXT: movdqa %xmm3, %xmm7 1907; SSE2-NEXT: pandn %xmm5, %xmm7 1908; SSE2-NEXT: psrlw $4, %xmm2 1909; SSE2-NEXT: pand %xmm3, %xmm2 1910; SSE2-NEXT: por %xmm7, %xmm2 1911; SSE2-NEXT: movdqa %xmm2, %xmm5 1912; SSE2-NEXT: psrlw $2, %xmm5 1913; SSE2-NEXT: pand %xmm8, %xmm5 1914; SSE2-NEXT: pand %xmm8, %xmm2 1915; SSE2-NEXT: psllw $2, %xmm2 1916; SSE2-NEXT: por %xmm5, %xmm2 1917; SSE2-NEXT: movdqa %xmm2, %xmm5 1918; SSE2-NEXT: psrlw $1, %xmm5 1919; SSE2-NEXT: pand %xmm6, %xmm5 1920; SSE2-NEXT: pand %xmm6, %xmm2 1921; SSE2-NEXT: paddb %xmm2, %xmm2 1922; SSE2-NEXT: por %xmm5, %xmm2 1923; SSE2-NEXT: movdqa %xmm4, %xmm5 1924; SSE2-NEXT: psllw $4, %xmm5 1925; SSE2-NEXT: psrlw $4, %xmm4 1926; SSE2-NEXT: pand %xmm3, %xmm4 1927; SSE2-NEXT: pandn %xmm5, %xmm3 1928; SSE2-NEXT: por %xmm4, %xmm3 1929; SSE2-NEXT: movdqa %xmm3, %xmm4 1930; SSE2-NEXT: psrlw $2, %xmm4 1931; SSE2-NEXT: pand %xmm8, %xmm4 1932; SSE2-NEXT: pand %xmm8, %xmm3 1933; SSE2-NEXT: psllw $2, %xmm3 1934; SSE2-NEXT: por %xmm4, %xmm3 1935; SSE2-NEXT: movdqa %xmm3, %xmm4 1936; SSE2-NEXT: psrlw $1, %xmm4 1937; SSE2-NEXT: pand %xmm6, %xmm4 1938; SSE2-NEXT: pand %xmm6, %xmm3 1939; SSE2-NEXT: paddb %xmm3, %xmm3 1940; SSE2-NEXT: por %xmm4, %xmm3 1941; SSE2-NEXT: retq 1942; 1943; SSSE3-LABEL: test_bitreverse_v64i8: 1944; SSSE3: # %bb.0: 1945; SSSE3-NEXT: movdqa %xmm0, %xmm5 1946; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1947; SSSE3-NEXT: pand %xmm8, %xmm0 1948; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1949; SSSE3-NEXT: movdqa %xmm9, %xmm6 1950; SSSE3-NEXT: pshufb %xmm0, %xmm6 1951; SSSE3-NEXT: psrlw $4, %xmm5 1952; SSSE3-NEXT: pand %xmm8, %xmm5 1953; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1954; SSSE3-NEXT: movdqa %xmm4, %xmm0 1955; SSSE3-NEXT: pshufb %xmm5, %xmm0 1956; SSSE3-NEXT: por %xmm6, %xmm0 1957; SSSE3-NEXT: movdqa %xmm1, %xmm5 1958; SSSE3-NEXT: pand %xmm8, %xmm5 1959; SSSE3-NEXT: movdqa %xmm9, %xmm6 1960; SSSE3-NEXT: pshufb %xmm5, %xmm6 1961; SSSE3-NEXT: psrlw $4, %xmm1 1962; SSSE3-NEXT: pand %xmm8, %xmm1 1963; SSSE3-NEXT: movdqa %xmm4, %xmm5 1964; SSSE3-NEXT: pshufb %xmm1, %xmm5 1965; SSSE3-NEXT: por %xmm6, %xmm5 1966; SSSE3-NEXT: movdqa %xmm2, %xmm1 1967; SSSE3-NEXT: pand %xmm8, %xmm1 1968; SSSE3-NEXT: movdqa %xmm9, %xmm7 1969; SSSE3-NEXT: pshufb %xmm1, %xmm7 1970; SSSE3-NEXT: psrlw $4, %xmm2 1971; SSSE3-NEXT: pand %xmm8, %xmm2 1972; SSSE3-NEXT: movdqa %xmm4, %xmm6 1973; SSSE3-NEXT: pshufb %xmm2, %xmm6 1974; SSSE3-NEXT: por %xmm7, %xmm6 1975; SSSE3-NEXT: movdqa %xmm3, %xmm1 1976; SSSE3-NEXT: pand %xmm8, %xmm1 1977; SSSE3-NEXT: pshufb %xmm1, %xmm9 1978; SSSE3-NEXT: psrlw $4, %xmm3 1979; SSSE3-NEXT: pand %xmm8, %xmm3 1980; SSSE3-NEXT: pshufb %xmm3, %xmm4 1981; SSSE3-NEXT: por %xmm9, %xmm4 1982; SSSE3-NEXT: movdqa %xmm5, %xmm1 1983; SSSE3-NEXT: movdqa %xmm6, %xmm2 1984; SSSE3-NEXT: movdqa %xmm4, %xmm3 1985; SSSE3-NEXT: retq 1986; 1987; AVX1-LABEL: test_bitreverse_v64i8: 1988; AVX1: # %bb.0: 1989; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1990; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1991; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 1992; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1993; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1994; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1995; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1996; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1997; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1998; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1999; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm4 2000; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 2001; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2002; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 2003; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 2004; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 2005; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2006; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2007; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 2008; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 2009; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2010; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 2011; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 2012; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 2013; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 2014; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 2015; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2016; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 2017; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 2018; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 2019; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2020; AVX1-NEXT: retq 2021; 2022; AVX2-LABEL: test_bitreverse_v64i8: 2023; AVX2: # %bb.0: 2024; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2025; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 2026; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2027; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 2028; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2029; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2030; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2031; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0 2032; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 2033; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 2034; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 2035; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2036; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 2037; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1 2038; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 2039; AVX2-NEXT: retq 2040; 2041; AVX512F-LABEL: test_bitreverse_v64i8: 2042; AVX512F: # %bb.0: 2043; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2044; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2045; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 2046; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2047; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 2048; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 2049; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm4 2050; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 2051; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2052; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 2053; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2054; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 2055; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 2056; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 2057; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 2058; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2059; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0 2060; AVX512F-NEXT: retq 2061; 2062; AVX512BW-LABEL: test_bitreverse_v64i8: 2063; AVX512BW: # %bb.0: 2064; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2065; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2066; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2067; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2068; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2069; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2070; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2071; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2072; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2073; AVX512BW-NEXT: retq 2074; 2075; XOPAVX1-LABEL: test_bitreverse_v64i8: 2076; XOPAVX1: # %bb.0: 2077; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2078; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 2079; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2080; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2081; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2082; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2083; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2084; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2085; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2086; XOPAVX1-NEXT: retq 2087; 2088; XOPAVX2-LABEL: test_bitreverse_v64i8: 2089; XOPAVX2: # %bb.0: 2090; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2091; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 2092; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2093; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2094; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2095; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2096; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2097; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2098; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2099; XOPAVX2-NEXT: retq 2100; 2101; GFNISSE-LABEL: test_bitreverse_v64i8: 2102; GFNISSE: # %bb.0: 2103; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] 2104; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 2105; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 2106; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 2107; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm3 2108; GFNISSE-NEXT: retq 2109; 2110; GFNIAVX-LABEL: test_bitreverse_v64i8: 2111; GFNIAVX: # %bb.0: 2112; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm2 2113; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 2114; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm2, %xmm2 2115; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0 2116; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2117; GFNIAVX-NEXT: vextractf128 $1, %ymm1, %xmm2 2118; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm2, %xmm2 2119; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1 2120; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2121; GFNIAVX-NEXT: retq 2122; 2123; GFNIAVX2-LABEL: test_bitreverse_v64i8: 2124; GFNIAVX2: # %bb.0: 2125; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2126; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 2127; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 2128; GFNIAVX2-NEXT: retq 2129; 2130; GFNIAVX512F-LABEL: test_bitreverse_v64i8: 2131; GFNIAVX512F: # %bb.0: 2132; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2133; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2134; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 2135; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 2136; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2137; GFNIAVX512F-NEXT: retq 2138; 2139; GFNIAVX512BW-LABEL: test_bitreverse_v64i8: 2140; GFNIAVX512BW: # %bb.0: 2141; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 2142; GFNIAVX512BW-NEXT: retq 2143 %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) 2144 ret <64 x i8> %b 2145} 2146 2147define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { 2148; SSE2-LABEL: test_bitreverse_v32i16: 2149; SSE2: # %bb.0: 2150; SSE2-NEXT: movdqa %xmm3, %xmm4 2151; SSE2-NEXT: movdqa %xmm0, %xmm3 2152; SSE2-NEXT: psrlw $8, %xmm3 2153; SSE2-NEXT: psllw $8, %xmm0 2154; SSE2-NEXT: por %xmm3, %xmm0 2155; SSE2-NEXT: movdqa %xmm0, %xmm5 2156; SSE2-NEXT: psllw $4, %xmm5 2157; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2158; SSE2-NEXT: movdqa %xmm3, %xmm6 2159; SSE2-NEXT: pandn %xmm5, %xmm6 2160; SSE2-NEXT: psrlw $4, %xmm0 2161; SSE2-NEXT: pand %xmm3, %xmm0 2162; SSE2-NEXT: por %xmm6, %xmm0 2163; SSE2-NEXT: movdqa %xmm0, %xmm6 2164; SSE2-NEXT: psrlw $2, %xmm6 2165; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 2166; SSE2-NEXT: pand %xmm8, %xmm6 2167; SSE2-NEXT: pand %xmm8, %xmm0 2168; SSE2-NEXT: psllw $2, %xmm0 2169; SSE2-NEXT: por %xmm6, %xmm0 2170; SSE2-NEXT: movdqa %xmm0, %xmm7 2171; SSE2-NEXT: psrlw $1, %xmm7 2172; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 2173; SSE2-NEXT: pand %xmm6, %xmm7 2174; SSE2-NEXT: pand %xmm6, %xmm0 2175; SSE2-NEXT: paddb %xmm0, %xmm0 2176; SSE2-NEXT: por %xmm7, %xmm0 2177; SSE2-NEXT: movdqa %xmm1, %xmm7 2178; SSE2-NEXT: psrlw $8, %xmm7 2179; SSE2-NEXT: psllw $8, %xmm1 2180; SSE2-NEXT: por %xmm7, %xmm1 2181; SSE2-NEXT: movdqa %xmm1, %xmm7 2182; SSE2-NEXT: psllw $4, %xmm7 2183; SSE2-NEXT: movdqa %xmm3, %xmm5 2184; SSE2-NEXT: pandn %xmm7, %xmm5 2185; SSE2-NEXT: psrlw $4, %xmm1 2186; SSE2-NEXT: pand %xmm3, %xmm1 2187; SSE2-NEXT: por %xmm5, %xmm1 2188; SSE2-NEXT: movdqa %xmm1, %xmm5 2189; SSE2-NEXT: psrlw $2, %xmm5 2190; SSE2-NEXT: pand %xmm8, %xmm5 2191; SSE2-NEXT: pand %xmm8, %xmm1 2192; SSE2-NEXT: psllw $2, %xmm1 2193; SSE2-NEXT: por %xmm5, %xmm1 2194; SSE2-NEXT: movdqa %xmm1, %xmm5 2195; SSE2-NEXT: psrlw $1, %xmm5 2196; SSE2-NEXT: pand %xmm6, %xmm5 2197; SSE2-NEXT: pand %xmm6, %xmm1 2198; SSE2-NEXT: paddb %xmm1, %xmm1 2199; SSE2-NEXT: por %xmm5, %xmm1 2200; SSE2-NEXT: movdqa %xmm2, %xmm5 2201; SSE2-NEXT: psrlw $8, %xmm5 2202; SSE2-NEXT: psllw $8, %xmm2 2203; SSE2-NEXT: por %xmm5, %xmm2 2204; SSE2-NEXT: movdqa %xmm2, %xmm5 2205; SSE2-NEXT: psllw $4, %xmm5 2206; SSE2-NEXT: movdqa %xmm3, %xmm7 2207; SSE2-NEXT: pandn %xmm5, %xmm7 2208; SSE2-NEXT: psrlw $4, %xmm2 2209; SSE2-NEXT: pand %xmm3, %xmm2 2210; SSE2-NEXT: por %xmm7, %xmm2 2211; SSE2-NEXT: movdqa %xmm2, %xmm5 2212; SSE2-NEXT: psrlw $2, %xmm5 2213; SSE2-NEXT: pand %xmm8, %xmm5 2214; SSE2-NEXT: pand %xmm8, %xmm2 2215; SSE2-NEXT: psllw $2, %xmm2 2216; SSE2-NEXT: por %xmm5, %xmm2 2217; SSE2-NEXT: movdqa %xmm2, %xmm5 2218; SSE2-NEXT: psrlw $1, %xmm5 2219; SSE2-NEXT: pand %xmm6, %xmm5 2220; SSE2-NEXT: pand %xmm6, %xmm2 2221; SSE2-NEXT: paddb %xmm2, %xmm2 2222; SSE2-NEXT: por %xmm5, %xmm2 2223; SSE2-NEXT: movdqa %xmm4, %xmm5 2224; SSE2-NEXT: psrlw $8, %xmm5 2225; SSE2-NEXT: psllw $8, %xmm4 2226; SSE2-NEXT: por %xmm5, %xmm4 2227; SSE2-NEXT: movdqa %xmm4, %xmm5 2228; SSE2-NEXT: psllw $4, %xmm5 2229; SSE2-NEXT: psrlw $4, %xmm4 2230; SSE2-NEXT: pand %xmm3, %xmm4 2231; SSE2-NEXT: pandn %xmm5, %xmm3 2232; SSE2-NEXT: por %xmm4, %xmm3 2233; SSE2-NEXT: movdqa %xmm3, %xmm4 2234; SSE2-NEXT: psrlw $2, %xmm4 2235; SSE2-NEXT: pand %xmm8, %xmm4 2236; SSE2-NEXT: pand %xmm8, %xmm3 2237; SSE2-NEXT: psllw $2, %xmm3 2238; SSE2-NEXT: por %xmm4, %xmm3 2239; SSE2-NEXT: movdqa %xmm3, %xmm4 2240; SSE2-NEXT: psrlw $1, %xmm4 2241; SSE2-NEXT: pand %xmm6, %xmm4 2242; SSE2-NEXT: pand %xmm6, %xmm3 2243; SSE2-NEXT: paddb %xmm3, %xmm3 2244; SSE2-NEXT: por %xmm4, %xmm3 2245; SSE2-NEXT: retq 2246; 2247; SSSE3-LABEL: test_bitreverse_v32i16: 2248; SSSE3: # %bb.0: 2249; SSSE3-NEXT: movdqa %xmm1, %xmm5 2250; SSSE3-NEXT: movdqa %xmm0, %xmm1 2251; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2252; SSSE3-NEXT: pshufb %xmm8, %xmm1 2253; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2254; SSSE3-NEXT: movdqa %xmm1, %xmm0 2255; SSSE3-NEXT: pand %xmm9, %xmm0 2256; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2257; SSSE3-NEXT: movdqa %xmm7, %xmm6 2258; SSSE3-NEXT: pshufb %xmm0, %xmm6 2259; SSSE3-NEXT: psrlw $4, %xmm1 2260; SSSE3-NEXT: pand %xmm9, %xmm1 2261; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2262; SSSE3-NEXT: movdqa %xmm4, %xmm0 2263; SSSE3-NEXT: pshufb %xmm1, %xmm0 2264; SSSE3-NEXT: por %xmm6, %xmm0 2265; SSSE3-NEXT: pshufb %xmm8, %xmm5 2266; SSSE3-NEXT: movdqa %xmm5, %xmm1 2267; SSSE3-NEXT: pand %xmm9, %xmm1 2268; SSSE3-NEXT: movdqa %xmm7, %xmm6 2269; SSSE3-NEXT: pshufb %xmm1, %xmm6 2270; SSSE3-NEXT: psrlw $4, %xmm5 2271; SSSE3-NEXT: pand %xmm9, %xmm5 2272; SSSE3-NEXT: movdqa %xmm4, %xmm1 2273; SSSE3-NEXT: pshufb %xmm5, %xmm1 2274; SSSE3-NEXT: por %xmm6, %xmm1 2275; SSSE3-NEXT: pshufb %xmm8, %xmm2 2276; SSSE3-NEXT: movdqa %xmm2, %xmm5 2277; SSSE3-NEXT: pand %xmm9, %xmm5 2278; SSSE3-NEXT: movdqa %xmm7, %xmm6 2279; SSSE3-NEXT: pshufb %xmm5, %xmm6 2280; SSSE3-NEXT: psrlw $4, %xmm2 2281; SSSE3-NEXT: pand %xmm9, %xmm2 2282; SSSE3-NEXT: movdqa %xmm4, %xmm5 2283; SSSE3-NEXT: pshufb %xmm2, %xmm5 2284; SSSE3-NEXT: por %xmm6, %xmm5 2285; SSSE3-NEXT: pshufb %xmm8, %xmm3 2286; SSSE3-NEXT: movdqa %xmm3, %xmm2 2287; SSSE3-NEXT: pand %xmm9, %xmm2 2288; SSSE3-NEXT: pshufb %xmm2, %xmm7 2289; SSSE3-NEXT: psrlw $4, %xmm3 2290; SSSE3-NEXT: pand %xmm9, %xmm3 2291; SSSE3-NEXT: pshufb %xmm3, %xmm4 2292; SSSE3-NEXT: por %xmm7, %xmm4 2293; SSSE3-NEXT: movdqa %xmm5, %xmm2 2294; SSSE3-NEXT: movdqa %xmm4, %xmm3 2295; SSSE3-NEXT: retq 2296; 2297; AVX1-LABEL: test_bitreverse_v32i16: 2298; AVX1: # %bb.0: 2299; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2300; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2301; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2302; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2303; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2304; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2305; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2306; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2307; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2308; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2309; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2310; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2311; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2312; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 2313; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2314; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2315; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2316; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2317; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2318; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2319; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2320; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2321; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2322; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2323; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2324; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2325; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2326; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2327; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2328; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2329; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2330; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2331; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2332; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2333; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2334; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2335; AVX1-NEXT: retq 2336; 2337; AVX2-LABEL: test_bitreverse_v32i16: 2338; AVX2: # %bb.0: 2339; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2340; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2341; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2342; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2343; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2344; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2345; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2346; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2347; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2348; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2349; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2350; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2351; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2352; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2353; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2354; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2355; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2356; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2357; AVX2-NEXT: retq 2358; 2359; AVX512F-LABEL: test_bitreverse_v32i16: 2360; AVX512F: # %bb.0: 2361; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2362; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2363; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2364; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2365; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 2366; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2367; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2368; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2369; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 2370; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2371; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 2372; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2373; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 2374; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2375; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 2376; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 2377; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 2378; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 2379; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2380; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 2381; AVX512F-NEXT: retq 2382; 2383; AVX512BW-LABEL: test_bitreverse_v32i16: 2384; AVX512BW: # %bb.0: 2385; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] 2386; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2387; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2388; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2389; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2390; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2391; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2392; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2393; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2394; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2395; AVX512BW-NEXT: retq 2396; 2397; XOPAVX1-LABEL: test_bitreverse_v32i16: 2398; XOPAVX1: # %bb.0: 2399; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2400; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 2401; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2402; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2403; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2404; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2405; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2406; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2407; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2408; XOPAVX1-NEXT: retq 2409; 2410; XOPAVX2-LABEL: test_bitreverse_v32i16: 2411; XOPAVX2: # %bb.0: 2412; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2413; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 2414; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2415; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2416; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2417; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2418; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2419; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2420; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2421; XOPAVX2-NEXT: retq 2422; 2423; GFNISSE-LABEL: test_bitreverse_v32i16: 2424; GFNISSE: # %bb.0: 2425; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2426; GFNISSE-NEXT: pshufb %xmm4, %xmm0 2427; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] 2428; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 2429; GFNISSE-NEXT: pshufb %xmm4, %xmm1 2430; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 2431; GFNISSE-NEXT: pshufb %xmm4, %xmm2 2432; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 2433; GFNISSE-NEXT: pshufb %xmm4, %xmm3 2434; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 2435; GFNISSE-NEXT: retq 2436; 2437; GFNIAVX-LABEL: test_bitreverse_v32i16: 2438; GFNIAVX: # %bb.0: 2439; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm2 2440; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2441; GFNIAVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2442; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] 2443; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 2444; GFNIAVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2445; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0 2446; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2447; GFNIAVX-NEXT: vextractf128 $1, %ymm1, %xmm2 2448; GFNIAVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2449; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 2450; GFNIAVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2451; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1 2452; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2453; GFNIAVX-NEXT: retq 2454; 2455; GFNIAVX2-LABEL: test_bitreverse_v32i16: 2456; GFNIAVX2: # %bb.0: 2457; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2458; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2459; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2460; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2461; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2462; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2463; GFNIAVX2-NEXT: retq 2464; 2465; GFNIAVX512F-LABEL: test_bitreverse_v32i16: 2466; GFNIAVX512F: # %bb.0: 2467; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2468; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2469; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2470; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2471; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2472; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2473; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2474; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2475; GFNIAVX512F-NEXT: retq 2476; 2477; GFNIAVX512BW-LABEL: test_bitreverse_v32i16: 2478; GFNIAVX512BW: # %bb.0: 2479; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] 2480; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 2481; GFNIAVX512BW-NEXT: retq 2482 %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) 2483 ret <32 x i16> %b 2484} 2485 2486define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { 2487; SSE2-LABEL: test_bitreverse_v16i32: 2488; SSE2: # %bb.0: 2489; SSE2-NEXT: movdqa %xmm3, %xmm4 2490; SSE2-NEXT: pxor %xmm8, %xmm8 2491; SSE2-NEXT: movdqa %xmm0, %xmm3 2492; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] 2493; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 2494; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 2495; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 2496; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 2497; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 2498; SSE2-NEXT: packuswb %xmm3, %xmm0 2499; SSE2-NEXT: movdqa %xmm0, %xmm6 2500; SSE2-NEXT: psllw $4, %xmm6 2501; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2502; SSE2-NEXT: movdqa %xmm3, %xmm7 2503; SSE2-NEXT: pandn %xmm6, %xmm7 2504; SSE2-NEXT: psrlw $4, %xmm0 2505; SSE2-NEXT: pand %xmm3, %xmm0 2506; SSE2-NEXT: por %xmm7, %xmm0 2507; SSE2-NEXT: movdqa %xmm0, %xmm7 2508; SSE2-NEXT: psrlw $2, %xmm7 2509; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 2510; SSE2-NEXT: pand %xmm9, %xmm7 2511; SSE2-NEXT: pand %xmm9, %xmm0 2512; SSE2-NEXT: psllw $2, %xmm0 2513; SSE2-NEXT: por %xmm7, %xmm0 2514; SSE2-NEXT: movdqa %xmm0, %xmm5 2515; SSE2-NEXT: psrlw $1, %xmm5 2516; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 2517; SSE2-NEXT: pand %xmm7, %xmm5 2518; SSE2-NEXT: pand %xmm7, %xmm0 2519; SSE2-NEXT: paddb %xmm0, %xmm0 2520; SSE2-NEXT: por %xmm5, %xmm0 2521; SSE2-NEXT: movdqa %xmm1, %xmm5 2522; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] 2523; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] 2524; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] 2525; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 2526; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 2527; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 2528; SSE2-NEXT: packuswb %xmm5, %xmm1 2529; SSE2-NEXT: movdqa %xmm1, %xmm5 2530; SSE2-NEXT: psllw $4, %xmm5 2531; SSE2-NEXT: movdqa %xmm3, %xmm6 2532; SSE2-NEXT: pandn %xmm5, %xmm6 2533; SSE2-NEXT: psrlw $4, %xmm1 2534; SSE2-NEXT: pand %xmm3, %xmm1 2535; SSE2-NEXT: por %xmm6, %xmm1 2536; SSE2-NEXT: movdqa %xmm1, %xmm5 2537; SSE2-NEXT: psrlw $2, %xmm5 2538; SSE2-NEXT: pand %xmm9, %xmm5 2539; SSE2-NEXT: pand %xmm9, %xmm1 2540; SSE2-NEXT: psllw $2, %xmm1 2541; SSE2-NEXT: por %xmm5, %xmm1 2542; SSE2-NEXT: movdqa %xmm1, %xmm5 2543; SSE2-NEXT: psrlw $1, %xmm5 2544; SSE2-NEXT: pand %xmm7, %xmm5 2545; SSE2-NEXT: pand %xmm7, %xmm1 2546; SSE2-NEXT: paddb %xmm1, %xmm1 2547; SSE2-NEXT: por %xmm5, %xmm1 2548; SSE2-NEXT: movdqa %xmm2, %xmm5 2549; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] 2550; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] 2551; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] 2552; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] 2553; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 2554; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 2555; SSE2-NEXT: packuswb %xmm5, %xmm2 2556; SSE2-NEXT: movdqa %xmm2, %xmm5 2557; SSE2-NEXT: psllw $4, %xmm5 2558; SSE2-NEXT: movdqa %xmm3, %xmm6 2559; SSE2-NEXT: pandn %xmm5, %xmm6 2560; SSE2-NEXT: psrlw $4, %xmm2 2561; SSE2-NEXT: pand %xmm3, %xmm2 2562; SSE2-NEXT: por %xmm6, %xmm2 2563; SSE2-NEXT: movdqa %xmm2, %xmm5 2564; SSE2-NEXT: psrlw $2, %xmm5 2565; SSE2-NEXT: pand %xmm9, %xmm5 2566; SSE2-NEXT: pand %xmm9, %xmm2 2567; SSE2-NEXT: psllw $2, %xmm2 2568; SSE2-NEXT: por %xmm5, %xmm2 2569; SSE2-NEXT: movdqa %xmm2, %xmm5 2570; SSE2-NEXT: psrlw $1, %xmm5 2571; SSE2-NEXT: pand %xmm7, %xmm5 2572; SSE2-NEXT: pand %xmm7, %xmm2 2573; SSE2-NEXT: paddb %xmm2, %xmm2 2574; SSE2-NEXT: por %xmm5, %xmm2 2575; SSE2-NEXT: movdqa %xmm4, %xmm5 2576; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] 2577; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] 2578; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] 2579; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] 2580; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2581; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2582; SSE2-NEXT: packuswb %xmm5, %xmm4 2583; SSE2-NEXT: movdqa %xmm4, %xmm5 2584; SSE2-NEXT: psllw $4, %xmm5 2585; SSE2-NEXT: psrlw $4, %xmm4 2586; SSE2-NEXT: pand %xmm3, %xmm4 2587; SSE2-NEXT: pandn %xmm5, %xmm3 2588; SSE2-NEXT: por %xmm4, %xmm3 2589; SSE2-NEXT: movdqa %xmm3, %xmm4 2590; SSE2-NEXT: psrlw $2, %xmm4 2591; SSE2-NEXT: pand %xmm9, %xmm4 2592; SSE2-NEXT: pand %xmm9, %xmm3 2593; SSE2-NEXT: psllw $2, %xmm3 2594; SSE2-NEXT: por %xmm4, %xmm3 2595; SSE2-NEXT: movdqa %xmm3, %xmm4 2596; SSE2-NEXT: psrlw $1, %xmm4 2597; SSE2-NEXT: pand %xmm7, %xmm4 2598; SSE2-NEXT: pand %xmm7, %xmm3 2599; SSE2-NEXT: paddb %xmm3, %xmm3 2600; SSE2-NEXT: por %xmm4, %xmm3 2601; SSE2-NEXT: retq 2602; 2603; SSSE3-LABEL: test_bitreverse_v16i32: 2604; SSSE3: # %bb.0: 2605; SSSE3-NEXT: movdqa %xmm1, %xmm5 2606; SSSE3-NEXT: movdqa %xmm0, %xmm1 2607; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2608; SSSE3-NEXT: pshufb %xmm8, %xmm1 2609; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2610; SSSE3-NEXT: movdqa %xmm1, %xmm0 2611; SSSE3-NEXT: pand %xmm9, %xmm0 2612; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2613; SSSE3-NEXT: movdqa %xmm7, %xmm6 2614; SSSE3-NEXT: pshufb %xmm0, %xmm6 2615; SSSE3-NEXT: psrlw $4, %xmm1 2616; SSSE3-NEXT: pand %xmm9, %xmm1 2617; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2618; SSSE3-NEXT: movdqa %xmm4, %xmm0 2619; SSSE3-NEXT: pshufb %xmm1, %xmm0 2620; SSSE3-NEXT: por %xmm6, %xmm0 2621; SSSE3-NEXT: pshufb %xmm8, %xmm5 2622; SSSE3-NEXT: movdqa %xmm5, %xmm1 2623; SSSE3-NEXT: pand %xmm9, %xmm1 2624; SSSE3-NEXT: movdqa %xmm7, %xmm6 2625; SSSE3-NEXT: pshufb %xmm1, %xmm6 2626; SSSE3-NEXT: psrlw $4, %xmm5 2627; SSSE3-NEXT: pand %xmm9, %xmm5 2628; SSSE3-NEXT: movdqa %xmm4, %xmm1 2629; SSSE3-NEXT: pshufb %xmm5, %xmm1 2630; SSSE3-NEXT: por %xmm6, %xmm1 2631; SSSE3-NEXT: pshufb %xmm8, %xmm2 2632; SSSE3-NEXT: movdqa %xmm2, %xmm5 2633; SSSE3-NEXT: pand %xmm9, %xmm5 2634; SSSE3-NEXT: movdqa %xmm7, %xmm6 2635; SSSE3-NEXT: pshufb %xmm5, %xmm6 2636; SSSE3-NEXT: psrlw $4, %xmm2 2637; SSSE3-NEXT: pand %xmm9, %xmm2 2638; SSSE3-NEXT: movdqa %xmm4, %xmm5 2639; SSSE3-NEXT: pshufb %xmm2, %xmm5 2640; SSSE3-NEXT: por %xmm6, %xmm5 2641; SSSE3-NEXT: pshufb %xmm8, %xmm3 2642; SSSE3-NEXT: movdqa %xmm3, %xmm2 2643; SSSE3-NEXT: pand %xmm9, %xmm2 2644; SSSE3-NEXT: pshufb %xmm2, %xmm7 2645; SSSE3-NEXT: psrlw $4, %xmm3 2646; SSSE3-NEXT: pand %xmm9, %xmm3 2647; SSSE3-NEXT: pshufb %xmm3, %xmm4 2648; SSSE3-NEXT: por %xmm7, %xmm4 2649; SSSE3-NEXT: movdqa %xmm5, %xmm2 2650; SSSE3-NEXT: movdqa %xmm4, %xmm3 2651; SSSE3-NEXT: retq 2652; 2653; AVX1-LABEL: test_bitreverse_v16i32: 2654; AVX1: # %bb.0: 2655; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2656; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2657; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2658; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2659; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2660; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2661; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2662; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2663; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2664; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2665; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2666; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2667; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2668; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 2669; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2670; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2671; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2672; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2673; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2674; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2675; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2676; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2677; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2678; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2679; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2680; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2681; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2682; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2683; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2684; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2685; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2686; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2687; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2688; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2689; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2690; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2691; AVX1-NEXT: retq 2692; 2693; AVX2-LABEL: test_bitreverse_v16i32: 2694; AVX2: # %bb.0: 2695; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2696; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2697; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2698; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2699; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2700; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2701; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2702; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2703; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2704; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2705; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2706; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2707; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2708; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2709; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2710; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2711; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2712; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2713; AVX2-NEXT: retq 2714; 2715; AVX512F-LABEL: test_bitreverse_v16i32: 2716; AVX512F: # %bb.0: 2717; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2718; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2719; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2720; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2721; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 2722; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2723; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2724; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2725; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 2726; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2727; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 2728; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2729; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 2730; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2731; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 2732; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 2733; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 2734; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 2735; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2736; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 2737; AVX512F-NEXT: retq 2738; 2739; AVX512BW-LABEL: test_bitreverse_v16i32: 2740; AVX512BW: # %bb.0: 2741; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] 2742; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2743; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2744; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2745; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2746; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2747; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2748; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2749; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2750; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2751; AVX512BW-NEXT: retq 2752; 2753; XOPAVX1-LABEL: test_bitreverse_v16i32: 2754; XOPAVX1: # %bb.0: 2755; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2756; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 2757; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2758; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2759; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2760; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2761; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2762; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2763; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2764; XOPAVX1-NEXT: retq 2765; 2766; XOPAVX2-LABEL: test_bitreverse_v16i32: 2767; XOPAVX2: # %bb.0: 2768; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2769; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 2770; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2771; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2772; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2773; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2774; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2775; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2776; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2777; XOPAVX2-NEXT: retq 2778; 2779; GFNISSE-LABEL: test_bitreverse_v16i32: 2780; GFNISSE: # %bb.0: 2781; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2782; GFNISSE-NEXT: pshufb %xmm4, %xmm0 2783; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] 2784; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 2785; GFNISSE-NEXT: pshufb %xmm4, %xmm1 2786; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 2787; GFNISSE-NEXT: pshufb %xmm4, %xmm2 2788; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 2789; GFNISSE-NEXT: pshufb %xmm4, %xmm3 2790; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 2791; GFNISSE-NEXT: retq 2792; 2793; GFNIAVX-LABEL: test_bitreverse_v16i32: 2794; GFNIAVX: # %bb.0: 2795; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm2 2796; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2797; GFNIAVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2798; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] 2799; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 2800; GFNIAVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2801; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0 2802; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2803; GFNIAVX-NEXT: vextractf128 $1, %ymm1, %xmm2 2804; GFNIAVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2805; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 2806; GFNIAVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2807; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1 2808; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2809; GFNIAVX-NEXT: retq 2810; 2811; GFNIAVX2-LABEL: test_bitreverse_v16i32: 2812; GFNIAVX2: # %bb.0: 2813; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2814; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2815; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2816; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2817; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2818; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2819; GFNIAVX2-NEXT: retq 2820; 2821; GFNIAVX512F-LABEL: test_bitreverse_v16i32: 2822; GFNIAVX512F: # %bb.0: 2823; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2824; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2825; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2826; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2827; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2828; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2829; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2830; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2831; GFNIAVX512F-NEXT: retq 2832; 2833; GFNIAVX512BW-LABEL: test_bitreverse_v16i32: 2834; GFNIAVX512BW: # %bb.0: 2835; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] 2836; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 2837; GFNIAVX512BW-NEXT: retq 2838 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) 2839 ret <16 x i32> %b 2840} 2841 2842define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { 2843; SSE2-LABEL: test_bitreverse_v8i64: 2844; SSE2: # %bb.0: 2845; SSE2-NEXT: movdqa %xmm3, %xmm4 2846; SSE2-NEXT: pxor %xmm8, %xmm8 2847; SSE2-NEXT: movdqa %xmm0, %xmm3 2848; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] 2849; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 2850; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 2851; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 2852; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 2853; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2854; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 2855; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 2856; SSE2-NEXT: packuswb %xmm3, %xmm0 2857; SSE2-NEXT: movdqa %xmm0, %xmm6 2858; SSE2-NEXT: psllw $4, %xmm6 2859; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2860; SSE2-NEXT: movdqa %xmm3, %xmm7 2861; SSE2-NEXT: pandn %xmm6, %xmm7 2862; SSE2-NEXT: psrlw $4, %xmm0 2863; SSE2-NEXT: pand %xmm3, %xmm0 2864; SSE2-NEXT: por %xmm7, %xmm0 2865; SSE2-NEXT: movdqa %xmm0, %xmm7 2866; SSE2-NEXT: psrlw $2, %xmm7 2867; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 2868; SSE2-NEXT: pand %xmm9, %xmm7 2869; SSE2-NEXT: pand %xmm9, %xmm0 2870; SSE2-NEXT: psllw $2, %xmm0 2871; SSE2-NEXT: por %xmm7, %xmm0 2872; SSE2-NEXT: movdqa %xmm0, %xmm5 2873; SSE2-NEXT: psrlw $1, %xmm5 2874; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 2875; SSE2-NEXT: pand %xmm7, %xmm5 2876; SSE2-NEXT: pand %xmm7, %xmm0 2877; SSE2-NEXT: paddb %xmm0, %xmm0 2878; SSE2-NEXT: por %xmm5, %xmm0 2879; SSE2-NEXT: movdqa %xmm1, %xmm5 2880; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] 2881; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] 2882; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] 2883; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] 2884; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 2885; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2886; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 2887; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 2888; SSE2-NEXT: packuswb %xmm5, %xmm1 2889; SSE2-NEXT: movdqa %xmm1, %xmm5 2890; SSE2-NEXT: psllw $4, %xmm5 2891; SSE2-NEXT: movdqa %xmm3, %xmm6 2892; SSE2-NEXT: pandn %xmm5, %xmm6 2893; SSE2-NEXT: psrlw $4, %xmm1 2894; SSE2-NEXT: pand %xmm3, %xmm1 2895; SSE2-NEXT: por %xmm6, %xmm1 2896; SSE2-NEXT: movdqa %xmm1, %xmm5 2897; SSE2-NEXT: psrlw $2, %xmm5 2898; SSE2-NEXT: pand %xmm9, %xmm5 2899; SSE2-NEXT: pand %xmm9, %xmm1 2900; SSE2-NEXT: psllw $2, %xmm1 2901; SSE2-NEXT: por %xmm5, %xmm1 2902; SSE2-NEXT: movdqa %xmm1, %xmm5 2903; SSE2-NEXT: psrlw $1, %xmm5 2904; SSE2-NEXT: pand %xmm7, %xmm5 2905; SSE2-NEXT: pand %xmm7, %xmm1 2906; SSE2-NEXT: paddb %xmm1, %xmm1 2907; SSE2-NEXT: por %xmm5, %xmm1 2908; SSE2-NEXT: movdqa %xmm2, %xmm5 2909; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] 2910; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] 2911; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] 2912; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] 2913; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] 2914; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2915; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 2916; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 2917; SSE2-NEXT: packuswb %xmm5, %xmm2 2918; SSE2-NEXT: movdqa %xmm2, %xmm5 2919; SSE2-NEXT: psllw $4, %xmm5 2920; SSE2-NEXT: movdqa %xmm3, %xmm6 2921; SSE2-NEXT: pandn %xmm5, %xmm6 2922; SSE2-NEXT: psrlw $4, %xmm2 2923; SSE2-NEXT: pand %xmm3, %xmm2 2924; SSE2-NEXT: por %xmm6, %xmm2 2925; SSE2-NEXT: movdqa %xmm2, %xmm5 2926; SSE2-NEXT: psrlw $2, %xmm5 2927; SSE2-NEXT: pand %xmm9, %xmm5 2928; SSE2-NEXT: pand %xmm9, %xmm2 2929; SSE2-NEXT: psllw $2, %xmm2 2930; SSE2-NEXT: por %xmm5, %xmm2 2931; SSE2-NEXT: movdqa %xmm2, %xmm5 2932; SSE2-NEXT: psrlw $1, %xmm5 2933; SSE2-NEXT: pand %xmm7, %xmm5 2934; SSE2-NEXT: pand %xmm7, %xmm2 2935; SSE2-NEXT: paddb %xmm2, %xmm2 2936; SSE2-NEXT: por %xmm5, %xmm2 2937; SSE2-NEXT: movdqa %xmm4, %xmm5 2938; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] 2939; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] 2940; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] 2941; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] 2942; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] 2943; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 2944; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2945; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2946; SSE2-NEXT: packuswb %xmm5, %xmm4 2947; SSE2-NEXT: movdqa %xmm4, %xmm5 2948; SSE2-NEXT: psllw $4, %xmm5 2949; SSE2-NEXT: psrlw $4, %xmm4 2950; SSE2-NEXT: pand %xmm3, %xmm4 2951; SSE2-NEXT: pandn %xmm5, %xmm3 2952; SSE2-NEXT: por %xmm4, %xmm3 2953; SSE2-NEXT: movdqa %xmm3, %xmm4 2954; SSE2-NEXT: psrlw $2, %xmm4 2955; SSE2-NEXT: pand %xmm9, %xmm4 2956; SSE2-NEXT: pand %xmm9, %xmm3 2957; SSE2-NEXT: psllw $2, %xmm3 2958; SSE2-NEXT: por %xmm4, %xmm3 2959; SSE2-NEXT: movdqa %xmm3, %xmm4 2960; SSE2-NEXT: psrlw $1, %xmm4 2961; SSE2-NEXT: pand %xmm7, %xmm4 2962; SSE2-NEXT: pand %xmm7, %xmm3 2963; SSE2-NEXT: paddb %xmm3, %xmm3 2964; SSE2-NEXT: por %xmm4, %xmm3 2965; SSE2-NEXT: retq 2966; 2967; SSSE3-LABEL: test_bitreverse_v8i64: 2968; SSSE3: # %bb.0: 2969; SSSE3-NEXT: movdqa %xmm1, %xmm5 2970; SSSE3-NEXT: movdqa %xmm0, %xmm1 2971; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2972; SSSE3-NEXT: pshufb %xmm8, %xmm1 2973; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2974; SSSE3-NEXT: movdqa %xmm1, %xmm0 2975; SSSE3-NEXT: pand %xmm9, %xmm0 2976; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2977; SSSE3-NEXT: movdqa %xmm7, %xmm6 2978; SSSE3-NEXT: pshufb %xmm0, %xmm6 2979; SSSE3-NEXT: psrlw $4, %xmm1 2980; SSSE3-NEXT: pand %xmm9, %xmm1 2981; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2982; SSSE3-NEXT: movdqa %xmm4, %xmm0 2983; SSSE3-NEXT: pshufb %xmm1, %xmm0 2984; SSSE3-NEXT: por %xmm6, %xmm0 2985; SSSE3-NEXT: pshufb %xmm8, %xmm5 2986; SSSE3-NEXT: movdqa %xmm5, %xmm1 2987; SSSE3-NEXT: pand %xmm9, %xmm1 2988; SSSE3-NEXT: movdqa %xmm7, %xmm6 2989; SSSE3-NEXT: pshufb %xmm1, %xmm6 2990; SSSE3-NEXT: psrlw $4, %xmm5 2991; SSSE3-NEXT: pand %xmm9, %xmm5 2992; SSSE3-NEXT: movdqa %xmm4, %xmm1 2993; SSSE3-NEXT: pshufb %xmm5, %xmm1 2994; SSSE3-NEXT: por %xmm6, %xmm1 2995; SSSE3-NEXT: pshufb %xmm8, %xmm2 2996; SSSE3-NEXT: movdqa %xmm2, %xmm5 2997; SSSE3-NEXT: pand %xmm9, %xmm5 2998; SSSE3-NEXT: movdqa %xmm7, %xmm6 2999; SSSE3-NEXT: pshufb %xmm5, %xmm6 3000; SSSE3-NEXT: psrlw $4, %xmm2 3001; SSSE3-NEXT: pand %xmm9, %xmm2 3002; SSSE3-NEXT: movdqa %xmm4, %xmm5 3003; SSSE3-NEXT: pshufb %xmm2, %xmm5 3004; SSSE3-NEXT: por %xmm6, %xmm5 3005; SSSE3-NEXT: pshufb %xmm8, %xmm3 3006; SSSE3-NEXT: movdqa %xmm3, %xmm2 3007; SSSE3-NEXT: pand %xmm9, %xmm2 3008; SSSE3-NEXT: pshufb %xmm2, %xmm7 3009; SSSE3-NEXT: psrlw $4, %xmm3 3010; SSSE3-NEXT: pand %xmm9, %xmm3 3011; SSSE3-NEXT: pshufb %xmm3, %xmm4 3012; SSSE3-NEXT: por %xmm7, %xmm4 3013; SSSE3-NEXT: movdqa %xmm5, %xmm2 3014; SSSE3-NEXT: movdqa %xmm4, %xmm3 3015; SSSE3-NEXT: retq 3016; 3017; AVX1-LABEL: test_bitreverse_v8i64: 3018; AVX1: # %bb.0: 3019; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3020; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3021; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3022; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3023; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 3024; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3025; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3026; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 3027; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 3028; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3029; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 3030; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 3031; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 3032; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 3033; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3034; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 3035; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 3036; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 3037; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 3038; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3039; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3040; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3041; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 3042; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3043; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 3044; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 3045; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 3046; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 3047; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 3048; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 3049; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 3050; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 3051; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 3052; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 3053; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 3054; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3055; AVX1-NEXT: retq 3056; 3057; AVX2-LABEL: test_bitreverse_v8i64: 3058; AVX2: # %bb.0: 3059; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3060; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3061; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3062; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 3063; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3064; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 3065; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 3066; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 3067; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3068; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 3069; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 3070; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3071; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 3072; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 3073; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 3074; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 3075; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 3076; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 3077; AVX2-NEXT: retq 3078; 3079; AVX512F-LABEL: test_bitreverse_v8i64: 3080; AVX512F: # %bb.0: 3081; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 3082; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3083; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3084; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3085; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 3086; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3087; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 3088; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3089; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 3090; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 3091; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 3092; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 3093; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 3094; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3095; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 3096; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 3097; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 3098; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 3099; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 3100; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 3101; AVX512F-NEXT: retq 3102; 3103; AVX512BW-LABEL: test_bitreverse_v8i64: 3104; AVX512BW: # %bb.0: 3105; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] 3106; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3107; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 3108; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3109; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 3110; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 3111; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 3112; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3113; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 3114; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 3115; AVX512BW-NEXT: retq 3116; 3117; XOPAVX1-LABEL: test_bitreverse_v8i64: 3118; XOPAVX1: # %bb.0: 3119; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3120; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 3121; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3122; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 3123; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3124; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3125; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3126; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 3127; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3128; XOPAVX1-NEXT: retq 3129; 3130; XOPAVX2-LABEL: test_bitreverse_v8i64: 3131; XOPAVX2: # %bb.0: 3132; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 3133; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 3134; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3135; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 3136; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 3137; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 3138; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3139; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 3140; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 3141; XOPAVX2-NEXT: retq 3142; 3143; GFNISSE-LABEL: test_bitreverse_v8i64: 3144; GFNISSE: # %bb.0: 3145; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3146; GFNISSE-NEXT: pshufb %xmm4, %xmm0 3147; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] 3148; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 3149; GFNISSE-NEXT: pshufb %xmm4, %xmm1 3150; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 3151; GFNISSE-NEXT: pshufb %xmm4, %xmm2 3152; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 3153; GFNISSE-NEXT: pshufb %xmm4, %xmm3 3154; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 3155; GFNISSE-NEXT: retq 3156; 3157; GFNIAVX-LABEL: test_bitreverse_v8i64: 3158; GFNIAVX: # %bb.0: 3159; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm2 3160; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3161; GFNIAVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3162; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] 3163; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 3164; GFNIAVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 3165; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0 3166; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3167; GFNIAVX-NEXT: vextractf128 $1, %ymm1, %xmm2 3168; GFNIAVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3169; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 3170; GFNIAVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 3171; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1 3172; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3173; GFNIAVX-NEXT: retq 3174; 3175; GFNIAVX2-LABEL: test_bitreverse_v8i64: 3176; GFNIAVX2: # %bb.0: 3177; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3178; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3179; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 3180; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 3181; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3182; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 3183; GFNIAVX2-NEXT: retq 3184; 3185; GFNIAVX512F-LABEL: test_bitreverse_v8i64: 3186; GFNIAVX512F: # %bb.0: 3187; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 3188; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3189; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3190; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 3191; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 3192; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3193; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 3194; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 3195; GFNIAVX512F-NEXT: retq 3196; 3197; GFNIAVX512BW-LABEL: test_bitreverse_v8i64: 3198; GFNIAVX512BW: # %bb.0: 3199; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] 3200; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 3201; GFNIAVX512BW-NEXT: retq 3202 %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) 3203 ret <8 x i64> %b 3204} 3205 3206; 3207; Constant Folding 3208; 3209 3210define i32 @fold_bitreverse_i32() nounwind { 3211; ALL-LABEL: fold_bitreverse_i32: 3212; ALL: # %bb.0: 3213; ALL-NEXT: movl $16711935, %eax # imm = 0xFF00FF 3214; ALL-NEXT: retq 3215 %b = call i32 @llvm.bitreverse.i32(i32 4278255360) 3216 ret i32 %b 3217} 3218 3219define <16 x i8> @fold_bitreverse_v16i8() nounwind { 3220; SSE-LABEL: fold_bitreverse_v16i8: 3221; SSE: # %bb.0: 3222; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3223; SSE-NEXT: retq 3224; 3225; AVX-LABEL: fold_bitreverse_v16i8: 3226; AVX: # %bb.0: 3227; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3228; AVX-NEXT: retq 3229; 3230; XOP-LABEL: fold_bitreverse_v16i8: 3231; XOP: # %bb.0: 3232; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3233; XOP-NEXT: retq 3234; 3235; GFNISSE-LABEL: fold_bitreverse_v16i8: 3236; GFNISSE: # %bb.0: 3237; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3238; GFNISSE-NEXT: retq 3239; 3240; GFNIAVX-LABEL: fold_bitreverse_v16i8: 3241; GFNIAVX: # %bb.0: 3242; GFNIAVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3243; GFNIAVX-NEXT: retq 3244; 3245; GFNIAVX2-LABEL: fold_bitreverse_v16i8: 3246; GFNIAVX2: # %bb.0: 3247; GFNIAVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3248; GFNIAVX2-NEXT: retq 3249; 3250; GFNIAVX512F-LABEL: fold_bitreverse_v16i8: 3251; GFNIAVX512F: # %bb.0: 3252; GFNIAVX512F-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3253; GFNIAVX512F-NEXT: retq 3254; 3255; GFNIAVX512BW-LABEL: fold_bitreverse_v16i8: 3256; GFNIAVX512BW: # %bb.0: 3257; GFNIAVX512BW-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3258; GFNIAVX512BW-NEXT: retq 3259 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>) 3260 ret <16 x i8> %b 3261} 3262 3263define <16 x i16> @fold_bitreverse_v16i16() nounwind { 3264; SSE-LABEL: fold_bitreverse_v16i16: 3265; SSE: # %bb.0: 3266; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959] 3267; SSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863] 3268; SSE-NEXT: retq 3269; 3270; AVX-LABEL: fold_bitreverse_v16i16: 3271; AVX: # %bb.0: 3272; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 3273; AVX-NEXT: retq 3274; 3275; XOP-LABEL: fold_bitreverse_v16i16: 3276; XOP: # %bb.0: 3277; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 3278; XOP-NEXT: retq 3279; 3280; GFNISSE-LABEL: fold_bitreverse_v16i16: 3281; GFNISSE: # %bb.0: 3282; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959] 3283; GFNISSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863] 3284; GFNISSE-NEXT: retq 3285; 3286; GFNIAVX-LABEL: fold_bitreverse_v16i16: 3287; GFNIAVX: # %bb.0: 3288; GFNIAVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 3289; GFNIAVX-NEXT: retq 3290; 3291; GFNIAVX2-LABEL: fold_bitreverse_v16i16: 3292; GFNIAVX2: # %bb.0: 3293; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 3294; GFNIAVX2-NEXT: retq 3295; 3296; GFNIAVX512F-LABEL: fold_bitreverse_v16i16: 3297; GFNIAVX512F: # %bb.0: 3298; GFNIAVX512F-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 3299; GFNIAVX512F-NEXT: retq 3300; 3301; GFNIAVX512BW-LABEL: fold_bitreverse_v16i16: 3302; GFNIAVX512BW: # %bb.0: 3303; GFNIAVX512BW-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 3304; GFNIAVX512BW-NEXT: retq 3305 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>) 3306 ret <16 x i16> %b 3307} 3308 3309define <16 x i32> @fold_bitreverse_v16i32() nounwind { 3310; SSE-LABEL: fold_bitreverse_v16i32: 3311; SSE: # %bb.0: 3312; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471] 3313; SSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559] 3314; SSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015] 3315; SSE-NEXT: movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103] 3316; SSE-NEXT: retq 3317; 3318; AVX1-LABEL: fold_bitreverse_v16i32: 3319; AVX1: # %bb.0: 3320; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 3321; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3322; AVX1-NEXT: retq 3323; 3324; AVX2-LABEL: fold_bitreverse_v16i32: 3325; AVX2: # %bb.0: 3326; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 3327; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3328; AVX2-NEXT: retq 3329; 3330; AVX512-LABEL: fold_bitreverse_v16i32: 3331; AVX512: # %bb.0: 3332; AVX512-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3333; AVX512-NEXT: retq 3334; 3335; XOP-LABEL: fold_bitreverse_v16i32: 3336; XOP: # %bb.0: 3337; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 3338; XOP-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3339; XOP-NEXT: retq 3340; 3341; GFNISSE-LABEL: fold_bitreverse_v16i32: 3342; GFNISSE: # %bb.0: 3343; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471] 3344; GFNISSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559] 3345; GFNISSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015] 3346; GFNISSE-NEXT: movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103] 3347; GFNISSE-NEXT: retq 3348; 3349; GFNIAVX-LABEL: fold_bitreverse_v16i32: 3350; GFNIAVX: # %bb.0: 3351; GFNIAVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 3352; GFNIAVX-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3353; GFNIAVX-NEXT: retq 3354; 3355; GFNIAVX2-LABEL: fold_bitreverse_v16i32: 3356; GFNIAVX2: # %bb.0: 3357; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 3358; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3359; GFNIAVX2-NEXT: retq 3360; 3361; GFNIAVX512F-LABEL: fold_bitreverse_v16i32: 3362; GFNIAVX512F: # %bb.0: 3363; GFNIAVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3364; GFNIAVX512F-NEXT: retq 3365; 3366; GFNIAVX512BW-LABEL: fold_bitreverse_v16i32: 3367; GFNIAVX512BW: # %bb.0: 3368; GFNIAVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3369; GFNIAVX512BW-NEXT: retq 3370 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>) 3371 ret <16 x i32> %b 3372} 3373 3374declare i8 @llvm.bitreverse.i8(i8) readnone 3375declare i16 @llvm.bitreverse.i16(i16) readnone 3376declare i32 @llvm.bitreverse.i32(i32) readnone 3377declare i64 @llvm.bitreverse.i64(i64) readnone 3378 3379declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone 3380declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone 3381declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone 3382declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone 3383 3384declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) readnone 3385declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone 3386declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone 3387declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone 3388 3389declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) readnone 3390declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone 3391declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>) readnone 3392declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>) readnone 3393