1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNISSE 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX2 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX512F 14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX512BW 15 16; Make sure we don't crash with avx512bw and xop 17; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw 18 19define i8 @test_bitreverse_i8(i8 %a) nounwind { 20; SSE-LABEL: test_bitreverse_i8: 21; SSE: # %bb.0: 22; SSE-NEXT: rolb $4, %dil 23; SSE-NEXT: movl %edi, %eax 24; SSE-NEXT: andb $51, %al 25; SSE-NEXT: shlb $2, %al 26; SSE-NEXT: shrb $2, %dil 27; SSE-NEXT: andb $51, %dil 28; SSE-NEXT: orb %dil, %al 29; SSE-NEXT: movl %eax, %ecx 30; SSE-NEXT: andb $85, %cl 31; SSE-NEXT: addb %cl, %cl 32; SSE-NEXT: shrb %al 33; SSE-NEXT: andb $85, %al 34; SSE-NEXT: orb %cl, %al 35; SSE-NEXT: retq 36; 37; AVX-LABEL: test_bitreverse_i8: 38; AVX: # %bb.0: 39; AVX-NEXT: rolb $4, %dil 40; AVX-NEXT: movl %edi, %eax 41; AVX-NEXT: andb $51, %al 42; AVX-NEXT: shlb $2, %al 43; AVX-NEXT: shrb $2, %dil 44; AVX-NEXT: andb $51, %dil 45; AVX-NEXT: orb %dil, %al 46; AVX-NEXT: movl %eax, %ecx 47; AVX-NEXT: andb $85, %cl 48; AVX-NEXT: addb %cl, %cl 49; AVX-NEXT: shrb %al 50; AVX-NEXT: andb $85, %al 51; AVX-NEXT: orb %cl, %al 52; AVX-NEXT: retq 53; 54; XOP-LABEL: test_bitreverse_i8: 55; XOP: # %bb.0: 56; XOP-NEXT: vmovd %edi, %xmm0 57; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 58; XOP-NEXT: vmovd %xmm0, %eax 59; XOP-NEXT: # kill: def $al killed $al killed $eax 60; XOP-NEXT: retq 61; 62; GFNISSE-LABEL: test_bitreverse_i8: 63; GFNISSE: # %bb.0: 64; GFNISSE-NEXT: rolb $4, %dil 65; GFNISSE-NEXT: movl %edi, %eax 66; GFNISSE-NEXT: andb $51, %al 67; GFNISSE-NEXT: shlb $2, %al 68; GFNISSE-NEXT: shrb $2, %dil 69; GFNISSE-NEXT: andb $51, %dil 70; GFNISSE-NEXT: orb %dil, %al 71; GFNISSE-NEXT: movl %eax, %ecx 72; GFNISSE-NEXT: andb $85, %cl 73; GFNISSE-NEXT: addb %cl, %cl 74; GFNISSE-NEXT: shrb %al 75; GFNISSE-NEXT: andb $85, %al 76; GFNISSE-NEXT: orb %cl, %al 77; GFNISSE-NEXT: retq 78; 79; GFNIAVX-LABEL: test_bitreverse_i8: 80; GFNIAVX: # %bb.0: 81; GFNIAVX-NEXT: rolb $4, %dil 82; GFNIAVX-NEXT: movl %edi, %eax 83; GFNIAVX-NEXT: andb $51, %al 84; GFNIAVX-NEXT: shlb $2, %al 85; GFNIAVX-NEXT: shrb $2, %dil 86; GFNIAVX-NEXT: andb $51, %dil 87; GFNIAVX-NEXT: orb %dil, %al 88; GFNIAVX-NEXT: movl %eax, %ecx 89; GFNIAVX-NEXT: andb $85, %cl 90; GFNIAVX-NEXT: addb %cl, %cl 91; GFNIAVX-NEXT: shrb %al 92; GFNIAVX-NEXT: andb $85, %al 93; GFNIAVX-NEXT: orb %cl, %al 94; GFNIAVX-NEXT: retq 95; 96; GFNIAVX2-LABEL: test_bitreverse_i8: 97; GFNIAVX2: # %bb.0: 98; GFNIAVX2-NEXT: rolb $4, %dil 99; GFNIAVX2-NEXT: movl %edi, %eax 100; GFNIAVX2-NEXT: andb $51, %al 101; GFNIAVX2-NEXT: shlb $2, %al 102; GFNIAVX2-NEXT: shrb $2, %dil 103; GFNIAVX2-NEXT: andb $51, %dil 104; GFNIAVX2-NEXT: orb %dil, %al 105; GFNIAVX2-NEXT: movl %eax, %ecx 106; GFNIAVX2-NEXT: andb $85, %cl 107; GFNIAVX2-NEXT: addb %cl, %cl 108; GFNIAVX2-NEXT: shrb %al 109; GFNIAVX2-NEXT: andb $85, %al 110; GFNIAVX2-NEXT: orb %cl, %al 111; GFNIAVX2-NEXT: retq 112; 113; GFNIAVX512F-LABEL: test_bitreverse_i8: 114; GFNIAVX512F: # %bb.0: 115; GFNIAVX512F-NEXT: rolb $4, %dil 116; GFNIAVX512F-NEXT: movl %edi, %eax 117; GFNIAVX512F-NEXT: andb $51, %al 118; GFNIAVX512F-NEXT: shlb $2, %al 119; GFNIAVX512F-NEXT: shrb $2, %dil 120; GFNIAVX512F-NEXT: andb $51, %dil 121; GFNIAVX512F-NEXT: orb %dil, %al 122; GFNIAVX512F-NEXT: movl %eax, %ecx 123; GFNIAVX512F-NEXT: andb $85, %cl 124; GFNIAVX512F-NEXT: addb %cl, %cl 125; GFNIAVX512F-NEXT: shrb %al 126; GFNIAVX512F-NEXT: andb $85, %al 127; GFNIAVX512F-NEXT: orb %cl, %al 128; GFNIAVX512F-NEXT: retq 129; 130; GFNIAVX512BW-LABEL: test_bitreverse_i8: 131; GFNIAVX512BW: # %bb.0: 132; GFNIAVX512BW-NEXT: rolb $4, %dil 133; GFNIAVX512BW-NEXT: movl %edi, %eax 134; GFNIAVX512BW-NEXT: andb $51, %al 135; GFNIAVX512BW-NEXT: shlb $2, %al 136; GFNIAVX512BW-NEXT: shrb $2, %dil 137; GFNIAVX512BW-NEXT: andb $51, %dil 138; GFNIAVX512BW-NEXT: orb %dil, %al 139; GFNIAVX512BW-NEXT: movl %eax, %ecx 140; GFNIAVX512BW-NEXT: andb $85, %cl 141; GFNIAVX512BW-NEXT: addb %cl, %cl 142; GFNIAVX512BW-NEXT: shrb %al 143; GFNIAVX512BW-NEXT: andb $85, %al 144; GFNIAVX512BW-NEXT: orb %cl, %al 145; GFNIAVX512BW-NEXT: retq 146 %b = call i8 @llvm.bitreverse.i8(i8 %a) 147 ret i8 %b 148} 149 150define i16 @test_bitreverse_i16(i16 %a) nounwind { 151; SSE-LABEL: test_bitreverse_i16: 152; SSE: # %bb.0: 153; SSE-NEXT: # kill: def $edi killed $edi def $rdi 154; SSE-NEXT: rolw $8, %di 155; SSE-NEXT: movl %edi, %eax 156; SSE-NEXT: andl $3855, %eax # imm = 0xF0F 157; SSE-NEXT: shll $4, %eax 158; SSE-NEXT: shrl $4, %edi 159; SSE-NEXT: andl $3855, %edi # imm = 0xF0F 160; SSE-NEXT: orl %eax, %edi 161; SSE-NEXT: movl %edi, %eax 162; SSE-NEXT: andl $13107, %eax # imm = 0x3333 163; SSE-NEXT: shrl $2, %edi 164; SSE-NEXT: andl $13107, %edi # imm = 0x3333 165; SSE-NEXT: leal (%rdi,%rax,4), %eax 166; SSE-NEXT: movl %eax, %ecx 167; SSE-NEXT: andl $21845, %ecx # imm = 0x5555 168; SSE-NEXT: shrl %eax 169; SSE-NEXT: andl $21845, %eax # imm = 0x5555 170; SSE-NEXT: leal (%rax,%rcx,2), %eax 171; SSE-NEXT: # kill: def $ax killed $ax killed $eax 172; SSE-NEXT: retq 173; 174; AVX-LABEL: test_bitreverse_i16: 175; AVX: # %bb.0: 176; AVX-NEXT: # kill: def $edi killed $edi def $rdi 177; AVX-NEXT: rolw $8, %di 178; AVX-NEXT: movl %edi, %eax 179; AVX-NEXT: andl $3855, %eax # imm = 0xF0F 180; AVX-NEXT: shll $4, %eax 181; AVX-NEXT: shrl $4, %edi 182; AVX-NEXT: andl $3855, %edi # imm = 0xF0F 183; AVX-NEXT: orl %eax, %edi 184; AVX-NEXT: movl %edi, %eax 185; AVX-NEXT: andl $13107, %eax # imm = 0x3333 186; AVX-NEXT: shrl $2, %edi 187; AVX-NEXT: andl $13107, %edi # imm = 0x3333 188; AVX-NEXT: leal (%rdi,%rax,4), %eax 189; AVX-NEXT: movl %eax, %ecx 190; AVX-NEXT: andl $21845, %ecx # imm = 0x5555 191; AVX-NEXT: shrl %eax 192; AVX-NEXT: andl $21845, %eax # imm = 0x5555 193; AVX-NEXT: leal (%rax,%rcx,2), %eax 194; AVX-NEXT: # kill: def $ax killed $ax killed $eax 195; AVX-NEXT: retq 196; 197; XOP-LABEL: test_bitreverse_i16: 198; XOP: # %bb.0: 199; XOP-NEXT: vmovd %edi, %xmm0 200; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 201; XOP-NEXT: vmovd %xmm0, %eax 202; XOP-NEXT: # kill: def $ax killed $ax killed $eax 203; XOP-NEXT: retq 204; 205; GFNISSE-LABEL: test_bitreverse_i16: 206; GFNISSE: # %bb.0: 207; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi 208; GFNISSE-NEXT: rolw $8, %di 209; GFNISSE-NEXT: movl %edi, %eax 210; GFNISSE-NEXT: andl $3855, %eax # imm = 0xF0F 211; GFNISSE-NEXT: shll $4, %eax 212; GFNISSE-NEXT: shrl $4, %edi 213; GFNISSE-NEXT: andl $3855, %edi # imm = 0xF0F 214; GFNISSE-NEXT: orl %eax, %edi 215; GFNISSE-NEXT: movl %edi, %eax 216; GFNISSE-NEXT: andl $13107, %eax # imm = 0x3333 217; GFNISSE-NEXT: shrl $2, %edi 218; GFNISSE-NEXT: andl $13107, %edi # imm = 0x3333 219; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax 220; GFNISSE-NEXT: movl %eax, %ecx 221; GFNISSE-NEXT: andl $21845, %ecx # imm = 0x5555 222; GFNISSE-NEXT: shrl %eax 223; GFNISSE-NEXT: andl $21845, %eax # imm = 0x5555 224; GFNISSE-NEXT: leal (%rax,%rcx,2), %eax 225; GFNISSE-NEXT: # kill: def $ax killed $ax killed $eax 226; GFNISSE-NEXT: retq 227; 228; GFNIAVX-LABEL: test_bitreverse_i16: 229; GFNIAVX: # %bb.0: 230; GFNIAVX-NEXT: # kill: def $edi killed $edi def $rdi 231; GFNIAVX-NEXT: rolw $8, %di 232; GFNIAVX-NEXT: movl %edi, %eax 233; GFNIAVX-NEXT: andl $3855, %eax # imm = 0xF0F 234; GFNIAVX-NEXT: shll $4, %eax 235; GFNIAVX-NEXT: shrl $4, %edi 236; GFNIAVX-NEXT: andl $3855, %edi # imm = 0xF0F 237; GFNIAVX-NEXT: orl %eax, %edi 238; GFNIAVX-NEXT: movl %edi, %eax 239; GFNIAVX-NEXT: andl $13107, %eax # imm = 0x3333 240; GFNIAVX-NEXT: shrl $2, %edi 241; GFNIAVX-NEXT: andl $13107, %edi # imm = 0x3333 242; GFNIAVX-NEXT: leal (%rdi,%rax,4), %eax 243; GFNIAVX-NEXT: movl %eax, %ecx 244; GFNIAVX-NEXT: andl $21845, %ecx # imm = 0x5555 245; GFNIAVX-NEXT: shrl %eax 246; GFNIAVX-NEXT: andl $21845, %eax # imm = 0x5555 247; GFNIAVX-NEXT: leal (%rax,%rcx,2), %eax 248; GFNIAVX-NEXT: # kill: def $ax killed $ax killed $eax 249; GFNIAVX-NEXT: retq 250; 251; GFNIAVX2-LABEL: test_bitreverse_i16: 252; GFNIAVX2: # %bb.0: 253; GFNIAVX2-NEXT: # kill: def $edi killed $edi def $rdi 254; GFNIAVX2-NEXT: rolw $8, %di 255; GFNIAVX2-NEXT: movl %edi, %eax 256; GFNIAVX2-NEXT: andl $3855, %eax # imm = 0xF0F 257; GFNIAVX2-NEXT: shll $4, %eax 258; GFNIAVX2-NEXT: shrl $4, %edi 259; GFNIAVX2-NEXT: andl $3855, %edi # imm = 0xF0F 260; GFNIAVX2-NEXT: orl %eax, %edi 261; GFNIAVX2-NEXT: movl %edi, %eax 262; GFNIAVX2-NEXT: andl $13107, %eax # imm = 0x3333 263; GFNIAVX2-NEXT: shrl $2, %edi 264; GFNIAVX2-NEXT: andl $13107, %edi # imm = 0x3333 265; GFNIAVX2-NEXT: leal (%rdi,%rax,4), %eax 266; GFNIAVX2-NEXT: movl %eax, %ecx 267; GFNIAVX2-NEXT: andl $21845, %ecx # imm = 0x5555 268; GFNIAVX2-NEXT: shrl %eax 269; GFNIAVX2-NEXT: andl $21845, %eax # imm = 0x5555 270; GFNIAVX2-NEXT: leal (%rax,%rcx,2), %eax 271; GFNIAVX2-NEXT: # kill: def $ax killed $ax killed $eax 272; GFNIAVX2-NEXT: retq 273; 274; GFNIAVX512F-LABEL: test_bitreverse_i16: 275; GFNIAVX512F: # %bb.0: 276; GFNIAVX512F-NEXT: # kill: def $edi killed $edi def $rdi 277; GFNIAVX512F-NEXT: rolw $8, %di 278; GFNIAVX512F-NEXT: movl %edi, %eax 279; GFNIAVX512F-NEXT: andl $3855, %eax # imm = 0xF0F 280; GFNIAVX512F-NEXT: shll $4, %eax 281; GFNIAVX512F-NEXT: shrl $4, %edi 282; GFNIAVX512F-NEXT: andl $3855, %edi # imm = 0xF0F 283; GFNIAVX512F-NEXT: orl %eax, %edi 284; GFNIAVX512F-NEXT: movl %edi, %eax 285; GFNIAVX512F-NEXT: andl $13107, %eax # imm = 0x3333 286; GFNIAVX512F-NEXT: shrl $2, %edi 287; GFNIAVX512F-NEXT: andl $13107, %edi # imm = 0x3333 288; GFNIAVX512F-NEXT: leal (%rdi,%rax,4), %eax 289; GFNIAVX512F-NEXT: movl %eax, %ecx 290; GFNIAVX512F-NEXT: andl $21845, %ecx # imm = 0x5555 291; GFNIAVX512F-NEXT: shrl %eax 292; GFNIAVX512F-NEXT: andl $21845, %eax # imm = 0x5555 293; GFNIAVX512F-NEXT: leal (%rax,%rcx,2), %eax 294; GFNIAVX512F-NEXT: # kill: def $ax killed $ax killed $eax 295; GFNIAVX512F-NEXT: retq 296; 297; GFNIAVX512BW-LABEL: test_bitreverse_i16: 298; GFNIAVX512BW: # %bb.0: 299; GFNIAVX512BW-NEXT: # kill: def $edi killed $edi def $rdi 300; GFNIAVX512BW-NEXT: rolw $8, %di 301; GFNIAVX512BW-NEXT: movl %edi, %eax 302; GFNIAVX512BW-NEXT: andl $3855, %eax # imm = 0xF0F 303; GFNIAVX512BW-NEXT: shll $4, %eax 304; GFNIAVX512BW-NEXT: shrl $4, %edi 305; GFNIAVX512BW-NEXT: andl $3855, %edi # imm = 0xF0F 306; GFNIAVX512BW-NEXT: orl %eax, %edi 307; GFNIAVX512BW-NEXT: movl %edi, %eax 308; GFNIAVX512BW-NEXT: andl $13107, %eax # imm = 0x3333 309; GFNIAVX512BW-NEXT: shrl $2, %edi 310; GFNIAVX512BW-NEXT: andl $13107, %edi # imm = 0x3333 311; GFNIAVX512BW-NEXT: leal (%rdi,%rax,4), %eax 312; GFNIAVX512BW-NEXT: movl %eax, %ecx 313; GFNIAVX512BW-NEXT: andl $21845, %ecx # imm = 0x5555 314; GFNIAVX512BW-NEXT: shrl %eax 315; GFNIAVX512BW-NEXT: andl $21845, %eax # imm = 0x5555 316; GFNIAVX512BW-NEXT: leal (%rax,%rcx,2), %eax 317; GFNIAVX512BW-NEXT: # kill: def $ax killed $ax killed $eax 318; GFNIAVX512BW-NEXT: retq 319 %b = call i16 @llvm.bitreverse.i16(i16 %a) 320 ret i16 %b 321} 322 323define i32 @test_bitreverse_i32(i32 %a) nounwind { 324; SSE-LABEL: test_bitreverse_i32: 325; SSE: # %bb.0: 326; SSE-NEXT: # kill: def $edi killed $edi def $rdi 327; SSE-NEXT: bswapl %edi 328; SSE-NEXT: movl %edi, %eax 329; SSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 330; SSE-NEXT: shll $4, %eax 331; SSE-NEXT: shrl $4, %edi 332; SSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 333; SSE-NEXT: orl %eax, %edi 334; SSE-NEXT: movl %edi, %eax 335; SSE-NEXT: andl $858993459, %eax # imm = 0x33333333 336; SSE-NEXT: shrl $2, %edi 337; SSE-NEXT: andl $858993459, %edi # imm = 0x33333333 338; SSE-NEXT: leal (%rdi,%rax,4), %eax 339; SSE-NEXT: movl %eax, %ecx 340; SSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555 341; SSE-NEXT: shrl %eax 342; SSE-NEXT: andl $1431655765, %eax # imm = 0x55555555 343; SSE-NEXT: leal (%rax,%rcx,2), %eax 344; SSE-NEXT: retq 345; 346; AVX-LABEL: test_bitreverse_i32: 347; AVX: # %bb.0: 348; AVX-NEXT: # kill: def $edi killed $edi def $rdi 349; AVX-NEXT: bswapl %edi 350; AVX-NEXT: movl %edi, %eax 351; AVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 352; AVX-NEXT: shll $4, %eax 353; AVX-NEXT: shrl $4, %edi 354; AVX-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 355; AVX-NEXT: orl %eax, %edi 356; AVX-NEXT: movl %edi, %eax 357; AVX-NEXT: andl $858993459, %eax # imm = 0x33333333 358; AVX-NEXT: shrl $2, %edi 359; AVX-NEXT: andl $858993459, %edi # imm = 0x33333333 360; AVX-NEXT: leal (%rdi,%rax,4), %eax 361; AVX-NEXT: movl %eax, %ecx 362; AVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555 363; AVX-NEXT: shrl %eax 364; AVX-NEXT: andl $1431655765, %eax # imm = 0x55555555 365; AVX-NEXT: leal (%rax,%rcx,2), %eax 366; AVX-NEXT: retq 367; 368; XOP-LABEL: test_bitreverse_i32: 369; XOP: # %bb.0: 370; XOP-NEXT: vmovd %edi, %xmm0 371; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 372; XOP-NEXT: vmovd %xmm0, %eax 373; XOP-NEXT: retq 374; 375; GFNISSE-LABEL: test_bitreverse_i32: 376; GFNISSE: # %bb.0: 377; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi 378; GFNISSE-NEXT: bswapl %edi 379; GFNISSE-NEXT: movl %edi, %eax 380; GFNISSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 381; GFNISSE-NEXT: shll $4, %eax 382; GFNISSE-NEXT: shrl $4, %edi 383; GFNISSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 384; GFNISSE-NEXT: orl %eax, %edi 385; GFNISSE-NEXT: movl %edi, %eax 386; GFNISSE-NEXT: andl $858993459, %eax # imm = 0x33333333 387; GFNISSE-NEXT: shrl $2, %edi 388; GFNISSE-NEXT: andl $858993459, %edi # imm = 0x33333333 389; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax 390; GFNISSE-NEXT: movl %eax, %ecx 391; GFNISSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555 392; GFNISSE-NEXT: shrl %eax 393; GFNISSE-NEXT: andl $1431655765, %eax # imm = 0x55555555 394; GFNISSE-NEXT: leal (%rax,%rcx,2), %eax 395; GFNISSE-NEXT: retq 396; 397; GFNIAVX-LABEL: test_bitreverse_i32: 398; GFNIAVX: # %bb.0: 399; GFNIAVX-NEXT: # kill: def $edi killed $edi def $rdi 400; GFNIAVX-NEXT: bswapl %edi 401; GFNIAVX-NEXT: movl %edi, %eax 402; GFNIAVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 403; GFNIAVX-NEXT: shll $4, %eax 404; GFNIAVX-NEXT: shrl $4, %edi 405; GFNIAVX-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 406; GFNIAVX-NEXT: orl %eax, %edi 407; GFNIAVX-NEXT: movl %edi, %eax 408; GFNIAVX-NEXT: andl $858993459, %eax # imm = 0x33333333 409; GFNIAVX-NEXT: shrl $2, %edi 410; GFNIAVX-NEXT: andl $858993459, %edi # imm = 0x33333333 411; GFNIAVX-NEXT: leal (%rdi,%rax,4), %eax 412; GFNIAVX-NEXT: movl %eax, %ecx 413; GFNIAVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555 414; GFNIAVX-NEXT: shrl %eax 415; GFNIAVX-NEXT: andl $1431655765, %eax # imm = 0x55555555 416; GFNIAVX-NEXT: leal (%rax,%rcx,2), %eax 417; GFNIAVX-NEXT: retq 418; 419; GFNIAVX2-LABEL: test_bitreverse_i32: 420; GFNIAVX2: # %bb.0: 421; GFNIAVX2-NEXT: # kill: def $edi killed $edi def $rdi 422; GFNIAVX2-NEXT: bswapl %edi 423; GFNIAVX2-NEXT: movl %edi, %eax 424; GFNIAVX2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 425; GFNIAVX2-NEXT: shll $4, %eax 426; GFNIAVX2-NEXT: shrl $4, %edi 427; GFNIAVX2-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 428; GFNIAVX2-NEXT: orl %eax, %edi 429; GFNIAVX2-NEXT: movl %edi, %eax 430; GFNIAVX2-NEXT: andl $858993459, %eax # imm = 0x33333333 431; GFNIAVX2-NEXT: shrl $2, %edi 432; GFNIAVX2-NEXT: andl $858993459, %edi # imm = 0x33333333 433; GFNIAVX2-NEXT: leal (%rdi,%rax,4), %eax 434; GFNIAVX2-NEXT: movl %eax, %ecx 435; GFNIAVX2-NEXT: andl $1431655765, %ecx # imm = 0x55555555 436; GFNIAVX2-NEXT: shrl %eax 437; GFNIAVX2-NEXT: andl $1431655765, %eax # imm = 0x55555555 438; GFNIAVX2-NEXT: leal (%rax,%rcx,2), %eax 439; GFNIAVX2-NEXT: retq 440; 441; GFNIAVX512F-LABEL: test_bitreverse_i32: 442; GFNIAVX512F: # %bb.0: 443; GFNIAVX512F-NEXT: # kill: def $edi killed $edi def $rdi 444; GFNIAVX512F-NEXT: bswapl %edi 445; GFNIAVX512F-NEXT: movl %edi, %eax 446; GFNIAVX512F-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 447; GFNIAVX512F-NEXT: shll $4, %eax 448; GFNIAVX512F-NEXT: shrl $4, %edi 449; GFNIAVX512F-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 450; GFNIAVX512F-NEXT: orl %eax, %edi 451; GFNIAVX512F-NEXT: movl %edi, %eax 452; GFNIAVX512F-NEXT: andl $858993459, %eax # imm = 0x33333333 453; GFNIAVX512F-NEXT: shrl $2, %edi 454; GFNIAVX512F-NEXT: andl $858993459, %edi # imm = 0x33333333 455; GFNIAVX512F-NEXT: leal (%rdi,%rax,4), %eax 456; GFNIAVX512F-NEXT: movl %eax, %ecx 457; GFNIAVX512F-NEXT: andl $1431655765, %ecx # imm = 0x55555555 458; GFNIAVX512F-NEXT: shrl %eax 459; GFNIAVX512F-NEXT: andl $1431655765, %eax # imm = 0x55555555 460; GFNIAVX512F-NEXT: leal (%rax,%rcx,2), %eax 461; GFNIAVX512F-NEXT: retq 462; 463; GFNIAVX512BW-LABEL: test_bitreverse_i32: 464; GFNIAVX512BW: # %bb.0: 465; GFNIAVX512BW-NEXT: # kill: def $edi killed $edi def $rdi 466; GFNIAVX512BW-NEXT: bswapl %edi 467; GFNIAVX512BW-NEXT: movl %edi, %eax 468; GFNIAVX512BW-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 469; GFNIAVX512BW-NEXT: shll $4, %eax 470; GFNIAVX512BW-NEXT: shrl $4, %edi 471; GFNIAVX512BW-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 472; GFNIAVX512BW-NEXT: orl %eax, %edi 473; GFNIAVX512BW-NEXT: movl %edi, %eax 474; GFNIAVX512BW-NEXT: andl $858993459, %eax # imm = 0x33333333 475; GFNIAVX512BW-NEXT: shrl $2, %edi 476; GFNIAVX512BW-NEXT: andl $858993459, %edi # imm = 0x33333333 477; GFNIAVX512BW-NEXT: leal (%rdi,%rax,4), %eax 478; GFNIAVX512BW-NEXT: movl %eax, %ecx 479; GFNIAVX512BW-NEXT: andl $1431655765, %ecx # imm = 0x55555555 480; GFNIAVX512BW-NEXT: shrl %eax 481; GFNIAVX512BW-NEXT: andl $1431655765, %eax # imm = 0x55555555 482; GFNIAVX512BW-NEXT: leal (%rax,%rcx,2), %eax 483; GFNIAVX512BW-NEXT: retq 484 %b = call i32 @llvm.bitreverse.i32(i32 %a) 485 ret i32 %b 486} 487 488define i64 @test_bitreverse_i64(i64 %a) nounwind { 489; SSE-LABEL: test_bitreverse_i64: 490; SSE: # %bb.0: 491; SSE-NEXT: bswapq %rdi 492; SSE-NEXT: movq %rdi, %rax 493; SSE-NEXT: shrq $4, %rax 494; SSE-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 495; SSE-NEXT: andq %rcx, %rax 496; SSE-NEXT: andq %rcx, %rdi 497; SSE-NEXT: shlq $4, %rdi 498; SSE-NEXT: orq %rax, %rdi 499; SSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 500; SSE-NEXT: movq %rdi, %rcx 501; SSE-NEXT: andq %rax, %rcx 502; SSE-NEXT: shrq $2, %rdi 503; SSE-NEXT: andq %rax, %rdi 504; SSE-NEXT: leaq (%rdi,%rcx,4), %rax 505; SSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 506; SSE-NEXT: movq %rax, %rdx 507; SSE-NEXT: andq %rcx, %rdx 508; SSE-NEXT: shrq %rax 509; SSE-NEXT: andq %rcx, %rax 510; SSE-NEXT: leaq (%rax,%rdx,2), %rax 511; SSE-NEXT: retq 512; 513; AVX-LABEL: test_bitreverse_i64: 514; AVX: # %bb.0: 515; AVX-NEXT: bswapq %rdi 516; AVX-NEXT: movq %rdi, %rax 517; AVX-NEXT: shrq $4, %rax 518; AVX-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 519; AVX-NEXT: andq %rcx, %rax 520; AVX-NEXT: andq %rcx, %rdi 521; AVX-NEXT: shlq $4, %rdi 522; AVX-NEXT: orq %rax, %rdi 523; AVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 524; AVX-NEXT: movq %rdi, %rcx 525; AVX-NEXT: andq %rax, %rcx 526; AVX-NEXT: shrq $2, %rdi 527; AVX-NEXT: andq %rax, %rdi 528; AVX-NEXT: leaq (%rdi,%rcx,4), %rax 529; AVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 530; AVX-NEXT: movq %rax, %rdx 531; AVX-NEXT: andq %rcx, %rdx 532; AVX-NEXT: shrq %rax 533; AVX-NEXT: andq %rcx, %rax 534; AVX-NEXT: leaq (%rax,%rdx,2), %rax 535; AVX-NEXT: retq 536; 537; XOP-LABEL: test_bitreverse_i64: 538; XOP: # %bb.0: 539; XOP-NEXT: vmovq %rdi, %xmm0 540; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 541; XOP-NEXT: vmovq %xmm0, %rax 542; XOP-NEXT: retq 543; 544; GFNISSE-LABEL: test_bitreverse_i64: 545; GFNISSE: # %bb.0: 546; GFNISSE-NEXT: bswapq %rdi 547; GFNISSE-NEXT: movq %rdi, %rax 548; GFNISSE-NEXT: shrq $4, %rax 549; GFNISSE-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 550; GFNISSE-NEXT: andq %rcx, %rax 551; GFNISSE-NEXT: andq %rcx, %rdi 552; GFNISSE-NEXT: shlq $4, %rdi 553; GFNISSE-NEXT: orq %rax, %rdi 554; GFNISSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 555; GFNISSE-NEXT: movq %rdi, %rcx 556; GFNISSE-NEXT: andq %rax, %rcx 557; GFNISSE-NEXT: shrq $2, %rdi 558; GFNISSE-NEXT: andq %rax, %rdi 559; GFNISSE-NEXT: leaq (%rdi,%rcx,4), %rax 560; GFNISSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 561; GFNISSE-NEXT: movq %rax, %rdx 562; GFNISSE-NEXT: andq %rcx, %rdx 563; GFNISSE-NEXT: shrq %rax 564; GFNISSE-NEXT: andq %rcx, %rax 565; GFNISSE-NEXT: leaq (%rax,%rdx,2), %rax 566; GFNISSE-NEXT: retq 567; 568; GFNIAVX-LABEL: test_bitreverse_i64: 569; GFNIAVX: # %bb.0: 570; GFNIAVX-NEXT: bswapq %rdi 571; GFNIAVX-NEXT: movq %rdi, %rax 572; GFNIAVX-NEXT: shrq $4, %rax 573; GFNIAVX-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 574; GFNIAVX-NEXT: andq %rcx, %rax 575; GFNIAVX-NEXT: andq %rcx, %rdi 576; GFNIAVX-NEXT: shlq $4, %rdi 577; GFNIAVX-NEXT: orq %rax, %rdi 578; GFNIAVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 579; GFNIAVX-NEXT: movq %rdi, %rcx 580; GFNIAVX-NEXT: andq %rax, %rcx 581; GFNIAVX-NEXT: shrq $2, %rdi 582; GFNIAVX-NEXT: andq %rax, %rdi 583; GFNIAVX-NEXT: leaq (%rdi,%rcx,4), %rax 584; GFNIAVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 585; GFNIAVX-NEXT: movq %rax, %rdx 586; GFNIAVX-NEXT: andq %rcx, %rdx 587; GFNIAVX-NEXT: shrq %rax 588; GFNIAVX-NEXT: andq %rcx, %rax 589; GFNIAVX-NEXT: leaq (%rax,%rdx,2), %rax 590; GFNIAVX-NEXT: retq 591; 592; GFNIAVX2-LABEL: test_bitreverse_i64: 593; GFNIAVX2: # %bb.0: 594; GFNIAVX2-NEXT: bswapq %rdi 595; GFNIAVX2-NEXT: movq %rdi, %rax 596; GFNIAVX2-NEXT: shrq $4, %rax 597; GFNIAVX2-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 598; GFNIAVX2-NEXT: andq %rcx, %rax 599; GFNIAVX2-NEXT: andq %rcx, %rdi 600; GFNIAVX2-NEXT: shlq $4, %rdi 601; GFNIAVX2-NEXT: orq %rax, %rdi 602; GFNIAVX2-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 603; GFNIAVX2-NEXT: movq %rdi, %rcx 604; GFNIAVX2-NEXT: andq %rax, %rcx 605; GFNIAVX2-NEXT: shrq $2, %rdi 606; GFNIAVX2-NEXT: andq %rax, %rdi 607; GFNIAVX2-NEXT: leaq (%rdi,%rcx,4), %rax 608; GFNIAVX2-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 609; GFNIAVX2-NEXT: movq %rax, %rdx 610; GFNIAVX2-NEXT: andq %rcx, %rdx 611; GFNIAVX2-NEXT: shrq %rax 612; GFNIAVX2-NEXT: andq %rcx, %rax 613; GFNIAVX2-NEXT: leaq (%rax,%rdx,2), %rax 614; GFNIAVX2-NEXT: retq 615; 616; GFNIAVX512F-LABEL: test_bitreverse_i64: 617; GFNIAVX512F: # %bb.0: 618; GFNIAVX512F-NEXT: bswapq %rdi 619; GFNIAVX512F-NEXT: movq %rdi, %rax 620; GFNIAVX512F-NEXT: shrq $4, %rax 621; GFNIAVX512F-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 622; GFNIAVX512F-NEXT: andq %rcx, %rax 623; GFNIAVX512F-NEXT: andq %rcx, %rdi 624; GFNIAVX512F-NEXT: shlq $4, %rdi 625; GFNIAVX512F-NEXT: orq %rax, %rdi 626; GFNIAVX512F-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 627; GFNIAVX512F-NEXT: movq %rdi, %rcx 628; GFNIAVX512F-NEXT: andq %rax, %rcx 629; GFNIAVX512F-NEXT: shrq $2, %rdi 630; GFNIAVX512F-NEXT: andq %rax, %rdi 631; GFNIAVX512F-NEXT: leaq (%rdi,%rcx,4), %rax 632; GFNIAVX512F-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 633; GFNIAVX512F-NEXT: movq %rax, %rdx 634; GFNIAVX512F-NEXT: andq %rcx, %rdx 635; GFNIAVX512F-NEXT: shrq %rax 636; GFNIAVX512F-NEXT: andq %rcx, %rax 637; GFNIAVX512F-NEXT: leaq (%rax,%rdx,2), %rax 638; GFNIAVX512F-NEXT: retq 639; 640; GFNIAVX512BW-LABEL: test_bitreverse_i64: 641; GFNIAVX512BW: # %bb.0: 642; GFNIAVX512BW-NEXT: bswapq %rdi 643; GFNIAVX512BW-NEXT: movq %rdi, %rax 644; GFNIAVX512BW-NEXT: shrq $4, %rax 645; GFNIAVX512BW-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 646; GFNIAVX512BW-NEXT: andq %rcx, %rax 647; GFNIAVX512BW-NEXT: andq %rcx, %rdi 648; GFNIAVX512BW-NEXT: shlq $4, %rdi 649; GFNIAVX512BW-NEXT: orq %rax, %rdi 650; GFNIAVX512BW-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 651; GFNIAVX512BW-NEXT: movq %rdi, %rcx 652; GFNIAVX512BW-NEXT: andq %rax, %rcx 653; GFNIAVX512BW-NEXT: shrq $2, %rdi 654; GFNIAVX512BW-NEXT: andq %rax, %rdi 655; GFNIAVX512BW-NEXT: leaq (%rdi,%rcx,4), %rax 656; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 657; GFNIAVX512BW-NEXT: movq %rax, %rdx 658; GFNIAVX512BW-NEXT: andq %rcx, %rdx 659; GFNIAVX512BW-NEXT: shrq %rax 660; GFNIAVX512BW-NEXT: andq %rcx, %rax 661; GFNIAVX512BW-NEXT: leaq (%rax,%rdx,2), %rax 662; GFNIAVX512BW-NEXT: retq 663 %b = call i64 @llvm.bitreverse.i64(i64 %a) 664 ret i64 %b 665} 666 667define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { 668; SSE2-LABEL: test_bitreverse_v16i8: 669; SSE2: # %bb.0: 670; SSE2-NEXT: movdqa %xmm0, %xmm1 671; SSE2-NEXT: psrlw $4, %xmm1 672; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 673; SSE2-NEXT: pand %xmm2, %xmm1 674; SSE2-NEXT: pand %xmm2, %xmm0 675; SSE2-NEXT: psllw $4, %xmm0 676; SSE2-NEXT: por %xmm1, %xmm0 677; SSE2-NEXT: movdqa %xmm0, %xmm1 678; SSE2-NEXT: psrlw $2, %xmm1 679; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 680; SSE2-NEXT: pand %xmm2, %xmm1 681; SSE2-NEXT: pand %xmm2, %xmm0 682; SSE2-NEXT: psllw $2, %xmm0 683; SSE2-NEXT: por %xmm1, %xmm0 684; SSE2-NEXT: movdqa %xmm0, %xmm1 685; SSE2-NEXT: psrlw $1, %xmm1 686; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 687; SSE2-NEXT: pand %xmm2, %xmm1 688; SSE2-NEXT: pand %xmm2, %xmm0 689; SSE2-NEXT: paddb %xmm0, %xmm0 690; SSE2-NEXT: por %xmm1, %xmm0 691; SSE2-NEXT: retq 692; 693; SSSE3-LABEL: test_bitreverse_v16i8: 694; SSSE3: # %bb.0: 695; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 696; SSSE3-NEXT: movdqa %xmm0, %xmm2 697; SSSE3-NEXT: pand %xmm1, %xmm2 698; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 699; SSSE3-NEXT: pshufb %xmm2, %xmm3 700; SSSE3-NEXT: psrlw $4, %xmm0 701; SSSE3-NEXT: pand %xmm1, %xmm0 702; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 703; SSSE3-NEXT: pshufb %xmm0, %xmm1 704; SSSE3-NEXT: por %xmm3, %xmm1 705; SSSE3-NEXT: movdqa %xmm1, %xmm0 706; SSSE3-NEXT: retq 707; 708; AVX-LABEL: test_bitreverse_v16i8: 709; AVX: # %bb.0: 710; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 711; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 712; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 713; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 714; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 715; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 716; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 717; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 718; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 719; AVX-NEXT: retq 720; 721; XOP-LABEL: test_bitreverse_v16i8: 722; XOP: # %bb.0: 723; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 724; XOP-NEXT: retq 725; 726; GFNISSE-LABEL: test_bitreverse_v16i8: 727; GFNISSE: # %bb.0: 728; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 729; GFNISSE-NEXT: retq 730; 731; GFNIAVX-LABEL: test_bitreverse_v16i8: 732; GFNIAVX: # %bb.0: 733; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 734; GFNIAVX-NEXT: retq 735; 736; GFNIAVX2-LABEL: test_bitreverse_v16i8: 737; GFNIAVX2: # %bb.0: 738; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 739; GFNIAVX2-NEXT: retq 740; 741; GFNIAVX512F-LABEL: test_bitreverse_v16i8: 742; GFNIAVX512F: # %bb.0: 743; GFNIAVX512F-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 744; GFNIAVX512F-NEXT: retq 745; 746; GFNIAVX512BW-LABEL: test_bitreverse_v16i8: 747; GFNIAVX512BW: # %bb.0: 748; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 749; GFNIAVX512BW-NEXT: retq 750 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) 751 ret <16 x i8> %b 752} 753 754define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { 755; SSE2-LABEL: test_bitreverse_v8i16: 756; SSE2: # %bb.0: 757; SSE2-NEXT: movdqa %xmm0, %xmm1 758; SSE2-NEXT: psrlw $8, %xmm1 759; SSE2-NEXT: psllw $8, %xmm0 760; SSE2-NEXT: por %xmm1, %xmm0 761; SSE2-NEXT: movdqa %xmm0, %xmm1 762; SSE2-NEXT: psrlw $4, %xmm1 763; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 764; SSE2-NEXT: pand %xmm2, %xmm1 765; SSE2-NEXT: pand %xmm2, %xmm0 766; SSE2-NEXT: psllw $4, %xmm0 767; SSE2-NEXT: por %xmm1, %xmm0 768; SSE2-NEXT: movdqa %xmm0, %xmm1 769; SSE2-NEXT: psrlw $2, %xmm1 770; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 771; SSE2-NEXT: pand %xmm2, %xmm1 772; SSE2-NEXT: pand %xmm2, %xmm0 773; SSE2-NEXT: psllw $2, %xmm0 774; SSE2-NEXT: por %xmm1, %xmm0 775; SSE2-NEXT: movdqa %xmm0, %xmm1 776; SSE2-NEXT: psrlw $1, %xmm1 777; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 778; SSE2-NEXT: pand %xmm2, %xmm1 779; SSE2-NEXT: pand %xmm2, %xmm0 780; SSE2-NEXT: paddb %xmm0, %xmm0 781; SSE2-NEXT: por %xmm1, %xmm0 782; SSE2-NEXT: retq 783; 784; SSSE3-LABEL: test_bitreverse_v8i16: 785; SSSE3: # %bb.0: 786; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 787; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 788; SSSE3-NEXT: movdqa %xmm0, %xmm2 789; SSSE3-NEXT: pand %xmm1, %xmm2 790; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 791; SSSE3-NEXT: pshufb %xmm2, %xmm3 792; SSSE3-NEXT: psrlw $4, %xmm0 793; SSSE3-NEXT: pand %xmm1, %xmm0 794; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 795; SSSE3-NEXT: pshufb %xmm0, %xmm1 796; SSSE3-NEXT: por %xmm3, %xmm1 797; SSSE3-NEXT: movdqa %xmm1, %xmm0 798; SSSE3-NEXT: retq 799; 800; AVX-LABEL: test_bitreverse_v8i16: 801; AVX: # %bb.0: 802; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 803; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 804; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 805; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 806; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 807; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 808; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 809; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 810; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 811; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 812; AVX-NEXT: retq 813; 814; XOP-LABEL: test_bitreverse_v8i16: 815; XOP: # %bb.0: 816; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 817; XOP-NEXT: retq 818; 819; GFNISSE-LABEL: test_bitreverse_v8i16: 820; GFNISSE: # %bb.0: 821; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 822; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 823; GFNISSE-NEXT: retq 824; 825; GFNIAVX-LABEL: test_bitreverse_v8i16: 826; GFNIAVX: # %bb.0: 827; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 828; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 829; GFNIAVX-NEXT: retq 830; 831; GFNIAVX2-LABEL: test_bitreverse_v8i16: 832; GFNIAVX2: # %bb.0: 833; GFNIAVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 834; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 835; GFNIAVX2-NEXT: retq 836; 837; GFNIAVX512F-LABEL: test_bitreverse_v8i16: 838; GFNIAVX512F: # %bb.0: 839; GFNIAVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 840; GFNIAVX512F-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 841; GFNIAVX512F-NEXT: retq 842; 843; GFNIAVX512BW-LABEL: test_bitreverse_v8i16: 844; GFNIAVX512BW: # %bb.0: 845; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 846; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 847; GFNIAVX512BW-NEXT: retq 848 %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) 849 ret <8 x i16> %b 850} 851 852define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { 853; SSE2-LABEL: test_bitreverse_v4i32: 854; SSE2: # %bb.0: 855; SSE2-NEXT: pxor %xmm1, %xmm1 856; SSE2-NEXT: movdqa %xmm0, %xmm2 857; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 858; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 859; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 860; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 861; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 862; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 863; SSE2-NEXT: packuswb %xmm2, %xmm0 864; SSE2-NEXT: movdqa %xmm0, %xmm1 865; SSE2-NEXT: psrlw $4, %xmm1 866; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 867; SSE2-NEXT: pand %xmm2, %xmm1 868; SSE2-NEXT: pand %xmm2, %xmm0 869; SSE2-NEXT: psllw $4, %xmm0 870; SSE2-NEXT: por %xmm1, %xmm0 871; SSE2-NEXT: movdqa %xmm0, %xmm1 872; SSE2-NEXT: psrlw $2, %xmm1 873; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 874; SSE2-NEXT: pand %xmm2, %xmm1 875; SSE2-NEXT: pand %xmm2, %xmm0 876; SSE2-NEXT: psllw $2, %xmm0 877; SSE2-NEXT: por %xmm1, %xmm0 878; SSE2-NEXT: movdqa %xmm0, %xmm1 879; SSE2-NEXT: psrlw $1, %xmm1 880; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 881; SSE2-NEXT: pand %xmm2, %xmm1 882; SSE2-NEXT: pand %xmm2, %xmm0 883; SSE2-NEXT: paddb %xmm0, %xmm0 884; SSE2-NEXT: por %xmm1, %xmm0 885; SSE2-NEXT: retq 886; 887; SSSE3-LABEL: test_bitreverse_v4i32: 888; SSSE3: # %bb.0: 889; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 890; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 891; SSSE3-NEXT: movdqa %xmm0, %xmm2 892; SSSE3-NEXT: pand %xmm1, %xmm2 893; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 894; SSSE3-NEXT: pshufb %xmm2, %xmm3 895; SSSE3-NEXT: psrlw $4, %xmm0 896; SSSE3-NEXT: pand %xmm1, %xmm0 897; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 898; SSSE3-NEXT: pshufb %xmm0, %xmm1 899; SSSE3-NEXT: por %xmm3, %xmm1 900; SSSE3-NEXT: movdqa %xmm1, %xmm0 901; SSSE3-NEXT: retq 902; 903; AVX-LABEL: test_bitreverse_v4i32: 904; AVX: # %bb.0: 905; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 906; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 907; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 908; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 909; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 910; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 911; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 912; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 913; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 914; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 915; AVX-NEXT: retq 916; 917; XOP-LABEL: test_bitreverse_v4i32: 918; XOP: # %bb.0: 919; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 920; XOP-NEXT: retq 921; 922; GFNISSE-LABEL: test_bitreverse_v4i32: 923; GFNISSE: # %bb.0: 924; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 925; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 926; GFNISSE-NEXT: retq 927; 928; GFNIAVX-LABEL: test_bitreverse_v4i32: 929; GFNIAVX: # %bb.0: 930; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 931; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 932; GFNIAVX-NEXT: retq 933; 934; GFNIAVX2-LABEL: test_bitreverse_v4i32: 935; GFNIAVX2: # %bb.0: 936; GFNIAVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 937; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 938; GFNIAVX2-NEXT: retq 939; 940; GFNIAVX512F-LABEL: test_bitreverse_v4i32: 941; GFNIAVX512F: # %bb.0: 942; GFNIAVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 943; GFNIAVX512F-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 944; GFNIAVX512F-NEXT: retq 945; 946; GFNIAVX512BW-LABEL: test_bitreverse_v4i32: 947; GFNIAVX512BW: # %bb.0: 948; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 949; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 950; GFNIAVX512BW-NEXT: retq 951 %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) 952 ret <4 x i32> %b 953} 954 955define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { 956; SSE2-LABEL: test_bitreverse_v2i64: 957; SSE2: # %bb.0: 958; SSE2-NEXT: pxor %xmm1, %xmm1 959; SSE2-NEXT: movdqa %xmm0, %xmm2 960; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 961; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 962; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 963; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 964; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 965; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 966; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 967; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 968; SSE2-NEXT: packuswb %xmm2, %xmm0 969; SSE2-NEXT: movdqa %xmm0, %xmm1 970; SSE2-NEXT: psrlw $4, %xmm1 971; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 972; SSE2-NEXT: pand %xmm2, %xmm1 973; SSE2-NEXT: pand %xmm2, %xmm0 974; SSE2-NEXT: psllw $4, %xmm0 975; SSE2-NEXT: por %xmm1, %xmm0 976; SSE2-NEXT: movdqa %xmm0, %xmm1 977; SSE2-NEXT: psrlw $2, %xmm1 978; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 979; SSE2-NEXT: pand %xmm2, %xmm1 980; SSE2-NEXT: pand %xmm2, %xmm0 981; SSE2-NEXT: psllw $2, %xmm0 982; SSE2-NEXT: por %xmm1, %xmm0 983; SSE2-NEXT: movdqa %xmm0, %xmm1 984; SSE2-NEXT: psrlw $1, %xmm1 985; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 986; SSE2-NEXT: pand %xmm2, %xmm1 987; SSE2-NEXT: pand %xmm2, %xmm0 988; SSE2-NEXT: paddb %xmm0, %xmm0 989; SSE2-NEXT: por %xmm1, %xmm0 990; SSE2-NEXT: retq 991; 992; SSSE3-LABEL: test_bitreverse_v2i64: 993; SSSE3: # %bb.0: 994; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 995; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 996; SSSE3-NEXT: movdqa %xmm0, %xmm2 997; SSSE3-NEXT: pand %xmm1, %xmm2 998; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 999; SSSE3-NEXT: pshufb %xmm2, %xmm3 1000; SSSE3-NEXT: psrlw $4, %xmm0 1001; SSSE3-NEXT: pand %xmm1, %xmm0 1002; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1003; SSSE3-NEXT: pshufb %xmm0, %xmm1 1004; SSSE3-NEXT: por %xmm3, %xmm1 1005; SSSE3-NEXT: movdqa %xmm1, %xmm0 1006; SSSE3-NEXT: retq 1007; 1008; AVX-LABEL: test_bitreverse_v2i64: 1009; AVX: # %bb.0: 1010; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1011; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1012; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 1013; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1014; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1015; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1016; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 1017; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1018; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 1019; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 1020; AVX-NEXT: retq 1021; 1022; XOP-LABEL: test_bitreverse_v2i64: 1023; XOP: # %bb.0: 1024; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 1025; XOP-NEXT: retq 1026; 1027; GFNISSE-LABEL: test_bitreverse_v2i64: 1028; GFNISSE: # %bb.0: 1029; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1030; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1031; GFNISSE-NEXT: retq 1032; 1033; GFNIAVX-LABEL: test_bitreverse_v2i64: 1034; GFNIAVX: # %bb.0: 1035; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1036; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1037; GFNIAVX-NEXT: retq 1038; 1039; GFNIAVX2-LABEL: test_bitreverse_v2i64: 1040; GFNIAVX2: # %bb.0: 1041; GFNIAVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1042; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1043; GFNIAVX2-NEXT: retq 1044; 1045; GFNIAVX512F-LABEL: test_bitreverse_v2i64: 1046; GFNIAVX512F: # %bb.0: 1047; GFNIAVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1048; GFNIAVX512F-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1049; GFNIAVX512F-NEXT: retq 1050; 1051; GFNIAVX512BW-LABEL: test_bitreverse_v2i64: 1052; GFNIAVX512BW: # %bb.0: 1053; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1054; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1055; GFNIAVX512BW-NEXT: retq 1056 %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) 1057 ret <2 x i64> %b 1058} 1059 1060define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { 1061; SSE2-LABEL: test_bitreverse_v32i8: 1062; SSE2: # %bb.0: 1063; SSE2-NEXT: movdqa %xmm0, %xmm3 1064; SSE2-NEXT: psrlw $4, %xmm3 1065; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1066; SSE2-NEXT: pand %xmm2, %xmm3 1067; SSE2-NEXT: pand %xmm2, %xmm0 1068; SSE2-NEXT: psllw $4, %xmm0 1069; SSE2-NEXT: por %xmm3, %xmm0 1070; SSE2-NEXT: movdqa %xmm0, %xmm4 1071; SSE2-NEXT: psrlw $2, %xmm4 1072; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1073; SSE2-NEXT: pand %xmm3, %xmm4 1074; SSE2-NEXT: pand %xmm3, %xmm0 1075; SSE2-NEXT: psllw $2, %xmm0 1076; SSE2-NEXT: por %xmm4, %xmm0 1077; SSE2-NEXT: movdqa %xmm0, %xmm5 1078; SSE2-NEXT: psrlw $1, %xmm5 1079; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1080; SSE2-NEXT: pand %xmm4, %xmm5 1081; SSE2-NEXT: pand %xmm4, %xmm0 1082; SSE2-NEXT: paddb %xmm0, %xmm0 1083; SSE2-NEXT: por %xmm5, %xmm0 1084; SSE2-NEXT: movdqa %xmm1, %xmm5 1085; SSE2-NEXT: psrlw $4, %xmm5 1086; SSE2-NEXT: pand %xmm2, %xmm5 1087; SSE2-NEXT: pand %xmm2, %xmm1 1088; SSE2-NEXT: psllw $4, %xmm1 1089; SSE2-NEXT: por %xmm5, %xmm1 1090; SSE2-NEXT: movdqa %xmm1, %xmm2 1091; SSE2-NEXT: psrlw $2, %xmm2 1092; SSE2-NEXT: pand %xmm3, %xmm2 1093; SSE2-NEXT: pand %xmm3, %xmm1 1094; SSE2-NEXT: psllw $2, %xmm1 1095; SSE2-NEXT: por %xmm2, %xmm1 1096; SSE2-NEXT: movdqa %xmm1, %xmm2 1097; SSE2-NEXT: psrlw $1, %xmm2 1098; SSE2-NEXT: pand %xmm4, %xmm2 1099; SSE2-NEXT: pand %xmm4, %xmm1 1100; SSE2-NEXT: paddb %xmm1, %xmm1 1101; SSE2-NEXT: por %xmm2, %xmm1 1102; SSE2-NEXT: retq 1103; 1104; SSSE3-LABEL: test_bitreverse_v32i8: 1105; SSSE3: # %bb.0: 1106; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1107; SSSE3-NEXT: movdqa %xmm0, %xmm2 1108; SSSE3-NEXT: pand %xmm4, %xmm2 1109; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1110; SSSE3-NEXT: movdqa %xmm5, %xmm6 1111; SSSE3-NEXT: pshufb %xmm2, %xmm6 1112; SSSE3-NEXT: psrlw $4, %xmm0 1113; SSSE3-NEXT: pand %xmm4, %xmm0 1114; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1115; SSSE3-NEXT: movdqa %xmm2, %xmm3 1116; SSSE3-NEXT: pshufb %xmm0, %xmm3 1117; SSSE3-NEXT: por %xmm6, %xmm3 1118; SSSE3-NEXT: movdqa %xmm1, %xmm0 1119; SSSE3-NEXT: pand %xmm4, %xmm0 1120; SSSE3-NEXT: pshufb %xmm0, %xmm5 1121; SSSE3-NEXT: psrlw $4, %xmm1 1122; SSSE3-NEXT: pand %xmm4, %xmm1 1123; SSSE3-NEXT: pshufb %xmm1, %xmm2 1124; SSSE3-NEXT: por %xmm5, %xmm2 1125; SSSE3-NEXT: movdqa %xmm3, %xmm0 1126; SSSE3-NEXT: movdqa %xmm2, %xmm1 1127; SSSE3-NEXT: retq 1128; 1129; AVX1-LABEL: test_bitreverse_v32i8: 1130; AVX1: # %bb.0: 1131; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1132; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1133; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 1134; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1135; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 1136; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1137; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1138; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1139; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 1140; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 1141; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 1142; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 1143; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1144; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1145; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 1146; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 1147; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1148; AVX1-NEXT: retq 1149; 1150; AVX2-LABEL: test_bitreverse_v32i8: 1151; AVX2: # %bb.0: 1152; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1153; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1154; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1155; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1156; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1157; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1158; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1159; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1160; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1161; AVX2-NEXT: retq 1162; 1163; AVX512-LABEL: test_bitreverse_v32i8: 1164; AVX512: # %bb.0: 1165; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1166; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1167; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1168; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1169; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1170; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1171; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1172; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1173; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1174; AVX512-NEXT: retq 1175; 1176; XOPAVX1-LABEL: test_bitreverse_v32i8: 1177; XOPAVX1: # %bb.0: 1178; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1179; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1180; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1181; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1182; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1183; XOPAVX1-NEXT: retq 1184; 1185; XOPAVX2-LABEL: test_bitreverse_v32i8: 1186; XOPAVX2: # %bb.0: 1187; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1188; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1189; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1190; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1191; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1192; XOPAVX2-NEXT: retq 1193; 1194; GFNISSE-LABEL: test_bitreverse_v32i8: 1195; GFNISSE: # %bb.0: 1196; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] 1197; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 1198; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 1199; GFNISSE-NEXT: retq 1200; 1201; GFNIAVX-LABEL: test_bitreverse_v32i8: 1202; GFNIAVX: # %bb.0: 1203; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1204; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] 1205; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm2, %xmm1, %xmm1 1206; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm2, %xmm0, %xmm0 1207; GFNIAVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1208; GFNIAVX-NEXT: retq 1209; 1210; GFNIAVX2-LABEL: test_bitreverse_v32i8: 1211; GFNIAVX2: # %bb.0: 1212; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1213; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1214; GFNIAVX2-NEXT: retq 1215; 1216; GFNIAVX512F-LABEL: test_bitreverse_v32i8: 1217; GFNIAVX512F: # %bb.0: 1218; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1219; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1220; GFNIAVX512F-NEXT: retq 1221; 1222; GFNIAVX512BW-LABEL: test_bitreverse_v32i8: 1223; GFNIAVX512BW: # %bb.0: 1224; GFNIAVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1225; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1226; GFNIAVX512BW-NEXT: retq 1227 %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) 1228 ret <32 x i8> %b 1229} 1230 1231define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { 1232; SSE2-LABEL: test_bitreverse_v16i16: 1233; SSE2: # %bb.0: 1234; SSE2-NEXT: movdqa %xmm0, %xmm2 1235; SSE2-NEXT: psrlw $8, %xmm2 1236; SSE2-NEXT: psllw $8, %xmm0 1237; SSE2-NEXT: por %xmm2, %xmm0 1238; SSE2-NEXT: movdqa %xmm0, %xmm3 1239; SSE2-NEXT: psrlw $4, %xmm3 1240; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1241; SSE2-NEXT: pand %xmm2, %xmm3 1242; SSE2-NEXT: pand %xmm2, %xmm0 1243; SSE2-NEXT: psllw $4, %xmm0 1244; SSE2-NEXT: por %xmm3, %xmm0 1245; SSE2-NEXT: movdqa %xmm0, %xmm4 1246; SSE2-NEXT: psrlw $2, %xmm4 1247; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1248; SSE2-NEXT: pand %xmm3, %xmm4 1249; SSE2-NEXT: pand %xmm3, %xmm0 1250; SSE2-NEXT: psllw $2, %xmm0 1251; SSE2-NEXT: por %xmm4, %xmm0 1252; SSE2-NEXT: movdqa %xmm0, %xmm5 1253; SSE2-NEXT: psrlw $1, %xmm5 1254; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1255; SSE2-NEXT: pand %xmm4, %xmm5 1256; SSE2-NEXT: pand %xmm4, %xmm0 1257; SSE2-NEXT: paddb %xmm0, %xmm0 1258; SSE2-NEXT: por %xmm5, %xmm0 1259; SSE2-NEXT: movdqa %xmm1, %xmm5 1260; SSE2-NEXT: psrlw $8, %xmm5 1261; SSE2-NEXT: psllw $8, %xmm1 1262; SSE2-NEXT: por %xmm5, %xmm1 1263; SSE2-NEXT: movdqa %xmm1, %xmm5 1264; SSE2-NEXT: psrlw $4, %xmm5 1265; SSE2-NEXT: pand %xmm2, %xmm5 1266; SSE2-NEXT: pand %xmm2, %xmm1 1267; SSE2-NEXT: psllw $4, %xmm1 1268; SSE2-NEXT: por %xmm5, %xmm1 1269; SSE2-NEXT: movdqa %xmm1, %xmm2 1270; SSE2-NEXT: psrlw $2, %xmm2 1271; SSE2-NEXT: pand %xmm3, %xmm2 1272; SSE2-NEXT: pand %xmm3, %xmm1 1273; SSE2-NEXT: psllw $2, %xmm1 1274; SSE2-NEXT: por %xmm2, %xmm1 1275; SSE2-NEXT: movdqa %xmm1, %xmm2 1276; SSE2-NEXT: psrlw $1, %xmm2 1277; SSE2-NEXT: pand %xmm4, %xmm2 1278; SSE2-NEXT: pand %xmm4, %xmm1 1279; SSE2-NEXT: paddb %xmm1, %xmm1 1280; SSE2-NEXT: por %xmm2, %xmm1 1281; SSE2-NEXT: retq 1282; 1283; SSSE3-LABEL: test_bitreverse_v16i16: 1284; SSSE3: # %bb.0: 1285; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1286; SSSE3-NEXT: pshufb %xmm4, %xmm0 1287; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1288; SSSE3-NEXT: movdqa %xmm0, %xmm2 1289; SSSE3-NEXT: pand %xmm5, %xmm2 1290; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1291; SSSE3-NEXT: movdqa %xmm6, %xmm7 1292; SSSE3-NEXT: pshufb %xmm2, %xmm7 1293; SSSE3-NEXT: psrlw $4, %xmm0 1294; SSSE3-NEXT: pand %xmm5, %xmm0 1295; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1296; SSSE3-NEXT: movdqa %xmm2, %xmm3 1297; SSSE3-NEXT: pshufb %xmm0, %xmm3 1298; SSSE3-NEXT: por %xmm7, %xmm3 1299; SSSE3-NEXT: pshufb %xmm4, %xmm1 1300; SSSE3-NEXT: movdqa %xmm1, %xmm0 1301; SSSE3-NEXT: pand %xmm5, %xmm0 1302; SSSE3-NEXT: pshufb %xmm0, %xmm6 1303; SSSE3-NEXT: psrlw $4, %xmm1 1304; SSSE3-NEXT: pand %xmm5, %xmm1 1305; SSSE3-NEXT: pshufb %xmm1, %xmm2 1306; SSSE3-NEXT: por %xmm6, %xmm2 1307; SSSE3-NEXT: movdqa %xmm3, %xmm0 1308; SSSE3-NEXT: movdqa %xmm2, %xmm1 1309; SSSE3-NEXT: retq 1310; 1311; AVX1-LABEL: test_bitreverse_v16i16: 1312; AVX1: # %bb.0: 1313; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1314; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1315; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1316; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1317; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1318; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1319; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1320; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1321; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1322; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1323; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1324; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1325; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1326; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1327; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1328; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1329; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1330; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1331; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1332; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1333; AVX1-NEXT: retq 1334; 1335; AVX2-LABEL: test_bitreverse_v16i16: 1336; AVX2: # %bb.0: 1337; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1338; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1339; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1340; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1341; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1342; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1343; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1344; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1345; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1346; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1347; AVX2-NEXT: retq 1348; 1349; AVX512-LABEL: test_bitreverse_v16i16: 1350; AVX512: # %bb.0: 1351; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1352; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1353; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1354; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1355; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1356; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1357; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1358; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1359; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1360; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1361; AVX512-NEXT: retq 1362; 1363; XOPAVX1-LABEL: test_bitreverse_v16i16: 1364; XOPAVX1: # %bb.0: 1365; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1366; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1367; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1368; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1369; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1370; XOPAVX1-NEXT: retq 1371; 1372; XOPAVX2-LABEL: test_bitreverse_v16i16: 1373; XOPAVX2: # %bb.0: 1374; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1375; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1376; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1377; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1378; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1379; XOPAVX2-NEXT: retq 1380; 1381; GFNISSE-LABEL: test_bitreverse_v16i16: 1382; GFNISSE: # %bb.0: 1383; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1384; GFNISSE-NEXT: pshufb %xmm2, %xmm0 1385; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1386; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 1387; GFNISSE-NEXT: pshufb %xmm2, %xmm1 1388; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 1389; GFNISSE-NEXT: retq 1390; 1391; GFNIAVX-LABEL: test_bitreverse_v16i16: 1392; GFNIAVX: # %bb.0: 1393; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1394; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1395; GFNIAVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1396; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1397; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1 1398; GFNIAVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1399; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0 1400; GFNIAVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1401; GFNIAVX-NEXT: retq 1402; 1403; GFNIAVX2-LABEL: test_bitreverse_v16i16: 1404; GFNIAVX2: # %bb.0: 1405; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1406; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1407; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1408; GFNIAVX2-NEXT: retq 1409; 1410; GFNIAVX512F-LABEL: test_bitreverse_v16i16: 1411; GFNIAVX512F: # %bb.0: 1412; GFNIAVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1413; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1414; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1415; GFNIAVX512F-NEXT: retq 1416; 1417; GFNIAVX512BW-LABEL: test_bitreverse_v16i16: 1418; GFNIAVX512BW: # %bb.0: 1419; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1420; GFNIAVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1421; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1422; GFNIAVX512BW-NEXT: retq 1423 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) 1424 ret <16 x i16> %b 1425} 1426 1427define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { 1428; SSE2-LABEL: test_bitreverse_v8i32: 1429; SSE2: # %bb.0: 1430; SSE2-NEXT: pxor %xmm2, %xmm2 1431; SSE2-NEXT: movdqa %xmm0, %xmm3 1432; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 1433; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 1434; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 1435; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1436; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1437; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1438; SSE2-NEXT: packuswb %xmm3, %xmm0 1439; SSE2-NEXT: movdqa %xmm0, %xmm4 1440; SSE2-NEXT: psrlw $4, %xmm4 1441; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1442; SSE2-NEXT: pand %xmm3, %xmm4 1443; SSE2-NEXT: pand %xmm3, %xmm0 1444; SSE2-NEXT: psllw $4, %xmm0 1445; SSE2-NEXT: por %xmm4, %xmm0 1446; SSE2-NEXT: movdqa %xmm0, %xmm5 1447; SSE2-NEXT: psrlw $2, %xmm5 1448; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1449; SSE2-NEXT: pand %xmm4, %xmm5 1450; SSE2-NEXT: pand %xmm4, %xmm0 1451; SSE2-NEXT: psllw $2, %xmm0 1452; SSE2-NEXT: por %xmm5, %xmm0 1453; SSE2-NEXT: movdqa %xmm0, %xmm6 1454; SSE2-NEXT: psrlw $1, %xmm6 1455; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1456; SSE2-NEXT: pand %xmm5, %xmm6 1457; SSE2-NEXT: pand %xmm5, %xmm0 1458; SSE2-NEXT: paddb %xmm0, %xmm0 1459; SSE2-NEXT: por %xmm6, %xmm0 1460; SSE2-NEXT: movdqa %xmm1, %xmm6 1461; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] 1462; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1463; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1464; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1465; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1466; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1467; SSE2-NEXT: packuswb %xmm6, %xmm1 1468; SSE2-NEXT: movdqa %xmm1, %xmm2 1469; SSE2-NEXT: psrlw $4, %xmm2 1470; SSE2-NEXT: pand %xmm3, %xmm2 1471; SSE2-NEXT: pand %xmm3, %xmm1 1472; SSE2-NEXT: psllw $4, %xmm1 1473; SSE2-NEXT: por %xmm2, %xmm1 1474; SSE2-NEXT: movdqa %xmm1, %xmm2 1475; SSE2-NEXT: psrlw $2, %xmm2 1476; SSE2-NEXT: pand %xmm4, %xmm2 1477; SSE2-NEXT: pand %xmm4, %xmm1 1478; SSE2-NEXT: psllw $2, %xmm1 1479; SSE2-NEXT: por %xmm2, %xmm1 1480; SSE2-NEXT: movdqa %xmm1, %xmm2 1481; SSE2-NEXT: psrlw $1, %xmm2 1482; SSE2-NEXT: pand %xmm5, %xmm2 1483; SSE2-NEXT: pand %xmm5, %xmm1 1484; SSE2-NEXT: paddb %xmm1, %xmm1 1485; SSE2-NEXT: por %xmm2, %xmm1 1486; SSE2-NEXT: retq 1487; 1488; SSSE3-LABEL: test_bitreverse_v8i32: 1489; SSSE3: # %bb.0: 1490; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1491; SSSE3-NEXT: pshufb %xmm4, %xmm0 1492; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1493; SSSE3-NEXT: movdqa %xmm0, %xmm2 1494; SSSE3-NEXT: pand %xmm5, %xmm2 1495; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1496; SSSE3-NEXT: movdqa %xmm6, %xmm7 1497; SSSE3-NEXT: pshufb %xmm2, %xmm7 1498; SSSE3-NEXT: psrlw $4, %xmm0 1499; SSSE3-NEXT: pand %xmm5, %xmm0 1500; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1501; SSSE3-NEXT: movdqa %xmm2, %xmm3 1502; SSSE3-NEXT: pshufb %xmm0, %xmm3 1503; SSSE3-NEXT: por %xmm7, %xmm3 1504; SSSE3-NEXT: pshufb %xmm4, %xmm1 1505; SSSE3-NEXT: movdqa %xmm1, %xmm0 1506; SSSE3-NEXT: pand %xmm5, %xmm0 1507; SSSE3-NEXT: pshufb %xmm0, %xmm6 1508; SSSE3-NEXT: psrlw $4, %xmm1 1509; SSSE3-NEXT: pand %xmm5, %xmm1 1510; SSSE3-NEXT: pshufb %xmm1, %xmm2 1511; SSSE3-NEXT: por %xmm6, %xmm2 1512; SSSE3-NEXT: movdqa %xmm3, %xmm0 1513; SSSE3-NEXT: movdqa %xmm2, %xmm1 1514; SSSE3-NEXT: retq 1515; 1516; AVX1-LABEL: test_bitreverse_v8i32: 1517; AVX1: # %bb.0: 1518; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1519; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1520; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1521; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1522; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1523; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1524; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1525; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1526; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1527; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1528; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1529; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1530; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1531; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1532; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1533; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1534; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1535; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1536; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1537; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1538; AVX1-NEXT: retq 1539; 1540; AVX2-LABEL: test_bitreverse_v8i32: 1541; AVX2: # %bb.0: 1542; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1543; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1544; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1545; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1546; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1547; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1548; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1549; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1550; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1551; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1552; AVX2-NEXT: retq 1553; 1554; AVX512-LABEL: test_bitreverse_v8i32: 1555; AVX512: # %bb.0: 1556; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1557; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1558; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1559; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1560; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1561; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1562; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1563; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1564; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1565; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1566; AVX512-NEXT: retq 1567; 1568; XOPAVX1-LABEL: test_bitreverse_v8i32: 1569; XOPAVX1: # %bb.0: 1570; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1571; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1572; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1573; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1574; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1575; XOPAVX1-NEXT: retq 1576; 1577; XOPAVX2-LABEL: test_bitreverse_v8i32: 1578; XOPAVX2: # %bb.0: 1579; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1580; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1581; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1582; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1583; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1584; XOPAVX2-NEXT: retq 1585; 1586; GFNISSE-LABEL: test_bitreverse_v8i32: 1587; GFNISSE: # %bb.0: 1588; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1589; GFNISSE-NEXT: pshufb %xmm2, %xmm0 1590; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1591; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 1592; GFNISSE-NEXT: pshufb %xmm2, %xmm1 1593; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 1594; GFNISSE-NEXT: retq 1595; 1596; GFNIAVX-LABEL: test_bitreverse_v8i32: 1597; GFNIAVX: # %bb.0: 1598; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1599; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1600; GFNIAVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1601; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1602; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1 1603; GFNIAVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1604; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0 1605; GFNIAVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1606; GFNIAVX-NEXT: retq 1607; 1608; GFNIAVX2-LABEL: test_bitreverse_v8i32: 1609; GFNIAVX2: # %bb.0: 1610; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1611; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1612; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1613; GFNIAVX2-NEXT: retq 1614; 1615; GFNIAVX512F-LABEL: test_bitreverse_v8i32: 1616; GFNIAVX512F: # %bb.0: 1617; GFNIAVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1618; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1619; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1620; GFNIAVX512F-NEXT: retq 1621; 1622; GFNIAVX512BW-LABEL: test_bitreverse_v8i32: 1623; GFNIAVX512BW: # %bb.0: 1624; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1625; GFNIAVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1626; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1627; GFNIAVX512BW-NEXT: retq 1628 %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) 1629 ret <8 x i32> %b 1630} 1631 1632define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { 1633; SSE2-LABEL: test_bitreverse_v4i64: 1634; SSE2: # %bb.0: 1635; SSE2-NEXT: pxor %xmm2, %xmm2 1636; SSE2-NEXT: movdqa %xmm0, %xmm3 1637; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 1638; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 1639; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 1640; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 1641; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1642; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1643; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1644; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1645; SSE2-NEXT: packuswb %xmm3, %xmm0 1646; SSE2-NEXT: movdqa %xmm0, %xmm4 1647; SSE2-NEXT: psrlw $4, %xmm4 1648; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1649; SSE2-NEXT: pand %xmm3, %xmm4 1650; SSE2-NEXT: pand %xmm3, %xmm0 1651; SSE2-NEXT: psllw $4, %xmm0 1652; SSE2-NEXT: por %xmm4, %xmm0 1653; SSE2-NEXT: movdqa %xmm0, %xmm5 1654; SSE2-NEXT: psrlw $2, %xmm5 1655; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1656; SSE2-NEXT: pand %xmm4, %xmm5 1657; SSE2-NEXT: pand %xmm4, %xmm0 1658; SSE2-NEXT: psllw $2, %xmm0 1659; SSE2-NEXT: por %xmm5, %xmm0 1660; SSE2-NEXT: movdqa %xmm0, %xmm6 1661; SSE2-NEXT: psrlw $1, %xmm6 1662; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1663; SSE2-NEXT: pand %xmm5, %xmm6 1664; SSE2-NEXT: pand %xmm5, %xmm0 1665; SSE2-NEXT: paddb %xmm0, %xmm0 1666; SSE2-NEXT: por %xmm6, %xmm0 1667; SSE2-NEXT: movdqa %xmm1, %xmm6 1668; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] 1669; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 1670; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1671; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1672; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1673; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1674; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1675; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1676; SSE2-NEXT: packuswb %xmm6, %xmm1 1677; SSE2-NEXT: movdqa %xmm1, %xmm2 1678; SSE2-NEXT: psrlw $4, %xmm2 1679; SSE2-NEXT: pand %xmm3, %xmm2 1680; SSE2-NEXT: pand %xmm3, %xmm1 1681; SSE2-NEXT: psllw $4, %xmm1 1682; SSE2-NEXT: por %xmm2, %xmm1 1683; SSE2-NEXT: movdqa %xmm1, %xmm2 1684; SSE2-NEXT: psrlw $2, %xmm2 1685; SSE2-NEXT: pand %xmm4, %xmm2 1686; SSE2-NEXT: pand %xmm4, %xmm1 1687; SSE2-NEXT: psllw $2, %xmm1 1688; SSE2-NEXT: por %xmm2, %xmm1 1689; SSE2-NEXT: movdqa %xmm1, %xmm2 1690; SSE2-NEXT: psrlw $1, %xmm2 1691; SSE2-NEXT: pand %xmm5, %xmm2 1692; SSE2-NEXT: pand %xmm5, %xmm1 1693; SSE2-NEXT: paddb %xmm1, %xmm1 1694; SSE2-NEXT: por %xmm2, %xmm1 1695; SSE2-NEXT: retq 1696; 1697; SSSE3-LABEL: test_bitreverse_v4i64: 1698; SSSE3: # %bb.0: 1699; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1700; SSSE3-NEXT: pshufb %xmm4, %xmm0 1701; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1702; SSSE3-NEXT: movdqa %xmm0, %xmm2 1703; SSSE3-NEXT: pand %xmm5, %xmm2 1704; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1705; SSSE3-NEXT: movdqa %xmm6, %xmm7 1706; SSSE3-NEXT: pshufb %xmm2, %xmm7 1707; SSSE3-NEXT: psrlw $4, %xmm0 1708; SSSE3-NEXT: pand %xmm5, %xmm0 1709; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1710; SSSE3-NEXT: movdqa %xmm2, %xmm3 1711; SSSE3-NEXT: pshufb %xmm0, %xmm3 1712; SSSE3-NEXT: por %xmm7, %xmm3 1713; SSSE3-NEXT: pshufb %xmm4, %xmm1 1714; SSSE3-NEXT: movdqa %xmm1, %xmm0 1715; SSSE3-NEXT: pand %xmm5, %xmm0 1716; SSSE3-NEXT: pshufb %xmm0, %xmm6 1717; SSSE3-NEXT: psrlw $4, %xmm1 1718; SSSE3-NEXT: pand %xmm5, %xmm1 1719; SSSE3-NEXT: pshufb %xmm1, %xmm2 1720; SSSE3-NEXT: por %xmm6, %xmm2 1721; SSSE3-NEXT: movdqa %xmm3, %xmm0 1722; SSSE3-NEXT: movdqa %xmm2, %xmm1 1723; SSSE3-NEXT: retq 1724; 1725; AVX1-LABEL: test_bitreverse_v4i64: 1726; AVX1: # %bb.0: 1727; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1728; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1729; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1730; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1731; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1732; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1733; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1734; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1735; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1736; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1737; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1738; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1739; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1740; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1741; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1742; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1743; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1744; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1745; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1746; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1747; AVX1-NEXT: retq 1748; 1749; AVX2-LABEL: test_bitreverse_v4i64: 1750; AVX2: # %bb.0: 1751; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1752; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1753; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1754; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1755; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1756; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1757; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1758; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1759; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1760; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1761; AVX2-NEXT: retq 1762; 1763; AVX512-LABEL: test_bitreverse_v4i64: 1764; AVX512: # %bb.0: 1765; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1766; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1767; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1768; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1769; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1770; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1771; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1772; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1773; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1774; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1775; AVX512-NEXT: retq 1776; 1777; XOPAVX1-LABEL: test_bitreverse_v4i64: 1778; XOPAVX1: # %bb.0: 1779; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1780; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1781; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1782; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1783; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1784; XOPAVX1-NEXT: retq 1785; 1786; XOPAVX2-LABEL: test_bitreverse_v4i64: 1787; XOPAVX2: # %bb.0: 1788; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1789; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1790; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1791; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1792; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1793; XOPAVX2-NEXT: retq 1794; 1795; GFNISSE-LABEL: test_bitreverse_v4i64: 1796; GFNISSE: # %bb.0: 1797; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1798; GFNISSE-NEXT: pshufb %xmm2, %xmm0 1799; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1800; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 1801; GFNISSE-NEXT: pshufb %xmm2, %xmm1 1802; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 1803; GFNISSE-NEXT: retq 1804; 1805; GFNIAVX-LABEL: test_bitreverse_v4i64: 1806; GFNIAVX: # %bb.0: 1807; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1808; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1809; GFNIAVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1810; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1811; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1 1812; GFNIAVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1813; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0 1814; GFNIAVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1815; GFNIAVX-NEXT: retq 1816; 1817; GFNIAVX2-LABEL: test_bitreverse_v4i64: 1818; GFNIAVX2: # %bb.0: 1819; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1820; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1821; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1822; GFNIAVX2-NEXT: retq 1823; 1824; GFNIAVX512F-LABEL: test_bitreverse_v4i64: 1825; GFNIAVX512F: # %bb.0: 1826; GFNIAVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1827; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1828; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1829; GFNIAVX512F-NEXT: retq 1830; 1831; GFNIAVX512BW-LABEL: test_bitreverse_v4i64: 1832; GFNIAVX512BW: # %bb.0: 1833; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1834; GFNIAVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1835; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1836; GFNIAVX512BW-NEXT: retq 1837 %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) 1838 ret <4 x i64> %b 1839} 1840 1841define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { 1842; SSE2-LABEL: test_bitreverse_v64i8: 1843; SSE2: # %bb.0: 1844; SSE2-NEXT: movdqa %xmm0, %xmm5 1845; SSE2-NEXT: psrlw $4, %xmm5 1846; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1847; SSE2-NEXT: pand %xmm4, %xmm5 1848; SSE2-NEXT: pand %xmm4, %xmm0 1849; SSE2-NEXT: psllw $4, %xmm0 1850; SSE2-NEXT: por %xmm5, %xmm0 1851; SSE2-NEXT: movdqa %xmm0, %xmm6 1852; SSE2-NEXT: psrlw $2, %xmm6 1853; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1854; SSE2-NEXT: pand %xmm5, %xmm6 1855; SSE2-NEXT: pand %xmm5, %xmm0 1856; SSE2-NEXT: psllw $2, %xmm0 1857; SSE2-NEXT: por %xmm6, %xmm0 1858; SSE2-NEXT: movdqa %xmm0, %xmm7 1859; SSE2-NEXT: psrlw $1, %xmm7 1860; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1861; SSE2-NEXT: pand %xmm6, %xmm7 1862; SSE2-NEXT: pand %xmm6, %xmm0 1863; SSE2-NEXT: paddb %xmm0, %xmm0 1864; SSE2-NEXT: por %xmm7, %xmm0 1865; SSE2-NEXT: movdqa %xmm1, %xmm7 1866; SSE2-NEXT: psrlw $4, %xmm7 1867; SSE2-NEXT: pand %xmm4, %xmm7 1868; SSE2-NEXT: pand %xmm4, %xmm1 1869; SSE2-NEXT: psllw $4, %xmm1 1870; SSE2-NEXT: por %xmm7, %xmm1 1871; SSE2-NEXT: movdqa %xmm1, %xmm7 1872; SSE2-NEXT: psrlw $2, %xmm7 1873; SSE2-NEXT: pand %xmm5, %xmm7 1874; SSE2-NEXT: pand %xmm5, %xmm1 1875; SSE2-NEXT: psllw $2, %xmm1 1876; SSE2-NEXT: por %xmm7, %xmm1 1877; SSE2-NEXT: movdqa %xmm1, %xmm7 1878; SSE2-NEXT: psrlw $1, %xmm7 1879; SSE2-NEXT: pand %xmm6, %xmm7 1880; SSE2-NEXT: pand %xmm6, %xmm1 1881; SSE2-NEXT: paddb %xmm1, %xmm1 1882; SSE2-NEXT: por %xmm7, %xmm1 1883; SSE2-NEXT: movdqa %xmm2, %xmm7 1884; SSE2-NEXT: psrlw $4, %xmm7 1885; SSE2-NEXT: pand %xmm4, %xmm7 1886; SSE2-NEXT: pand %xmm4, %xmm2 1887; SSE2-NEXT: psllw $4, %xmm2 1888; SSE2-NEXT: por %xmm7, %xmm2 1889; SSE2-NEXT: movdqa %xmm2, %xmm7 1890; SSE2-NEXT: psrlw $2, %xmm7 1891; SSE2-NEXT: pand %xmm5, %xmm7 1892; SSE2-NEXT: pand %xmm5, %xmm2 1893; SSE2-NEXT: psllw $2, %xmm2 1894; SSE2-NEXT: por %xmm7, %xmm2 1895; SSE2-NEXT: movdqa %xmm2, %xmm7 1896; SSE2-NEXT: psrlw $1, %xmm7 1897; SSE2-NEXT: pand %xmm6, %xmm7 1898; SSE2-NEXT: pand %xmm6, %xmm2 1899; SSE2-NEXT: paddb %xmm2, %xmm2 1900; SSE2-NEXT: por %xmm7, %xmm2 1901; SSE2-NEXT: movdqa %xmm3, %xmm7 1902; SSE2-NEXT: psrlw $4, %xmm7 1903; SSE2-NEXT: pand %xmm4, %xmm7 1904; SSE2-NEXT: pand %xmm4, %xmm3 1905; SSE2-NEXT: psllw $4, %xmm3 1906; SSE2-NEXT: por %xmm7, %xmm3 1907; SSE2-NEXT: movdqa %xmm3, %xmm4 1908; SSE2-NEXT: psrlw $2, %xmm4 1909; SSE2-NEXT: pand %xmm5, %xmm4 1910; SSE2-NEXT: pand %xmm5, %xmm3 1911; SSE2-NEXT: psllw $2, %xmm3 1912; SSE2-NEXT: por %xmm4, %xmm3 1913; SSE2-NEXT: movdqa %xmm3, %xmm4 1914; SSE2-NEXT: psrlw $1, %xmm4 1915; SSE2-NEXT: pand %xmm6, %xmm4 1916; SSE2-NEXT: pand %xmm6, %xmm3 1917; SSE2-NEXT: paddb %xmm3, %xmm3 1918; SSE2-NEXT: por %xmm4, %xmm3 1919; SSE2-NEXT: retq 1920; 1921; SSSE3-LABEL: test_bitreverse_v64i8: 1922; SSSE3: # %bb.0: 1923; SSSE3-NEXT: movdqa %xmm0, %xmm5 1924; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1925; SSSE3-NEXT: pand %xmm8, %xmm0 1926; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1927; SSSE3-NEXT: movdqa %xmm9, %xmm6 1928; SSSE3-NEXT: pshufb %xmm0, %xmm6 1929; SSSE3-NEXT: psrlw $4, %xmm5 1930; SSSE3-NEXT: pand %xmm8, %xmm5 1931; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1932; SSSE3-NEXT: movdqa %xmm4, %xmm0 1933; SSSE3-NEXT: pshufb %xmm5, %xmm0 1934; SSSE3-NEXT: por %xmm6, %xmm0 1935; SSSE3-NEXT: movdqa %xmm1, %xmm5 1936; SSSE3-NEXT: pand %xmm8, %xmm5 1937; SSSE3-NEXT: movdqa %xmm9, %xmm6 1938; SSSE3-NEXT: pshufb %xmm5, %xmm6 1939; SSSE3-NEXT: psrlw $4, %xmm1 1940; SSSE3-NEXT: pand %xmm8, %xmm1 1941; SSSE3-NEXT: movdqa %xmm4, %xmm5 1942; SSSE3-NEXT: pshufb %xmm1, %xmm5 1943; SSSE3-NEXT: por %xmm6, %xmm5 1944; SSSE3-NEXT: movdqa %xmm2, %xmm1 1945; SSSE3-NEXT: pand %xmm8, %xmm1 1946; SSSE3-NEXT: movdqa %xmm9, %xmm7 1947; SSSE3-NEXT: pshufb %xmm1, %xmm7 1948; SSSE3-NEXT: psrlw $4, %xmm2 1949; SSSE3-NEXT: pand %xmm8, %xmm2 1950; SSSE3-NEXT: movdqa %xmm4, %xmm6 1951; SSSE3-NEXT: pshufb %xmm2, %xmm6 1952; SSSE3-NEXT: por %xmm7, %xmm6 1953; SSSE3-NEXT: movdqa %xmm3, %xmm1 1954; SSSE3-NEXT: pand %xmm8, %xmm1 1955; SSSE3-NEXT: pshufb %xmm1, %xmm9 1956; SSSE3-NEXT: psrlw $4, %xmm3 1957; SSSE3-NEXT: pand %xmm8, %xmm3 1958; SSSE3-NEXT: pshufb %xmm3, %xmm4 1959; SSSE3-NEXT: por %xmm9, %xmm4 1960; SSSE3-NEXT: movdqa %xmm5, %xmm1 1961; SSSE3-NEXT: movdqa %xmm6, %xmm2 1962; SSSE3-NEXT: movdqa %xmm4, %xmm3 1963; SSSE3-NEXT: retq 1964; 1965; AVX1-LABEL: test_bitreverse_v64i8: 1966; AVX1: # %bb.0: 1967; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1968; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1969; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 1970; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1971; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1972; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1973; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1974; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1975; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1976; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1977; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm4 1978; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1979; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1980; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1981; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1982; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 1983; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1984; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1985; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 1986; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1987; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1988; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1989; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1990; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1991; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1992; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1993; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1994; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1995; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1996; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1997; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1998; AVX1-NEXT: retq 1999; 2000; AVX2-LABEL: test_bitreverse_v64i8: 2001; AVX2: # %bb.0: 2002; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2003; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 2004; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2005; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 2006; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2007; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2008; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2009; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0 2010; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 2011; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 2012; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 2013; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2014; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 2015; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1 2016; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 2017; AVX2-NEXT: retq 2018; 2019; AVX512F-LABEL: test_bitreverse_v64i8: 2020; AVX512F: # %bb.0: 2021; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2022; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2023; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 2024; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2025; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 2026; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 2027; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm4 2028; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 2029; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2030; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 2031; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2032; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 2033; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 2034; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 2035; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 2036; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2037; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0 2038; AVX512F-NEXT: retq 2039; 2040; AVX512BW-LABEL: test_bitreverse_v64i8: 2041; AVX512BW: # %bb.0: 2042; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2043; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2044; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2045; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2046; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2047; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2048; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2049; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2050; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2051; AVX512BW-NEXT: retq 2052; 2053; XOPAVX1-LABEL: test_bitreverse_v64i8: 2054; XOPAVX1: # %bb.0: 2055; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2056; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 2057; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2058; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2059; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2060; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2061; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2062; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2063; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2064; XOPAVX1-NEXT: retq 2065; 2066; XOPAVX2-LABEL: test_bitreverse_v64i8: 2067; XOPAVX2: # %bb.0: 2068; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2069; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 2070; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2071; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2072; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2073; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2074; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2075; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2076; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2077; XOPAVX2-NEXT: retq 2078; 2079; GFNISSE-LABEL: test_bitreverse_v64i8: 2080; GFNISSE: # %bb.0: 2081; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] 2082; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 2083; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 2084; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 2085; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm3 2086; GFNISSE-NEXT: retq 2087; 2088; GFNIAVX-LABEL: test_bitreverse_v64i8: 2089; GFNIAVX: # %bb.0: 2090; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm2 2091; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 2092; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm2, %xmm2 2093; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0 2094; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2095; GFNIAVX-NEXT: vextractf128 $1, %ymm1, %xmm2 2096; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm2, %xmm2 2097; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1 2098; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2099; GFNIAVX-NEXT: retq 2100; 2101; GFNIAVX2-LABEL: test_bitreverse_v64i8: 2102; GFNIAVX2: # %bb.0: 2103; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2104; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 2105; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 2106; GFNIAVX2-NEXT: retq 2107; 2108; GFNIAVX512F-LABEL: test_bitreverse_v64i8: 2109; GFNIAVX512F: # %bb.0: 2110; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2111; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2112; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 2113; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 2114; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2115; GFNIAVX512F-NEXT: retq 2116; 2117; GFNIAVX512BW-LABEL: test_bitreverse_v64i8: 2118; GFNIAVX512BW: # %bb.0: 2119; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 2120; GFNIAVX512BW-NEXT: retq 2121 %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) 2122 ret <64 x i8> %b 2123} 2124 2125define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { 2126; SSE2-LABEL: test_bitreverse_v32i16: 2127; SSE2: # %bb.0: 2128; SSE2-NEXT: movdqa %xmm0, %xmm4 2129; SSE2-NEXT: psrlw $8, %xmm4 2130; SSE2-NEXT: psllw $8, %xmm0 2131; SSE2-NEXT: por %xmm4, %xmm0 2132; SSE2-NEXT: movdqa %xmm0, %xmm5 2133; SSE2-NEXT: psrlw $4, %xmm5 2134; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2135; SSE2-NEXT: pand %xmm4, %xmm5 2136; SSE2-NEXT: pand %xmm4, %xmm0 2137; SSE2-NEXT: psllw $4, %xmm0 2138; SSE2-NEXT: por %xmm5, %xmm0 2139; SSE2-NEXT: movdqa %xmm0, %xmm6 2140; SSE2-NEXT: psrlw $2, %xmm6 2141; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 2142; SSE2-NEXT: pand %xmm5, %xmm6 2143; SSE2-NEXT: pand %xmm5, %xmm0 2144; SSE2-NEXT: psllw $2, %xmm0 2145; SSE2-NEXT: por %xmm6, %xmm0 2146; SSE2-NEXT: movdqa %xmm0, %xmm7 2147; SSE2-NEXT: psrlw $1, %xmm7 2148; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 2149; SSE2-NEXT: pand %xmm6, %xmm7 2150; SSE2-NEXT: pand %xmm6, %xmm0 2151; SSE2-NEXT: paddb %xmm0, %xmm0 2152; SSE2-NEXT: por %xmm7, %xmm0 2153; SSE2-NEXT: movdqa %xmm1, %xmm7 2154; SSE2-NEXT: psrlw $8, %xmm7 2155; SSE2-NEXT: psllw $8, %xmm1 2156; SSE2-NEXT: por %xmm7, %xmm1 2157; SSE2-NEXT: movdqa %xmm1, %xmm7 2158; SSE2-NEXT: psrlw $4, %xmm7 2159; SSE2-NEXT: pand %xmm4, %xmm7 2160; SSE2-NEXT: pand %xmm4, %xmm1 2161; SSE2-NEXT: psllw $4, %xmm1 2162; SSE2-NEXT: por %xmm7, %xmm1 2163; SSE2-NEXT: movdqa %xmm1, %xmm7 2164; SSE2-NEXT: psrlw $2, %xmm7 2165; SSE2-NEXT: pand %xmm5, %xmm7 2166; SSE2-NEXT: pand %xmm5, %xmm1 2167; SSE2-NEXT: psllw $2, %xmm1 2168; SSE2-NEXT: por %xmm7, %xmm1 2169; SSE2-NEXT: movdqa %xmm1, %xmm7 2170; SSE2-NEXT: psrlw $1, %xmm7 2171; SSE2-NEXT: pand %xmm6, %xmm7 2172; SSE2-NEXT: pand %xmm6, %xmm1 2173; SSE2-NEXT: paddb %xmm1, %xmm1 2174; SSE2-NEXT: por %xmm7, %xmm1 2175; SSE2-NEXT: movdqa %xmm2, %xmm7 2176; SSE2-NEXT: psrlw $8, %xmm7 2177; SSE2-NEXT: psllw $8, %xmm2 2178; SSE2-NEXT: por %xmm7, %xmm2 2179; SSE2-NEXT: movdqa %xmm2, %xmm7 2180; SSE2-NEXT: psrlw $4, %xmm7 2181; SSE2-NEXT: pand %xmm4, %xmm7 2182; SSE2-NEXT: pand %xmm4, %xmm2 2183; SSE2-NEXT: psllw $4, %xmm2 2184; SSE2-NEXT: por %xmm7, %xmm2 2185; SSE2-NEXT: movdqa %xmm2, %xmm7 2186; SSE2-NEXT: psrlw $2, %xmm7 2187; SSE2-NEXT: pand %xmm5, %xmm7 2188; SSE2-NEXT: pand %xmm5, %xmm2 2189; SSE2-NEXT: psllw $2, %xmm2 2190; SSE2-NEXT: por %xmm7, %xmm2 2191; SSE2-NEXT: movdqa %xmm2, %xmm7 2192; SSE2-NEXT: psrlw $1, %xmm7 2193; SSE2-NEXT: pand %xmm6, %xmm7 2194; SSE2-NEXT: pand %xmm6, %xmm2 2195; SSE2-NEXT: paddb %xmm2, %xmm2 2196; SSE2-NEXT: por %xmm7, %xmm2 2197; SSE2-NEXT: movdqa %xmm3, %xmm7 2198; SSE2-NEXT: psrlw $8, %xmm7 2199; SSE2-NEXT: psllw $8, %xmm3 2200; SSE2-NEXT: por %xmm7, %xmm3 2201; SSE2-NEXT: movdqa %xmm3, %xmm7 2202; SSE2-NEXT: psrlw $4, %xmm7 2203; SSE2-NEXT: pand %xmm4, %xmm7 2204; SSE2-NEXT: pand %xmm4, %xmm3 2205; SSE2-NEXT: psllw $4, %xmm3 2206; SSE2-NEXT: por %xmm7, %xmm3 2207; SSE2-NEXT: movdqa %xmm3, %xmm4 2208; SSE2-NEXT: psrlw $2, %xmm4 2209; SSE2-NEXT: pand %xmm5, %xmm4 2210; SSE2-NEXT: pand %xmm5, %xmm3 2211; SSE2-NEXT: psllw $2, %xmm3 2212; SSE2-NEXT: por %xmm4, %xmm3 2213; SSE2-NEXT: movdqa %xmm3, %xmm4 2214; SSE2-NEXT: psrlw $1, %xmm4 2215; SSE2-NEXT: pand %xmm6, %xmm4 2216; SSE2-NEXT: pand %xmm6, %xmm3 2217; SSE2-NEXT: paddb %xmm3, %xmm3 2218; SSE2-NEXT: por %xmm4, %xmm3 2219; SSE2-NEXT: retq 2220; 2221; SSSE3-LABEL: test_bitreverse_v32i16: 2222; SSSE3: # %bb.0: 2223; SSSE3-NEXT: movdqa %xmm1, %xmm5 2224; SSSE3-NEXT: movdqa %xmm0, %xmm1 2225; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2226; SSSE3-NEXT: pshufb %xmm8, %xmm1 2227; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2228; SSSE3-NEXT: movdqa %xmm1, %xmm0 2229; SSSE3-NEXT: pand %xmm9, %xmm0 2230; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2231; SSSE3-NEXT: movdqa %xmm7, %xmm6 2232; SSSE3-NEXT: pshufb %xmm0, %xmm6 2233; SSSE3-NEXT: psrlw $4, %xmm1 2234; SSSE3-NEXT: pand %xmm9, %xmm1 2235; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2236; SSSE3-NEXT: movdqa %xmm4, %xmm0 2237; SSSE3-NEXT: pshufb %xmm1, %xmm0 2238; SSSE3-NEXT: por %xmm6, %xmm0 2239; SSSE3-NEXT: pshufb %xmm8, %xmm5 2240; SSSE3-NEXT: movdqa %xmm5, %xmm1 2241; SSSE3-NEXT: pand %xmm9, %xmm1 2242; SSSE3-NEXT: movdqa %xmm7, %xmm6 2243; SSSE3-NEXT: pshufb %xmm1, %xmm6 2244; SSSE3-NEXT: psrlw $4, %xmm5 2245; SSSE3-NEXT: pand %xmm9, %xmm5 2246; SSSE3-NEXT: movdqa %xmm4, %xmm1 2247; SSSE3-NEXT: pshufb %xmm5, %xmm1 2248; SSSE3-NEXT: por %xmm6, %xmm1 2249; SSSE3-NEXT: pshufb %xmm8, %xmm2 2250; SSSE3-NEXT: movdqa %xmm2, %xmm5 2251; SSSE3-NEXT: pand %xmm9, %xmm5 2252; SSSE3-NEXT: movdqa %xmm7, %xmm6 2253; SSSE3-NEXT: pshufb %xmm5, %xmm6 2254; SSSE3-NEXT: psrlw $4, %xmm2 2255; SSSE3-NEXT: pand %xmm9, %xmm2 2256; SSSE3-NEXT: movdqa %xmm4, %xmm5 2257; SSSE3-NEXT: pshufb %xmm2, %xmm5 2258; SSSE3-NEXT: por %xmm6, %xmm5 2259; SSSE3-NEXT: pshufb %xmm8, %xmm3 2260; SSSE3-NEXT: movdqa %xmm3, %xmm2 2261; SSSE3-NEXT: pand %xmm9, %xmm2 2262; SSSE3-NEXT: pshufb %xmm2, %xmm7 2263; SSSE3-NEXT: psrlw $4, %xmm3 2264; SSSE3-NEXT: pand %xmm9, %xmm3 2265; SSSE3-NEXT: pshufb %xmm3, %xmm4 2266; SSSE3-NEXT: por %xmm7, %xmm4 2267; SSSE3-NEXT: movdqa %xmm5, %xmm2 2268; SSSE3-NEXT: movdqa %xmm4, %xmm3 2269; SSSE3-NEXT: retq 2270; 2271; AVX1-LABEL: test_bitreverse_v32i16: 2272; AVX1: # %bb.0: 2273; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2274; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2275; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2276; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2277; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2278; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2279; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2280; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2281; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2282; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2283; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2284; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2285; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2286; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 2287; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2288; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2289; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2290; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2291; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2292; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2293; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2294; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2295; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2296; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2297; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2298; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2299; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2300; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2301; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2302; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2303; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2304; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2305; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2306; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2307; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2308; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2309; AVX1-NEXT: retq 2310; 2311; AVX2-LABEL: test_bitreverse_v32i16: 2312; AVX2: # %bb.0: 2313; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2314; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2315; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2316; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2317; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2318; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2319; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2320; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2321; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2322; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2323; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2324; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2325; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2326; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2327; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2328; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2329; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2330; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2331; AVX2-NEXT: retq 2332; 2333; AVX512F-LABEL: test_bitreverse_v32i16: 2334; AVX512F: # %bb.0: 2335; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2336; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2337; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2338; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2339; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 2340; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2341; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2342; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2343; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 2344; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2345; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 2346; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2347; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 2348; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2349; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 2350; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 2351; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 2352; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 2353; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2354; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 2355; AVX512F-NEXT: retq 2356; 2357; AVX512BW-LABEL: test_bitreverse_v32i16: 2358; AVX512BW: # %bb.0: 2359; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] 2360; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2361; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2362; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2363; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2364; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2365; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2366; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2367; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2368; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2369; AVX512BW-NEXT: retq 2370; 2371; XOPAVX1-LABEL: test_bitreverse_v32i16: 2372; XOPAVX1: # %bb.0: 2373; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2374; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 2375; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2376; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2377; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2378; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2379; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2380; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2381; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2382; XOPAVX1-NEXT: retq 2383; 2384; XOPAVX2-LABEL: test_bitreverse_v32i16: 2385; XOPAVX2: # %bb.0: 2386; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2387; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 2388; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2389; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2390; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2391; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2392; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2393; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2394; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2395; XOPAVX2-NEXT: retq 2396; 2397; GFNISSE-LABEL: test_bitreverse_v32i16: 2398; GFNISSE: # %bb.0: 2399; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2400; GFNISSE-NEXT: pshufb %xmm4, %xmm0 2401; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] 2402; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 2403; GFNISSE-NEXT: pshufb %xmm4, %xmm1 2404; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 2405; GFNISSE-NEXT: pshufb %xmm4, %xmm2 2406; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 2407; GFNISSE-NEXT: pshufb %xmm4, %xmm3 2408; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 2409; GFNISSE-NEXT: retq 2410; 2411; GFNIAVX-LABEL: test_bitreverse_v32i16: 2412; GFNIAVX: # %bb.0: 2413; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm2 2414; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2415; GFNIAVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2416; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] 2417; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 2418; GFNIAVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2419; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0 2420; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2421; GFNIAVX-NEXT: vextractf128 $1, %ymm1, %xmm2 2422; GFNIAVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2423; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 2424; GFNIAVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2425; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1 2426; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2427; GFNIAVX-NEXT: retq 2428; 2429; GFNIAVX2-LABEL: test_bitreverse_v32i16: 2430; GFNIAVX2: # %bb.0: 2431; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2432; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2433; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2434; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2435; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2436; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2437; GFNIAVX2-NEXT: retq 2438; 2439; GFNIAVX512F-LABEL: test_bitreverse_v32i16: 2440; GFNIAVX512F: # %bb.0: 2441; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2442; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2443; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2444; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2445; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2446; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2447; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2448; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2449; GFNIAVX512F-NEXT: retq 2450; 2451; GFNIAVX512BW-LABEL: test_bitreverse_v32i16: 2452; GFNIAVX512BW: # %bb.0: 2453; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] 2454; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 2455; GFNIAVX512BW-NEXT: retq 2456 %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) 2457 ret <32 x i16> %b 2458} 2459 2460define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { 2461; SSE2-LABEL: test_bitreverse_v16i32: 2462; SSE2: # %bb.0: 2463; SSE2-NEXT: pxor %xmm8, %xmm8 2464; SSE2-NEXT: movdqa %xmm0, %xmm5 2465; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] 2466; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] 2467; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] 2468; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 2469; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 2470; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 2471; SSE2-NEXT: packuswb %xmm5, %xmm0 2472; SSE2-NEXT: movdqa %xmm0, %xmm6 2473; SSE2-NEXT: psrlw $4, %xmm6 2474; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2475; SSE2-NEXT: pand %xmm5, %xmm6 2476; SSE2-NEXT: pand %xmm5, %xmm0 2477; SSE2-NEXT: psllw $4, %xmm0 2478; SSE2-NEXT: por %xmm6, %xmm0 2479; SSE2-NEXT: movdqa %xmm0, %xmm7 2480; SSE2-NEXT: psrlw $2, %xmm7 2481; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 2482; SSE2-NEXT: pand %xmm6, %xmm7 2483; SSE2-NEXT: pand %xmm6, %xmm0 2484; SSE2-NEXT: psllw $2, %xmm0 2485; SSE2-NEXT: por %xmm7, %xmm0 2486; SSE2-NEXT: movdqa %xmm0, %xmm4 2487; SSE2-NEXT: psrlw $1, %xmm4 2488; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 2489; SSE2-NEXT: pand %xmm7, %xmm4 2490; SSE2-NEXT: pand %xmm7, %xmm0 2491; SSE2-NEXT: paddb %xmm0, %xmm0 2492; SSE2-NEXT: por %xmm4, %xmm0 2493; SSE2-NEXT: movdqa %xmm1, %xmm4 2494; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] 2495; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2496; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2497; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 2498; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 2499; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 2500; SSE2-NEXT: packuswb %xmm4, %xmm1 2501; SSE2-NEXT: movdqa %xmm1, %xmm4 2502; SSE2-NEXT: psrlw $4, %xmm4 2503; SSE2-NEXT: pand %xmm5, %xmm4 2504; SSE2-NEXT: pand %xmm5, %xmm1 2505; SSE2-NEXT: psllw $4, %xmm1 2506; SSE2-NEXT: por %xmm4, %xmm1 2507; SSE2-NEXT: movdqa %xmm1, %xmm4 2508; SSE2-NEXT: psrlw $2, %xmm4 2509; SSE2-NEXT: pand %xmm6, %xmm4 2510; SSE2-NEXT: pand %xmm6, %xmm1 2511; SSE2-NEXT: psllw $2, %xmm1 2512; SSE2-NEXT: por %xmm4, %xmm1 2513; SSE2-NEXT: movdqa %xmm1, %xmm4 2514; SSE2-NEXT: psrlw $1, %xmm4 2515; SSE2-NEXT: pand %xmm7, %xmm4 2516; SSE2-NEXT: pand %xmm7, %xmm1 2517; SSE2-NEXT: paddb %xmm1, %xmm1 2518; SSE2-NEXT: por %xmm4, %xmm1 2519; SSE2-NEXT: movdqa %xmm2, %xmm4 2520; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] 2521; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2522; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2523; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] 2524; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 2525; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 2526; SSE2-NEXT: packuswb %xmm4, %xmm2 2527; SSE2-NEXT: movdqa %xmm2, %xmm4 2528; SSE2-NEXT: psrlw $4, %xmm4 2529; SSE2-NEXT: pand %xmm5, %xmm4 2530; SSE2-NEXT: pand %xmm5, %xmm2 2531; SSE2-NEXT: psllw $4, %xmm2 2532; SSE2-NEXT: por %xmm4, %xmm2 2533; SSE2-NEXT: movdqa %xmm2, %xmm4 2534; SSE2-NEXT: psrlw $2, %xmm4 2535; SSE2-NEXT: pand %xmm6, %xmm4 2536; SSE2-NEXT: pand %xmm6, %xmm2 2537; SSE2-NEXT: psllw $2, %xmm2 2538; SSE2-NEXT: por %xmm4, %xmm2 2539; SSE2-NEXT: movdqa %xmm2, %xmm4 2540; SSE2-NEXT: psrlw $1, %xmm4 2541; SSE2-NEXT: pand %xmm7, %xmm4 2542; SSE2-NEXT: pand %xmm7, %xmm2 2543; SSE2-NEXT: paddb %xmm2, %xmm2 2544; SSE2-NEXT: por %xmm4, %xmm2 2545; SSE2-NEXT: movdqa %xmm3, %xmm4 2546; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] 2547; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2548; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2549; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] 2550; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 2551; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 2552; SSE2-NEXT: packuswb %xmm4, %xmm3 2553; SSE2-NEXT: movdqa %xmm3, %xmm4 2554; SSE2-NEXT: psrlw $4, %xmm4 2555; SSE2-NEXT: pand %xmm5, %xmm4 2556; SSE2-NEXT: pand %xmm5, %xmm3 2557; SSE2-NEXT: psllw $4, %xmm3 2558; SSE2-NEXT: por %xmm4, %xmm3 2559; SSE2-NEXT: movdqa %xmm3, %xmm4 2560; SSE2-NEXT: psrlw $2, %xmm4 2561; SSE2-NEXT: pand %xmm6, %xmm4 2562; SSE2-NEXT: pand %xmm6, %xmm3 2563; SSE2-NEXT: psllw $2, %xmm3 2564; SSE2-NEXT: por %xmm4, %xmm3 2565; SSE2-NEXT: movdqa %xmm3, %xmm4 2566; SSE2-NEXT: psrlw $1, %xmm4 2567; SSE2-NEXT: pand %xmm7, %xmm4 2568; SSE2-NEXT: pand %xmm7, %xmm3 2569; SSE2-NEXT: paddb %xmm3, %xmm3 2570; SSE2-NEXT: por %xmm4, %xmm3 2571; SSE2-NEXT: retq 2572; 2573; SSSE3-LABEL: test_bitreverse_v16i32: 2574; SSSE3: # %bb.0: 2575; SSSE3-NEXT: movdqa %xmm1, %xmm5 2576; SSSE3-NEXT: movdqa %xmm0, %xmm1 2577; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2578; SSSE3-NEXT: pshufb %xmm8, %xmm1 2579; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2580; SSSE3-NEXT: movdqa %xmm1, %xmm0 2581; SSSE3-NEXT: pand %xmm9, %xmm0 2582; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2583; SSSE3-NEXT: movdqa %xmm7, %xmm6 2584; SSSE3-NEXT: pshufb %xmm0, %xmm6 2585; SSSE3-NEXT: psrlw $4, %xmm1 2586; SSSE3-NEXT: pand %xmm9, %xmm1 2587; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2588; SSSE3-NEXT: movdqa %xmm4, %xmm0 2589; SSSE3-NEXT: pshufb %xmm1, %xmm0 2590; SSSE3-NEXT: por %xmm6, %xmm0 2591; SSSE3-NEXT: pshufb %xmm8, %xmm5 2592; SSSE3-NEXT: movdqa %xmm5, %xmm1 2593; SSSE3-NEXT: pand %xmm9, %xmm1 2594; SSSE3-NEXT: movdqa %xmm7, %xmm6 2595; SSSE3-NEXT: pshufb %xmm1, %xmm6 2596; SSSE3-NEXT: psrlw $4, %xmm5 2597; SSSE3-NEXT: pand %xmm9, %xmm5 2598; SSSE3-NEXT: movdqa %xmm4, %xmm1 2599; SSSE3-NEXT: pshufb %xmm5, %xmm1 2600; SSSE3-NEXT: por %xmm6, %xmm1 2601; SSSE3-NEXT: pshufb %xmm8, %xmm2 2602; SSSE3-NEXT: movdqa %xmm2, %xmm5 2603; SSSE3-NEXT: pand %xmm9, %xmm5 2604; SSSE3-NEXT: movdqa %xmm7, %xmm6 2605; SSSE3-NEXT: pshufb %xmm5, %xmm6 2606; SSSE3-NEXT: psrlw $4, %xmm2 2607; SSSE3-NEXT: pand %xmm9, %xmm2 2608; SSSE3-NEXT: movdqa %xmm4, %xmm5 2609; SSSE3-NEXT: pshufb %xmm2, %xmm5 2610; SSSE3-NEXT: por %xmm6, %xmm5 2611; SSSE3-NEXT: pshufb %xmm8, %xmm3 2612; SSSE3-NEXT: movdqa %xmm3, %xmm2 2613; SSSE3-NEXT: pand %xmm9, %xmm2 2614; SSSE3-NEXT: pshufb %xmm2, %xmm7 2615; SSSE3-NEXT: psrlw $4, %xmm3 2616; SSSE3-NEXT: pand %xmm9, %xmm3 2617; SSSE3-NEXT: pshufb %xmm3, %xmm4 2618; SSSE3-NEXT: por %xmm7, %xmm4 2619; SSSE3-NEXT: movdqa %xmm5, %xmm2 2620; SSSE3-NEXT: movdqa %xmm4, %xmm3 2621; SSSE3-NEXT: retq 2622; 2623; AVX1-LABEL: test_bitreverse_v16i32: 2624; AVX1: # %bb.0: 2625; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2626; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2627; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2628; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2629; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2630; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2631; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2632; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2633; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2634; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2635; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2636; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2637; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2638; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 2639; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2640; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2641; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2642; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2643; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2644; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2645; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2646; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2647; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2648; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2649; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2650; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2651; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2652; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2653; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2654; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2655; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2656; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2657; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2658; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2659; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2660; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2661; AVX1-NEXT: retq 2662; 2663; AVX2-LABEL: test_bitreverse_v16i32: 2664; AVX2: # %bb.0: 2665; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2666; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2667; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2668; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2669; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2670; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2671; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2672; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2673; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2674; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2675; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2676; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2677; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2678; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2679; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2680; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2681; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2682; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2683; AVX2-NEXT: retq 2684; 2685; AVX512F-LABEL: test_bitreverse_v16i32: 2686; AVX512F: # %bb.0: 2687; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2688; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2689; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2690; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2691; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 2692; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2693; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2694; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2695; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 2696; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2697; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 2698; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2699; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 2700; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2701; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 2702; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 2703; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 2704; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 2705; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2706; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 2707; AVX512F-NEXT: retq 2708; 2709; AVX512BW-LABEL: test_bitreverse_v16i32: 2710; AVX512BW: # %bb.0: 2711; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] 2712; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2713; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2714; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2715; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2716; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2717; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2718; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2719; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2720; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2721; AVX512BW-NEXT: retq 2722; 2723; XOPAVX1-LABEL: test_bitreverse_v16i32: 2724; XOPAVX1: # %bb.0: 2725; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2726; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 2727; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2728; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2729; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2730; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2731; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2732; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2733; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2734; XOPAVX1-NEXT: retq 2735; 2736; XOPAVX2-LABEL: test_bitreverse_v16i32: 2737; XOPAVX2: # %bb.0: 2738; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2739; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 2740; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2741; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2742; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2743; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2744; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2745; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2746; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2747; XOPAVX2-NEXT: retq 2748; 2749; GFNISSE-LABEL: test_bitreverse_v16i32: 2750; GFNISSE: # %bb.0: 2751; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2752; GFNISSE-NEXT: pshufb %xmm4, %xmm0 2753; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] 2754; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 2755; GFNISSE-NEXT: pshufb %xmm4, %xmm1 2756; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 2757; GFNISSE-NEXT: pshufb %xmm4, %xmm2 2758; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 2759; GFNISSE-NEXT: pshufb %xmm4, %xmm3 2760; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 2761; GFNISSE-NEXT: retq 2762; 2763; GFNIAVX-LABEL: test_bitreverse_v16i32: 2764; GFNIAVX: # %bb.0: 2765; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm2 2766; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2767; GFNIAVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2768; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] 2769; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 2770; GFNIAVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2771; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0 2772; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2773; GFNIAVX-NEXT: vextractf128 $1, %ymm1, %xmm2 2774; GFNIAVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2775; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 2776; GFNIAVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2777; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1 2778; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2779; GFNIAVX-NEXT: retq 2780; 2781; GFNIAVX2-LABEL: test_bitreverse_v16i32: 2782; GFNIAVX2: # %bb.0: 2783; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2784; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2785; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2786; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2787; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2788; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2789; GFNIAVX2-NEXT: retq 2790; 2791; GFNIAVX512F-LABEL: test_bitreverse_v16i32: 2792; GFNIAVX512F: # %bb.0: 2793; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2794; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2795; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2796; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2797; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2798; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2799; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2800; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2801; GFNIAVX512F-NEXT: retq 2802; 2803; GFNIAVX512BW-LABEL: test_bitreverse_v16i32: 2804; GFNIAVX512BW: # %bb.0: 2805; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] 2806; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 2807; GFNIAVX512BW-NEXT: retq 2808 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) 2809 ret <16 x i32> %b 2810} 2811 2812define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { 2813; SSE2-LABEL: test_bitreverse_v8i64: 2814; SSE2: # %bb.0: 2815; SSE2-NEXT: pxor %xmm8, %xmm8 2816; SSE2-NEXT: movdqa %xmm0, %xmm5 2817; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] 2818; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] 2819; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] 2820; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] 2821; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 2822; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2823; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 2824; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 2825; SSE2-NEXT: packuswb %xmm5, %xmm0 2826; SSE2-NEXT: movdqa %xmm0, %xmm6 2827; SSE2-NEXT: psrlw $4, %xmm6 2828; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2829; SSE2-NEXT: pand %xmm5, %xmm6 2830; SSE2-NEXT: pand %xmm5, %xmm0 2831; SSE2-NEXT: psllw $4, %xmm0 2832; SSE2-NEXT: por %xmm6, %xmm0 2833; SSE2-NEXT: movdqa %xmm0, %xmm7 2834; SSE2-NEXT: psrlw $2, %xmm7 2835; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 2836; SSE2-NEXT: pand %xmm6, %xmm7 2837; SSE2-NEXT: pand %xmm6, %xmm0 2838; SSE2-NEXT: psllw $2, %xmm0 2839; SSE2-NEXT: por %xmm7, %xmm0 2840; SSE2-NEXT: movdqa %xmm0, %xmm4 2841; SSE2-NEXT: psrlw $1, %xmm4 2842; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 2843; SSE2-NEXT: pand %xmm7, %xmm4 2844; SSE2-NEXT: pand %xmm7, %xmm0 2845; SSE2-NEXT: paddb %xmm0, %xmm0 2846; SSE2-NEXT: por %xmm4, %xmm0 2847; SSE2-NEXT: movdqa %xmm1, %xmm4 2848; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] 2849; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 2850; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2851; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2852; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 2853; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2854; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 2855; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 2856; SSE2-NEXT: packuswb %xmm4, %xmm1 2857; SSE2-NEXT: movdqa %xmm1, %xmm4 2858; SSE2-NEXT: psrlw $4, %xmm4 2859; SSE2-NEXT: pand %xmm5, %xmm4 2860; SSE2-NEXT: pand %xmm5, %xmm1 2861; SSE2-NEXT: psllw $4, %xmm1 2862; SSE2-NEXT: por %xmm4, %xmm1 2863; SSE2-NEXT: movdqa %xmm1, %xmm4 2864; SSE2-NEXT: psrlw $2, %xmm4 2865; SSE2-NEXT: pand %xmm6, %xmm4 2866; SSE2-NEXT: pand %xmm6, %xmm1 2867; SSE2-NEXT: psllw $2, %xmm1 2868; SSE2-NEXT: por %xmm4, %xmm1 2869; SSE2-NEXT: movdqa %xmm1, %xmm4 2870; SSE2-NEXT: psrlw $1, %xmm4 2871; SSE2-NEXT: pand %xmm7, %xmm4 2872; SSE2-NEXT: pand %xmm7, %xmm1 2873; SSE2-NEXT: paddb %xmm1, %xmm1 2874; SSE2-NEXT: por %xmm4, %xmm1 2875; SSE2-NEXT: movdqa %xmm2, %xmm4 2876; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] 2877; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 2878; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2879; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2880; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] 2881; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2882; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 2883; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 2884; SSE2-NEXT: packuswb %xmm4, %xmm2 2885; SSE2-NEXT: movdqa %xmm2, %xmm4 2886; SSE2-NEXT: psrlw $4, %xmm4 2887; SSE2-NEXT: pand %xmm5, %xmm4 2888; SSE2-NEXT: pand %xmm5, %xmm2 2889; SSE2-NEXT: psllw $4, %xmm2 2890; SSE2-NEXT: por %xmm4, %xmm2 2891; SSE2-NEXT: movdqa %xmm2, %xmm4 2892; SSE2-NEXT: psrlw $2, %xmm4 2893; SSE2-NEXT: pand %xmm6, %xmm4 2894; SSE2-NEXT: pand %xmm6, %xmm2 2895; SSE2-NEXT: psllw $2, %xmm2 2896; SSE2-NEXT: por %xmm4, %xmm2 2897; SSE2-NEXT: movdqa %xmm2, %xmm4 2898; SSE2-NEXT: psrlw $1, %xmm4 2899; SSE2-NEXT: pand %xmm7, %xmm4 2900; SSE2-NEXT: pand %xmm7, %xmm2 2901; SSE2-NEXT: paddb %xmm2, %xmm2 2902; SSE2-NEXT: por %xmm4, %xmm2 2903; SSE2-NEXT: movdqa %xmm3, %xmm4 2904; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] 2905; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 2906; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2907; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2908; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] 2909; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 2910; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 2911; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 2912; SSE2-NEXT: packuswb %xmm4, %xmm3 2913; SSE2-NEXT: movdqa %xmm3, %xmm4 2914; SSE2-NEXT: psrlw $4, %xmm4 2915; SSE2-NEXT: pand %xmm5, %xmm4 2916; SSE2-NEXT: pand %xmm5, %xmm3 2917; SSE2-NEXT: psllw $4, %xmm3 2918; SSE2-NEXT: por %xmm4, %xmm3 2919; SSE2-NEXT: movdqa %xmm3, %xmm4 2920; SSE2-NEXT: psrlw $2, %xmm4 2921; SSE2-NEXT: pand %xmm6, %xmm4 2922; SSE2-NEXT: pand %xmm6, %xmm3 2923; SSE2-NEXT: psllw $2, %xmm3 2924; SSE2-NEXT: por %xmm4, %xmm3 2925; SSE2-NEXT: movdqa %xmm3, %xmm4 2926; SSE2-NEXT: psrlw $1, %xmm4 2927; SSE2-NEXT: pand %xmm7, %xmm4 2928; SSE2-NEXT: pand %xmm7, %xmm3 2929; SSE2-NEXT: paddb %xmm3, %xmm3 2930; SSE2-NEXT: por %xmm4, %xmm3 2931; SSE2-NEXT: retq 2932; 2933; SSSE3-LABEL: test_bitreverse_v8i64: 2934; SSSE3: # %bb.0: 2935; SSSE3-NEXT: movdqa %xmm1, %xmm5 2936; SSSE3-NEXT: movdqa %xmm0, %xmm1 2937; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2938; SSSE3-NEXT: pshufb %xmm8, %xmm1 2939; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2940; SSSE3-NEXT: movdqa %xmm1, %xmm0 2941; SSSE3-NEXT: pand %xmm9, %xmm0 2942; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2943; SSSE3-NEXT: movdqa %xmm7, %xmm6 2944; SSSE3-NEXT: pshufb %xmm0, %xmm6 2945; SSSE3-NEXT: psrlw $4, %xmm1 2946; SSSE3-NEXT: pand %xmm9, %xmm1 2947; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2948; SSSE3-NEXT: movdqa %xmm4, %xmm0 2949; SSSE3-NEXT: pshufb %xmm1, %xmm0 2950; SSSE3-NEXT: por %xmm6, %xmm0 2951; SSSE3-NEXT: pshufb %xmm8, %xmm5 2952; SSSE3-NEXT: movdqa %xmm5, %xmm1 2953; SSSE3-NEXT: pand %xmm9, %xmm1 2954; SSSE3-NEXT: movdqa %xmm7, %xmm6 2955; SSSE3-NEXT: pshufb %xmm1, %xmm6 2956; SSSE3-NEXT: psrlw $4, %xmm5 2957; SSSE3-NEXT: pand %xmm9, %xmm5 2958; SSSE3-NEXT: movdqa %xmm4, %xmm1 2959; SSSE3-NEXT: pshufb %xmm5, %xmm1 2960; SSSE3-NEXT: por %xmm6, %xmm1 2961; SSSE3-NEXT: pshufb %xmm8, %xmm2 2962; SSSE3-NEXT: movdqa %xmm2, %xmm5 2963; SSSE3-NEXT: pand %xmm9, %xmm5 2964; SSSE3-NEXT: movdqa %xmm7, %xmm6 2965; SSSE3-NEXT: pshufb %xmm5, %xmm6 2966; SSSE3-NEXT: psrlw $4, %xmm2 2967; SSSE3-NEXT: pand %xmm9, %xmm2 2968; SSSE3-NEXT: movdqa %xmm4, %xmm5 2969; SSSE3-NEXT: pshufb %xmm2, %xmm5 2970; SSSE3-NEXT: por %xmm6, %xmm5 2971; SSSE3-NEXT: pshufb %xmm8, %xmm3 2972; SSSE3-NEXT: movdqa %xmm3, %xmm2 2973; SSSE3-NEXT: pand %xmm9, %xmm2 2974; SSSE3-NEXT: pshufb %xmm2, %xmm7 2975; SSSE3-NEXT: psrlw $4, %xmm3 2976; SSSE3-NEXT: pand %xmm9, %xmm3 2977; SSSE3-NEXT: pshufb %xmm3, %xmm4 2978; SSSE3-NEXT: por %xmm7, %xmm4 2979; SSSE3-NEXT: movdqa %xmm5, %xmm2 2980; SSSE3-NEXT: movdqa %xmm4, %xmm3 2981; SSSE3-NEXT: retq 2982; 2983; AVX1-LABEL: test_bitreverse_v8i64: 2984; AVX1: # %bb.0: 2985; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2986; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2987; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2988; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2989; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2990; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2991; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2992; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2993; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2994; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2995; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2996; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2997; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2998; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 2999; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3000; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 3001; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 3002; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 3003; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 3004; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3005; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3006; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3007; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 3008; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3009; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 3010; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 3011; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 3012; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 3013; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 3014; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 3015; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 3016; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 3017; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 3018; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 3019; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 3020; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3021; AVX1-NEXT: retq 3022; 3023; AVX2-LABEL: test_bitreverse_v8i64: 3024; AVX2: # %bb.0: 3025; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3026; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3027; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3028; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 3029; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3030; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 3031; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 3032; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 3033; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3034; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 3035; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 3036; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3037; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 3038; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 3039; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 3040; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 3041; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 3042; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 3043; AVX2-NEXT: retq 3044; 3045; AVX512F-LABEL: test_bitreverse_v8i64: 3046; AVX512F: # %bb.0: 3047; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 3048; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3049; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3050; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3051; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 3052; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3053; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 3054; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3055; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 3056; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 3057; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 3058; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 3059; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 3060; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3061; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 3062; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 3063; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 3064; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 3065; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 3066; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 3067; AVX512F-NEXT: retq 3068; 3069; AVX512BW-LABEL: test_bitreverse_v8i64: 3070; AVX512BW: # %bb.0: 3071; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] 3072; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3073; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 3074; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3075; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 3076; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 3077; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 3078; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3079; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 3080; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 3081; AVX512BW-NEXT: retq 3082; 3083; XOPAVX1-LABEL: test_bitreverse_v8i64: 3084; XOPAVX1: # %bb.0: 3085; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3086; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 3087; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3088; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 3089; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3090; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3091; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3092; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 3093; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3094; XOPAVX1-NEXT: retq 3095; 3096; XOPAVX2-LABEL: test_bitreverse_v8i64: 3097; XOPAVX2: # %bb.0: 3098; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 3099; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 3100; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3101; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 3102; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 3103; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 3104; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3105; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 3106; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 3107; XOPAVX2-NEXT: retq 3108; 3109; GFNISSE-LABEL: test_bitreverse_v8i64: 3110; GFNISSE: # %bb.0: 3111; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3112; GFNISSE-NEXT: pshufb %xmm4, %xmm0 3113; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] 3114; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 3115; GFNISSE-NEXT: pshufb %xmm4, %xmm1 3116; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 3117; GFNISSE-NEXT: pshufb %xmm4, %xmm2 3118; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 3119; GFNISSE-NEXT: pshufb %xmm4, %xmm3 3120; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 3121; GFNISSE-NEXT: retq 3122; 3123; GFNIAVX-LABEL: test_bitreverse_v8i64: 3124; GFNIAVX: # %bb.0: 3125; GFNIAVX-NEXT: vextractf128 $1, %ymm0, %xmm2 3126; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3127; GFNIAVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3128; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] 3129; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 3130; GFNIAVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 3131; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0 3132; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3133; GFNIAVX-NEXT: vextractf128 $1, %ymm1, %xmm2 3134; GFNIAVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3135; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 3136; GFNIAVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 3137; GFNIAVX-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1 3138; GFNIAVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3139; GFNIAVX-NEXT: retq 3140; 3141; GFNIAVX2-LABEL: test_bitreverse_v8i64: 3142; GFNIAVX2: # %bb.0: 3143; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3144; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3145; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 3146; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 3147; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3148; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 3149; GFNIAVX2-NEXT: retq 3150; 3151; GFNIAVX512F-LABEL: test_bitreverse_v8i64: 3152; GFNIAVX512F: # %bb.0: 3153; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 3154; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3155; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3156; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 3157; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 3158; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3159; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 3160; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 3161; GFNIAVX512F-NEXT: retq 3162; 3163; GFNIAVX512BW-LABEL: test_bitreverse_v8i64: 3164; GFNIAVX512BW: # %bb.0: 3165; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] 3166; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 3167; GFNIAVX512BW-NEXT: retq 3168 %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) 3169 ret <8 x i64> %b 3170} 3171 3172; 3173; Constant Folding 3174; 3175 3176define i32 @fold_bitreverse_i32() nounwind { 3177; ALL-LABEL: fold_bitreverse_i32: 3178; ALL: # %bb.0: 3179; ALL-NEXT: movl $16711935, %eax # imm = 0xFF00FF 3180; ALL-NEXT: retq 3181 %b = call i32 @llvm.bitreverse.i32(i32 4278255360) 3182 ret i32 %b 3183} 3184 3185define <16 x i8> @fold_bitreverse_v16i8() nounwind { 3186; SSE-LABEL: fold_bitreverse_v16i8: 3187; SSE: # %bb.0: 3188; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3189; SSE-NEXT: retq 3190; 3191; AVX-LABEL: fold_bitreverse_v16i8: 3192; AVX: # %bb.0: 3193; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3194; AVX-NEXT: retq 3195; 3196; XOP-LABEL: fold_bitreverse_v16i8: 3197; XOP: # %bb.0: 3198; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3199; XOP-NEXT: retq 3200; 3201; GFNISSE-LABEL: fold_bitreverse_v16i8: 3202; GFNISSE: # %bb.0: 3203; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3204; GFNISSE-NEXT: retq 3205; 3206; GFNIAVX-LABEL: fold_bitreverse_v16i8: 3207; GFNIAVX: # %bb.0: 3208; GFNIAVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3209; GFNIAVX-NEXT: retq 3210; 3211; GFNIAVX2-LABEL: fold_bitreverse_v16i8: 3212; GFNIAVX2: # %bb.0: 3213; GFNIAVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3214; GFNIAVX2-NEXT: retq 3215; 3216; GFNIAVX512F-LABEL: fold_bitreverse_v16i8: 3217; GFNIAVX512F: # %bb.0: 3218; GFNIAVX512F-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3219; GFNIAVX512F-NEXT: retq 3220; 3221; GFNIAVX512BW-LABEL: fold_bitreverse_v16i8: 3222; GFNIAVX512BW: # %bb.0: 3223; GFNIAVX512BW-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 3224; GFNIAVX512BW-NEXT: retq 3225 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>) 3226 ret <16 x i8> %b 3227} 3228 3229define <16 x i16> @fold_bitreverse_v16i16() nounwind { 3230; SSE-LABEL: fold_bitreverse_v16i16: 3231; SSE: # %bb.0: 3232; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959] 3233; SSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863] 3234; SSE-NEXT: retq 3235; 3236; AVX-LABEL: fold_bitreverse_v16i16: 3237; AVX: # %bb.0: 3238; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 3239; AVX-NEXT: retq 3240; 3241; XOP-LABEL: fold_bitreverse_v16i16: 3242; XOP: # %bb.0: 3243; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 3244; XOP-NEXT: retq 3245; 3246; GFNISSE-LABEL: fold_bitreverse_v16i16: 3247; GFNISSE: # %bb.0: 3248; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959] 3249; GFNISSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863] 3250; GFNISSE-NEXT: retq 3251; 3252; GFNIAVX-LABEL: fold_bitreverse_v16i16: 3253; GFNIAVX: # %bb.0: 3254; GFNIAVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 3255; GFNIAVX-NEXT: retq 3256; 3257; GFNIAVX2-LABEL: fold_bitreverse_v16i16: 3258; GFNIAVX2: # %bb.0: 3259; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 3260; GFNIAVX2-NEXT: retq 3261; 3262; GFNIAVX512F-LABEL: fold_bitreverse_v16i16: 3263; GFNIAVX512F: # %bb.0: 3264; GFNIAVX512F-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 3265; GFNIAVX512F-NEXT: retq 3266; 3267; GFNIAVX512BW-LABEL: fold_bitreverse_v16i16: 3268; GFNIAVX512BW: # %bb.0: 3269; GFNIAVX512BW-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 3270; GFNIAVX512BW-NEXT: retq 3271 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>) 3272 ret <16 x i16> %b 3273} 3274 3275define <16 x i32> @fold_bitreverse_v16i32() nounwind { 3276; SSE-LABEL: fold_bitreverse_v16i32: 3277; SSE: # %bb.0: 3278; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471] 3279; SSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559] 3280; SSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015] 3281; SSE-NEXT: movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103] 3282; SSE-NEXT: retq 3283; 3284; AVX1-LABEL: fold_bitreverse_v16i32: 3285; AVX1: # %bb.0: 3286; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 3287; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3288; AVX1-NEXT: retq 3289; 3290; AVX2-LABEL: fold_bitreverse_v16i32: 3291; AVX2: # %bb.0: 3292; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 3293; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3294; AVX2-NEXT: retq 3295; 3296; AVX512-LABEL: fold_bitreverse_v16i32: 3297; AVX512: # %bb.0: 3298; AVX512-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3299; AVX512-NEXT: retq 3300; 3301; XOP-LABEL: fold_bitreverse_v16i32: 3302; XOP: # %bb.0: 3303; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 3304; XOP-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3305; XOP-NEXT: retq 3306; 3307; GFNISSE-LABEL: fold_bitreverse_v16i32: 3308; GFNISSE: # %bb.0: 3309; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471] 3310; GFNISSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559] 3311; GFNISSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015] 3312; GFNISSE-NEXT: movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103] 3313; GFNISSE-NEXT: retq 3314; 3315; GFNIAVX-LABEL: fold_bitreverse_v16i32: 3316; GFNIAVX: # %bb.0: 3317; GFNIAVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 3318; GFNIAVX-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3319; GFNIAVX-NEXT: retq 3320; 3321; GFNIAVX2-LABEL: fold_bitreverse_v16i32: 3322; GFNIAVX2: # %bb.0: 3323; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 3324; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3325; GFNIAVX2-NEXT: retq 3326; 3327; GFNIAVX512F-LABEL: fold_bitreverse_v16i32: 3328; GFNIAVX512F: # %bb.0: 3329; GFNIAVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3330; GFNIAVX512F-NEXT: retq 3331; 3332; GFNIAVX512BW-LABEL: fold_bitreverse_v16i32: 3333; GFNIAVX512BW: # %bb.0: 3334; GFNIAVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 3335; GFNIAVX512BW-NEXT: retq 3336 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>) 3337 ret <16 x i32> %b 3338} 3339 3340declare i8 @llvm.bitreverse.i8(i8) readnone 3341declare i16 @llvm.bitreverse.i16(i16) readnone 3342declare i32 @llvm.bitreverse.i32(i32) readnone 3343declare i64 @llvm.bitreverse.i64(i64) readnone 3344 3345declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone 3346declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone 3347declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone 3348declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone 3349 3350declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) readnone 3351declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone 3352declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone 3353declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone 3354 3355declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) readnone 3356declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone 3357declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>) readnone 3358declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>) readnone 3359