1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=ALL,SSE,SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX1 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX2 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefixes=ALL,GFNISSE 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX1 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX2 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX512,GFNIAVX512F 14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX512,GFNIAVX512BW 15 16; Make sure we don't crash with avx512bw and xop 17; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw 18 19define i8 @test_bitreverse_i8(i8 %a) nounwind { 20; SSE-LABEL: test_bitreverse_i8: 21; SSE: # %bb.0: 22; SSE-NEXT: rolb $4, %dil 23; SSE-NEXT: movl %edi, %eax 24; SSE-NEXT: andb $51, %al 25; SSE-NEXT: shlb $2, %al 26; SSE-NEXT: shrb $2, %dil 27; SSE-NEXT: andb $51, %dil 28; SSE-NEXT: orb %dil, %al 29; SSE-NEXT: movl %eax, %ecx 30; SSE-NEXT: andb $85, %cl 31; SSE-NEXT: addb %cl, %cl 32; SSE-NEXT: shrb %al 33; SSE-NEXT: andb $85, %al 34; SSE-NEXT: orb %cl, %al 35; SSE-NEXT: retq 36; 37; AVX-LABEL: test_bitreverse_i8: 38; AVX: # %bb.0: 39; AVX-NEXT: rolb $4, %dil 40; AVX-NEXT: movl %edi, %eax 41; AVX-NEXT: andb $51, %al 42; AVX-NEXT: shlb $2, %al 43; AVX-NEXT: shrb $2, %dil 44; AVX-NEXT: andb $51, %dil 45; AVX-NEXT: orb %dil, %al 46; AVX-NEXT: movl %eax, %ecx 47; AVX-NEXT: andb $85, %cl 48; AVX-NEXT: addb %cl, %cl 49; AVX-NEXT: shrb %al 50; AVX-NEXT: andb $85, %al 51; AVX-NEXT: orb %cl, %al 52; AVX-NEXT: retq 53; 54; XOP-LABEL: test_bitreverse_i8: 55; XOP: # %bb.0: 56; XOP-NEXT: vmovd %edi, %xmm0 57; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 58; XOP-NEXT: vmovd %xmm0, %eax 59; XOP-NEXT: # kill: def $al killed $al killed $eax 60; XOP-NEXT: retq 61; 62; GFNISSE-LABEL: test_bitreverse_i8: 63; GFNISSE: # %bb.0: 64; GFNISSE-NEXT: rolb $4, %dil 65; GFNISSE-NEXT: movl %edi, %eax 66; GFNISSE-NEXT: andb $51, %al 67; GFNISSE-NEXT: shlb $2, %al 68; GFNISSE-NEXT: shrb $2, %dil 69; GFNISSE-NEXT: andb $51, %dil 70; GFNISSE-NEXT: orb %dil, %al 71; GFNISSE-NEXT: movl %eax, %ecx 72; GFNISSE-NEXT: andb $85, %cl 73; GFNISSE-NEXT: addb %cl, %cl 74; GFNISSE-NEXT: shrb %al 75; GFNISSE-NEXT: andb $85, %al 76; GFNISSE-NEXT: orb %cl, %al 77; GFNISSE-NEXT: retq 78; 79; GFNIAVX-LABEL: test_bitreverse_i8: 80; GFNIAVX: # %bb.0: 81; GFNIAVX-NEXT: rolb $4, %dil 82; GFNIAVX-NEXT: movl %edi, %eax 83; GFNIAVX-NEXT: andb $51, %al 84; GFNIAVX-NEXT: shlb $2, %al 85; GFNIAVX-NEXT: shrb $2, %dil 86; GFNIAVX-NEXT: andb $51, %dil 87; GFNIAVX-NEXT: orb %dil, %al 88; GFNIAVX-NEXT: movl %eax, %ecx 89; GFNIAVX-NEXT: andb $85, %cl 90; GFNIAVX-NEXT: addb %cl, %cl 91; GFNIAVX-NEXT: shrb %al 92; GFNIAVX-NEXT: andb $85, %al 93; GFNIAVX-NEXT: orb %cl, %al 94; GFNIAVX-NEXT: retq 95 %b = call i8 @llvm.bitreverse.i8(i8 %a) 96 ret i8 %b 97} 98 99define i16 @test_bitreverse_i16(i16 %a) nounwind { 100; SSE-LABEL: test_bitreverse_i16: 101; SSE: # %bb.0: 102; SSE-NEXT: # kill: def $edi killed $edi def $rdi 103; SSE-NEXT: rolw $8, %di 104; SSE-NEXT: movl %edi, %eax 105; SSE-NEXT: andl $3855, %eax # imm = 0xF0F 106; SSE-NEXT: shll $4, %eax 107; SSE-NEXT: shrl $4, %edi 108; SSE-NEXT: andl $3855, %edi # imm = 0xF0F 109; SSE-NEXT: orl %eax, %edi 110; SSE-NEXT: movl %edi, %eax 111; SSE-NEXT: andl $13107, %eax # imm = 0x3333 112; SSE-NEXT: shrl $2, %edi 113; SSE-NEXT: andl $13107, %edi # imm = 0x3333 114; SSE-NEXT: leal (%rdi,%rax,4), %eax 115; SSE-NEXT: movl %eax, %ecx 116; SSE-NEXT: andl $21845, %ecx # imm = 0x5555 117; SSE-NEXT: shrl %eax 118; SSE-NEXT: andl $21845, %eax # imm = 0x5555 119; SSE-NEXT: leal (%rax,%rcx,2), %eax 120; SSE-NEXT: # kill: def $ax killed $ax killed $eax 121; SSE-NEXT: retq 122; 123; AVX-LABEL: test_bitreverse_i16: 124; AVX: # %bb.0: 125; AVX-NEXT: # kill: def $edi killed $edi def $rdi 126; AVX-NEXT: rolw $8, %di 127; AVX-NEXT: movl %edi, %eax 128; AVX-NEXT: andl $3855, %eax # imm = 0xF0F 129; AVX-NEXT: shll $4, %eax 130; AVX-NEXT: shrl $4, %edi 131; AVX-NEXT: andl $3855, %edi # imm = 0xF0F 132; AVX-NEXT: orl %eax, %edi 133; AVX-NEXT: movl %edi, %eax 134; AVX-NEXT: andl $13107, %eax # imm = 0x3333 135; AVX-NEXT: shrl $2, %edi 136; AVX-NEXT: andl $13107, %edi # imm = 0x3333 137; AVX-NEXT: leal (%rdi,%rax,4), %eax 138; AVX-NEXT: movl %eax, %ecx 139; AVX-NEXT: andl $21845, %ecx # imm = 0x5555 140; AVX-NEXT: shrl %eax 141; AVX-NEXT: andl $21845, %eax # imm = 0x5555 142; AVX-NEXT: leal (%rax,%rcx,2), %eax 143; AVX-NEXT: # kill: def $ax killed $ax killed $eax 144; AVX-NEXT: retq 145; 146; XOP-LABEL: test_bitreverse_i16: 147; XOP: # %bb.0: 148; XOP-NEXT: vmovd %edi, %xmm0 149; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 150; XOP-NEXT: vmovd %xmm0, %eax 151; XOP-NEXT: # kill: def $ax killed $ax killed $eax 152; XOP-NEXT: retq 153; 154; GFNISSE-LABEL: test_bitreverse_i16: 155; GFNISSE: # %bb.0: 156; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi 157; GFNISSE-NEXT: rolw $8, %di 158; GFNISSE-NEXT: movl %edi, %eax 159; GFNISSE-NEXT: andl $3855, %eax # imm = 0xF0F 160; GFNISSE-NEXT: shll $4, %eax 161; GFNISSE-NEXT: shrl $4, %edi 162; GFNISSE-NEXT: andl $3855, %edi # imm = 0xF0F 163; GFNISSE-NEXT: orl %eax, %edi 164; GFNISSE-NEXT: movl %edi, %eax 165; GFNISSE-NEXT: andl $13107, %eax # imm = 0x3333 166; GFNISSE-NEXT: shrl $2, %edi 167; GFNISSE-NEXT: andl $13107, %edi # imm = 0x3333 168; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax 169; GFNISSE-NEXT: movl %eax, %ecx 170; GFNISSE-NEXT: andl $21845, %ecx # imm = 0x5555 171; GFNISSE-NEXT: shrl %eax 172; GFNISSE-NEXT: andl $21845, %eax # imm = 0x5555 173; GFNISSE-NEXT: leal (%rax,%rcx,2), %eax 174; GFNISSE-NEXT: # kill: def $ax killed $ax killed $eax 175; GFNISSE-NEXT: retq 176; 177; GFNIAVX-LABEL: test_bitreverse_i16: 178; GFNIAVX: # %bb.0: 179; GFNIAVX-NEXT: # kill: def $edi killed $edi def $rdi 180; GFNIAVX-NEXT: rolw $8, %di 181; GFNIAVX-NEXT: movl %edi, %eax 182; GFNIAVX-NEXT: andl $3855, %eax # imm = 0xF0F 183; GFNIAVX-NEXT: shll $4, %eax 184; GFNIAVX-NEXT: shrl $4, %edi 185; GFNIAVX-NEXT: andl $3855, %edi # imm = 0xF0F 186; GFNIAVX-NEXT: orl %eax, %edi 187; GFNIAVX-NEXT: movl %edi, %eax 188; GFNIAVX-NEXT: andl $13107, %eax # imm = 0x3333 189; GFNIAVX-NEXT: shrl $2, %edi 190; GFNIAVX-NEXT: andl $13107, %edi # imm = 0x3333 191; GFNIAVX-NEXT: leal (%rdi,%rax,4), %eax 192; GFNIAVX-NEXT: movl %eax, %ecx 193; GFNIAVX-NEXT: andl $21845, %ecx # imm = 0x5555 194; GFNIAVX-NEXT: shrl %eax 195; GFNIAVX-NEXT: andl $21845, %eax # imm = 0x5555 196; GFNIAVX-NEXT: leal (%rax,%rcx,2), %eax 197; GFNIAVX-NEXT: # kill: def $ax killed $ax killed $eax 198; GFNIAVX-NEXT: retq 199 %b = call i16 @llvm.bitreverse.i16(i16 %a) 200 ret i16 %b 201} 202 203define i32 @test_bitreverse_i32(i32 %a) nounwind { 204; SSE-LABEL: test_bitreverse_i32: 205; SSE: # %bb.0: 206; SSE-NEXT: # kill: def $edi killed $edi def $rdi 207; SSE-NEXT: bswapl %edi 208; SSE-NEXT: movl %edi, %eax 209; SSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 210; SSE-NEXT: shll $4, %eax 211; SSE-NEXT: shrl $4, %edi 212; SSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 213; SSE-NEXT: orl %eax, %edi 214; SSE-NEXT: movl %edi, %eax 215; SSE-NEXT: andl $858993459, %eax # imm = 0x33333333 216; SSE-NEXT: shrl $2, %edi 217; SSE-NEXT: andl $858993459, %edi # imm = 0x33333333 218; SSE-NEXT: leal (%rdi,%rax,4), %eax 219; SSE-NEXT: movl %eax, %ecx 220; SSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555 221; SSE-NEXT: shrl %eax 222; SSE-NEXT: andl $1431655765, %eax # imm = 0x55555555 223; SSE-NEXT: leal (%rax,%rcx,2), %eax 224; SSE-NEXT: retq 225; 226; AVX-LABEL: test_bitreverse_i32: 227; AVX: # %bb.0: 228; AVX-NEXT: # kill: def $edi killed $edi def $rdi 229; AVX-NEXT: bswapl %edi 230; AVX-NEXT: movl %edi, %eax 231; AVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 232; AVX-NEXT: shll $4, %eax 233; AVX-NEXT: shrl $4, %edi 234; AVX-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 235; AVX-NEXT: orl %eax, %edi 236; AVX-NEXT: movl %edi, %eax 237; AVX-NEXT: andl $858993459, %eax # imm = 0x33333333 238; AVX-NEXT: shrl $2, %edi 239; AVX-NEXT: andl $858993459, %edi # imm = 0x33333333 240; AVX-NEXT: leal (%rdi,%rax,4), %eax 241; AVX-NEXT: movl %eax, %ecx 242; AVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555 243; AVX-NEXT: shrl %eax 244; AVX-NEXT: andl $1431655765, %eax # imm = 0x55555555 245; AVX-NEXT: leal (%rax,%rcx,2), %eax 246; AVX-NEXT: retq 247; 248; XOP-LABEL: test_bitreverse_i32: 249; XOP: # %bb.0: 250; XOP-NEXT: vmovd %edi, %xmm0 251; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 252; XOP-NEXT: vmovd %xmm0, %eax 253; XOP-NEXT: retq 254; 255; GFNISSE-LABEL: test_bitreverse_i32: 256; GFNISSE: # %bb.0: 257; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi 258; GFNISSE-NEXT: bswapl %edi 259; GFNISSE-NEXT: movl %edi, %eax 260; GFNISSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 261; GFNISSE-NEXT: shll $4, %eax 262; GFNISSE-NEXT: shrl $4, %edi 263; GFNISSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 264; GFNISSE-NEXT: orl %eax, %edi 265; GFNISSE-NEXT: movl %edi, %eax 266; GFNISSE-NEXT: andl $858993459, %eax # imm = 0x33333333 267; GFNISSE-NEXT: shrl $2, %edi 268; GFNISSE-NEXT: andl $858993459, %edi # imm = 0x33333333 269; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax 270; GFNISSE-NEXT: movl %eax, %ecx 271; GFNISSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555 272; GFNISSE-NEXT: shrl %eax 273; GFNISSE-NEXT: andl $1431655765, %eax # imm = 0x55555555 274; GFNISSE-NEXT: leal (%rax,%rcx,2), %eax 275; GFNISSE-NEXT: retq 276; 277; GFNIAVX-LABEL: test_bitreverse_i32: 278; GFNIAVX: # %bb.0: 279; GFNIAVX-NEXT: # kill: def $edi killed $edi def $rdi 280; GFNIAVX-NEXT: bswapl %edi 281; GFNIAVX-NEXT: movl %edi, %eax 282; GFNIAVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 283; GFNIAVX-NEXT: shll $4, %eax 284; GFNIAVX-NEXT: shrl $4, %edi 285; GFNIAVX-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 286; GFNIAVX-NEXT: orl %eax, %edi 287; GFNIAVX-NEXT: movl %edi, %eax 288; GFNIAVX-NEXT: andl $858993459, %eax # imm = 0x33333333 289; GFNIAVX-NEXT: shrl $2, %edi 290; GFNIAVX-NEXT: andl $858993459, %edi # imm = 0x33333333 291; GFNIAVX-NEXT: leal (%rdi,%rax,4), %eax 292; GFNIAVX-NEXT: movl %eax, %ecx 293; GFNIAVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555 294; GFNIAVX-NEXT: shrl %eax 295; GFNIAVX-NEXT: andl $1431655765, %eax # imm = 0x55555555 296; GFNIAVX-NEXT: leal (%rax,%rcx,2), %eax 297; GFNIAVX-NEXT: retq 298 %b = call i32 @llvm.bitreverse.i32(i32 %a) 299 ret i32 %b 300} 301 302define i64 @test_bitreverse_i64(i64 %a) nounwind { 303; SSE-LABEL: test_bitreverse_i64: 304; SSE: # %bb.0: 305; SSE-NEXT: bswapq %rdi 306; SSE-NEXT: movq %rdi, %rax 307; SSE-NEXT: shrq $4, %rax 308; SSE-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 309; SSE-NEXT: andq %rcx, %rax 310; SSE-NEXT: andq %rcx, %rdi 311; SSE-NEXT: shlq $4, %rdi 312; SSE-NEXT: orq %rax, %rdi 313; SSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 314; SSE-NEXT: movq %rdi, %rcx 315; SSE-NEXT: andq %rax, %rcx 316; SSE-NEXT: shrq $2, %rdi 317; SSE-NEXT: andq %rax, %rdi 318; SSE-NEXT: leaq (%rdi,%rcx,4), %rax 319; SSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 320; SSE-NEXT: movq %rax, %rdx 321; SSE-NEXT: andq %rcx, %rdx 322; SSE-NEXT: shrq %rax 323; SSE-NEXT: andq %rcx, %rax 324; SSE-NEXT: leaq (%rax,%rdx,2), %rax 325; SSE-NEXT: retq 326; 327; AVX-LABEL: test_bitreverse_i64: 328; AVX: # %bb.0: 329; AVX-NEXT: bswapq %rdi 330; AVX-NEXT: movq %rdi, %rax 331; AVX-NEXT: shrq $4, %rax 332; AVX-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 333; AVX-NEXT: andq %rcx, %rax 334; AVX-NEXT: andq %rcx, %rdi 335; AVX-NEXT: shlq $4, %rdi 336; AVX-NEXT: orq %rax, %rdi 337; AVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 338; AVX-NEXT: movq %rdi, %rcx 339; AVX-NEXT: andq %rax, %rcx 340; AVX-NEXT: shrq $2, %rdi 341; AVX-NEXT: andq %rax, %rdi 342; AVX-NEXT: leaq (%rdi,%rcx,4), %rax 343; AVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 344; AVX-NEXT: movq %rax, %rdx 345; AVX-NEXT: andq %rcx, %rdx 346; AVX-NEXT: shrq %rax 347; AVX-NEXT: andq %rcx, %rax 348; AVX-NEXT: leaq (%rax,%rdx,2), %rax 349; AVX-NEXT: retq 350; 351; XOP-LABEL: test_bitreverse_i64: 352; XOP: # %bb.0: 353; XOP-NEXT: vmovq %rdi, %xmm0 354; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 355; XOP-NEXT: vmovq %xmm0, %rax 356; XOP-NEXT: retq 357; 358; GFNISSE-LABEL: test_bitreverse_i64: 359; GFNISSE: # %bb.0: 360; GFNISSE-NEXT: bswapq %rdi 361; GFNISSE-NEXT: movq %rdi, %rax 362; GFNISSE-NEXT: shrq $4, %rax 363; GFNISSE-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 364; GFNISSE-NEXT: andq %rcx, %rax 365; GFNISSE-NEXT: andq %rcx, %rdi 366; GFNISSE-NEXT: shlq $4, %rdi 367; GFNISSE-NEXT: orq %rax, %rdi 368; GFNISSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 369; GFNISSE-NEXT: movq %rdi, %rcx 370; GFNISSE-NEXT: andq %rax, %rcx 371; GFNISSE-NEXT: shrq $2, %rdi 372; GFNISSE-NEXT: andq %rax, %rdi 373; GFNISSE-NEXT: leaq (%rdi,%rcx,4), %rax 374; GFNISSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 375; GFNISSE-NEXT: movq %rax, %rdx 376; GFNISSE-NEXT: andq %rcx, %rdx 377; GFNISSE-NEXT: shrq %rax 378; GFNISSE-NEXT: andq %rcx, %rax 379; GFNISSE-NEXT: leaq (%rax,%rdx,2), %rax 380; GFNISSE-NEXT: retq 381; 382; GFNIAVX-LABEL: test_bitreverse_i64: 383; GFNIAVX: # %bb.0: 384; GFNIAVX-NEXT: bswapq %rdi 385; GFNIAVX-NEXT: movq %rdi, %rax 386; GFNIAVX-NEXT: shrq $4, %rax 387; GFNIAVX-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 388; GFNIAVX-NEXT: andq %rcx, %rax 389; GFNIAVX-NEXT: andq %rcx, %rdi 390; GFNIAVX-NEXT: shlq $4, %rdi 391; GFNIAVX-NEXT: orq %rax, %rdi 392; GFNIAVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 393; GFNIAVX-NEXT: movq %rdi, %rcx 394; GFNIAVX-NEXT: andq %rax, %rcx 395; GFNIAVX-NEXT: shrq $2, %rdi 396; GFNIAVX-NEXT: andq %rax, %rdi 397; GFNIAVX-NEXT: leaq (%rdi,%rcx,4), %rax 398; GFNIAVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 399; GFNIAVX-NEXT: movq %rax, %rdx 400; GFNIAVX-NEXT: andq %rcx, %rdx 401; GFNIAVX-NEXT: shrq %rax 402; GFNIAVX-NEXT: andq %rcx, %rax 403; GFNIAVX-NEXT: leaq (%rax,%rdx,2), %rax 404; GFNIAVX-NEXT: retq 405 %b = call i64 @llvm.bitreverse.i64(i64 %a) 406 ret i64 %b 407} 408 409define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { 410; SSE2-LABEL: test_bitreverse_v16i8: 411; SSE2: # %bb.0: 412; SSE2-NEXT: movdqa %xmm0, %xmm1 413; SSE2-NEXT: psrlw $4, %xmm1 414; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 415; SSE2-NEXT: pand %xmm2, %xmm1 416; SSE2-NEXT: pand %xmm2, %xmm0 417; SSE2-NEXT: psllw $4, %xmm0 418; SSE2-NEXT: por %xmm1, %xmm0 419; SSE2-NEXT: movdqa %xmm0, %xmm1 420; SSE2-NEXT: psrlw $2, %xmm1 421; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 422; SSE2-NEXT: pand %xmm2, %xmm1 423; SSE2-NEXT: pand %xmm2, %xmm0 424; SSE2-NEXT: psllw $2, %xmm0 425; SSE2-NEXT: por %xmm1, %xmm0 426; SSE2-NEXT: movdqa %xmm0, %xmm1 427; SSE2-NEXT: psrlw $1, %xmm1 428; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 429; SSE2-NEXT: pand %xmm2, %xmm1 430; SSE2-NEXT: pand %xmm2, %xmm0 431; SSE2-NEXT: paddb %xmm0, %xmm0 432; SSE2-NEXT: por %xmm1, %xmm0 433; SSE2-NEXT: retq 434; 435; SSSE3-LABEL: test_bitreverse_v16i8: 436; SSSE3: # %bb.0: 437; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 438; SSSE3-NEXT: movdqa %xmm0, %xmm2 439; SSSE3-NEXT: pand %xmm1, %xmm2 440; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 441; SSSE3-NEXT: pshufb %xmm2, %xmm3 442; SSSE3-NEXT: psrlw $4, %xmm0 443; SSSE3-NEXT: pand %xmm1, %xmm0 444; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 445; SSSE3-NEXT: pshufb %xmm0, %xmm1 446; SSSE3-NEXT: por %xmm3, %xmm1 447; SSSE3-NEXT: movdqa %xmm1, %xmm0 448; SSSE3-NEXT: retq 449; 450; AVX-LABEL: test_bitreverse_v16i8: 451; AVX: # %bb.0: 452; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 453; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 454; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 455; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 456; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 457; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 458; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 459; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 460; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 461; AVX-NEXT: retq 462; 463; XOP-LABEL: test_bitreverse_v16i8: 464; XOP: # %bb.0: 465; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 466; XOP-NEXT: retq 467; 468; GFNISSE-LABEL: test_bitreverse_v16i8: 469; GFNISSE: # %bb.0: 470; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 471; GFNISSE-NEXT: retq 472; 473; GFNIAVX-LABEL: test_bitreverse_v16i8: 474; GFNIAVX: # %bb.0: 475; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 476; GFNIAVX-NEXT: retq 477 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) 478 ret <16 x i8> %b 479} 480 481define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { 482; SSE2-LABEL: test_bitreverse_v8i16: 483; SSE2: # %bb.0: 484; SSE2-NEXT: movdqa %xmm0, %xmm1 485; SSE2-NEXT: psrlw $8, %xmm1 486; SSE2-NEXT: psllw $8, %xmm0 487; SSE2-NEXT: por %xmm1, %xmm0 488; SSE2-NEXT: movdqa %xmm0, %xmm1 489; SSE2-NEXT: psrlw $4, %xmm1 490; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 491; SSE2-NEXT: pand %xmm2, %xmm1 492; SSE2-NEXT: pand %xmm2, %xmm0 493; SSE2-NEXT: psllw $4, %xmm0 494; SSE2-NEXT: por %xmm1, %xmm0 495; SSE2-NEXT: movdqa %xmm0, %xmm1 496; SSE2-NEXT: psrlw $2, %xmm1 497; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 498; SSE2-NEXT: pand %xmm2, %xmm1 499; SSE2-NEXT: pand %xmm2, %xmm0 500; SSE2-NEXT: psllw $2, %xmm0 501; SSE2-NEXT: por %xmm1, %xmm0 502; SSE2-NEXT: movdqa %xmm0, %xmm1 503; SSE2-NEXT: psrlw $1, %xmm1 504; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 505; SSE2-NEXT: pand %xmm2, %xmm1 506; SSE2-NEXT: pand %xmm2, %xmm0 507; SSE2-NEXT: paddb %xmm0, %xmm0 508; SSE2-NEXT: por %xmm1, %xmm0 509; SSE2-NEXT: retq 510; 511; SSSE3-LABEL: test_bitreverse_v8i16: 512; SSSE3: # %bb.0: 513; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 514; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 515; SSSE3-NEXT: movdqa %xmm0, %xmm2 516; SSSE3-NEXT: pand %xmm1, %xmm2 517; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 518; SSSE3-NEXT: pshufb %xmm2, %xmm3 519; SSSE3-NEXT: psrlw $4, %xmm0 520; SSSE3-NEXT: pand %xmm1, %xmm0 521; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 522; SSSE3-NEXT: pshufb %xmm0, %xmm1 523; SSSE3-NEXT: por %xmm3, %xmm1 524; SSSE3-NEXT: movdqa %xmm1, %xmm0 525; SSSE3-NEXT: retq 526; 527; AVX-LABEL: test_bitreverse_v8i16: 528; AVX: # %bb.0: 529; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 530; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 531; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 532; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 533; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 534; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 535; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 536; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 537; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 538; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 539; AVX-NEXT: retq 540; 541; XOP-LABEL: test_bitreverse_v8i16: 542; XOP: # %bb.0: 543; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 544; XOP-NEXT: retq 545; 546; GFNISSE-LABEL: test_bitreverse_v8i16: 547; GFNISSE: # %bb.0: 548; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 549; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 550; GFNISSE-NEXT: retq 551; 552; GFNIAVX-LABEL: test_bitreverse_v8i16: 553; GFNIAVX: # %bb.0: 554; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 555; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 556; GFNIAVX-NEXT: retq 557 %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) 558 ret <8 x i16> %b 559} 560 561define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { 562; SSE2-LABEL: test_bitreverse_v4i32: 563; SSE2: # %bb.0: 564; SSE2-NEXT: pxor %xmm1, %xmm1 565; SSE2-NEXT: movdqa %xmm0, %xmm2 566; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 567; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 568; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 569; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 570; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 571; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 572; SSE2-NEXT: packuswb %xmm2, %xmm0 573; SSE2-NEXT: movdqa %xmm0, %xmm1 574; SSE2-NEXT: psrlw $4, %xmm1 575; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 576; SSE2-NEXT: pand %xmm2, %xmm1 577; SSE2-NEXT: pand %xmm2, %xmm0 578; SSE2-NEXT: psllw $4, %xmm0 579; SSE2-NEXT: por %xmm1, %xmm0 580; SSE2-NEXT: movdqa %xmm0, %xmm1 581; SSE2-NEXT: psrlw $2, %xmm1 582; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 583; SSE2-NEXT: pand %xmm2, %xmm1 584; SSE2-NEXT: pand %xmm2, %xmm0 585; SSE2-NEXT: psllw $2, %xmm0 586; SSE2-NEXT: por %xmm1, %xmm0 587; SSE2-NEXT: movdqa %xmm0, %xmm1 588; SSE2-NEXT: psrlw $1, %xmm1 589; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 590; SSE2-NEXT: pand %xmm2, %xmm1 591; SSE2-NEXT: pand %xmm2, %xmm0 592; SSE2-NEXT: paddb %xmm0, %xmm0 593; SSE2-NEXT: por %xmm1, %xmm0 594; SSE2-NEXT: retq 595; 596; SSSE3-LABEL: test_bitreverse_v4i32: 597; SSSE3: # %bb.0: 598; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 599; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 600; SSSE3-NEXT: movdqa %xmm0, %xmm2 601; SSSE3-NEXT: pand %xmm1, %xmm2 602; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 603; SSSE3-NEXT: pshufb %xmm2, %xmm3 604; SSSE3-NEXT: psrlw $4, %xmm0 605; SSSE3-NEXT: pand %xmm1, %xmm0 606; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 607; SSSE3-NEXT: pshufb %xmm0, %xmm1 608; SSSE3-NEXT: por %xmm3, %xmm1 609; SSSE3-NEXT: movdqa %xmm1, %xmm0 610; SSSE3-NEXT: retq 611; 612; AVX-LABEL: test_bitreverse_v4i32: 613; AVX: # %bb.0: 614; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 615; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 616; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 617; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 618; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 619; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 620; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 621; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 622; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 623; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 624; AVX-NEXT: retq 625; 626; XOP-LABEL: test_bitreverse_v4i32: 627; XOP: # %bb.0: 628; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 629; XOP-NEXT: retq 630; 631; GFNISSE-LABEL: test_bitreverse_v4i32: 632; GFNISSE: # %bb.0: 633; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 634; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 635; GFNISSE-NEXT: retq 636; 637; GFNIAVX-LABEL: test_bitreverse_v4i32: 638; GFNIAVX: # %bb.0: 639; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 640; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 641; GFNIAVX-NEXT: retq 642 %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) 643 ret <4 x i32> %b 644} 645 646define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { 647; SSE2-LABEL: test_bitreverse_v2i64: 648; SSE2: # %bb.0: 649; SSE2-NEXT: pxor %xmm1, %xmm1 650; SSE2-NEXT: movdqa %xmm0, %xmm2 651; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 652; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 653; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 654; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 655; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 656; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 657; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 658; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 659; SSE2-NEXT: packuswb %xmm2, %xmm0 660; SSE2-NEXT: movdqa %xmm0, %xmm1 661; SSE2-NEXT: psrlw $4, %xmm1 662; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 663; SSE2-NEXT: pand %xmm2, %xmm1 664; SSE2-NEXT: pand %xmm2, %xmm0 665; SSE2-NEXT: psllw $4, %xmm0 666; SSE2-NEXT: por %xmm1, %xmm0 667; SSE2-NEXT: movdqa %xmm0, %xmm1 668; SSE2-NEXT: psrlw $2, %xmm1 669; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 670; SSE2-NEXT: pand %xmm2, %xmm1 671; SSE2-NEXT: pand %xmm2, %xmm0 672; SSE2-NEXT: psllw $2, %xmm0 673; SSE2-NEXT: por %xmm1, %xmm0 674; SSE2-NEXT: movdqa %xmm0, %xmm1 675; SSE2-NEXT: psrlw $1, %xmm1 676; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 677; SSE2-NEXT: pand %xmm2, %xmm1 678; SSE2-NEXT: pand %xmm2, %xmm0 679; SSE2-NEXT: paddb %xmm0, %xmm0 680; SSE2-NEXT: por %xmm1, %xmm0 681; SSE2-NEXT: retq 682; 683; SSSE3-LABEL: test_bitreverse_v2i64: 684; SSSE3: # %bb.0: 685; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 686; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 687; SSSE3-NEXT: movdqa %xmm0, %xmm2 688; SSSE3-NEXT: pand %xmm1, %xmm2 689; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 690; SSSE3-NEXT: pshufb %xmm2, %xmm3 691; SSSE3-NEXT: psrlw $4, %xmm0 692; SSSE3-NEXT: pand %xmm1, %xmm0 693; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 694; SSSE3-NEXT: pshufb %xmm0, %xmm1 695; SSSE3-NEXT: por %xmm3, %xmm1 696; SSSE3-NEXT: movdqa %xmm1, %xmm0 697; SSSE3-NEXT: retq 698; 699; AVX-LABEL: test_bitreverse_v2i64: 700; AVX: # %bb.0: 701; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 702; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 703; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 704; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 705; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 706; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 707; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 708; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 709; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 710; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 711; AVX-NEXT: retq 712; 713; XOP-LABEL: test_bitreverse_v2i64: 714; XOP: # %bb.0: 715; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 716; XOP-NEXT: retq 717; 718; GFNISSE-LABEL: test_bitreverse_v2i64: 719; GFNISSE: # %bb.0: 720; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 721; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 722; GFNISSE-NEXT: retq 723; 724; GFNIAVX-LABEL: test_bitreverse_v2i64: 725; GFNIAVX: # %bb.0: 726; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 727; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 728; GFNIAVX-NEXT: retq 729 %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) 730 ret <2 x i64> %b 731} 732 733define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { 734; SSE2-LABEL: test_bitreverse_v32i8: 735; SSE2: # %bb.0: 736; SSE2-NEXT: movdqa %xmm0, %xmm3 737; SSE2-NEXT: psrlw $4, %xmm3 738; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 739; SSE2-NEXT: pand %xmm2, %xmm3 740; SSE2-NEXT: pand %xmm2, %xmm0 741; SSE2-NEXT: psllw $4, %xmm0 742; SSE2-NEXT: por %xmm3, %xmm0 743; SSE2-NEXT: movdqa %xmm0, %xmm4 744; SSE2-NEXT: psrlw $2, %xmm4 745; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 746; SSE2-NEXT: pand %xmm3, %xmm4 747; SSE2-NEXT: pand %xmm3, %xmm0 748; SSE2-NEXT: psllw $2, %xmm0 749; SSE2-NEXT: por %xmm4, %xmm0 750; SSE2-NEXT: movdqa %xmm0, %xmm5 751; SSE2-NEXT: psrlw $1, %xmm5 752; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 753; SSE2-NEXT: pand %xmm4, %xmm5 754; SSE2-NEXT: pand %xmm4, %xmm0 755; SSE2-NEXT: paddb %xmm0, %xmm0 756; SSE2-NEXT: por %xmm5, %xmm0 757; SSE2-NEXT: movdqa %xmm1, %xmm5 758; SSE2-NEXT: psrlw $4, %xmm5 759; SSE2-NEXT: pand %xmm2, %xmm5 760; SSE2-NEXT: pand %xmm2, %xmm1 761; SSE2-NEXT: psllw $4, %xmm1 762; SSE2-NEXT: por %xmm5, %xmm1 763; SSE2-NEXT: movdqa %xmm1, %xmm2 764; SSE2-NEXT: psrlw $2, %xmm2 765; SSE2-NEXT: pand %xmm3, %xmm2 766; SSE2-NEXT: pand %xmm3, %xmm1 767; SSE2-NEXT: psllw $2, %xmm1 768; SSE2-NEXT: por %xmm2, %xmm1 769; SSE2-NEXT: movdqa %xmm1, %xmm2 770; SSE2-NEXT: psrlw $1, %xmm2 771; SSE2-NEXT: pand %xmm4, %xmm2 772; SSE2-NEXT: pand %xmm4, %xmm1 773; SSE2-NEXT: paddb %xmm1, %xmm1 774; SSE2-NEXT: por %xmm2, %xmm1 775; SSE2-NEXT: retq 776; 777; SSSE3-LABEL: test_bitreverse_v32i8: 778; SSSE3: # %bb.0: 779; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 780; SSSE3-NEXT: movdqa %xmm0, %xmm2 781; SSSE3-NEXT: pand %xmm4, %xmm2 782; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 783; SSSE3-NEXT: movdqa %xmm5, %xmm6 784; SSSE3-NEXT: pshufb %xmm2, %xmm6 785; SSSE3-NEXT: psrlw $4, %xmm0 786; SSSE3-NEXT: pand %xmm4, %xmm0 787; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 788; SSSE3-NEXT: movdqa %xmm2, %xmm3 789; SSSE3-NEXT: pshufb %xmm0, %xmm3 790; SSSE3-NEXT: por %xmm6, %xmm3 791; SSSE3-NEXT: movdqa %xmm1, %xmm0 792; SSSE3-NEXT: pand %xmm4, %xmm0 793; SSSE3-NEXT: pshufb %xmm0, %xmm5 794; SSSE3-NEXT: psrlw $4, %xmm1 795; SSSE3-NEXT: pand %xmm4, %xmm1 796; SSSE3-NEXT: pshufb %xmm1, %xmm2 797; SSSE3-NEXT: por %xmm5, %xmm2 798; SSSE3-NEXT: movdqa %xmm3, %xmm0 799; SSSE3-NEXT: movdqa %xmm2, %xmm1 800; SSSE3-NEXT: retq 801; 802; AVX1-LABEL: test_bitreverse_v32i8: 803; AVX1: # %bb.0: 804; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 805; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 806; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 807; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 808; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 809; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 810; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 811; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 812; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 813; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 814; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 815; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 816; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 817; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 818; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 819; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 820; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 821; AVX1-NEXT: retq 822; 823; AVX2-LABEL: test_bitreverse_v32i8: 824; AVX2: # %bb.0: 825; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 826; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 827; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 828; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 829; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 830; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 831; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 832; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 833; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 834; AVX2-NEXT: retq 835; 836; AVX512-LABEL: test_bitreverse_v32i8: 837; AVX512: # %bb.0: 838; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 839; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 840; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 841; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 842; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 843; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 844; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 845; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 846; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 847; AVX512-NEXT: retq 848; 849; XOPAVX1-LABEL: test_bitreverse_v32i8: 850; XOPAVX1: # %bb.0: 851; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 852; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 853; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 854; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 855; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 856; XOPAVX1-NEXT: retq 857; 858; XOPAVX2-LABEL: test_bitreverse_v32i8: 859; XOPAVX2: # %bb.0: 860; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 861; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 862; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 863; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 864; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 865; XOPAVX2-NEXT: retq 866; 867; GFNISSE-LABEL: test_bitreverse_v32i8: 868; GFNISSE: # %bb.0: 869; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] 870; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 871; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 872; GFNISSE-NEXT: retq 873; 874; GFNIAVX1-LABEL: test_bitreverse_v32i8: 875; GFNIAVX1: # %bb.0: 876; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 877; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] 878; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm2, %xmm1, %xmm1 879; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm2, %xmm0, %xmm0 880; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 881; GFNIAVX1-NEXT: retq 882; 883; GFNIAVX2-LABEL: test_bitreverse_v32i8: 884; GFNIAVX2: # %bb.0: 885; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 886; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 887; GFNIAVX2-NEXT: retq 888; 889; GFNIAVX512-LABEL: test_bitreverse_v32i8: 890; GFNIAVX512: # %bb.0: 891; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 892; GFNIAVX512-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 893; GFNIAVX512-NEXT: retq 894 %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) 895 ret <32 x i8> %b 896} 897 898define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { 899; SSE2-LABEL: test_bitreverse_v16i16: 900; SSE2: # %bb.0: 901; SSE2-NEXT: movdqa %xmm0, %xmm2 902; SSE2-NEXT: psrlw $8, %xmm2 903; SSE2-NEXT: psllw $8, %xmm0 904; SSE2-NEXT: por %xmm2, %xmm0 905; SSE2-NEXT: movdqa %xmm0, %xmm3 906; SSE2-NEXT: psrlw $4, %xmm3 907; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 908; SSE2-NEXT: pand %xmm2, %xmm3 909; SSE2-NEXT: pand %xmm2, %xmm0 910; SSE2-NEXT: psllw $4, %xmm0 911; SSE2-NEXT: por %xmm3, %xmm0 912; SSE2-NEXT: movdqa %xmm0, %xmm4 913; SSE2-NEXT: psrlw $2, %xmm4 914; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 915; SSE2-NEXT: pand %xmm3, %xmm4 916; SSE2-NEXT: pand %xmm3, %xmm0 917; SSE2-NEXT: psllw $2, %xmm0 918; SSE2-NEXT: por %xmm4, %xmm0 919; SSE2-NEXT: movdqa %xmm0, %xmm5 920; SSE2-NEXT: psrlw $1, %xmm5 921; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 922; SSE2-NEXT: pand %xmm4, %xmm5 923; SSE2-NEXT: pand %xmm4, %xmm0 924; SSE2-NEXT: paddb %xmm0, %xmm0 925; SSE2-NEXT: por %xmm5, %xmm0 926; SSE2-NEXT: movdqa %xmm1, %xmm5 927; SSE2-NEXT: psrlw $8, %xmm5 928; SSE2-NEXT: psllw $8, %xmm1 929; SSE2-NEXT: por %xmm5, %xmm1 930; SSE2-NEXT: movdqa %xmm1, %xmm5 931; SSE2-NEXT: psrlw $4, %xmm5 932; SSE2-NEXT: pand %xmm2, %xmm5 933; SSE2-NEXT: pand %xmm2, %xmm1 934; SSE2-NEXT: psllw $4, %xmm1 935; SSE2-NEXT: por %xmm5, %xmm1 936; SSE2-NEXT: movdqa %xmm1, %xmm2 937; SSE2-NEXT: psrlw $2, %xmm2 938; SSE2-NEXT: pand %xmm3, %xmm2 939; SSE2-NEXT: pand %xmm3, %xmm1 940; SSE2-NEXT: psllw $2, %xmm1 941; SSE2-NEXT: por %xmm2, %xmm1 942; SSE2-NEXT: movdqa %xmm1, %xmm2 943; SSE2-NEXT: psrlw $1, %xmm2 944; SSE2-NEXT: pand %xmm4, %xmm2 945; SSE2-NEXT: pand %xmm4, %xmm1 946; SSE2-NEXT: paddb %xmm1, %xmm1 947; SSE2-NEXT: por %xmm2, %xmm1 948; SSE2-NEXT: retq 949; 950; SSSE3-LABEL: test_bitreverse_v16i16: 951; SSSE3: # %bb.0: 952; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 953; SSSE3-NEXT: pshufb %xmm4, %xmm0 954; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 955; SSSE3-NEXT: movdqa %xmm0, %xmm2 956; SSSE3-NEXT: pand %xmm5, %xmm2 957; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 958; SSSE3-NEXT: movdqa %xmm6, %xmm7 959; SSSE3-NEXT: pshufb %xmm2, %xmm7 960; SSSE3-NEXT: psrlw $4, %xmm0 961; SSSE3-NEXT: pand %xmm5, %xmm0 962; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 963; SSSE3-NEXT: movdqa %xmm2, %xmm3 964; SSSE3-NEXT: pshufb %xmm0, %xmm3 965; SSSE3-NEXT: por %xmm7, %xmm3 966; SSSE3-NEXT: pshufb %xmm4, %xmm1 967; SSSE3-NEXT: movdqa %xmm1, %xmm0 968; SSSE3-NEXT: pand %xmm5, %xmm0 969; SSSE3-NEXT: pshufb %xmm0, %xmm6 970; SSSE3-NEXT: psrlw $4, %xmm1 971; SSSE3-NEXT: pand %xmm5, %xmm1 972; SSSE3-NEXT: pshufb %xmm1, %xmm2 973; SSSE3-NEXT: por %xmm6, %xmm2 974; SSSE3-NEXT: movdqa %xmm3, %xmm0 975; SSSE3-NEXT: movdqa %xmm2, %xmm1 976; SSSE3-NEXT: retq 977; 978; AVX1-LABEL: test_bitreverse_v16i16: 979; AVX1: # %bb.0: 980; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 981; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 982; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 983; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 984; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 985; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 986; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 987; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 988; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 989; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 990; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 991; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 992; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 993; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 994; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 995; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 996; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 997; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 998; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 999; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1000; AVX1-NEXT: retq 1001; 1002; AVX2-LABEL: test_bitreverse_v16i16: 1003; AVX2: # %bb.0: 1004; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1005; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1006; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1007; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1008; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1009; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1010; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1011; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1012; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1013; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1014; AVX2-NEXT: retq 1015; 1016; AVX512-LABEL: test_bitreverse_v16i16: 1017; AVX512: # %bb.0: 1018; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1019; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1020; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1021; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1022; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1023; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1024; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1025; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1026; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1027; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1028; AVX512-NEXT: retq 1029; 1030; XOPAVX1-LABEL: test_bitreverse_v16i16: 1031; XOPAVX1: # %bb.0: 1032; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1033; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1034; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1035; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1036; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1037; XOPAVX1-NEXT: retq 1038; 1039; XOPAVX2-LABEL: test_bitreverse_v16i16: 1040; XOPAVX2: # %bb.0: 1041; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1042; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1043; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1044; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1045; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1046; XOPAVX2-NEXT: retq 1047; 1048; GFNISSE-LABEL: test_bitreverse_v16i16: 1049; GFNISSE: # %bb.0: 1050; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1051; GFNISSE-NEXT: pshufb %xmm2, %xmm0 1052; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1053; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 1054; GFNISSE-NEXT: pshufb %xmm2, %xmm1 1055; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 1056; GFNISSE-NEXT: retq 1057; 1058; GFNIAVX1-LABEL: test_bitreverse_v16i16: 1059; GFNIAVX1: # %bb.0: 1060; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1061; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1062; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1063; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1064; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1 1065; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1066; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0 1067; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1068; GFNIAVX1-NEXT: retq 1069; 1070; GFNIAVX2-LABEL: test_bitreverse_v16i16: 1071; GFNIAVX2: # %bb.0: 1072; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1073; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1074; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1075; GFNIAVX2-NEXT: retq 1076; 1077; GFNIAVX512-LABEL: test_bitreverse_v16i16: 1078; GFNIAVX512: # %bb.0: 1079; GFNIAVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1080; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1081; GFNIAVX512-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1082; GFNIAVX512-NEXT: retq 1083 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) 1084 ret <16 x i16> %b 1085} 1086 1087define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { 1088; SSE2-LABEL: test_bitreverse_v8i32: 1089; SSE2: # %bb.0: 1090; SSE2-NEXT: pxor %xmm2, %xmm2 1091; SSE2-NEXT: movdqa %xmm0, %xmm3 1092; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 1093; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 1094; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 1095; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1096; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1097; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1098; SSE2-NEXT: packuswb %xmm3, %xmm0 1099; SSE2-NEXT: movdqa %xmm0, %xmm4 1100; SSE2-NEXT: psrlw $4, %xmm4 1101; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1102; SSE2-NEXT: pand %xmm3, %xmm4 1103; SSE2-NEXT: pand %xmm3, %xmm0 1104; SSE2-NEXT: psllw $4, %xmm0 1105; SSE2-NEXT: por %xmm4, %xmm0 1106; SSE2-NEXT: movdqa %xmm0, %xmm5 1107; SSE2-NEXT: psrlw $2, %xmm5 1108; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1109; SSE2-NEXT: pand %xmm4, %xmm5 1110; SSE2-NEXT: pand %xmm4, %xmm0 1111; SSE2-NEXT: psllw $2, %xmm0 1112; SSE2-NEXT: por %xmm5, %xmm0 1113; SSE2-NEXT: movdqa %xmm0, %xmm6 1114; SSE2-NEXT: psrlw $1, %xmm6 1115; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1116; SSE2-NEXT: pand %xmm5, %xmm6 1117; SSE2-NEXT: pand %xmm5, %xmm0 1118; SSE2-NEXT: paddb %xmm0, %xmm0 1119; SSE2-NEXT: por %xmm6, %xmm0 1120; SSE2-NEXT: movdqa %xmm1, %xmm6 1121; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] 1122; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1123; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1124; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1125; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1126; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1127; SSE2-NEXT: packuswb %xmm6, %xmm1 1128; SSE2-NEXT: movdqa %xmm1, %xmm2 1129; SSE2-NEXT: psrlw $4, %xmm2 1130; SSE2-NEXT: pand %xmm3, %xmm2 1131; SSE2-NEXT: pand %xmm3, %xmm1 1132; SSE2-NEXT: psllw $4, %xmm1 1133; SSE2-NEXT: por %xmm2, %xmm1 1134; SSE2-NEXT: movdqa %xmm1, %xmm2 1135; SSE2-NEXT: psrlw $2, %xmm2 1136; SSE2-NEXT: pand %xmm4, %xmm2 1137; SSE2-NEXT: pand %xmm4, %xmm1 1138; SSE2-NEXT: psllw $2, %xmm1 1139; SSE2-NEXT: por %xmm2, %xmm1 1140; SSE2-NEXT: movdqa %xmm1, %xmm2 1141; SSE2-NEXT: psrlw $1, %xmm2 1142; SSE2-NEXT: pand %xmm5, %xmm2 1143; SSE2-NEXT: pand %xmm5, %xmm1 1144; SSE2-NEXT: paddb %xmm1, %xmm1 1145; SSE2-NEXT: por %xmm2, %xmm1 1146; SSE2-NEXT: retq 1147; 1148; SSSE3-LABEL: test_bitreverse_v8i32: 1149; SSSE3: # %bb.0: 1150; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1151; SSSE3-NEXT: pshufb %xmm4, %xmm0 1152; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1153; SSSE3-NEXT: movdqa %xmm0, %xmm2 1154; SSSE3-NEXT: pand %xmm5, %xmm2 1155; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1156; SSSE3-NEXT: movdqa %xmm6, %xmm7 1157; SSSE3-NEXT: pshufb %xmm2, %xmm7 1158; SSSE3-NEXT: psrlw $4, %xmm0 1159; SSSE3-NEXT: pand %xmm5, %xmm0 1160; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1161; SSSE3-NEXT: movdqa %xmm2, %xmm3 1162; SSSE3-NEXT: pshufb %xmm0, %xmm3 1163; SSSE3-NEXT: por %xmm7, %xmm3 1164; SSSE3-NEXT: pshufb %xmm4, %xmm1 1165; SSSE3-NEXT: movdqa %xmm1, %xmm0 1166; SSSE3-NEXT: pand %xmm5, %xmm0 1167; SSSE3-NEXT: pshufb %xmm0, %xmm6 1168; SSSE3-NEXT: psrlw $4, %xmm1 1169; SSSE3-NEXT: pand %xmm5, %xmm1 1170; SSSE3-NEXT: pshufb %xmm1, %xmm2 1171; SSSE3-NEXT: por %xmm6, %xmm2 1172; SSSE3-NEXT: movdqa %xmm3, %xmm0 1173; SSSE3-NEXT: movdqa %xmm2, %xmm1 1174; SSSE3-NEXT: retq 1175; 1176; AVX1-LABEL: test_bitreverse_v8i32: 1177; AVX1: # %bb.0: 1178; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1179; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1180; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1181; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1182; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1183; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1184; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1185; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1186; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1187; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1188; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1189; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1190; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1191; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1192; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1193; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1194; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1195; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1196; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1197; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1198; AVX1-NEXT: retq 1199; 1200; AVX2-LABEL: test_bitreverse_v8i32: 1201; AVX2: # %bb.0: 1202; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1203; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1204; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1205; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1206; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1207; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1208; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1209; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1210; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1211; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1212; AVX2-NEXT: retq 1213; 1214; AVX512-LABEL: test_bitreverse_v8i32: 1215; AVX512: # %bb.0: 1216; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1217; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1218; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1219; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1220; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1221; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1222; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1223; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1224; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1225; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1226; AVX512-NEXT: retq 1227; 1228; XOPAVX1-LABEL: test_bitreverse_v8i32: 1229; XOPAVX1: # %bb.0: 1230; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1231; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1232; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1233; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1234; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1235; XOPAVX1-NEXT: retq 1236; 1237; XOPAVX2-LABEL: test_bitreverse_v8i32: 1238; XOPAVX2: # %bb.0: 1239; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1240; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1241; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1242; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1243; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1244; XOPAVX2-NEXT: retq 1245; 1246; GFNISSE-LABEL: test_bitreverse_v8i32: 1247; GFNISSE: # %bb.0: 1248; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1249; GFNISSE-NEXT: pshufb %xmm2, %xmm0 1250; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1251; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 1252; GFNISSE-NEXT: pshufb %xmm2, %xmm1 1253; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 1254; GFNISSE-NEXT: retq 1255; 1256; GFNIAVX1-LABEL: test_bitreverse_v8i32: 1257; GFNIAVX1: # %bb.0: 1258; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1259; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1260; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1261; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1262; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1 1263; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1264; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0 1265; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1266; GFNIAVX1-NEXT: retq 1267; 1268; GFNIAVX2-LABEL: test_bitreverse_v8i32: 1269; GFNIAVX2: # %bb.0: 1270; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1271; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1272; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1273; GFNIAVX2-NEXT: retq 1274; 1275; GFNIAVX512-LABEL: test_bitreverse_v8i32: 1276; GFNIAVX512: # %bb.0: 1277; GFNIAVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1278; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1279; GFNIAVX512-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1280; GFNIAVX512-NEXT: retq 1281 %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) 1282 ret <8 x i32> %b 1283} 1284 1285define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { 1286; SSE2-LABEL: test_bitreverse_v4i64: 1287; SSE2: # %bb.0: 1288; SSE2-NEXT: pxor %xmm2, %xmm2 1289; SSE2-NEXT: movdqa %xmm0, %xmm3 1290; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 1291; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 1292; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 1293; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 1294; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1295; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1296; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1297; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1298; SSE2-NEXT: packuswb %xmm3, %xmm0 1299; SSE2-NEXT: movdqa %xmm0, %xmm4 1300; SSE2-NEXT: psrlw $4, %xmm4 1301; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1302; SSE2-NEXT: pand %xmm3, %xmm4 1303; SSE2-NEXT: pand %xmm3, %xmm0 1304; SSE2-NEXT: psllw $4, %xmm0 1305; SSE2-NEXT: por %xmm4, %xmm0 1306; SSE2-NEXT: movdqa %xmm0, %xmm5 1307; SSE2-NEXT: psrlw $2, %xmm5 1308; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1309; SSE2-NEXT: pand %xmm4, %xmm5 1310; SSE2-NEXT: pand %xmm4, %xmm0 1311; SSE2-NEXT: psllw $2, %xmm0 1312; SSE2-NEXT: por %xmm5, %xmm0 1313; SSE2-NEXT: movdqa %xmm0, %xmm6 1314; SSE2-NEXT: psrlw $1, %xmm6 1315; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1316; SSE2-NEXT: pand %xmm5, %xmm6 1317; SSE2-NEXT: pand %xmm5, %xmm0 1318; SSE2-NEXT: paddb %xmm0, %xmm0 1319; SSE2-NEXT: por %xmm6, %xmm0 1320; SSE2-NEXT: movdqa %xmm1, %xmm6 1321; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] 1322; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 1323; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1324; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1325; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1326; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1327; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1328; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1329; SSE2-NEXT: packuswb %xmm6, %xmm1 1330; SSE2-NEXT: movdqa %xmm1, %xmm2 1331; SSE2-NEXT: psrlw $4, %xmm2 1332; SSE2-NEXT: pand %xmm3, %xmm2 1333; SSE2-NEXT: pand %xmm3, %xmm1 1334; SSE2-NEXT: psllw $4, %xmm1 1335; SSE2-NEXT: por %xmm2, %xmm1 1336; SSE2-NEXT: movdqa %xmm1, %xmm2 1337; SSE2-NEXT: psrlw $2, %xmm2 1338; SSE2-NEXT: pand %xmm4, %xmm2 1339; SSE2-NEXT: pand %xmm4, %xmm1 1340; SSE2-NEXT: psllw $2, %xmm1 1341; SSE2-NEXT: por %xmm2, %xmm1 1342; SSE2-NEXT: movdqa %xmm1, %xmm2 1343; SSE2-NEXT: psrlw $1, %xmm2 1344; SSE2-NEXT: pand %xmm5, %xmm2 1345; SSE2-NEXT: pand %xmm5, %xmm1 1346; SSE2-NEXT: paddb %xmm1, %xmm1 1347; SSE2-NEXT: por %xmm2, %xmm1 1348; SSE2-NEXT: retq 1349; 1350; SSSE3-LABEL: test_bitreverse_v4i64: 1351; SSSE3: # %bb.0: 1352; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1353; SSSE3-NEXT: pshufb %xmm4, %xmm0 1354; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1355; SSSE3-NEXT: movdqa %xmm0, %xmm2 1356; SSSE3-NEXT: pand %xmm5, %xmm2 1357; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1358; SSSE3-NEXT: movdqa %xmm6, %xmm7 1359; SSSE3-NEXT: pshufb %xmm2, %xmm7 1360; SSSE3-NEXT: psrlw $4, %xmm0 1361; SSSE3-NEXT: pand %xmm5, %xmm0 1362; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1363; SSSE3-NEXT: movdqa %xmm2, %xmm3 1364; SSSE3-NEXT: pshufb %xmm0, %xmm3 1365; SSSE3-NEXT: por %xmm7, %xmm3 1366; SSSE3-NEXT: pshufb %xmm4, %xmm1 1367; SSSE3-NEXT: movdqa %xmm1, %xmm0 1368; SSSE3-NEXT: pand %xmm5, %xmm0 1369; SSSE3-NEXT: pshufb %xmm0, %xmm6 1370; SSSE3-NEXT: psrlw $4, %xmm1 1371; SSSE3-NEXT: pand %xmm5, %xmm1 1372; SSSE3-NEXT: pshufb %xmm1, %xmm2 1373; SSSE3-NEXT: por %xmm6, %xmm2 1374; SSSE3-NEXT: movdqa %xmm3, %xmm0 1375; SSSE3-NEXT: movdqa %xmm2, %xmm1 1376; SSSE3-NEXT: retq 1377; 1378; AVX1-LABEL: test_bitreverse_v4i64: 1379; AVX1: # %bb.0: 1380; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1381; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1382; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1383; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1384; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1385; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1386; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1387; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1388; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1389; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1390; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1391; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1392; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1393; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1394; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1395; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1396; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1397; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1398; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1399; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1400; AVX1-NEXT: retq 1401; 1402; AVX2-LABEL: test_bitreverse_v4i64: 1403; AVX2: # %bb.0: 1404; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1405; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1406; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1407; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1408; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1409; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1410; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1411; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1412; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1413; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1414; AVX2-NEXT: retq 1415; 1416; AVX512-LABEL: test_bitreverse_v4i64: 1417; AVX512: # %bb.0: 1418; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1419; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1420; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1421; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1422; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1423; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1424; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1425; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1426; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1427; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1428; AVX512-NEXT: retq 1429; 1430; XOPAVX1-LABEL: test_bitreverse_v4i64: 1431; XOPAVX1: # %bb.0: 1432; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1433; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1434; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1435; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1436; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1437; XOPAVX1-NEXT: retq 1438; 1439; XOPAVX2-LABEL: test_bitreverse_v4i64: 1440; XOPAVX2: # %bb.0: 1441; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1442; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1443; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1444; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1445; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1446; XOPAVX2-NEXT: retq 1447; 1448; GFNISSE-LABEL: test_bitreverse_v4i64: 1449; GFNISSE: # %bb.0: 1450; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1451; GFNISSE-NEXT: pshufb %xmm2, %xmm0 1452; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1453; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 1454; GFNISSE-NEXT: pshufb %xmm2, %xmm1 1455; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 1456; GFNISSE-NEXT: retq 1457; 1458; GFNIAVX1-LABEL: test_bitreverse_v4i64: 1459; GFNIAVX1: # %bb.0: 1460; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1461; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1462; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1463; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1464; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1 1465; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1466; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0 1467; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1468; GFNIAVX1-NEXT: retq 1469; 1470; GFNIAVX2-LABEL: test_bitreverse_v4i64: 1471; GFNIAVX2: # %bb.0: 1472; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1473; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1474; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1475; GFNIAVX2-NEXT: retq 1476; 1477; GFNIAVX512-LABEL: test_bitreverse_v4i64: 1478; GFNIAVX512: # %bb.0: 1479; GFNIAVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1480; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1481; GFNIAVX512-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1482; GFNIAVX512-NEXT: retq 1483 %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) 1484 ret <4 x i64> %b 1485} 1486 1487define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { 1488; SSE2-LABEL: test_bitreverse_v64i8: 1489; SSE2: # %bb.0: 1490; SSE2-NEXT: movdqa %xmm0, %xmm5 1491; SSE2-NEXT: psrlw $4, %xmm5 1492; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1493; SSE2-NEXT: pand %xmm4, %xmm5 1494; SSE2-NEXT: pand %xmm4, %xmm0 1495; SSE2-NEXT: psllw $4, %xmm0 1496; SSE2-NEXT: por %xmm5, %xmm0 1497; SSE2-NEXT: movdqa %xmm0, %xmm6 1498; SSE2-NEXT: psrlw $2, %xmm6 1499; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1500; SSE2-NEXT: pand %xmm5, %xmm6 1501; SSE2-NEXT: pand %xmm5, %xmm0 1502; SSE2-NEXT: psllw $2, %xmm0 1503; SSE2-NEXT: por %xmm6, %xmm0 1504; SSE2-NEXT: movdqa %xmm0, %xmm7 1505; SSE2-NEXT: psrlw $1, %xmm7 1506; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1507; SSE2-NEXT: pand %xmm6, %xmm7 1508; SSE2-NEXT: pand %xmm6, %xmm0 1509; SSE2-NEXT: paddb %xmm0, %xmm0 1510; SSE2-NEXT: por %xmm7, %xmm0 1511; SSE2-NEXT: movdqa %xmm1, %xmm7 1512; SSE2-NEXT: psrlw $4, %xmm7 1513; SSE2-NEXT: pand %xmm4, %xmm7 1514; SSE2-NEXT: pand %xmm4, %xmm1 1515; SSE2-NEXT: psllw $4, %xmm1 1516; SSE2-NEXT: por %xmm7, %xmm1 1517; SSE2-NEXT: movdqa %xmm1, %xmm7 1518; SSE2-NEXT: psrlw $2, %xmm7 1519; SSE2-NEXT: pand %xmm5, %xmm7 1520; SSE2-NEXT: pand %xmm5, %xmm1 1521; SSE2-NEXT: psllw $2, %xmm1 1522; SSE2-NEXT: por %xmm7, %xmm1 1523; SSE2-NEXT: movdqa %xmm1, %xmm7 1524; SSE2-NEXT: psrlw $1, %xmm7 1525; SSE2-NEXT: pand %xmm6, %xmm7 1526; SSE2-NEXT: pand %xmm6, %xmm1 1527; SSE2-NEXT: paddb %xmm1, %xmm1 1528; SSE2-NEXT: por %xmm7, %xmm1 1529; SSE2-NEXT: movdqa %xmm2, %xmm7 1530; SSE2-NEXT: psrlw $4, %xmm7 1531; SSE2-NEXT: pand %xmm4, %xmm7 1532; SSE2-NEXT: pand %xmm4, %xmm2 1533; SSE2-NEXT: psllw $4, %xmm2 1534; SSE2-NEXT: por %xmm7, %xmm2 1535; SSE2-NEXT: movdqa %xmm2, %xmm7 1536; SSE2-NEXT: psrlw $2, %xmm7 1537; SSE2-NEXT: pand %xmm5, %xmm7 1538; SSE2-NEXT: pand %xmm5, %xmm2 1539; SSE2-NEXT: psllw $2, %xmm2 1540; SSE2-NEXT: por %xmm7, %xmm2 1541; SSE2-NEXT: movdqa %xmm2, %xmm7 1542; SSE2-NEXT: psrlw $1, %xmm7 1543; SSE2-NEXT: pand %xmm6, %xmm7 1544; SSE2-NEXT: pand %xmm6, %xmm2 1545; SSE2-NEXT: paddb %xmm2, %xmm2 1546; SSE2-NEXT: por %xmm7, %xmm2 1547; SSE2-NEXT: movdqa %xmm3, %xmm7 1548; SSE2-NEXT: psrlw $4, %xmm7 1549; SSE2-NEXT: pand %xmm4, %xmm7 1550; SSE2-NEXT: pand %xmm4, %xmm3 1551; SSE2-NEXT: psllw $4, %xmm3 1552; SSE2-NEXT: por %xmm7, %xmm3 1553; SSE2-NEXT: movdqa %xmm3, %xmm4 1554; SSE2-NEXT: psrlw $2, %xmm4 1555; SSE2-NEXT: pand %xmm5, %xmm4 1556; SSE2-NEXT: pand %xmm5, %xmm3 1557; SSE2-NEXT: psllw $2, %xmm3 1558; SSE2-NEXT: por %xmm4, %xmm3 1559; SSE2-NEXT: movdqa %xmm3, %xmm4 1560; SSE2-NEXT: psrlw $1, %xmm4 1561; SSE2-NEXT: pand %xmm6, %xmm4 1562; SSE2-NEXT: pand %xmm6, %xmm3 1563; SSE2-NEXT: paddb %xmm3, %xmm3 1564; SSE2-NEXT: por %xmm4, %xmm3 1565; SSE2-NEXT: retq 1566; 1567; SSSE3-LABEL: test_bitreverse_v64i8: 1568; SSSE3: # %bb.0: 1569; SSSE3-NEXT: movdqa %xmm0, %xmm5 1570; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1571; SSSE3-NEXT: pand %xmm8, %xmm0 1572; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1573; SSSE3-NEXT: movdqa %xmm9, %xmm6 1574; SSSE3-NEXT: pshufb %xmm0, %xmm6 1575; SSSE3-NEXT: psrlw $4, %xmm5 1576; SSSE3-NEXT: pand %xmm8, %xmm5 1577; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1578; SSSE3-NEXT: movdqa %xmm4, %xmm0 1579; SSSE3-NEXT: pshufb %xmm5, %xmm0 1580; SSSE3-NEXT: por %xmm6, %xmm0 1581; SSSE3-NEXT: movdqa %xmm1, %xmm5 1582; SSSE3-NEXT: pand %xmm8, %xmm5 1583; SSSE3-NEXT: movdqa %xmm9, %xmm6 1584; SSSE3-NEXT: pshufb %xmm5, %xmm6 1585; SSSE3-NEXT: psrlw $4, %xmm1 1586; SSSE3-NEXT: pand %xmm8, %xmm1 1587; SSSE3-NEXT: movdqa %xmm4, %xmm5 1588; SSSE3-NEXT: pshufb %xmm1, %xmm5 1589; SSSE3-NEXT: por %xmm6, %xmm5 1590; SSSE3-NEXT: movdqa %xmm2, %xmm1 1591; SSSE3-NEXT: pand %xmm8, %xmm1 1592; SSSE3-NEXT: movdqa %xmm9, %xmm7 1593; SSSE3-NEXT: pshufb %xmm1, %xmm7 1594; SSSE3-NEXT: psrlw $4, %xmm2 1595; SSSE3-NEXT: pand %xmm8, %xmm2 1596; SSSE3-NEXT: movdqa %xmm4, %xmm6 1597; SSSE3-NEXT: pshufb %xmm2, %xmm6 1598; SSSE3-NEXT: por %xmm7, %xmm6 1599; SSSE3-NEXT: movdqa %xmm3, %xmm1 1600; SSSE3-NEXT: pand %xmm8, %xmm1 1601; SSSE3-NEXT: pshufb %xmm1, %xmm9 1602; SSSE3-NEXT: psrlw $4, %xmm3 1603; SSSE3-NEXT: pand %xmm8, %xmm3 1604; SSSE3-NEXT: pshufb %xmm3, %xmm4 1605; SSSE3-NEXT: por %xmm9, %xmm4 1606; SSSE3-NEXT: movdqa %xmm5, %xmm1 1607; SSSE3-NEXT: movdqa %xmm6, %xmm2 1608; SSSE3-NEXT: movdqa %xmm4, %xmm3 1609; SSSE3-NEXT: retq 1610; 1611; AVX1-LABEL: test_bitreverse_v64i8: 1612; AVX1: # %bb.0: 1613; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1614; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1615; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 1616; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1617; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1618; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1619; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1620; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1621; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1622; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1623; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm4 1624; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1625; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1626; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1627; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1628; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 1629; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1630; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1631; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 1632; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1633; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1634; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1635; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1636; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1637; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1638; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1639; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1640; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1641; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1642; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1643; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1644; AVX1-NEXT: retq 1645; 1646; AVX2-LABEL: test_bitreverse_v64i8: 1647; AVX2: # %bb.0: 1648; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1649; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 1650; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1651; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1652; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1653; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1654; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1655; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0 1656; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 1657; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 1658; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1659; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 1660; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1661; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1 1662; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 1663; AVX2-NEXT: retq 1664; 1665; AVX512F-LABEL: test_bitreverse_v64i8: 1666; AVX512F: # %bb.0: 1667; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1668; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1669; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 1670; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1671; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1672; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 1673; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm4 1674; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 1675; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 1676; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 1677; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1678; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 1679; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 1680; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 1681; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 1682; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1683; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0 1684; AVX512F-NEXT: retq 1685; 1686; AVX512BW-LABEL: test_bitreverse_v64i8: 1687; AVX512BW: # %bb.0: 1688; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1689; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 1690; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1691; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 1692; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 1693; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 1694; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1695; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 1696; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 1697; AVX512BW-NEXT: retq 1698; 1699; XOPAVX1-LABEL: test_bitreverse_v64i8: 1700; XOPAVX1: # %bb.0: 1701; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1702; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1703; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1704; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1705; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1706; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1707; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1708; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1709; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1710; XOPAVX1-NEXT: retq 1711; 1712; XOPAVX2-LABEL: test_bitreverse_v64i8: 1713; XOPAVX2: # %bb.0: 1714; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1715; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1716; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1717; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1718; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1719; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1720; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1721; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1722; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 1723; XOPAVX2-NEXT: retq 1724; 1725; GFNISSE-LABEL: test_bitreverse_v64i8: 1726; GFNISSE: # %bb.0: 1727; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] 1728; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 1729; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 1730; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 1731; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm3 1732; GFNISSE-NEXT: retq 1733; 1734; GFNIAVX1-LABEL: test_bitreverse_v64i8: 1735; GFNIAVX1: # %bb.0: 1736; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1737; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] 1738; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm2, %xmm2 1739; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm0 1740; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1741; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1742; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm2, %xmm2 1743; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm1, %xmm1 1744; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1745; GFNIAVX1-NEXT: retq 1746; 1747; GFNIAVX2-LABEL: test_bitreverse_v64i8: 1748; GFNIAVX2: # %bb.0: 1749; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1750; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 1751; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 1752; GFNIAVX2-NEXT: retq 1753; 1754; GFNIAVX512F-LABEL: test_bitreverse_v64i8: 1755; GFNIAVX512F: # %bb.0: 1756; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1757; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 1758; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 1759; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 1760; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1761; GFNIAVX512F-NEXT: retq 1762; 1763; GFNIAVX512BW-LABEL: test_bitreverse_v64i8: 1764; GFNIAVX512BW: # %bb.0: 1765; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 1766; GFNIAVX512BW-NEXT: retq 1767 %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) 1768 ret <64 x i8> %b 1769} 1770 1771define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { 1772; SSE2-LABEL: test_bitreverse_v32i16: 1773; SSE2: # %bb.0: 1774; SSE2-NEXT: movdqa %xmm0, %xmm4 1775; SSE2-NEXT: psrlw $8, %xmm4 1776; SSE2-NEXT: psllw $8, %xmm0 1777; SSE2-NEXT: por %xmm4, %xmm0 1778; SSE2-NEXT: movdqa %xmm0, %xmm5 1779; SSE2-NEXT: psrlw $4, %xmm5 1780; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1781; SSE2-NEXT: pand %xmm4, %xmm5 1782; SSE2-NEXT: pand %xmm4, %xmm0 1783; SSE2-NEXT: psllw $4, %xmm0 1784; SSE2-NEXT: por %xmm5, %xmm0 1785; SSE2-NEXT: movdqa %xmm0, %xmm6 1786; SSE2-NEXT: psrlw $2, %xmm6 1787; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1788; SSE2-NEXT: pand %xmm5, %xmm6 1789; SSE2-NEXT: pand %xmm5, %xmm0 1790; SSE2-NEXT: psllw $2, %xmm0 1791; SSE2-NEXT: por %xmm6, %xmm0 1792; SSE2-NEXT: movdqa %xmm0, %xmm7 1793; SSE2-NEXT: psrlw $1, %xmm7 1794; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1795; SSE2-NEXT: pand %xmm6, %xmm7 1796; SSE2-NEXT: pand %xmm6, %xmm0 1797; SSE2-NEXT: paddb %xmm0, %xmm0 1798; SSE2-NEXT: por %xmm7, %xmm0 1799; SSE2-NEXT: movdqa %xmm1, %xmm7 1800; SSE2-NEXT: psrlw $8, %xmm7 1801; SSE2-NEXT: psllw $8, %xmm1 1802; SSE2-NEXT: por %xmm7, %xmm1 1803; SSE2-NEXT: movdqa %xmm1, %xmm7 1804; SSE2-NEXT: psrlw $4, %xmm7 1805; SSE2-NEXT: pand %xmm4, %xmm7 1806; SSE2-NEXT: pand %xmm4, %xmm1 1807; SSE2-NEXT: psllw $4, %xmm1 1808; SSE2-NEXT: por %xmm7, %xmm1 1809; SSE2-NEXT: movdqa %xmm1, %xmm7 1810; SSE2-NEXT: psrlw $2, %xmm7 1811; SSE2-NEXT: pand %xmm5, %xmm7 1812; SSE2-NEXT: pand %xmm5, %xmm1 1813; SSE2-NEXT: psllw $2, %xmm1 1814; SSE2-NEXT: por %xmm7, %xmm1 1815; SSE2-NEXT: movdqa %xmm1, %xmm7 1816; SSE2-NEXT: psrlw $1, %xmm7 1817; SSE2-NEXT: pand %xmm6, %xmm7 1818; SSE2-NEXT: pand %xmm6, %xmm1 1819; SSE2-NEXT: paddb %xmm1, %xmm1 1820; SSE2-NEXT: por %xmm7, %xmm1 1821; SSE2-NEXT: movdqa %xmm2, %xmm7 1822; SSE2-NEXT: psrlw $8, %xmm7 1823; SSE2-NEXT: psllw $8, %xmm2 1824; SSE2-NEXT: por %xmm7, %xmm2 1825; SSE2-NEXT: movdqa %xmm2, %xmm7 1826; SSE2-NEXT: psrlw $4, %xmm7 1827; SSE2-NEXT: pand %xmm4, %xmm7 1828; SSE2-NEXT: pand %xmm4, %xmm2 1829; SSE2-NEXT: psllw $4, %xmm2 1830; SSE2-NEXT: por %xmm7, %xmm2 1831; SSE2-NEXT: movdqa %xmm2, %xmm7 1832; SSE2-NEXT: psrlw $2, %xmm7 1833; SSE2-NEXT: pand %xmm5, %xmm7 1834; SSE2-NEXT: pand %xmm5, %xmm2 1835; SSE2-NEXT: psllw $2, %xmm2 1836; SSE2-NEXT: por %xmm7, %xmm2 1837; SSE2-NEXT: movdqa %xmm2, %xmm7 1838; SSE2-NEXT: psrlw $1, %xmm7 1839; SSE2-NEXT: pand %xmm6, %xmm7 1840; SSE2-NEXT: pand %xmm6, %xmm2 1841; SSE2-NEXT: paddb %xmm2, %xmm2 1842; SSE2-NEXT: por %xmm7, %xmm2 1843; SSE2-NEXT: movdqa %xmm3, %xmm7 1844; SSE2-NEXT: psrlw $8, %xmm7 1845; SSE2-NEXT: psllw $8, %xmm3 1846; SSE2-NEXT: por %xmm7, %xmm3 1847; SSE2-NEXT: movdqa %xmm3, %xmm7 1848; SSE2-NEXT: psrlw $4, %xmm7 1849; SSE2-NEXT: pand %xmm4, %xmm7 1850; SSE2-NEXT: pand %xmm4, %xmm3 1851; SSE2-NEXT: psllw $4, %xmm3 1852; SSE2-NEXT: por %xmm7, %xmm3 1853; SSE2-NEXT: movdqa %xmm3, %xmm4 1854; SSE2-NEXT: psrlw $2, %xmm4 1855; SSE2-NEXT: pand %xmm5, %xmm4 1856; SSE2-NEXT: pand %xmm5, %xmm3 1857; SSE2-NEXT: psllw $2, %xmm3 1858; SSE2-NEXT: por %xmm4, %xmm3 1859; SSE2-NEXT: movdqa %xmm3, %xmm4 1860; SSE2-NEXT: psrlw $1, %xmm4 1861; SSE2-NEXT: pand %xmm6, %xmm4 1862; SSE2-NEXT: pand %xmm6, %xmm3 1863; SSE2-NEXT: paddb %xmm3, %xmm3 1864; SSE2-NEXT: por %xmm4, %xmm3 1865; SSE2-NEXT: retq 1866; 1867; SSSE3-LABEL: test_bitreverse_v32i16: 1868; SSSE3: # %bb.0: 1869; SSSE3-NEXT: movdqa %xmm1, %xmm5 1870; SSSE3-NEXT: movdqa %xmm0, %xmm1 1871; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1872; SSSE3-NEXT: pshufb %xmm8, %xmm1 1873; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1874; SSSE3-NEXT: movdqa %xmm1, %xmm0 1875; SSSE3-NEXT: pand %xmm9, %xmm0 1876; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1877; SSSE3-NEXT: movdqa %xmm7, %xmm6 1878; SSSE3-NEXT: pshufb %xmm0, %xmm6 1879; SSSE3-NEXT: psrlw $4, %xmm1 1880; SSSE3-NEXT: pand %xmm9, %xmm1 1881; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1882; SSSE3-NEXT: movdqa %xmm4, %xmm0 1883; SSSE3-NEXT: pshufb %xmm1, %xmm0 1884; SSSE3-NEXT: por %xmm6, %xmm0 1885; SSSE3-NEXT: pshufb %xmm8, %xmm5 1886; SSSE3-NEXT: movdqa %xmm5, %xmm1 1887; SSSE3-NEXT: pand %xmm9, %xmm1 1888; SSSE3-NEXT: movdqa %xmm7, %xmm6 1889; SSSE3-NEXT: pshufb %xmm1, %xmm6 1890; SSSE3-NEXT: psrlw $4, %xmm5 1891; SSSE3-NEXT: pand %xmm9, %xmm5 1892; SSSE3-NEXT: movdqa %xmm4, %xmm1 1893; SSSE3-NEXT: pshufb %xmm5, %xmm1 1894; SSSE3-NEXT: por %xmm6, %xmm1 1895; SSSE3-NEXT: pshufb %xmm8, %xmm2 1896; SSSE3-NEXT: movdqa %xmm2, %xmm5 1897; SSSE3-NEXT: pand %xmm9, %xmm5 1898; SSSE3-NEXT: movdqa %xmm7, %xmm6 1899; SSSE3-NEXT: pshufb %xmm5, %xmm6 1900; SSSE3-NEXT: psrlw $4, %xmm2 1901; SSSE3-NEXT: pand %xmm9, %xmm2 1902; SSSE3-NEXT: movdqa %xmm4, %xmm5 1903; SSSE3-NEXT: pshufb %xmm2, %xmm5 1904; SSSE3-NEXT: por %xmm6, %xmm5 1905; SSSE3-NEXT: pshufb %xmm8, %xmm3 1906; SSSE3-NEXT: movdqa %xmm3, %xmm2 1907; SSSE3-NEXT: pand %xmm9, %xmm2 1908; SSSE3-NEXT: pshufb %xmm2, %xmm7 1909; SSSE3-NEXT: psrlw $4, %xmm3 1910; SSSE3-NEXT: pand %xmm9, %xmm3 1911; SSSE3-NEXT: pshufb %xmm3, %xmm4 1912; SSSE3-NEXT: por %xmm7, %xmm4 1913; SSSE3-NEXT: movdqa %xmm5, %xmm2 1914; SSSE3-NEXT: movdqa %xmm4, %xmm3 1915; SSSE3-NEXT: retq 1916; 1917; AVX1-LABEL: test_bitreverse_v32i16: 1918; AVX1: # %bb.0: 1919; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1920; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1921; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1922; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1923; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1924; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1925; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1926; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1927; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1928; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1929; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1930; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1931; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1932; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 1933; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1934; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1935; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1936; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 1937; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 1938; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1939; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1940; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1941; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1942; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1943; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1944; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1945; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1946; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1947; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1948; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 1949; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 1950; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1951; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 1952; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 1953; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 1954; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1955; AVX1-NEXT: retq 1956; 1957; AVX2-LABEL: test_bitreverse_v32i16: 1958; AVX2: # %bb.0: 1959; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1960; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1961; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1962; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 1963; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1964; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1965; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1966; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 1967; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1968; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 1969; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 1970; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1971; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 1972; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 1973; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 1974; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 1975; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 1976; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 1977; AVX2-NEXT: retq 1978; 1979; AVX512F-LABEL: test_bitreverse_v32i16: 1980; AVX512F: # %bb.0: 1981; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1982; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1983; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1984; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1985; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 1986; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1987; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1988; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1989; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 1990; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 1991; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 1992; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 1993; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 1994; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1995; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 1996; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 1997; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 1998; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 1999; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2000; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 2001; AVX512F-NEXT: retq 2002; 2003; AVX512BW-LABEL: test_bitreverse_v32i16: 2004; AVX512BW: # %bb.0: 2005; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] 2006; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2007; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2008; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2009; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2010; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2011; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2012; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2013; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2014; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2015; AVX512BW-NEXT: retq 2016; 2017; XOPAVX1-LABEL: test_bitreverse_v32i16: 2018; XOPAVX1: # %bb.0: 2019; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2020; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 2021; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2022; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2023; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2024; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2025; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2026; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2027; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2028; XOPAVX1-NEXT: retq 2029; 2030; XOPAVX2-LABEL: test_bitreverse_v32i16: 2031; XOPAVX2: # %bb.0: 2032; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2033; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 2034; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2035; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2036; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2037; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2038; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2039; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2040; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2041; XOPAVX2-NEXT: retq 2042; 2043; GFNISSE-LABEL: test_bitreverse_v32i16: 2044; GFNISSE: # %bb.0: 2045; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2046; GFNISSE-NEXT: pshufb %xmm4, %xmm0 2047; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] 2048; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 2049; GFNISSE-NEXT: pshufb %xmm4, %xmm1 2050; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 2051; GFNISSE-NEXT: pshufb %xmm4, %xmm2 2052; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 2053; GFNISSE-NEXT: pshufb %xmm4, %xmm3 2054; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 2055; GFNISSE-NEXT: retq 2056; 2057; GFNIAVX1-LABEL: test_bitreverse_v32i16: 2058; GFNIAVX1: # %bb.0: 2059; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2060; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2061; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2062; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] 2063; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 2064; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2065; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0 2066; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2067; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2068; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2069; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 2070; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2071; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1 2072; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2073; GFNIAVX1-NEXT: retq 2074; 2075; GFNIAVX2-LABEL: test_bitreverse_v32i16: 2076; GFNIAVX2: # %bb.0: 2077; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2078; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2079; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2080; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2081; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2082; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2083; GFNIAVX2-NEXT: retq 2084; 2085; GFNIAVX512F-LABEL: test_bitreverse_v32i16: 2086; GFNIAVX512F: # %bb.0: 2087; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2088; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2089; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2090; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2091; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2092; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2093; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2094; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2095; GFNIAVX512F-NEXT: retq 2096; 2097; GFNIAVX512BW-LABEL: test_bitreverse_v32i16: 2098; GFNIAVX512BW: # %bb.0: 2099; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] 2100; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 2101; GFNIAVX512BW-NEXT: retq 2102 %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) 2103 ret <32 x i16> %b 2104} 2105 2106define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { 2107; SSE2-LABEL: test_bitreverse_v16i32: 2108; SSE2: # %bb.0: 2109; SSE2-NEXT: pxor %xmm8, %xmm8 2110; SSE2-NEXT: movdqa %xmm0, %xmm5 2111; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] 2112; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] 2113; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] 2114; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 2115; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 2116; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 2117; SSE2-NEXT: packuswb %xmm5, %xmm0 2118; SSE2-NEXT: movdqa %xmm0, %xmm6 2119; SSE2-NEXT: psrlw $4, %xmm6 2120; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2121; SSE2-NEXT: pand %xmm5, %xmm6 2122; SSE2-NEXT: pand %xmm5, %xmm0 2123; SSE2-NEXT: psllw $4, %xmm0 2124; SSE2-NEXT: por %xmm6, %xmm0 2125; SSE2-NEXT: movdqa %xmm0, %xmm7 2126; SSE2-NEXT: psrlw $2, %xmm7 2127; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 2128; SSE2-NEXT: pand %xmm6, %xmm7 2129; SSE2-NEXT: pand %xmm6, %xmm0 2130; SSE2-NEXT: psllw $2, %xmm0 2131; SSE2-NEXT: por %xmm7, %xmm0 2132; SSE2-NEXT: movdqa %xmm0, %xmm4 2133; SSE2-NEXT: psrlw $1, %xmm4 2134; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 2135; SSE2-NEXT: pand %xmm7, %xmm4 2136; SSE2-NEXT: pand %xmm7, %xmm0 2137; SSE2-NEXT: paddb %xmm0, %xmm0 2138; SSE2-NEXT: por %xmm4, %xmm0 2139; SSE2-NEXT: movdqa %xmm1, %xmm4 2140; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] 2141; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2142; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2143; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 2144; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 2145; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 2146; SSE2-NEXT: packuswb %xmm4, %xmm1 2147; SSE2-NEXT: movdqa %xmm1, %xmm4 2148; SSE2-NEXT: psrlw $4, %xmm4 2149; SSE2-NEXT: pand %xmm5, %xmm4 2150; SSE2-NEXT: pand %xmm5, %xmm1 2151; SSE2-NEXT: psllw $4, %xmm1 2152; SSE2-NEXT: por %xmm4, %xmm1 2153; SSE2-NEXT: movdqa %xmm1, %xmm4 2154; SSE2-NEXT: psrlw $2, %xmm4 2155; SSE2-NEXT: pand %xmm6, %xmm4 2156; SSE2-NEXT: pand %xmm6, %xmm1 2157; SSE2-NEXT: psllw $2, %xmm1 2158; SSE2-NEXT: por %xmm4, %xmm1 2159; SSE2-NEXT: movdqa %xmm1, %xmm4 2160; SSE2-NEXT: psrlw $1, %xmm4 2161; SSE2-NEXT: pand %xmm7, %xmm4 2162; SSE2-NEXT: pand %xmm7, %xmm1 2163; SSE2-NEXT: paddb %xmm1, %xmm1 2164; SSE2-NEXT: por %xmm4, %xmm1 2165; SSE2-NEXT: movdqa %xmm2, %xmm4 2166; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] 2167; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2168; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2169; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] 2170; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 2171; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 2172; SSE2-NEXT: packuswb %xmm4, %xmm2 2173; SSE2-NEXT: movdqa %xmm2, %xmm4 2174; SSE2-NEXT: psrlw $4, %xmm4 2175; SSE2-NEXT: pand %xmm5, %xmm4 2176; SSE2-NEXT: pand %xmm5, %xmm2 2177; SSE2-NEXT: psllw $4, %xmm2 2178; SSE2-NEXT: por %xmm4, %xmm2 2179; SSE2-NEXT: movdqa %xmm2, %xmm4 2180; SSE2-NEXT: psrlw $2, %xmm4 2181; SSE2-NEXT: pand %xmm6, %xmm4 2182; SSE2-NEXT: pand %xmm6, %xmm2 2183; SSE2-NEXT: psllw $2, %xmm2 2184; SSE2-NEXT: por %xmm4, %xmm2 2185; SSE2-NEXT: movdqa %xmm2, %xmm4 2186; SSE2-NEXT: psrlw $1, %xmm4 2187; SSE2-NEXT: pand %xmm7, %xmm4 2188; SSE2-NEXT: pand %xmm7, %xmm2 2189; SSE2-NEXT: paddb %xmm2, %xmm2 2190; SSE2-NEXT: por %xmm4, %xmm2 2191; SSE2-NEXT: movdqa %xmm3, %xmm4 2192; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] 2193; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2194; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2195; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] 2196; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 2197; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 2198; SSE2-NEXT: packuswb %xmm4, %xmm3 2199; SSE2-NEXT: movdqa %xmm3, %xmm4 2200; SSE2-NEXT: psrlw $4, %xmm4 2201; SSE2-NEXT: pand %xmm5, %xmm4 2202; SSE2-NEXT: pand %xmm5, %xmm3 2203; SSE2-NEXT: psllw $4, %xmm3 2204; SSE2-NEXT: por %xmm4, %xmm3 2205; SSE2-NEXT: movdqa %xmm3, %xmm4 2206; SSE2-NEXT: psrlw $2, %xmm4 2207; SSE2-NEXT: pand %xmm6, %xmm4 2208; SSE2-NEXT: pand %xmm6, %xmm3 2209; SSE2-NEXT: psllw $2, %xmm3 2210; SSE2-NEXT: por %xmm4, %xmm3 2211; SSE2-NEXT: movdqa %xmm3, %xmm4 2212; SSE2-NEXT: psrlw $1, %xmm4 2213; SSE2-NEXT: pand %xmm7, %xmm4 2214; SSE2-NEXT: pand %xmm7, %xmm3 2215; SSE2-NEXT: paddb %xmm3, %xmm3 2216; SSE2-NEXT: por %xmm4, %xmm3 2217; SSE2-NEXT: retq 2218; 2219; SSSE3-LABEL: test_bitreverse_v16i32: 2220; SSSE3: # %bb.0: 2221; SSSE3-NEXT: movdqa %xmm1, %xmm5 2222; SSSE3-NEXT: movdqa %xmm0, %xmm1 2223; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2224; SSSE3-NEXT: pshufb %xmm8, %xmm1 2225; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2226; SSSE3-NEXT: movdqa %xmm1, %xmm0 2227; SSSE3-NEXT: pand %xmm9, %xmm0 2228; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2229; SSSE3-NEXT: movdqa %xmm7, %xmm6 2230; SSSE3-NEXT: pshufb %xmm0, %xmm6 2231; SSSE3-NEXT: psrlw $4, %xmm1 2232; SSSE3-NEXT: pand %xmm9, %xmm1 2233; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2234; SSSE3-NEXT: movdqa %xmm4, %xmm0 2235; SSSE3-NEXT: pshufb %xmm1, %xmm0 2236; SSSE3-NEXT: por %xmm6, %xmm0 2237; SSSE3-NEXT: pshufb %xmm8, %xmm5 2238; SSSE3-NEXT: movdqa %xmm5, %xmm1 2239; SSSE3-NEXT: pand %xmm9, %xmm1 2240; SSSE3-NEXT: movdqa %xmm7, %xmm6 2241; SSSE3-NEXT: pshufb %xmm1, %xmm6 2242; SSSE3-NEXT: psrlw $4, %xmm5 2243; SSSE3-NEXT: pand %xmm9, %xmm5 2244; SSSE3-NEXT: movdqa %xmm4, %xmm1 2245; SSSE3-NEXT: pshufb %xmm5, %xmm1 2246; SSSE3-NEXT: por %xmm6, %xmm1 2247; SSSE3-NEXT: pshufb %xmm8, %xmm2 2248; SSSE3-NEXT: movdqa %xmm2, %xmm5 2249; SSSE3-NEXT: pand %xmm9, %xmm5 2250; SSSE3-NEXT: movdqa %xmm7, %xmm6 2251; SSSE3-NEXT: pshufb %xmm5, %xmm6 2252; SSSE3-NEXT: psrlw $4, %xmm2 2253; SSSE3-NEXT: pand %xmm9, %xmm2 2254; SSSE3-NEXT: movdqa %xmm4, %xmm5 2255; SSSE3-NEXT: pshufb %xmm2, %xmm5 2256; SSSE3-NEXT: por %xmm6, %xmm5 2257; SSSE3-NEXT: pshufb %xmm8, %xmm3 2258; SSSE3-NEXT: movdqa %xmm3, %xmm2 2259; SSSE3-NEXT: pand %xmm9, %xmm2 2260; SSSE3-NEXT: pshufb %xmm2, %xmm7 2261; SSSE3-NEXT: psrlw $4, %xmm3 2262; SSSE3-NEXT: pand %xmm9, %xmm3 2263; SSSE3-NEXT: pshufb %xmm3, %xmm4 2264; SSSE3-NEXT: por %xmm7, %xmm4 2265; SSSE3-NEXT: movdqa %xmm5, %xmm2 2266; SSSE3-NEXT: movdqa %xmm4, %xmm3 2267; SSSE3-NEXT: retq 2268; 2269; AVX1-LABEL: test_bitreverse_v16i32: 2270; AVX1: # %bb.0: 2271; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2272; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2273; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2274; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2275; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2276; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2277; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2278; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2279; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2280; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2281; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2282; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2283; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2284; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 2285; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2286; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2287; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2288; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2289; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2290; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2291; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2292; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2293; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2294; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2295; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2296; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2297; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2298; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2299; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2300; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2301; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2302; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2303; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2304; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2305; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2306; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2307; AVX1-NEXT: retq 2308; 2309; AVX2-LABEL: test_bitreverse_v16i32: 2310; AVX2: # %bb.0: 2311; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2312; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2313; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2314; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2315; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2316; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2317; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2318; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2319; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2320; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2321; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2322; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2323; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2324; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2325; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2326; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2327; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2328; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2329; AVX2-NEXT: retq 2330; 2331; AVX512F-LABEL: test_bitreverse_v16i32: 2332; AVX512F: # %bb.0: 2333; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2334; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2335; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2336; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2337; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 2338; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2339; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2340; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2341; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 2342; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2343; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 2344; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2345; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 2346; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2347; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 2348; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 2349; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 2350; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 2351; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2352; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 2353; AVX512F-NEXT: retq 2354; 2355; AVX512BW-LABEL: test_bitreverse_v16i32: 2356; AVX512BW: # %bb.0: 2357; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] 2358; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2359; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2360; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2361; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2362; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2363; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2364; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2365; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2366; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2367; AVX512BW-NEXT: retq 2368; 2369; XOPAVX1-LABEL: test_bitreverse_v16i32: 2370; XOPAVX1: # %bb.0: 2371; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2372; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 2373; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2374; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2375; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2376; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2377; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2378; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2379; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2380; XOPAVX1-NEXT: retq 2381; 2382; XOPAVX2-LABEL: test_bitreverse_v16i32: 2383; XOPAVX2: # %bb.0: 2384; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2385; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 2386; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2387; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2388; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2389; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2390; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2391; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2392; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2393; XOPAVX2-NEXT: retq 2394; 2395; GFNISSE-LABEL: test_bitreverse_v16i32: 2396; GFNISSE: # %bb.0: 2397; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2398; GFNISSE-NEXT: pshufb %xmm4, %xmm0 2399; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] 2400; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 2401; GFNISSE-NEXT: pshufb %xmm4, %xmm1 2402; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 2403; GFNISSE-NEXT: pshufb %xmm4, %xmm2 2404; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 2405; GFNISSE-NEXT: pshufb %xmm4, %xmm3 2406; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 2407; GFNISSE-NEXT: retq 2408; 2409; GFNIAVX1-LABEL: test_bitreverse_v16i32: 2410; GFNIAVX1: # %bb.0: 2411; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2412; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2413; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2414; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] 2415; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 2416; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2417; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0 2418; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2419; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2420; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2421; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 2422; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2423; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1 2424; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2425; GFNIAVX1-NEXT: retq 2426; 2427; GFNIAVX2-LABEL: test_bitreverse_v16i32: 2428; GFNIAVX2: # %bb.0: 2429; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2430; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2431; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2432; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2433; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2434; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2435; GFNIAVX2-NEXT: retq 2436; 2437; GFNIAVX512F-LABEL: test_bitreverse_v16i32: 2438; GFNIAVX512F: # %bb.0: 2439; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2440; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2441; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2442; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2443; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2444; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2445; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2446; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2447; GFNIAVX512F-NEXT: retq 2448; 2449; GFNIAVX512BW-LABEL: test_bitreverse_v16i32: 2450; GFNIAVX512BW: # %bb.0: 2451; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] 2452; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 2453; GFNIAVX512BW-NEXT: retq 2454 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) 2455 ret <16 x i32> %b 2456} 2457 2458define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { 2459; SSE2-LABEL: test_bitreverse_v8i64: 2460; SSE2: # %bb.0: 2461; SSE2-NEXT: pxor %xmm8, %xmm8 2462; SSE2-NEXT: movdqa %xmm0, %xmm5 2463; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] 2464; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] 2465; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] 2466; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] 2467; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 2468; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2469; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 2470; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 2471; SSE2-NEXT: packuswb %xmm5, %xmm0 2472; SSE2-NEXT: movdqa %xmm0, %xmm6 2473; SSE2-NEXT: psrlw $4, %xmm6 2474; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2475; SSE2-NEXT: pand %xmm5, %xmm6 2476; SSE2-NEXT: pand %xmm5, %xmm0 2477; SSE2-NEXT: psllw $4, %xmm0 2478; SSE2-NEXT: por %xmm6, %xmm0 2479; SSE2-NEXT: movdqa %xmm0, %xmm7 2480; SSE2-NEXT: psrlw $2, %xmm7 2481; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 2482; SSE2-NEXT: pand %xmm6, %xmm7 2483; SSE2-NEXT: pand %xmm6, %xmm0 2484; SSE2-NEXT: psllw $2, %xmm0 2485; SSE2-NEXT: por %xmm7, %xmm0 2486; SSE2-NEXT: movdqa %xmm0, %xmm4 2487; SSE2-NEXT: psrlw $1, %xmm4 2488; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 2489; SSE2-NEXT: pand %xmm7, %xmm4 2490; SSE2-NEXT: pand %xmm7, %xmm0 2491; SSE2-NEXT: paddb %xmm0, %xmm0 2492; SSE2-NEXT: por %xmm4, %xmm0 2493; SSE2-NEXT: movdqa %xmm1, %xmm4 2494; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] 2495; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 2496; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2497; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2498; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 2499; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2500; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 2501; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 2502; SSE2-NEXT: packuswb %xmm4, %xmm1 2503; SSE2-NEXT: movdqa %xmm1, %xmm4 2504; SSE2-NEXT: psrlw $4, %xmm4 2505; SSE2-NEXT: pand %xmm5, %xmm4 2506; SSE2-NEXT: pand %xmm5, %xmm1 2507; SSE2-NEXT: psllw $4, %xmm1 2508; SSE2-NEXT: por %xmm4, %xmm1 2509; SSE2-NEXT: movdqa %xmm1, %xmm4 2510; SSE2-NEXT: psrlw $2, %xmm4 2511; SSE2-NEXT: pand %xmm6, %xmm4 2512; SSE2-NEXT: pand %xmm6, %xmm1 2513; SSE2-NEXT: psllw $2, %xmm1 2514; SSE2-NEXT: por %xmm4, %xmm1 2515; SSE2-NEXT: movdqa %xmm1, %xmm4 2516; SSE2-NEXT: psrlw $1, %xmm4 2517; SSE2-NEXT: pand %xmm7, %xmm4 2518; SSE2-NEXT: pand %xmm7, %xmm1 2519; SSE2-NEXT: paddb %xmm1, %xmm1 2520; SSE2-NEXT: por %xmm4, %xmm1 2521; SSE2-NEXT: movdqa %xmm2, %xmm4 2522; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] 2523; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 2524; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2525; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2526; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] 2527; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2528; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 2529; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 2530; SSE2-NEXT: packuswb %xmm4, %xmm2 2531; SSE2-NEXT: movdqa %xmm2, %xmm4 2532; SSE2-NEXT: psrlw $4, %xmm4 2533; SSE2-NEXT: pand %xmm5, %xmm4 2534; SSE2-NEXT: pand %xmm5, %xmm2 2535; SSE2-NEXT: psllw $4, %xmm2 2536; SSE2-NEXT: por %xmm4, %xmm2 2537; SSE2-NEXT: movdqa %xmm2, %xmm4 2538; SSE2-NEXT: psrlw $2, %xmm4 2539; SSE2-NEXT: pand %xmm6, %xmm4 2540; SSE2-NEXT: pand %xmm6, %xmm2 2541; SSE2-NEXT: psllw $2, %xmm2 2542; SSE2-NEXT: por %xmm4, %xmm2 2543; SSE2-NEXT: movdqa %xmm2, %xmm4 2544; SSE2-NEXT: psrlw $1, %xmm4 2545; SSE2-NEXT: pand %xmm7, %xmm4 2546; SSE2-NEXT: pand %xmm7, %xmm2 2547; SSE2-NEXT: paddb %xmm2, %xmm2 2548; SSE2-NEXT: por %xmm4, %xmm2 2549; SSE2-NEXT: movdqa %xmm3, %xmm4 2550; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] 2551; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 2552; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2553; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2554; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] 2555; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 2556; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 2557; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 2558; SSE2-NEXT: packuswb %xmm4, %xmm3 2559; SSE2-NEXT: movdqa %xmm3, %xmm4 2560; SSE2-NEXT: psrlw $4, %xmm4 2561; SSE2-NEXT: pand %xmm5, %xmm4 2562; SSE2-NEXT: pand %xmm5, %xmm3 2563; SSE2-NEXT: psllw $4, %xmm3 2564; SSE2-NEXT: por %xmm4, %xmm3 2565; SSE2-NEXT: movdqa %xmm3, %xmm4 2566; SSE2-NEXT: psrlw $2, %xmm4 2567; SSE2-NEXT: pand %xmm6, %xmm4 2568; SSE2-NEXT: pand %xmm6, %xmm3 2569; SSE2-NEXT: psllw $2, %xmm3 2570; SSE2-NEXT: por %xmm4, %xmm3 2571; SSE2-NEXT: movdqa %xmm3, %xmm4 2572; SSE2-NEXT: psrlw $1, %xmm4 2573; SSE2-NEXT: pand %xmm7, %xmm4 2574; SSE2-NEXT: pand %xmm7, %xmm3 2575; SSE2-NEXT: paddb %xmm3, %xmm3 2576; SSE2-NEXT: por %xmm4, %xmm3 2577; SSE2-NEXT: retq 2578; 2579; SSSE3-LABEL: test_bitreverse_v8i64: 2580; SSSE3: # %bb.0: 2581; SSSE3-NEXT: movdqa %xmm1, %xmm5 2582; SSSE3-NEXT: movdqa %xmm0, %xmm1 2583; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2584; SSSE3-NEXT: pshufb %xmm8, %xmm1 2585; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2586; SSSE3-NEXT: movdqa %xmm1, %xmm0 2587; SSSE3-NEXT: pand %xmm9, %xmm0 2588; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2589; SSSE3-NEXT: movdqa %xmm7, %xmm6 2590; SSSE3-NEXT: pshufb %xmm0, %xmm6 2591; SSSE3-NEXT: psrlw $4, %xmm1 2592; SSSE3-NEXT: pand %xmm9, %xmm1 2593; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2594; SSSE3-NEXT: movdqa %xmm4, %xmm0 2595; SSSE3-NEXT: pshufb %xmm1, %xmm0 2596; SSSE3-NEXT: por %xmm6, %xmm0 2597; SSSE3-NEXT: pshufb %xmm8, %xmm5 2598; SSSE3-NEXT: movdqa %xmm5, %xmm1 2599; SSSE3-NEXT: pand %xmm9, %xmm1 2600; SSSE3-NEXT: movdqa %xmm7, %xmm6 2601; SSSE3-NEXT: pshufb %xmm1, %xmm6 2602; SSSE3-NEXT: psrlw $4, %xmm5 2603; SSSE3-NEXT: pand %xmm9, %xmm5 2604; SSSE3-NEXT: movdqa %xmm4, %xmm1 2605; SSSE3-NEXT: pshufb %xmm5, %xmm1 2606; SSSE3-NEXT: por %xmm6, %xmm1 2607; SSSE3-NEXT: pshufb %xmm8, %xmm2 2608; SSSE3-NEXT: movdqa %xmm2, %xmm5 2609; SSSE3-NEXT: pand %xmm9, %xmm5 2610; SSSE3-NEXT: movdqa %xmm7, %xmm6 2611; SSSE3-NEXT: pshufb %xmm5, %xmm6 2612; SSSE3-NEXT: psrlw $4, %xmm2 2613; SSSE3-NEXT: pand %xmm9, %xmm2 2614; SSSE3-NEXT: movdqa %xmm4, %xmm5 2615; SSSE3-NEXT: pshufb %xmm2, %xmm5 2616; SSSE3-NEXT: por %xmm6, %xmm5 2617; SSSE3-NEXT: pshufb %xmm8, %xmm3 2618; SSSE3-NEXT: movdqa %xmm3, %xmm2 2619; SSSE3-NEXT: pand %xmm9, %xmm2 2620; SSSE3-NEXT: pshufb %xmm2, %xmm7 2621; SSSE3-NEXT: psrlw $4, %xmm3 2622; SSSE3-NEXT: pand %xmm9, %xmm3 2623; SSSE3-NEXT: pshufb %xmm3, %xmm4 2624; SSSE3-NEXT: por %xmm7, %xmm4 2625; SSSE3-NEXT: movdqa %xmm5, %xmm2 2626; SSSE3-NEXT: movdqa %xmm4, %xmm3 2627; SSSE3-NEXT: retq 2628; 2629; AVX1-LABEL: test_bitreverse_v8i64: 2630; AVX1: # %bb.0: 2631; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2632; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2633; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2634; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2635; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2636; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2637; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2638; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2639; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2640; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2641; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2642; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2643; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2644; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 2645; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2646; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2647; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2648; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2649; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2650; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2651; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2652; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2653; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2654; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2655; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2656; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2657; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2658; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2659; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2660; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2661; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2662; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2663; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2664; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2665; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2666; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2667; AVX1-NEXT: retq 2668; 2669; AVX2-LABEL: test_bitreverse_v8i64: 2670; AVX2: # %bb.0: 2671; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2672; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2673; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2674; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2675; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2676; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2677; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2678; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2679; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2680; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2681; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2682; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2683; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2684; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2685; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2686; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2687; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2688; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2689; AVX2-NEXT: retq 2690; 2691; AVX512F-LABEL: test_bitreverse_v8i64: 2692; AVX512F: # %bb.0: 2693; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2694; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2695; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2696; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2697; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 2698; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2699; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2700; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2701; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 2702; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2703; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 2704; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2705; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 2706; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2707; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 2708; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 2709; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 2710; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 2711; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2712; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 2713; AVX512F-NEXT: retq 2714; 2715; AVX512BW-LABEL: test_bitreverse_v8i64: 2716; AVX512BW: # %bb.0: 2717; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] 2718; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2719; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2720; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2721; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2722; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2723; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2724; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2725; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2726; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2727; AVX512BW-NEXT: retq 2728; 2729; XOPAVX1-LABEL: test_bitreverse_v8i64: 2730; XOPAVX1: # %bb.0: 2731; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2732; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 2733; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2734; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2735; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2736; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2737; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2738; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2739; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2740; XOPAVX1-NEXT: retq 2741; 2742; XOPAVX2-LABEL: test_bitreverse_v8i64: 2743; XOPAVX2: # %bb.0: 2744; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2745; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 2746; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2747; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2748; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2749; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2750; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2751; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2752; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2753; XOPAVX2-NEXT: retq 2754; 2755; GFNISSE-LABEL: test_bitreverse_v8i64: 2756; GFNISSE: # %bb.0: 2757; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2758; GFNISSE-NEXT: pshufb %xmm4, %xmm0 2759; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] 2760; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 2761; GFNISSE-NEXT: pshufb %xmm4, %xmm1 2762; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 2763; GFNISSE-NEXT: pshufb %xmm4, %xmm2 2764; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 2765; GFNISSE-NEXT: pshufb %xmm4, %xmm3 2766; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 2767; GFNISSE-NEXT: retq 2768; 2769; GFNIAVX1-LABEL: test_bitreverse_v8i64: 2770; GFNIAVX1: # %bb.0: 2771; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2772; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2773; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2774; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] 2775; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 2776; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2777; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0 2778; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2779; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2780; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2781; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm2 2782; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2783; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1 2784; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2785; GFNIAVX1-NEXT: retq 2786; 2787; GFNIAVX2-LABEL: test_bitreverse_v8i64: 2788; GFNIAVX2: # %bb.0: 2789; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2790; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2791; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2792; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2793; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2794; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2795; GFNIAVX2-NEXT: retq 2796; 2797; GFNIAVX512F-LABEL: test_bitreverse_v8i64: 2798; GFNIAVX512F: # %bb.0: 2799; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2800; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2801; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2802; GFNIAVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] 2803; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2804; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2805; GFNIAVX512F-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2806; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2807; GFNIAVX512F-NEXT: retq 2808; 2809; GFNIAVX512BW-LABEL: test_bitreverse_v8i64: 2810; GFNIAVX512BW: # %bb.0: 2811; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] 2812; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 2813; GFNIAVX512BW-NEXT: retq 2814 %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) 2815 ret <8 x i64> %b 2816} 2817 2818; 2819; Constant Folding 2820; 2821 2822define i32 @fold_bitreverse_i32() nounwind { 2823; ALL-LABEL: fold_bitreverse_i32: 2824; ALL: # %bb.0: 2825; ALL-NEXT: movl $16711935, %eax # imm = 0xFF00FF 2826; ALL-NEXT: retq 2827 %b = call i32 @llvm.bitreverse.i32(i32 4278255360) 2828 ret i32 %b 2829} 2830 2831define <16 x i8> @fold_bitreverse_v16i8() nounwind { 2832; SSE-LABEL: fold_bitreverse_v16i8: 2833; SSE: # %bb.0: 2834; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2835; SSE-NEXT: retq 2836; 2837; AVX-LABEL: fold_bitreverse_v16i8: 2838; AVX: # %bb.0: 2839; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2840; AVX-NEXT: retq 2841; 2842; XOP-LABEL: fold_bitreverse_v16i8: 2843; XOP: # %bb.0: 2844; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2845; XOP-NEXT: retq 2846; 2847; GFNISSE-LABEL: fold_bitreverse_v16i8: 2848; GFNISSE: # %bb.0: 2849; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2850; GFNISSE-NEXT: retq 2851; 2852; GFNIAVX-LABEL: fold_bitreverse_v16i8: 2853; GFNIAVX: # %bb.0: 2854; GFNIAVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2855; GFNIAVX-NEXT: retq 2856 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>) 2857 ret <16 x i8> %b 2858} 2859 2860define <16 x i16> @fold_bitreverse_v16i16() nounwind { 2861; SSE-LABEL: fold_bitreverse_v16i16: 2862; SSE: # %bb.0: 2863; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959] 2864; SSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863] 2865; SSE-NEXT: retq 2866; 2867; AVX-LABEL: fold_bitreverse_v16i16: 2868; AVX: # %bb.0: 2869; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 2870; AVX-NEXT: retq 2871; 2872; XOP-LABEL: fold_bitreverse_v16i16: 2873; XOP: # %bb.0: 2874; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 2875; XOP-NEXT: retq 2876; 2877; GFNISSE-LABEL: fold_bitreverse_v16i16: 2878; GFNISSE: # %bb.0: 2879; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959] 2880; GFNISSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863] 2881; GFNISSE-NEXT: retq 2882; 2883; GFNIAVX-LABEL: fold_bitreverse_v16i16: 2884; GFNIAVX: # %bb.0: 2885; GFNIAVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 2886; GFNIAVX-NEXT: retq 2887 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>) 2888 ret <16 x i16> %b 2889} 2890 2891define <16 x i32> @fold_bitreverse_v16i32() nounwind { 2892; SSE-LABEL: fold_bitreverse_v16i32: 2893; SSE: # %bb.0: 2894; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471] 2895; SSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559] 2896; SSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015] 2897; SSE-NEXT: movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103] 2898; SSE-NEXT: retq 2899; 2900; AVX1-LABEL: fold_bitreverse_v16i32: 2901; AVX1: # %bb.0: 2902; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2903; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2904; AVX1-NEXT: retq 2905; 2906; AVX2-LABEL: fold_bitreverse_v16i32: 2907; AVX2: # %bb.0: 2908; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2909; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2910; AVX2-NEXT: retq 2911; 2912; AVX512-LABEL: fold_bitreverse_v16i32: 2913; AVX512: # %bb.0: 2914; AVX512-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2915; AVX512-NEXT: retq 2916; 2917; XOP-LABEL: fold_bitreverse_v16i32: 2918; XOP: # %bb.0: 2919; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2920; XOP-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2921; XOP-NEXT: retq 2922; 2923; GFNISSE-LABEL: fold_bitreverse_v16i32: 2924; GFNISSE: # %bb.0: 2925; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471] 2926; GFNISSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559] 2927; GFNISSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015] 2928; GFNISSE-NEXT: movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103] 2929; GFNISSE-NEXT: retq 2930; 2931; GFNIAVX1-LABEL: fold_bitreverse_v16i32: 2932; GFNIAVX1: # %bb.0: 2933; GFNIAVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2934; GFNIAVX1-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2935; GFNIAVX1-NEXT: retq 2936; 2937; GFNIAVX2-LABEL: fold_bitreverse_v16i32: 2938; GFNIAVX2: # %bb.0: 2939; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2940; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2941; GFNIAVX2-NEXT: retq 2942; 2943; GFNIAVX512-LABEL: fold_bitreverse_v16i32: 2944; GFNIAVX512: # %bb.0: 2945; GFNIAVX512-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2946; GFNIAVX512-NEXT: retq 2947 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>) 2948 ret <16 x i32> %b 2949} 2950 2951declare i8 @llvm.bitreverse.i8(i8) readnone 2952declare i16 @llvm.bitreverse.i16(i16) readnone 2953declare i32 @llvm.bitreverse.i32(i32) readnone 2954declare i64 @llvm.bitreverse.i64(i64) readnone 2955 2956declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone 2957declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone 2958declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone 2959declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone 2960 2961declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) readnone 2962declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone 2963declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone 2964declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone 2965 2966declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) readnone 2967declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone 2968declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>) readnone 2969declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>) readnone 2970