1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,AVX1 3; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,AVX2 4; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=F16C 5; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=F16C 6; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+f16c,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=F16C 7; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+f16c,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=F16C 8; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512 9; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512 10; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512 11 12; 13; Half to Float 14; 15 16define float @cvt_i16_to_f32(i16 %a0) nounwind { 17; AVX-LABEL: cvt_i16_to_f32: 18; AVX: # %bb.0: 19; AVX-NEXT: vpinsrw $0, %edi, %xmm0, %xmm0 20; AVX-NEXT: jmp __extendhfsf2@PLT # TAILCALL 21; 22; F16C-LABEL: cvt_i16_to_f32: 23; F16C: # %bb.0: 24; F16C-NEXT: movzwl %di, %eax 25; F16C-NEXT: vmovd %eax, %xmm0 26; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 27; F16C-NEXT: retq 28; 29; AVX512-LABEL: cvt_i16_to_f32: 30; AVX512: # %bb.0: 31; AVX512-NEXT: movzwl %di, %eax 32; AVX512-NEXT: vmovd %eax, %xmm0 33; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 34; AVX512-NEXT: retq 35 %1 = bitcast i16 %a0 to half 36 %2 = fpext half %1 to float 37 ret float %2 38} 39 40define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind { 41; AVX-LABEL: cvt_4i16_to_4f32: 42; AVX: # %bb.0: 43; AVX-NEXT: subq $72, %rsp 44; AVX-NEXT: vmovq %xmm0, %rax 45; AVX-NEXT: movq %rax, %rcx 46; AVX-NEXT: movq %rax, %rdx 47; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 48; AVX-NEXT: # kill: def $eax killed $eax killed $rax 49; AVX-NEXT: shrl $16, %eax 50; AVX-NEXT: shrq $32, %rcx 51; AVX-NEXT: shrq $48, %rdx 52; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 53; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 54; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 55; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 56; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 57; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 58; AVX-NEXT: callq __extendhfsf2@PLT 59; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 60; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 61; AVX-NEXT: callq __extendhfsf2@PLT 62; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 63; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 64; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 65; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 66; AVX-NEXT: callq __extendhfsf2@PLT 67; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 68; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 69; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 70; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 71; AVX-NEXT: callq __extendhfsf2@PLT 72; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 73; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 74; AVX-NEXT: addq $72, %rsp 75; AVX-NEXT: retq 76; 77; F16C-LABEL: cvt_4i16_to_4f32: 78; F16C: # %bb.0: 79; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 80; F16C-NEXT: retq 81; 82; AVX512-LABEL: cvt_4i16_to_4f32: 83; AVX512: # %bb.0: 84; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 85; AVX512-NEXT: retq 86 %1 = bitcast <4 x i16> %a0 to <4 x half> 87 %2 = fpext <4 x half> %1 to <4 x float> 88 ret <4 x float> %2 89} 90 91define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind { 92; AVX-LABEL: cvt_8i16_to_4f32: 93; AVX: # %bb.0: 94; AVX-NEXT: subq $72, %rsp 95; AVX-NEXT: vmovq %xmm0, %rax 96; AVX-NEXT: movq %rax, %rcx 97; AVX-NEXT: movq %rax, %rdx 98; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 99; AVX-NEXT: # kill: def $eax killed $eax killed $rax 100; AVX-NEXT: shrl $16, %eax 101; AVX-NEXT: shrq $32, %rcx 102; AVX-NEXT: shrq $48, %rdx 103; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 104; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 105; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 106; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 107; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 108; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 109; AVX-NEXT: callq __extendhfsf2@PLT 110; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 111; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 112; AVX-NEXT: callq __extendhfsf2@PLT 113; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 114; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 115; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 116; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 117; AVX-NEXT: callq __extendhfsf2@PLT 118; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 119; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 120; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 121; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 122; AVX-NEXT: callq __extendhfsf2@PLT 123; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 124; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 125; AVX-NEXT: addq $72, %rsp 126; AVX-NEXT: retq 127; 128; F16C-LABEL: cvt_8i16_to_4f32: 129; F16C: # %bb.0: 130; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 131; F16C-NEXT: retq 132; 133; AVX512-LABEL: cvt_8i16_to_4f32: 134; AVX512: # %bb.0: 135; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 136; AVX512-NEXT: retq 137 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 138 %2 = bitcast <4 x i16> %1 to <4 x half> 139 %3 = fpext <4 x half> %2 to <4 x float> 140 ret <4 x float> %3 141} 142 143define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind { 144; AVX-LABEL: cvt_8i16_to_8f32: 145; AVX: # %bb.0: 146; AVX-NEXT: subq $56, %rsp 147; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 148; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 149; AVX-NEXT: callq __extendhfsf2@PLT 150; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 151; AVX-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 152; AVX-NEXT: # xmm0 = mem[2,3,0,1] 153; AVX-NEXT: callq __extendhfsf2@PLT 154; AVX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 155; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 156; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 157; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 158; AVX-NEXT: # xmm0 = mem[3,3,3,3] 159; AVX-NEXT: callq __extendhfsf2@PLT 160; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 161; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 162; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 163; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 164; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 165; AVX-NEXT: callq __extendhfsf2@PLT 166; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 167; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 168; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 169; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 170; AVX-NEXT: callq __extendhfsf2@PLT 171; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 172; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 173; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 174; AVX-NEXT: callq __extendhfsf2@PLT 175; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 176; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 177; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 178; AVX-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 179; AVX-NEXT: # xmm0 = mem[1,1,3,3] 180; AVX-NEXT: callq __extendhfsf2@PLT 181; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 182; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 183; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 184; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 185; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 186; AVX-NEXT: callq __extendhfsf2@PLT 187; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 188; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 189; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 190; AVX-NEXT: addq $56, %rsp 191; AVX-NEXT: retq 192; 193; F16C-LABEL: cvt_8i16_to_8f32: 194; F16C: # %bb.0: 195; F16C-NEXT: vcvtph2ps %xmm0, %ymm0 196; F16C-NEXT: retq 197; 198; AVX512-LABEL: cvt_8i16_to_8f32: 199; AVX512: # %bb.0: 200; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0 201; AVX512-NEXT: retq 202 %1 = bitcast <8 x i16> %a0 to <8 x half> 203 %2 = fpext <8 x half> %1 to <8 x float> 204 ret <8 x float> %2 205} 206 207define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { 208; AVX1-LABEL: cvt_16i16_to_16f32: 209; AVX1: # %bb.0: 210; AVX1-NEXT: subq $104, %rsp 211; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 212; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 213; AVX1-NEXT: vzeroupper 214; AVX1-NEXT: callq __extendhfsf2@PLT 215; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 216; AVX1-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 217; AVX1-NEXT: # xmm0 = mem[2,3,0,1] 218; AVX1-NEXT: callq __extendhfsf2@PLT 219; AVX1-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 220; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 221; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 222; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 223; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 224; AVX1-NEXT: callq __extendhfsf2@PLT 225; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 226; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 227; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 228; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 229; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 230; AVX1-NEXT: vzeroupper 231; AVX1-NEXT: callq __extendhfsf2@PLT 232; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 233; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 234; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 235; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 236; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 237; AVX1-NEXT: vzeroupper 238; AVX1-NEXT: callq __extendhfsf2@PLT 239; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 240; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 241; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 242; AVX1-NEXT: vzeroupper 243; AVX1-NEXT: callq __extendhfsf2@PLT 244; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 245; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 246; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 247; AVX1-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 248; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 249; AVX1-NEXT: callq __extendhfsf2@PLT 250; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 251; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 252; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 253; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 254; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 255; AVX1-NEXT: vzeroupper 256; AVX1-NEXT: callq __extendhfsf2@PLT 257; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 258; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 259; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 260; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 261; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 262; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 263; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 264; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 265; AVX1-NEXT: vzeroupper 266; AVX1-NEXT: callq __extendhfsf2@PLT 267; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 268; AVX1-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 269; AVX1-NEXT: # xmm0 = mem[2,3,0,1] 270; AVX1-NEXT: callq __extendhfsf2@PLT 271; AVX1-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 272; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 273; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 274; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 275; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 276; AVX1-NEXT: callq __extendhfsf2@PLT 277; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 278; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 279; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 280; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 281; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 282; AVX1-NEXT: callq __extendhfsf2@PLT 283; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 284; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 285; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 286; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 287; AVX1-NEXT: callq __extendhfsf2@PLT 288; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 289; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 290; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 291; AVX1-NEXT: callq __extendhfsf2@PLT 292; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 293; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 294; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 295; AVX1-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 296; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 297; AVX1-NEXT: callq __extendhfsf2@PLT 298; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 299; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 300; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 301; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 302; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 303; AVX1-NEXT: callq __extendhfsf2@PLT 304; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 305; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 306; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload 307; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 308; AVX1-NEXT: addq $104, %rsp 309; AVX1-NEXT: retq 310; 311; AVX2-LABEL: cvt_16i16_to_16f32: 312; AVX2: # %bb.0: 313; AVX2-NEXT: subq $104, %rsp 314; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 315; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 316; AVX2-NEXT: vzeroupper 317; AVX2-NEXT: callq __extendhfsf2@PLT 318; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 319; AVX2-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 320; AVX2-NEXT: # xmm0 = mem[2,3,0,1] 321; AVX2-NEXT: callq __extendhfsf2@PLT 322; AVX2-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 323; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 324; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 325; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 326; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 327; AVX2-NEXT: callq __extendhfsf2@PLT 328; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 329; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 330; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 331; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 332; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 333; AVX2-NEXT: vzeroupper 334; AVX2-NEXT: callq __extendhfsf2@PLT 335; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 336; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 337; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 338; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 339; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 340; AVX2-NEXT: vzeroupper 341; AVX2-NEXT: callq __extendhfsf2@PLT 342; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 343; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 344; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 345; AVX2-NEXT: vzeroupper 346; AVX2-NEXT: callq __extendhfsf2@PLT 347; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 348; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 349; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 350; AVX2-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 351; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 352; AVX2-NEXT: callq __extendhfsf2@PLT 353; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 354; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 355; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 356; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 357; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 358; AVX2-NEXT: vzeroupper 359; AVX2-NEXT: callq __extendhfsf2@PLT 360; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 361; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 362; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 363; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 364; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 365; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 366; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 367; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 368; AVX2-NEXT: vzeroupper 369; AVX2-NEXT: callq __extendhfsf2@PLT 370; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 371; AVX2-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 372; AVX2-NEXT: # xmm0 = mem[2,3,0,1] 373; AVX2-NEXT: callq __extendhfsf2@PLT 374; AVX2-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 375; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 376; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 377; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 378; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 379; AVX2-NEXT: callq __extendhfsf2@PLT 380; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 381; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 382; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 383; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 384; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 385; AVX2-NEXT: callq __extendhfsf2@PLT 386; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 387; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 388; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 389; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 390; AVX2-NEXT: callq __extendhfsf2@PLT 391; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 392; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 393; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 394; AVX2-NEXT: callq __extendhfsf2@PLT 395; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 396; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 397; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 398; AVX2-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 399; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 400; AVX2-NEXT: callq __extendhfsf2@PLT 401; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 402; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 403; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 404; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 405; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 406; AVX2-NEXT: callq __extendhfsf2@PLT 407; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 408; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 409; AVX2-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload 410; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 411; AVX2-NEXT: addq $104, %rsp 412; AVX2-NEXT: retq 413; 414; F16C-LABEL: cvt_16i16_to_16f32: 415; F16C: # %bb.0: 416; F16C-NEXT: vcvtph2ps %xmm0, %ymm2 417; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 418; F16C-NEXT: vcvtph2ps %xmm0, %ymm1 419; F16C-NEXT: vmovaps %ymm2, %ymm0 420; F16C-NEXT: retq 421; 422; AVX512-LABEL: cvt_16i16_to_16f32: 423; AVX512: # %bb.0: 424; AVX512-NEXT: vcvtph2ps %ymm0, %zmm0 425; AVX512-NEXT: retq 426 %1 = bitcast <16 x i16> %a0 to <16 x half> 427 %2 = fpext <16 x half> %1 to <16 x float> 428 ret <16 x float> %2 429} 430 431define <2 x float> @cvt_2i16_to_2f32_constrained(<2 x i16> %a0) nounwind strictfp { 432; AVX-LABEL: cvt_2i16_to_2f32_constrained: 433; AVX: # %bb.0: 434; AVX-NEXT: subq $40, %rsp 435; AVX-NEXT: vmovd %xmm0, %eax 436; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 437; AVX-NEXT: shrl $16, %eax 438; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 439; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 440; AVX-NEXT: callq __extendhfsf2@PLT 441; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 442; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 443; AVX-NEXT: callq __extendhfsf2@PLT 444; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 445; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 446; AVX-NEXT: addq $40, %rsp 447; AVX-NEXT: retq 448; 449; F16C-LABEL: cvt_2i16_to_2f32_constrained: 450; F16C: # %bb.0: 451; F16C-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 452; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 453; F16C-NEXT: retq 454; 455; AVX512-LABEL: cvt_2i16_to_2f32_constrained: 456; AVX512: # %bb.0: 457; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 458; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 459; AVX512-NEXT: retq 460 %1 = bitcast <2 x i16> %a0 to <2 x half> 461 %2 = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %1, metadata !"fpexcept.strict") strictfp 462 ret <2 x float> %2 463} 464declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) strictfp 465 466define <4 x float> @cvt_4i16_to_4f32_constrained(<4 x i16> %a0) nounwind strictfp { 467; AVX-LABEL: cvt_4i16_to_4f32_constrained: 468; AVX: # %bb.0: 469; AVX-NEXT: subq $72, %rsp 470; AVX-NEXT: vmovq %xmm0, %rax 471; AVX-NEXT: movq %rax, %rcx 472; AVX-NEXT: movq %rax, %rdx 473; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 474; AVX-NEXT: # kill: def $eax killed $eax killed $rax 475; AVX-NEXT: shrl $16, %eax 476; AVX-NEXT: shrq $32, %rcx 477; AVX-NEXT: shrq $48, %rdx 478; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 479; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 480; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 481; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 482; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 483; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 484; AVX-NEXT: callq __extendhfsf2@PLT 485; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 486; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 487; AVX-NEXT: callq __extendhfsf2@PLT 488; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 489; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 490; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 491; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 492; AVX-NEXT: callq __extendhfsf2@PLT 493; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 494; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 495; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 496; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 497; AVX-NEXT: callq __extendhfsf2@PLT 498; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 499; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 500; AVX-NEXT: addq $72, %rsp 501; AVX-NEXT: retq 502; 503; F16C-LABEL: cvt_4i16_to_4f32_constrained: 504; F16C: # %bb.0: 505; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 506; F16C-NEXT: retq 507; 508; AVX512-LABEL: cvt_4i16_to_4f32_constrained: 509; AVX512: # %bb.0: 510; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 511; AVX512-NEXT: retq 512 %1 = bitcast <4 x i16> %a0 to <4 x half> 513 %2 = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %1, metadata !"fpexcept.strict") strictfp 514 ret <4 x float> %2 515} 516declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) strictfp 517 518define <8 x float> @cvt_8i16_to_8f32_constrained(<8 x i16> %a0) nounwind strictfp { 519; AVX-LABEL: cvt_8i16_to_8f32_constrained: 520; AVX: # %bb.0: 521; AVX-NEXT: subq $56, %rsp 522; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 523; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 524; AVX-NEXT: callq __extendhfsf2@PLT 525; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 526; AVX-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 527; AVX-NEXT: # xmm0 = mem[2,3,0,1] 528; AVX-NEXT: callq __extendhfsf2@PLT 529; AVX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 530; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 531; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 532; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 533; AVX-NEXT: # xmm0 = mem[3,3,3,3] 534; AVX-NEXT: callq __extendhfsf2@PLT 535; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 536; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 537; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 538; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 539; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 540; AVX-NEXT: callq __extendhfsf2@PLT 541; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 542; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 543; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 544; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 545; AVX-NEXT: callq __extendhfsf2@PLT 546; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 547; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 548; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 549; AVX-NEXT: callq __extendhfsf2@PLT 550; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 551; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 552; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 553; AVX-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 554; AVX-NEXT: # xmm0 = mem[1,1,3,3] 555; AVX-NEXT: callq __extendhfsf2@PLT 556; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 557; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 558; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 559; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 560; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 561; AVX-NEXT: callq __extendhfsf2@PLT 562; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 563; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 564; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 565; AVX-NEXT: addq $56, %rsp 566; AVX-NEXT: retq 567; 568; F16C-LABEL: cvt_8i16_to_8f32_constrained: 569; F16C: # %bb.0: 570; F16C-NEXT: vcvtph2ps %xmm0, %ymm0 571; F16C-NEXT: retq 572; 573; AVX512-LABEL: cvt_8i16_to_8f32_constrained: 574; AVX512: # %bb.0: 575; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0 576; AVX512-NEXT: retq 577 %1 = bitcast <8 x i16> %a0 to <8 x half> 578 %2 = call <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half> %1, metadata !"fpexcept.strict") strictfp 579 ret <8 x float> %2 580} 581declare <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half>, metadata) strictfp 582 583define <16 x float> @cvt_16i16_to_16f32_constrained(<16 x i16> %a0) nounwind strictfp { 584; AVX1-LABEL: cvt_16i16_to_16f32_constrained: 585; AVX1: # %bb.0: 586; AVX1-NEXT: subq $104, %rsp 587; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 588; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 589; AVX1-NEXT: vzeroupper 590; AVX1-NEXT: callq __extendhfsf2@PLT 591; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 592; AVX1-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 593; AVX1-NEXT: # xmm0 = mem[2,3,0,1] 594; AVX1-NEXT: callq __extendhfsf2@PLT 595; AVX1-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 596; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 597; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 598; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 599; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 600; AVX1-NEXT: callq __extendhfsf2@PLT 601; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 602; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 603; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 604; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 605; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 606; AVX1-NEXT: vzeroupper 607; AVX1-NEXT: callq __extendhfsf2@PLT 608; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 609; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 610; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 611; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 612; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 613; AVX1-NEXT: vzeroupper 614; AVX1-NEXT: callq __extendhfsf2@PLT 615; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 616; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 617; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 618; AVX1-NEXT: vzeroupper 619; AVX1-NEXT: callq __extendhfsf2@PLT 620; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 621; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 622; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 623; AVX1-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 624; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 625; AVX1-NEXT: callq __extendhfsf2@PLT 626; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 627; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 628; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 629; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 630; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 631; AVX1-NEXT: vzeroupper 632; AVX1-NEXT: callq __extendhfsf2@PLT 633; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 634; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 635; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 636; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 637; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 638; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 639; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 640; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 641; AVX1-NEXT: vzeroupper 642; AVX1-NEXT: callq __extendhfsf2@PLT 643; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 644; AVX1-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 645; AVX1-NEXT: # xmm0 = mem[2,3,0,1] 646; AVX1-NEXT: callq __extendhfsf2@PLT 647; AVX1-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 648; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 649; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 650; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 651; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 652; AVX1-NEXT: callq __extendhfsf2@PLT 653; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 654; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 655; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 656; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 657; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 658; AVX1-NEXT: callq __extendhfsf2@PLT 659; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 660; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 661; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 662; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 663; AVX1-NEXT: callq __extendhfsf2@PLT 664; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 665; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 666; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 667; AVX1-NEXT: callq __extendhfsf2@PLT 668; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 669; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 670; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 671; AVX1-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 672; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 673; AVX1-NEXT: callq __extendhfsf2@PLT 674; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 675; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 676; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 677; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 678; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 679; AVX1-NEXT: callq __extendhfsf2@PLT 680; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 681; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 682; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload 683; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 684; AVX1-NEXT: addq $104, %rsp 685; AVX1-NEXT: retq 686; 687; AVX2-LABEL: cvt_16i16_to_16f32_constrained: 688; AVX2: # %bb.0: 689; AVX2-NEXT: subq $104, %rsp 690; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 691; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 692; AVX2-NEXT: vzeroupper 693; AVX2-NEXT: callq __extendhfsf2@PLT 694; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 695; AVX2-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 696; AVX2-NEXT: # xmm0 = mem[2,3,0,1] 697; AVX2-NEXT: callq __extendhfsf2@PLT 698; AVX2-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 699; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 700; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 701; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 702; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 703; AVX2-NEXT: callq __extendhfsf2@PLT 704; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 705; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 706; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 707; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 708; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 709; AVX2-NEXT: vzeroupper 710; AVX2-NEXT: callq __extendhfsf2@PLT 711; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 712; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 713; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 714; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 715; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 716; AVX2-NEXT: vzeroupper 717; AVX2-NEXT: callq __extendhfsf2@PLT 718; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 719; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 720; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 721; AVX2-NEXT: vzeroupper 722; AVX2-NEXT: callq __extendhfsf2@PLT 723; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 724; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 725; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 726; AVX2-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 727; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 728; AVX2-NEXT: callq __extendhfsf2@PLT 729; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 730; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 731; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 732; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 733; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 734; AVX2-NEXT: vzeroupper 735; AVX2-NEXT: callq __extendhfsf2@PLT 736; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 737; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 738; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 739; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 740; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 741; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 742; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 743; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 744; AVX2-NEXT: vzeroupper 745; AVX2-NEXT: callq __extendhfsf2@PLT 746; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 747; AVX2-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 748; AVX2-NEXT: # xmm0 = mem[2,3,0,1] 749; AVX2-NEXT: callq __extendhfsf2@PLT 750; AVX2-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 751; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 752; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 753; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 754; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 755; AVX2-NEXT: callq __extendhfsf2@PLT 756; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 757; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 758; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 759; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 760; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 761; AVX2-NEXT: callq __extendhfsf2@PLT 762; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 763; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 764; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 765; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 766; AVX2-NEXT: callq __extendhfsf2@PLT 767; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 768; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 769; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 770; AVX2-NEXT: callq __extendhfsf2@PLT 771; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 772; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 773; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 774; AVX2-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 775; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 776; AVX2-NEXT: callq __extendhfsf2@PLT 777; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 778; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 779; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 780; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 781; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 782; AVX2-NEXT: callq __extendhfsf2@PLT 783; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 784; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 785; AVX2-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload 786; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 787; AVX2-NEXT: addq $104, %rsp 788; AVX2-NEXT: retq 789; 790; F16C-LABEL: cvt_16i16_to_16f32_constrained: 791; F16C: # %bb.0: 792; F16C-NEXT: vextractf128 $1, %ymm0, %xmm1 793; F16C-NEXT: vcvtph2ps %xmm1, %ymm1 794; F16C-NEXT: vcvtph2ps %xmm0, %ymm0 795; F16C-NEXT: retq 796; 797; AVX512-LABEL: cvt_16i16_to_16f32_constrained: 798; AVX512: # %bb.0: 799; AVX512-NEXT: vcvtph2ps %ymm0, %zmm0 800; AVX512-NEXT: retq 801 %1 = bitcast <16 x i16> %a0 to <16 x half> 802 %2 = call <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half> %1, metadata !"fpexcept.strict") strictfp 803 ret <16 x float> %2 804} 805declare <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half>, metadata) strictfp 806 807; 808; Half to Float (Load) 809; 810 811define float @load_cvt_i16_to_f32(ptr %a0) nounwind { 812; AVX-LABEL: load_cvt_i16_to_f32: 813; AVX: # %bb.0: 814; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 815; AVX-NEXT: jmp __extendhfsf2@PLT # TAILCALL 816; 817; F16C-LABEL: load_cvt_i16_to_f32: 818; F16C: # %bb.0: 819; F16C-NEXT: movzwl (%rdi), %eax 820; F16C-NEXT: vmovd %eax, %xmm0 821; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 822; F16C-NEXT: retq 823; 824; AVX512-LABEL: load_cvt_i16_to_f32: 825; AVX512: # %bb.0: 826; AVX512-NEXT: movzwl (%rdi), %eax 827; AVX512-NEXT: vmovd %eax, %xmm0 828; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 829; AVX512-NEXT: retq 830 %1 = load i16, ptr %a0 831 %2 = bitcast i16 %1 to half 832 %3 = fpext half %2 to float 833 ret float %3 834} 835 836define <4 x float> @load_cvt_4i16_to_4f32(ptr %a0) nounwind { 837; AVX-LABEL: load_cvt_4i16_to_4f32: 838; AVX: # %bb.0: 839; AVX-NEXT: subq $72, %rsp 840; AVX-NEXT: vpinsrw $0, 6(%rdi), %xmm0, %xmm0 841; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 842; AVX-NEXT: vpinsrw $0, 4(%rdi), %xmm0, %xmm0 843; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 844; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 845; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 846; AVX-NEXT: vpinsrw $0, 2(%rdi), %xmm0, %xmm0 847; AVX-NEXT: callq __extendhfsf2@PLT 848; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 849; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 850; AVX-NEXT: callq __extendhfsf2@PLT 851; AVX-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 852; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 853; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 854; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 855; AVX-NEXT: callq __extendhfsf2@PLT 856; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 857; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 858; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 859; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 860; AVX-NEXT: callq __extendhfsf2@PLT 861; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 862; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 863; AVX-NEXT: addq $72, %rsp 864; AVX-NEXT: retq 865; 866; F16C-LABEL: load_cvt_4i16_to_4f32: 867; F16C: # %bb.0: 868; F16C-NEXT: vcvtph2ps (%rdi), %xmm0 869; F16C-NEXT: retq 870; 871; AVX512-LABEL: load_cvt_4i16_to_4f32: 872; AVX512: # %bb.0: 873; AVX512-NEXT: vcvtph2ps (%rdi), %xmm0 874; AVX512-NEXT: retq 875 %1 = load <4 x i16>, ptr %a0 876 %2 = bitcast <4 x i16> %1 to <4 x half> 877 %3 = fpext <4 x half> %2 to <4 x float> 878 ret <4 x float> %3 879} 880 881define <4 x float> @load_cvt_8i16_to_4f32(ptr %a0) nounwind { 882; AVX-LABEL: load_cvt_8i16_to_4f32: 883; AVX: # %bb.0: 884; AVX-NEXT: subq $72, %rsp 885; AVX-NEXT: movq (%rdi), %rax 886; AVX-NEXT: movq %rax, %rcx 887; AVX-NEXT: movq %rax, %rdx 888; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 889; AVX-NEXT: # kill: def $eax killed $eax killed $rax 890; AVX-NEXT: shrl $16, %eax 891; AVX-NEXT: shrq $32, %rcx 892; AVX-NEXT: shrq $48, %rdx 893; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 894; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 895; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 896; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 897; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 898; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 899; AVX-NEXT: callq __extendhfsf2@PLT 900; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 901; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 902; AVX-NEXT: callq __extendhfsf2@PLT 903; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 904; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 905; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 906; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 907; AVX-NEXT: callq __extendhfsf2@PLT 908; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 909; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 910; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 911; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 912; AVX-NEXT: callq __extendhfsf2@PLT 913; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 914; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 915; AVX-NEXT: addq $72, %rsp 916; AVX-NEXT: retq 917; 918; F16C-LABEL: load_cvt_8i16_to_4f32: 919; F16C: # %bb.0: 920; F16C-NEXT: vcvtph2ps (%rdi), %xmm0 921; F16C-NEXT: retq 922; 923; AVX512-LABEL: load_cvt_8i16_to_4f32: 924; AVX512: # %bb.0: 925; AVX512-NEXT: vcvtph2ps (%rdi), %xmm0 926; AVX512-NEXT: retq 927 %1 = load <8 x i16>, ptr %a0 928 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 929 %3 = bitcast <4 x i16> %2 to <4 x half> 930 %4 = fpext <4 x half> %3 to <4 x float> 931 ret <4 x float> %4 932} 933 934define <8 x float> @load_cvt_8i16_to_8f32(ptr %a0) nounwind { 935; AVX-LABEL: load_cvt_8i16_to_8f32: 936; AVX: # %bb.0: 937; AVX-NEXT: pushq %rbx 938; AVX-NEXT: subq $48, %rsp 939; AVX-NEXT: movq %rdi, %rbx 940; AVX-NEXT: vmovdqa (%rdi), %xmm0 941; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 942; AVX-NEXT: vpinsrw $0, 8(%rdi), %xmm0, %xmm0 943; AVX-NEXT: callq __extendhfsf2@PLT 944; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 945; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 946; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 947; AVX-NEXT: callq __extendhfsf2@PLT 948; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 949; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 950; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 951; AVX-NEXT: vpinsrw $0, 12(%rbx), %xmm0, %xmm0 952; AVX-NEXT: callq __extendhfsf2@PLT 953; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 954; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 955; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 956; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 957; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 958; AVX-NEXT: callq __extendhfsf2@PLT 959; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 960; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 961; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 962; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 963; AVX-NEXT: callq __extendhfsf2@PLT 964; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 965; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 966; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 967; AVX-NEXT: callq __extendhfsf2@PLT 968; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 969; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 970; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 971; AVX-NEXT: vpinsrw $0, 4(%rbx), %xmm0, %xmm0 972; AVX-NEXT: callq __extendhfsf2@PLT 973; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 974; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 975; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 976; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 977; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 978; AVX-NEXT: callq __extendhfsf2@PLT 979; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 980; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 981; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 982; AVX-NEXT: addq $48, %rsp 983; AVX-NEXT: popq %rbx 984; AVX-NEXT: retq 985; 986; F16C-LABEL: load_cvt_8i16_to_8f32: 987; F16C: # %bb.0: 988; F16C-NEXT: vcvtph2ps (%rdi), %ymm0 989; F16C-NEXT: retq 990; 991; AVX512-LABEL: load_cvt_8i16_to_8f32: 992; AVX512: # %bb.0: 993; AVX512-NEXT: vcvtph2ps (%rdi), %ymm0 994; AVX512-NEXT: retq 995 %1 = load <8 x i16>, ptr %a0 996 %2 = bitcast <8 x i16> %1 to <8 x half> 997 %3 = fpext <8 x half> %2 to <8 x float> 998 ret <8 x float> %3 999} 1000 1001define <16 x float> @load_cvt_16i16_to_16f32(ptr %a0) nounwind { 1002; AVX1-LABEL: load_cvt_16i16_to_16f32: 1003; AVX1: # %bb.0: 1004; AVX1-NEXT: pushq %rbx 1005; AVX1-NEXT: subq $80, %rsp 1006; AVX1-NEXT: movq %rdi, %rbx 1007; AVX1-NEXT: vpinsrw $0, 8(%rdi), %xmm0, %xmm0 1008; AVX1-NEXT: callq __extendhfsf2@PLT 1009; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1010; AVX1-NEXT: vmovdqa (%rbx), %xmm1 1011; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1012; AVX1-NEXT: vmovaps 16(%rbx), %xmm0 1013; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1014; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1015; AVX1-NEXT: callq __extendhfsf2@PLT 1016; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1017; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1018; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1019; AVX1-NEXT: vpinsrw $0, 12(%rbx), %xmm0, %xmm0 1020; AVX1-NEXT: callq __extendhfsf2@PLT 1021; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1022; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1023; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1024; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1025; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1026; AVX1-NEXT: callq __extendhfsf2@PLT 1027; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1028; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1029; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1030; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1031; AVX1-NEXT: callq __extendhfsf2@PLT 1032; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1033; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1034; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 1035; AVX1-NEXT: callq __extendhfsf2@PLT 1036; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1037; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1038; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1039; AVX1-NEXT: vpinsrw $0, 4(%rbx), %xmm0, %xmm0 1040; AVX1-NEXT: callq __extendhfsf2@PLT 1041; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1042; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1043; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1044; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1045; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 1046; AVX1-NEXT: callq __extendhfsf2@PLT 1047; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1048; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1049; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 1050; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1051; AVX1-NEXT: vpinsrw $0, 24(%rbx), %xmm0, %xmm0 1052; AVX1-NEXT: vzeroupper 1053; AVX1-NEXT: callq __extendhfsf2@PLT 1054; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1055; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1056; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1057; AVX1-NEXT: callq __extendhfsf2@PLT 1058; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1059; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1060; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1061; AVX1-NEXT: vpinsrw $0, 28(%rbx), %xmm0, %xmm0 1062; AVX1-NEXT: callq __extendhfsf2@PLT 1063; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1064; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1065; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1066; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1067; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1068; AVX1-NEXT: callq __extendhfsf2@PLT 1069; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1070; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1071; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1072; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1073; AVX1-NEXT: callq __extendhfsf2@PLT 1074; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1075; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1076; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 1077; AVX1-NEXT: callq __extendhfsf2@PLT 1078; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1079; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1080; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1081; AVX1-NEXT: vpinsrw $0, 20(%rbx), %xmm0, %xmm0 1082; AVX1-NEXT: callq __extendhfsf2@PLT 1083; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1084; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1085; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1086; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1087; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 1088; AVX1-NEXT: callq __extendhfsf2@PLT 1089; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1090; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1091; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload 1092; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1093; AVX1-NEXT: addq $80, %rsp 1094; AVX1-NEXT: popq %rbx 1095; AVX1-NEXT: retq 1096; 1097; AVX2-LABEL: load_cvt_16i16_to_16f32: 1098; AVX2: # %bb.0: 1099; AVX2-NEXT: pushq %rbx 1100; AVX2-NEXT: subq $80, %rsp 1101; AVX2-NEXT: movq %rdi, %rbx 1102; AVX2-NEXT: vpinsrw $0, 8(%rdi), %xmm0, %xmm0 1103; AVX2-NEXT: callq __extendhfsf2@PLT 1104; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1105; AVX2-NEXT: vmovdqa (%rbx), %xmm1 1106; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1107; AVX2-NEXT: vmovaps 16(%rbx), %xmm0 1108; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1109; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1110; AVX2-NEXT: callq __extendhfsf2@PLT 1111; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1112; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1113; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1114; AVX2-NEXT: vpinsrw $0, 12(%rbx), %xmm0, %xmm0 1115; AVX2-NEXT: callq __extendhfsf2@PLT 1116; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1117; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1118; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1119; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1120; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1121; AVX2-NEXT: callq __extendhfsf2@PLT 1122; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1123; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1124; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1125; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1126; AVX2-NEXT: callq __extendhfsf2@PLT 1127; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1128; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1129; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 1130; AVX2-NEXT: callq __extendhfsf2@PLT 1131; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1132; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1133; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1134; AVX2-NEXT: vpinsrw $0, 4(%rbx), %xmm0, %xmm0 1135; AVX2-NEXT: callq __extendhfsf2@PLT 1136; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1137; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1138; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1139; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1140; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 1141; AVX2-NEXT: callq __extendhfsf2@PLT 1142; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1143; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1144; AVX2-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 1145; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1146; AVX2-NEXT: vpinsrw $0, 24(%rbx), %xmm0, %xmm0 1147; AVX2-NEXT: vzeroupper 1148; AVX2-NEXT: callq __extendhfsf2@PLT 1149; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1150; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1151; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1152; AVX2-NEXT: callq __extendhfsf2@PLT 1153; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1154; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1155; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1156; AVX2-NEXT: vpinsrw $0, 28(%rbx), %xmm0, %xmm0 1157; AVX2-NEXT: callq __extendhfsf2@PLT 1158; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1159; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1160; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1161; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1162; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1163; AVX2-NEXT: callq __extendhfsf2@PLT 1164; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1165; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1166; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1167; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1168; AVX2-NEXT: callq __extendhfsf2@PLT 1169; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1170; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1171; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 1172; AVX2-NEXT: callq __extendhfsf2@PLT 1173; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1174; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1175; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1176; AVX2-NEXT: vpinsrw $0, 20(%rbx), %xmm0, %xmm0 1177; AVX2-NEXT: callq __extendhfsf2@PLT 1178; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1179; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1180; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1181; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1182; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 1183; AVX2-NEXT: callq __extendhfsf2@PLT 1184; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1185; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1186; AVX2-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload 1187; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1188; AVX2-NEXT: addq $80, %rsp 1189; AVX2-NEXT: popq %rbx 1190; AVX2-NEXT: retq 1191; 1192; F16C-LABEL: load_cvt_16i16_to_16f32: 1193; F16C: # %bb.0: 1194; F16C-NEXT: vcvtph2ps (%rdi), %ymm0 1195; F16C-NEXT: vcvtph2ps 16(%rdi), %ymm1 1196; F16C-NEXT: retq 1197; 1198; AVX512-LABEL: load_cvt_16i16_to_16f32: 1199; AVX512: # %bb.0: 1200; AVX512-NEXT: vcvtph2ps (%rdi), %zmm0 1201; AVX512-NEXT: retq 1202 %1 = load <16 x i16>, ptr %a0 1203 %2 = bitcast <16 x i16> %1 to <16 x half> 1204 %3 = fpext <16 x half> %2 to <16 x float> 1205 ret <16 x float> %3 1206} 1207 1208define <4 x float> @load_cvt_4i16_to_4f32_constrained(ptr %a0) nounwind strictfp { 1209; AVX-LABEL: load_cvt_4i16_to_4f32_constrained: 1210; AVX: # %bb.0: 1211; AVX-NEXT: subq $72, %rsp 1212; AVX-NEXT: vpinsrw $0, 6(%rdi), %xmm0, %xmm0 1213; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1214; AVX-NEXT: vpinsrw $0, 4(%rdi), %xmm0, %xmm0 1215; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1216; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 1217; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1218; AVX-NEXT: vpinsrw $0, 2(%rdi), %xmm0, %xmm0 1219; AVX-NEXT: callq __extendhfsf2@PLT 1220; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1221; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1222; AVX-NEXT: callq __extendhfsf2@PLT 1223; AVX-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1224; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 1225; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1226; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1227; AVX-NEXT: callq __extendhfsf2@PLT 1228; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1229; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1230; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1231; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1232; AVX-NEXT: callq __extendhfsf2@PLT 1233; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1234; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1235; AVX-NEXT: addq $72, %rsp 1236; AVX-NEXT: retq 1237; 1238; F16C-LABEL: load_cvt_4i16_to_4f32_constrained: 1239; F16C: # %bb.0: 1240; F16C-NEXT: vcvtph2ps (%rdi), %xmm0 1241; F16C-NEXT: retq 1242; 1243; AVX512-LABEL: load_cvt_4i16_to_4f32_constrained: 1244; AVX512: # %bb.0: 1245; AVX512-NEXT: vcvtph2ps (%rdi), %xmm0 1246; AVX512-NEXT: retq 1247 %1 = load <4 x i16>, ptr %a0 1248 %2 = bitcast <4 x i16> %1 to <4 x half> 1249 %3 = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %2, metadata !"fpexcept.strict") strictfp 1250 ret <4 x float> %3 1251} 1252 1253define <4 x float> @load_cvt_8i16_to_4f32_constrained(ptr %a0) nounwind strictfp { 1254; AVX-LABEL: load_cvt_8i16_to_4f32_constrained: 1255; AVX: # %bb.0: 1256; AVX-NEXT: subq $72, %rsp 1257; AVX-NEXT: movq (%rdi), %rax 1258; AVX-NEXT: movq %rax, %rcx 1259; AVX-NEXT: movq %rax, %rdx 1260; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1261; AVX-NEXT: # kill: def $eax killed $eax killed $rax 1262; AVX-NEXT: shrl $16, %eax 1263; AVX-NEXT: shrq $32, %rcx 1264; AVX-NEXT: shrq $48, %rdx 1265; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 1266; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1267; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 1268; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1269; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 1270; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1271; AVX-NEXT: callq __extendhfsf2@PLT 1272; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1273; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1274; AVX-NEXT: callq __extendhfsf2@PLT 1275; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1276; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1277; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1278; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1279; AVX-NEXT: callq __extendhfsf2@PLT 1280; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1281; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1282; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1283; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1284; AVX-NEXT: callq __extendhfsf2@PLT 1285; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1286; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1287; AVX-NEXT: addq $72, %rsp 1288; AVX-NEXT: retq 1289; 1290; F16C-LABEL: load_cvt_8i16_to_4f32_constrained: 1291; F16C: # %bb.0: 1292; F16C-NEXT: vcvtph2ps (%rdi), %xmm0 1293; F16C-NEXT: retq 1294; 1295; AVX512-LABEL: load_cvt_8i16_to_4f32_constrained: 1296; AVX512: # %bb.0: 1297; AVX512-NEXT: vcvtph2ps (%rdi), %xmm0 1298; AVX512-NEXT: retq 1299 %1 = load <8 x i16>, ptr %a0 1300 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1301 %3 = bitcast <4 x i16> %2 to <4 x half> 1302 %4 = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %3, metadata !"fpexcept.strict") strictfp 1303 ret <4 x float> %4 1304} 1305 1306; 1307; Half to Double 1308; 1309 1310define double @cvt_i16_to_f64(i16 %a0) nounwind { 1311; AVX-LABEL: cvt_i16_to_f64: 1312; AVX: # %bb.0: 1313; AVX-NEXT: pushq %rax 1314; AVX-NEXT: vpinsrw $0, %edi, %xmm0, %xmm0 1315; AVX-NEXT: callq __extendhfsf2@PLT 1316; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1317; AVX-NEXT: popq %rax 1318; AVX-NEXT: retq 1319; 1320; F16C-LABEL: cvt_i16_to_f64: 1321; F16C: # %bb.0: 1322; F16C-NEXT: movzwl %di, %eax 1323; F16C-NEXT: vmovd %eax, %xmm0 1324; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 1325; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1326; F16C-NEXT: retq 1327; 1328; AVX512-LABEL: cvt_i16_to_f64: 1329; AVX512: # %bb.0: 1330; AVX512-NEXT: movzwl %di, %eax 1331; AVX512-NEXT: vmovd %eax, %xmm0 1332; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1333; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1334; AVX512-NEXT: retq 1335 %1 = bitcast i16 %a0 to half 1336 %2 = fpext half %1 to double 1337 ret double %2 1338} 1339 1340define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind { 1341; AVX-LABEL: cvt_2i16_to_2f64: 1342; AVX: # %bb.0: 1343; AVX-NEXT: subq $40, %rsp 1344; AVX-NEXT: vmovd %xmm0, %eax 1345; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1346; AVX-NEXT: shrl $16, %eax 1347; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 1348; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1349; AVX-NEXT: callq __extendhfsf2@PLT 1350; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1351; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1352; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1353; AVX-NEXT: callq __extendhfsf2@PLT 1354; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1355; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1356; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1357; AVX-NEXT: addq $40, %rsp 1358; AVX-NEXT: retq 1359; 1360; F16C-LABEL: cvt_2i16_to_2f64: 1361; F16C: # %bb.0: 1362; F16C-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1363; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 1364; F16C-NEXT: vcvtps2pd %xmm0, %xmm0 1365; F16C-NEXT: retq 1366; 1367; AVX512-LABEL: cvt_2i16_to_2f64: 1368; AVX512: # %bb.0: 1369; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1370; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1371; AVX512-NEXT: vcvtps2pd %xmm0, %xmm0 1372; AVX512-NEXT: retq 1373 %1 = bitcast <2 x i16> %a0 to <2 x half> 1374 %2 = fpext <2 x half> %1 to <2 x double> 1375 ret <2 x double> %2 1376} 1377 1378define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind { 1379; AVX-LABEL: cvt_4i16_to_4f64: 1380; AVX: # %bb.0: 1381; AVX-NEXT: subq $72, %rsp 1382; AVX-NEXT: vmovq %xmm0, %rax 1383; AVX-NEXT: movq %rax, %rcx 1384; AVX-NEXT: movl %eax, %edx 1385; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1386; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1387; AVX-NEXT: shrq $48, %rax 1388; AVX-NEXT: shrq $32, %rcx 1389; AVX-NEXT: shrl $16, %edx 1390; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 1391; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1392; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 1393; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1394; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1395; AVX-NEXT: callq __extendhfsf2@PLT 1396; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1397; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1398; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1399; AVX-NEXT: callq __extendhfsf2@PLT 1400; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1401; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1402; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1403; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1404; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1405; AVX-NEXT: callq __extendhfsf2@PLT 1406; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1407; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1408; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1409; AVX-NEXT: callq __extendhfsf2@PLT 1410; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1411; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1412; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1413; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 1414; AVX-NEXT: addq $72, %rsp 1415; AVX-NEXT: retq 1416; 1417; F16C-LABEL: cvt_4i16_to_4f64: 1418; F16C: # %bb.0: 1419; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 1420; F16C-NEXT: vcvtps2pd %xmm0, %ymm0 1421; F16C-NEXT: retq 1422; 1423; AVX512-LABEL: cvt_4i16_to_4f64: 1424; AVX512: # %bb.0: 1425; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1426; AVX512-NEXT: vcvtps2pd %xmm0, %ymm0 1427; AVX512-NEXT: retq 1428 %1 = bitcast <4 x i16> %a0 to <4 x half> 1429 %2 = fpext <4 x half> %1 to <4 x double> 1430 ret <4 x double> %2 1431} 1432 1433define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind { 1434; AVX-LABEL: cvt_8i16_to_2f64: 1435; AVX: # %bb.0: 1436; AVX-NEXT: subq $40, %rsp 1437; AVX-NEXT: vmovd %xmm0, %eax 1438; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1439; AVX-NEXT: shrl $16, %eax 1440; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 1441; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1442; AVX-NEXT: callq __extendhfsf2@PLT 1443; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1444; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1445; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1446; AVX-NEXT: callq __extendhfsf2@PLT 1447; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1448; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1449; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1450; AVX-NEXT: addq $40, %rsp 1451; AVX-NEXT: retq 1452; 1453; F16C-LABEL: cvt_8i16_to_2f64: 1454; F16C: # %bb.0: 1455; F16C-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1456; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 1457; F16C-NEXT: vcvtps2pd %xmm0, %xmm0 1458; F16C-NEXT: retq 1459; 1460; AVX512-LABEL: cvt_8i16_to_2f64: 1461; AVX512: # %bb.0: 1462; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1463; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1464; AVX512-NEXT: vcvtps2pd %xmm0, %xmm0 1465; AVX512-NEXT: retq 1466 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 1467 %2 = bitcast <2 x i16> %1 to <2 x half> 1468 %3 = fpext <2 x half> %2 to <2 x double> 1469 ret <2 x double> %3 1470} 1471 1472define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind { 1473; AVX-LABEL: cvt_8i16_to_4f64: 1474; AVX: # %bb.0: 1475; AVX-NEXT: subq $72, %rsp 1476; AVX-NEXT: vmovq %xmm0, %rax 1477; AVX-NEXT: movq %rax, %rcx 1478; AVX-NEXT: movl %eax, %edx 1479; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1480; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1481; AVX-NEXT: shrq $48, %rax 1482; AVX-NEXT: shrq $32, %rcx 1483; AVX-NEXT: shrl $16, %edx 1484; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 1485; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1486; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 1487; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1488; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1489; AVX-NEXT: callq __extendhfsf2@PLT 1490; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1491; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1492; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1493; AVX-NEXT: callq __extendhfsf2@PLT 1494; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1495; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1496; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1497; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1498; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1499; AVX-NEXT: callq __extendhfsf2@PLT 1500; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1501; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1502; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1503; AVX-NEXT: callq __extendhfsf2@PLT 1504; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1505; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1506; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1507; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 1508; AVX-NEXT: addq $72, %rsp 1509; AVX-NEXT: retq 1510; 1511; F16C-LABEL: cvt_8i16_to_4f64: 1512; F16C: # %bb.0: 1513; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 1514; F16C-NEXT: vcvtps2pd %xmm0, %ymm0 1515; F16C-NEXT: retq 1516; 1517; AVX512-LABEL: cvt_8i16_to_4f64: 1518; AVX512: # %bb.0: 1519; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1520; AVX512-NEXT: vcvtps2pd %xmm0, %ymm0 1521; AVX512-NEXT: retq 1522 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1523 %2 = bitcast <4 x i16> %1 to <4 x half> 1524 %3 = fpext <4 x half> %2 to <4 x double> 1525 ret <4 x double> %3 1526} 1527 1528define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind { 1529; AVX-LABEL: cvt_8i16_to_8f64: 1530; AVX: # %bb.0: 1531; AVX-NEXT: subq $88, %rsp 1532; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1533; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 1534; AVX-NEXT: callq __extendhfsf2@PLT 1535; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1536; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1537; AVX-NEXT: vpermilps $245, (%rsp), %xmm0 # 16-byte Folded Reload 1538; AVX-NEXT: # xmm0 = mem[1,1,3,3] 1539; AVX-NEXT: callq __extendhfsf2@PLT 1540; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1541; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1542; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1543; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1544; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1545; AVX-NEXT: callq __extendhfsf2@PLT 1546; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1547; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1548; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 1549; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 1550; AVX-NEXT: callq __extendhfsf2@PLT 1551; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1552; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1553; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1554; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 1555; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1556; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 1557; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1558; AVX-NEXT: vzeroupper 1559; AVX-NEXT: callq __extendhfsf2@PLT 1560; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1561; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1562; AVX-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload 1563; AVX-NEXT: # xmm0 = mem[3,3,3,3] 1564; AVX-NEXT: callq __extendhfsf2@PLT 1565; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1566; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1567; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1568; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1569; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 1570; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1571; AVX-NEXT: callq __extendhfsf2@PLT 1572; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1573; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1574; AVX-NEXT: vpermilps $78, (%rsp), %xmm0 # 16-byte Folded Reload 1575; AVX-NEXT: # xmm0 = mem[2,3,0,1] 1576; AVX-NEXT: callq __extendhfsf2@PLT 1577; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1578; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1579; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1580; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload 1581; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1582; AVX-NEXT: addq $88, %rsp 1583; AVX-NEXT: retq 1584; 1585; F16C-LABEL: cvt_8i16_to_8f64: 1586; F16C: # %bb.0: 1587; F16C-NEXT: vcvtph2ps %xmm0, %ymm1 1588; F16C-NEXT: vcvtps2pd %xmm1, %ymm0 1589; F16C-NEXT: vextractf128 $1, %ymm1, %xmm1 1590; F16C-NEXT: vcvtps2pd %xmm1, %ymm1 1591; F16C-NEXT: retq 1592; 1593; AVX512-LABEL: cvt_8i16_to_8f64: 1594; AVX512: # %bb.0: 1595; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0 1596; AVX512-NEXT: vcvtps2pd %ymm0, %zmm0 1597; AVX512-NEXT: retq 1598 %1 = bitcast <8 x i16> %a0 to <8 x half> 1599 %2 = fpext <8 x half> %1 to <8 x double> 1600 ret <8 x double> %2 1601} 1602 1603define <2 x double> @cvt_2i16_to_2f64_constrained(<2 x i16> %a0) nounwind strictfp { 1604; AVX-LABEL: cvt_2i16_to_2f64_constrained: 1605; AVX: # %bb.0: 1606; AVX-NEXT: subq $40, %rsp 1607; AVX-NEXT: vmovd %xmm0, %eax 1608; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1609; AVX-NEXT: shrl $16, %eax 1610; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 1611; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1612; AVX-NEXT: callq __extendhfsf2@PLT 1613; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1614; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1615; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1616; AVX-NEXT: callq __extendhfsf2@PLT 1617; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1618; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1619; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1620; AVX-NEXT: addq $40, %rsp 1621; AVX-NEXT: retq 1622; 1623; F16C-LABEL: cvt_2i16_to_2f64_constrained: 1624; F16C: # %bb.0: 1625; F16C-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1626; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 1627; F16C-NEXT: vcvtps2pd %xmm0, %xmm0 1628; F16C-NEXT: retq 1629; 1630; AVX512-LABEL: cvt_2i16_to_2f64_constrained: 1631; AVX512: # %bb.0: 1632; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1633; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1634; AVX512-NEXT: vcvtps2pd %xmm0, %xmm0 1635; AVX512-NEXT: retq 1636 %1 = bitcast <2 x i16> %a0 to <2 x half> 1637 %2 = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f16(<2 x half> %1, metadata !"fpexcept.strict") strictfp 1638 ret <2 x double> %2 1639} 1640declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f16(<2 x half>, metadata) strictfp 1641 1642define <4 x double> @cvt_4i16_to_4f64_constrained(<4 x i16> %a0) nounwind strictfp { 1643; AVX-LABEL: cvt_4i16_to_4f64_constrained: 1644; AVX: # %bb.0: 1645; AVX-NEXT: subq $72, %rsp 1646; AVX-NEXT: vmovq %xmm0, %rax 1647; AVX-NEXT: movq %rax, %rcx 1648; AVX-NEXT: movl %eax, %edx 1649; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1650; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1651; AVX-NEXT: shrq $48, %rax 1652; AVX-NEXT: shrq $32, %rcx 1653; AVX-NEXT: shrl $16, %edx 1654; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 1655; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1656; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 1657; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1658; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1659; AVX-NEXT: callq __extendhfsf2@PLT 1660; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1661; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1662; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1663; AVX-NEXT: callq __extendhfsf2@PLT 1664; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1665; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1666; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1667; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1668; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1669; AVX-NEXT: callq __extendhfsf2@PLT 1670; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1671; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1672; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1673; AVX-NEXT: callq __extendhfsf2@PLT 1674; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1675; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1676; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1677; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 1678; AVX-NEXT: addq $72, %rsp 1679; AVX-NEXT: retq 1680; 1681; F16C-LABEL: cvt_4i16_to_4f64_constrained: 1682; F16C: # %bb.0: 1683; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 1684; F16C-NEXT: vcvtps2pd %xmm0, %ymm0 1685; F16C-NEXT: retq 1686; 1687; AVX512-LABEL: cvt_4i16_to_4f64_constrained: 1688; AVX512: # %bb.0: 1689; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1690; AVX512-NEXT: vcvtps2pd %xmm0, %ymm0 1691; AVX512-NEXT: retq 1692 %1 = bitcast <4 x i16> %a0 to <4 x half> 1693 %2 = call <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16(<4 x half> %1, metadata !"fpexcept.strict") strictfp 1694 ret <4 x double> %2 1695} 1696declare <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16(<4 x half>, metadata) strictfp 1697 1698define <8 x double> @cvt_8i16_to_8f64_constrained(<8 x i16> %a0) nounwind strictfp { 1699; AVX-LABEL: cvt_8i16_to_8f64_constrained: 1700; AVX: # %bb.0: 1701; AVX-NEXT: subq $88, %rsp 1702; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1703; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 1704; AVX-NEXT: callq __extendhfsf2@PLT 1705; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1706; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1707; AVX-NEXT: vpermilps $245, (%rsp), %xmm0 # 16-byte Folded Reload 1708; AVX-NEXT: # xmm0 = mem[1,1,3,3] 1709; AVX-NEXT: callq __extendhfsf2@PLT 1710; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1711; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1712; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1713; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1714; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1715; AVX-NEXT: callq __extendhfsf2@PLT 1716; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1717; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1718; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 1719; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 1720; AVX-NEXT: callq __extendhfsf2@PLT 1721; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1722; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1723; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1724; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 1725; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1726; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 1727; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1728; AVX-NEXT: vzeroupper 1729; AVX-NEXT: callq __extendhfsf2@PLT 1730; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1731; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1732; AVX-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload 1733; AVX-NEXT: # xmm0 = mem[3,3,3,3] 1734; AVX-NEXT: callq __extendhfsf2@PLT 1735; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1736; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1737; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1738; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1739; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 1740; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1741; AVX-NEXT: callq __extendhfsf2@PLT 1742; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1743; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1744; AVX-NEXT: vpermilps $78, (%rsp), %xmm0 # 16-byte Folded Reload 1745; AVX-NEXT: # xmm0 = mem[2,3,0,1] 1746; AVX-NEXT: callq __extendhfsf2@PLT 1747; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1748; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1749; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1750; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload 1751; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1752; AVX-NEXT: addq $88, %rsp 1753; AVX-NEXT: retq 1754; 1755; F16C-LABEL: cvt_8i16_to_8f64_constrained: 1756; F16C: # %bb.0: 1757; F16C-NEXT: vcvtph2ps %xmm0, %ymm0 1758; F16C-NEXT: vextractf128 $1, %ymm0, %xmm1 1759; F16C-NEXT: vcvtps2pd %xmm1, %ymm1 1760; F16C-NEXT: vcvtps2pd %xmm0, %ymm0 1761; F16C-NEXT: retq 1762; 1763; AVX512-LABEL: cvt_8i16_to_8f64_constrained: 1764; AVX512: # %bb.0: 1765; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0 1766; AVX512-NEXT: vcvtps2pd %ymm0, %zmm0 1767; AVX512-NEXT: retq 1768 %1 = bitcast <8 x i16> %a0 to <8 x half> 1769 %2 = call <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f16(<8 x half> %1, metadata !"fpexcept.strict") strictfp 1770 ret <8 x double> %2 1771} 1772declare <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f16(<8 x half>, metadata) strictfp 1773 1774; 1775; Half to Double (Load) 1776; 1777 1778define double @load_cvt_i16_to_f64(ptr %a0) nounwind { 1779; AVX-LABEL: load_cvt_i16_to_f64: 1780; AVX: # %bb.0: 1781; AVX-NEXT: pushq %rax 1782; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 1783; AVX-NEXT: callq __extendhfsf2@PLT 1784; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1785; AVX-NEXT: popq %rax 1786; AVX-NEXT: retq 1787; 1788; F16C-LABEL: load_cvt_i16_to_f64: 1789; F16C: # %bb.0: 1790; F16C-NEXT: movzwl (%rdi), %eax 1791; F16C-NEXT: vmovd %eax, %xmm0 1792; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 1793; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1794; F16C-NEXT: retq 1795; 1796; AVX512-LABEL: load_cvt_i16_to_f64: 1797; AVX512: # %bb.0: 1798; AVX512-NEXT: movzwl (%rdi), %eax 1799; AVX512-NEXT: vmovd %eax, %xmm0 1800; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1801; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1802; AVX512-NEXT: retq 1803 %1 = load i16, ptr %a0 1804 %2 = bitcast i16 %1 to half 1805 %3 = fpext half %2 to double 1806 ret double %3 1807} 1808 1809define <2 x double> @load_cvt_2i16_to_2f64(ptr %a0) nounwind { 1810; AVX-LABEL: load_cvt_2i16_to_2f64: 1811; AVX: # %bb.0: 1812; AVX-NEXT: subq $40, %rsp 1813; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 1814; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1815; AVX-NEXT: vpinsrw $0, 2(%rdi), %xmm0, %xmm0 1816; AVX-NEXT: callq __extendhfsf2@PLT 1817; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1818; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1819; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1820; AVX-NEXT: callq __extendhfsf2@PLT 1821; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1822; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 1823; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1824; AVX-NEXT: addq $40, %rsp 1825; AVX-NEXT: retq 1826; 1827; F16C-LABEL: load_cvt_2i16_to_2f64: 1828; F16C: # %bb.0: 1829; F16C-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1830; F16C-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1831; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 1832; F16C-NEXT: vcvtps2pd %xmm0, %xmm0 1833; F16C-NEXT: retq 1834; 1835; AVX512-LABEL: load_cvt_2i16_to_2f64: 1836; AVX512: # %bb.0: 1837; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1838; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1839; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1840; AVX512-NEXT: vcvtps2pd %xmm0, %xmm0 1841; AVX512-NEXT: retq 1842 %1 = load <2 x i16>, ptr %a0 1843 %2 = bitcast <2 x i16> %1 to <2 x half> 1844 %3 = fpext <2 x half> %2 to <2 x double> 1845 ret <2 x double> %3 1846} 1847 1848define <4 x double> @load_cvt_4i16_to_4f64(ptr %a0) nounwind { 1849; AVX-LABEL: load_cvt_4i16_to_4f64: 1850; AVX: # %bb.0: 1851; AVX-NEXT: subq $72, %rsp 1852; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 1853; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1854; AVX-NEXT: vpinsrw $0, 2(%rdi), %xmm0, %xmm0 1855; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1856; AVX-NEXT: vpinsrw $0, 4(%rdi), %xmm0, %xmm0 1857; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1858; AVX-NEXT: vpinsrw $0, 6(%rdi), %xmm0, %xmm0 1859; AVX-NEXT: callq __extendhfsf2@PLT 1860; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1861; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1862; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1863; AVX-NEXT: callq __extendhfsf2@PLT 1864; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1865; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1866; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1867; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1868; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1869; AVX-NEXT: callq __extendhfsf2@PLT 1870; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1871; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1872; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1873; AVX-NEXT: callq __extendhfsf2@PLT 1874; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1875; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1876; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1877; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 1878; AVX-NEXT: addq $72, %rsp 1879; AVX-NEXT: retq 1880; 1881; F16C-LABEL: load_cvt_4i16_to_4f64: 1882; F16C: # %bb.0: 1883; F16C-NEXT: vcvtph2ps (%rdi), %xmm0 1884; F16C-NEXT: vcvtps2pd %xmm0, %ymm0 1885; F16C-NEXT: retq 1886; 1887; AVX512-LABEL: load_cvt_4i16_to_4f64: 1888; AVX512: # %bb.0: 1889; AVX512-NEXT: vcvtph2ps (%rdi), %xmm0 1890; AVX512-NEXT: vcvtps2pd %xmm0, %ymm0 1891; AVX512-NEXT: retq 1892 %1 = load <4 x i16>, ptr %a0 1893 %2 = bitcast <4 x i16> %1 to <4 x half> 1894 %3 = fpext <4 x half> %2 to <4 x double> 1895 ret <4 x double> %3 1896} 1897 1898define <4 x double> @load_cvt_8i16_to_4f64(ptr %a0) nounwind { 1899; AVX-LABEL: load_cvt_8i16_to_4f64: 1900; AVX: # %bb.0: 1901; AVX-NEXT: subq $72, %rsp 1902; AVX-NEXT: movq (%rdi), %rax 1903; AVX-NEXT: movq %rax, %rcx 1904; AVX-NEXT: movl %eax, %edx 1905; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1906; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1907; AVX-NEXT: shrq $48, %rax 1908; AVX-NEXT: shrq $32, %rcx 1909; AVX-NEXT: shrl $16, %edx 1910; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 1911; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1912; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 1913; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1914; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1915; AVX-NEXT: callq __extendhfsf2@PLT 1916; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1917; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1918; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1919; AVX-NEXT: callq __extendhfsf2@PLT 1920; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1921; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1922; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1923; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1924; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1925; AVX-NEXT: callq __extendhfsf2@PLT 1926; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1927; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1928; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1929; AVX-NEXT: callq __extendhfsf2@PLT 1930; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1931; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1932; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1933; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 1934; AVX-NEXT: addq $72, %rsp 1935; AVX-NEXT: retq 1936; 1937; F16C-LABEL: load_cvt_8i16_to_4f64: 1938; F16C: # %bb.0: 1939; F16C-NEXT: vcvtph2ps (%rdi), %xmm0 1940; F16C-NEXT: vcvtps2pd %xmm0, %ymm0 1941; F16C-NEXT: retq 1942; 1943; AVX512-LABEL: load_cvt_8i16_to_4f64: 1944; AVX512: # %bb.0: 1945; AVX512-NEXT: vcvtph2ps (%rdi), %xmm0 1946; AVX512-NEXT: vcvtps2pd %xmm0, %ymm0 1947; AVX512-NEXT: retq 1948 %1 = load <8 x i16>, ptr %a0 1949 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1950 %3 = bitcast <4 x i16> %2 to <4 x half> 1951 %4 = fpext <4 x half> %3 to <4 x double> 1952 ret <4 x double> %4 1953} 1954 1955define <8 x double> @load_cvt_8i16_to_8f64(ptr %a0) nounwind { 1956; AVX1-LABEL: load_cvt_8i16_to_8f64: 1957; AVX1: # %bb.0: 1958; AVX1-NEXT: pushq %rbx 1959; AVX1-NEXT: subq $80, %rsp 1960; AVX1-NEXT: movq %rdi, %rbx 1961; AVX1-NEXT: vmovdqa (%rdi), %xmm0 1962; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1963; AVX1-NEXT: vpinsrw $0, 4(%rdi), %xmm0, %xmm0 1964; AVX1-NEXT: callq __extendhfsf2@PLT 1965; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1966; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1967; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1968; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 1969; AVX1-NEXT: callq __extendhfsf2@PLT 1970; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1971; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1972; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1973; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1974; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1975; AVX1-NEXT: callq __extendhfsf2@PLT 1976; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1977; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1978; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1979; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 1980; AVX1-NEXT: callq __extendhfsf2@PLT 1981; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1982; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1983; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1984; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 1985; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1986; AVX1-NEXT: vpinsrw $0, 12(%rbx), %xmm0, %xmm0 1987; AVX1-NEXT: vzeroupper 1988; AVX1-NEXT: callq __extendhfsf2@PLT 1989; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1990; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1991; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1992; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1993; AVX1-NEXT: callq __extendhfsf2@PLT 1994; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1995; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1996; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1997; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1998; AVX1-NEXT: vpinsrw $0, 8(%rbx), %xmm0, %xmm0 1999; AVX1-NEXT: callq __extendhfsf2@PLT 2000; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2001; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2002; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2003; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2004; AVX1-NEXT: callq __extendhfsf2@PLT 2005; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2006; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2007; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2008; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload 2009; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2010; AVX1-NEXT: addq $80, %rsp 2011; AVX1-NEXT: popq %rbx 2012; AVX1-NEXT: retq 2013; 2014; AVX2-LABEL: load_cvt_8i16_to_8f64: 2015; AVX2: # %bb.0: 2016; AVX2-NEXT: pushq %rbx 2017; AVX2-NEXT: subq $80, %rsp 2018; AVX2-NEXT: movq %rdi, %rbx 2019; AVX2-NEXT: vmovdqa (%rdi), %xmm0 2020; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2021; AVX2-NEXT: vpinsrw $0, 4(%rdi), %xmm0, %xmm0 2022; AVX2-NEXT: callq __extendhfsf2@PLT 2023; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2024; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2025; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2026; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 2027; AVX2-NEXT: callq __extendhfsf2@PLT 2028; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2029; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2030; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2031; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2032; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2033; AVX2-NEXT: callq __extendhfsf2@PLT 2034; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2035; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2036; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2037; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 2038; AVX2-NEXT: callq __extendhfsf2@PLT 2039; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2040; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 2041; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2042; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 2043; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2044; AVX2-NEXT: vpinsrw $0, 12(%rbx), %xmm0, %xmm0 2045; AVX2-NEXT: vzeroupper 2046; AVX2-NEXT: callq __extendhfsf2@PLT 2047; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2048; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2049; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2050; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2051; AVX2-NEXT: callq __extendhfsf2@PLT 2052; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2053; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 2054; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2055; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2056; AVX2-NEXT: vpinsrw $0, 8(%rbx), %xmm0, %xmm0 2057; AVX2-NEXT: callq __extendhfsf2@PLT 2058; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2059; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2060; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2061; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2062; AVX2-NEXT: callq __extendhfsf2@PLT 2063; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2064; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2065; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2066; AVX2-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload 2067; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2068; AVX2-NEXT: addq $80, %rsp 2069; AVX2-NEXT: popq %rbx 2070; AVX2-NEXT: retq 2071; 2072; F16C-LABEL: load_cvt_8i16_to_8f64: 2073; F16C: # %bb.0: 2074; F16C-NEXT: vcvtph2ps (%rdi), %ymm1 2075; F16C-NEXT: vcvtps2pd %xmm1, %ymm0 2076; F16C-NEXT: vextractf128 $1, %ymm1, %xmm1 2077; F16C-NEXT: vcvtps2pd %xmm1, %ymm1 2078; F16C-NEXT: retq 2079; 2080; AVX512-LABEL: load_cvt_8i16_to_8f64: 2081; AVX512: # %bb.0: 2082; AVX512-NEXT: vcvtph2ps (%rdi), %ymm0 2083; AVX512-NEXT: vcvtps2pd %ymm0, %zmm0 2084; AVX512-NEXT: retq 2085 %1 = load <8 x i16>, ptr %a0 2086 %2 = bitcast <8 x i16> %1 to <8 x half> 2087 %3 = fpext <8 x half> %2 to <8 x double> 2088 ret <8 x double> %3 2089} 2090 2091; 2092; Float to Half 2093; 2094 2095define i16 @cvt_f32_to_i16(float %a0) nounwind { 2096; AVX-LABEL: cvt_f32_to_i16: 2097; AVX: # %bb.0: 2098; AVX-NEXT: pushq %rax 2099; AVX-NEXT: callq __truncsfhf2@PLT 2100; AVX-NEXT: vpextrw $0, %xmm0, %eax 2101; AVX-NEXT: # kill: def $ax killed $ax killed $eax 2102; AVX-NEXT: popq %rcx 2103; AVX-NEXT: retq 2104; 2105; F16C-LABEL: cvt_f32_to_i16: 2106; F16C: # %bb.0: 2107; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2108; F16C-NEXT: vmovd %xmm0, %eax 2109; F16C-NEXT: # kill: def $ax killed $ax killed $eax 2110; F16C-NEXT: retq 2111; 2112; AVX512-LABEL: cvt_f32_to_i16: 2113; AVX512: # %bb.0: 2114; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2115; AVX512-NEXT: vmovd %xmm0, %eax 2116; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 2117; AVX512-NEXT: retq 2118 %1 = fptrunc float %a0 to half 2119 %2 = bitcast half %1 to i16 2120 ret i16 %2 2121} 2122 2123define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind { 2124; AVX-LABEL: cvt_4f32_to_4i16: 2125; AVX: # %bb.0: 2126; AVX-NEXT: subq $72, %rsp 2127; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2128; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 2129; AVX-NEXT: callq __truncsfhf2@PLT 2130; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2131; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2132; AVX-NEXT: callq __truncsfhf2@PLT 2133; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2134; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 2135; AVX-NEXT: # xmm0 = mem[1,0] 2136; AVX-NEXT: callq __truncsfhf2@PLT 2137; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2138; AVX-NEXT: vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload 2139; AVX-NEXT: # xmm0 = mem[3,3,3,3] 2140; AVX-NEXT: callq __truncsfhf2@PLT 2141; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2142; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2143; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2144; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 2145; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 2146; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 2147; AVX-NEXT: addq $72, %rsp 2148; AVX-NEXT: retq 2149; 2150; F16C-LABEL: cvt_4f32_to_4i16: 2151; F16C: # %bb.0: 2152; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2153; F16C-NEXT: retq 2154; 2155; AVX512-LABEL: cvt_4f32_to_4i16: 2156; AVX512: # %bb.0: 2157; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2158; AVX512-NEXT: retq 2159 %1 = fptrunc <4 x float> %a0 to <4 x half> 2160 %2 = bitcast <4 x half> %1 to <4 x i16> 2161 ret <4 x i16> %2 2162} 2163 2164define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind { 2165; AVX-LABEL: cvt_4f32_to_8i16_undef: 2166; AVX: # %bb.0: 2167; AVX-NEXT: subq $72, %rsp 2168; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2169; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 2170; AVX-NEXT: callq __truncsfhf2@PLT 2171; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2172; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2173; AVX-NEXT: callq __truncsfhf2@PLT 2174; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2175; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 2176; AVX-NEXT: # xmm0 = mem[1,0] 2177; AVX-NEXT: callq __truncsfhf2@PLT 2178; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2179; AVX-NEXT: vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload 2180; AVX-NEXT: # xmm0 = mem[3,3,3,3] 2181; AVX-NEXT: callq __truncsfhf2@PLT 2182; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2183; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2184; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2185; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 2186; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 2187; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 2188; AVX-NEXT: addq $72, %rsp 2189; AVX-NEXT: retq 2190; 2191; F16C-LABEL: cvt_4f32_to_8i16_undef: 2192; F16C: # %bb.0: 2193; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2194; F16C-NEXT: retq 2195; 2196; AVX512-LABEL: cvt_4f32_to_8i16_undef: 2197; AVX512: # %bb.0: 2198; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2199; AVX512-NEXT: retq 2200 %1 = fptrunc <4 x float> %a0 to <4 x half> 2201 %2 = bitcast <4 x half> %1 to <4 x i16> 2202 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2203 ret <8 x i16> %3 2204} 2205 2206define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind { 2207; AVX-LABEL: cvt_4f32_to_8i16_zero: 2208; AVX: # %bb.0: 2209; AVX-NEXT: subq $72, %rsp 2210; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2211; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 2212; AVX-NEXT: callq __truncsfhf2@PLT 2213; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2214; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2215; AVX-NEXT: callq __truncsfhf2@PLT 2216; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2217; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 2218; AVX-NEXT: # xmm0 = mem[1,0] 2219; AVX-NEXT: callq __truncsfhf2@PLT 2220; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2221; AVX-NEXT: vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload 2222; AVX-NEXT: # xmm0 = mem[3,3,3,3] 2223; AVX-NEXT: callq __truncsfhf2@PLT 2224; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2225; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2226; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2227; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 2228; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 2229; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 2230; AVX-NEXT: addq $72, %rsp 2231; AVX-NEXT: retq 2232; 2233; F16C-LABEL: cvt_4f32_to_8i16_zero: 2234; F16C: # %bb.0: 2235; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2236; F16C-NEXT: retq 2237; 2238; AVX512-LABEL: cvt_4f32_to_8i16_zero: 2239; AVX512: # %bb.0: 2240; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2241; AVX512-NEXT: retq 2242 %1 = fptrunc <4 x float> %a0 to <4 x half> 2243 %2 = bitcast <4 x half> %1 to <4 x i16> 2244 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2245 ret <8 x i16> %3 2246} 2247 2248define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind { 2249; AVX-LABEL: cvt_8f32_to_8i16: 2250; AVX: # %bb.0: 2251; AVX-NEXT: subq $88, %rsp 2252; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2253; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 2254; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2255; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2256; AVX-NEXT: vzeroupper 2257; AVX-NEXT: callq __truncsfhf2@PLT 2258; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2259; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2260; AVX-NEXT: # xmm0 = mem[1,0] 2261; AVX-NEXT: callq __truncsfhf2@PLT 2262; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2263; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2264; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2265; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2266; AVX-NEXT: callq __truncsfhf2@PLT 2267; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2268; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2269; AVX-NEXT: # xmm0 = mem[1,1,3,3] 2270; AVX-NEXT: callq __truncsfhf2@PLT 2271; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2272; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2273; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2274; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2275; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2276; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2277; AVX-NEXT: # xmm0 = mem[3,3,3,3] 2278; AVX-NEXT: callq __truncsfhf2@PLT 2279; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2280; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2281; AVX-NEXT: # xmm0 = mem[1,0] 2282; AVX-NEXT: callq __truncsfhf2@PLT 2283; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2284; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2285; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2286; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2287; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2288; AVX-NEXT: vzeroupper 2289; AVX-NEXT: callq __truncsfhf2@PLT 2290; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2291; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2292; AVX-NEXT: # xmm0 = mem[1,1,3,3] 2293; AVX-NEXT: callq __truncsfhf2@PLT 2294; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2295; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2296; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2297; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2298; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2299; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 2300; AVX-NEXT: addq $88, %rsp 2301; AVX-NEXT: retq 2302; 2303; F16C-LABEL: cvt_8f32_to_8i16: 2304; F16C: # %bb.0: 2305; F16C-NEXT: vcvtps2ph $4, %ymm0, %xmm0 2306; F16C-NEXT: vzeroupper 2307; F16C-NEXT: retq 2308; 2309; AVX512-LABEL: cvt_8f32_to_8i16: 2310; AVX512: # %bb.0: 2311; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0 2312; AVX512-NEXT: vzeroupper 2313; AVX512-NEXT: retq 2314 %1 = fptrunc <8 x float> %a0 to <8 x half> 2315 %2 = bitcast <8 x half> %1 to <8 x i16> 2316 ret <8 x i16> %2 2317} 2318 2319define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { 2320; AVX1-LABEL: cvt_16f32_to_16i16: 2321; AVX1: # %bb.0: 2322; AVX1-NEXT: subq $120, %rsp 2323; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2324; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2325; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 2326; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2327; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2328; AVX1-NEXT: vzeroupper 2329; AVX1-NEXT: callq __truncsfhf2@PLT 2330; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2331; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2332; AVX1-NEXT: # xmm0 = mem[1,0] 2333; AVX1-NEXT: callq __truncsfhf2@PLT 2334; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2335; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2336; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2337; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2338; AVX1-NEXT: callq __truncsfhf2@PLT 2339; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2340; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2341; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 2342; AVX1-NEXT: callq __truncsfhf2@PLT 2343; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2344; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2345; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2346; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2347; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2348; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2349; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 2350; AVX1-NEXT: callq __truncsfhf2@PLT 2351; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2352; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2353; AVX1-NEXT: # xmm0 = mem[1,0] 2354; AVX1-NEXT: callq __truncsfhf2@PLT 2355; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2356; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2357; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2358; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2359; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2360; AVX1-NEXT: vzeroupper 2361; AVX1-NEXT: callq __truncsfhf2@PLT 2362; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2363; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2364; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 2365; AVX1-NEXT: callq __truncsfhf2@PLT 2366; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2367; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2368; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2369; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2370; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2371; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] 2372; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2373; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2374; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2375; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2376; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2377; AVX1-NEXT: vzeroupper 2378; AVX1-NEXT: callq __truncsfhf2@PLT 2379; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2380; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2381; AVX1-NEXT: # xmm0 = mem[1,0] 2382; AVX1-NEXT: callq __truncsfhf2@PLT 2383; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2384; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2385; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2386; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2387; AVX1-NEXT: callq __truncsfhf2@PLT 2388; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2389; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2390; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 2391; AVX1-NEXT: callq __truncsfhf2@PLT 2392; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2393; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2394; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2395; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2396; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2397; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2398; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 2399; AVX1-NEXT: callq __truncsfhf2@PLT 2400; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2401; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2402; AVX1-NEXT: # xmm0 = mem[1,0] 2403; AVX1-NEXT: callq __truncsfhf2@PLT 2404; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2405; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2406; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2407; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2408; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2409; AVX1-NEXT: vzeroupper 2410; AVX1-NEXT: callq __truncsfhf2@PLT 2411; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2412; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2413; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 2414; AVX1-NEXT: callq __truncsfhf2@PLT 2415; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2416; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2417; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2418; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2419; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2420; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] 2421; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 2422; AVX1-NEXT: addq $120, %rsp 2423; AVX1-NEXT: retq 2424; 2425; AVX2-LABEL: cvt_16f32_to_16i16: 2426; AVX2: # %bb.0: 2427; AVX2-NEXT: subq $120, %rsp 2428; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2429; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2430; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 2431; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2432; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2433; AVX2-NEXT: vzeroupper 2434; AVX2-NEXT: callq __truncsfhf2@PLT 2435; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2436; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2437; AVX2-NEXT: # xmm0 = mem[1,0] 2438; AVX2-NEXT: callq __truncsfhf2@PLT 2439; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2440; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2441; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2442; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2443; AVX2-NEXT: callq __truncsfhf2@PLT 2444; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2445; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2446; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 2447; AVX2-NEXT: callq __truncsfhf2@PLT 2448; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2449; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2450; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2451; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2452; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2453; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2454; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 2455; AVX2-NEXT: callq __truncsfhf2@PLT 2456; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2457; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2458; AVX2-NEXT: # xmm0 = mem[1,0] 2459; AVX2-NEXT: callq __truncsfhf2@PLT 2460; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2461; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2462; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2463; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2464; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2465; AVX2-NEXT: vzeroupper 2466; AVX2-NEXT: callq __truncsfhf2@PLT 2467; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2468; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2469; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 2470; AVX2-NEXT: callq __truncsfhf2@PLT 2471; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2472; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2473; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2474; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2475; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2476; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] 2477; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2478; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2479; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 2480; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2481; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2482; AVX2-NEXT: vzeroupper 2483; AVX2-NEXT: callq __truncsfhf2@PLT 2484; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2485; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2486; AVX2-NEXT: # xmm0 = mem[1,0] 2487; AVX2-NEXT: callq __truncsfhf2@PLT 2488; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2489; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2490; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2491; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2492; AVX2-NEXT: callq __truncsfhf2@PLT 2493; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2494; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2495; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 2496; AVX2-NEXT: callq __truncsfhf2@PLT 2497; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2498; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2499; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2500; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2501; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2502; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2503; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 2504; AVX2-NEXT: callq __truncsfhf2@PLT 2505; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2506; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2507; AVX2-NEXT: # xmm0 = mem[1,0] 2508; AVX2-NEXT: callq __truncsfhf2@PLT 2509; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2510; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2511; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2512; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2513; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2514; AVX2-NEXT: vzeroupper 2515; AVX2-NEXT: callq __truncsfhf2@PLT 2516; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2517; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2518; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 2519; AVX2-NEXT: callq __truncsfhf2@PLT 2520; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2521; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2522; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2523; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2524; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2525; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] 2526; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 2527; AVX2-NEXT: addq $120, %rsp 2528; AVX2-NEXT: retq 2529; 2530; F16C-LABEL: cvt_16f32_to_16i16: 2531; F16C: # %bb.0: 2532; F16C-NEXT: vcvtps2ph $4, %ymm0, %xmm0 2533; F16C-NEXT: vcvtps2ph $4, %ymm1, %xmm1 2534; F16C-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2535; F16C-NEXT: retq 2536; 2537; AVX512-LABEL: cvt_16f32_to_16i16: 2538; AVX512: # %bb.0: 2539; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0 2540; AVX512-NEXT: retq 2541 %1 = fptrunc <16 x float> %a0 to <16 x half> 2542 %2 = bitcast <16 x half> %1 to <16 x i16> 2543 ret <16 x i16> %2 2544} 2545 2546; 2547; Float to Half (Store) 2548; 2549 2550define void @store_cvt_f32_to_i16(float %a0, ptr %a1) nounwind { 2551; AVX-LABEL: store_cvt_f32_to_i16: 2552; AVX: # %bb.0: 2553; AVX-NEXT: pushq %rbx 2554; AVX-NEXT: movq %rdi, %rbx 2555; AVX-NEXT: callq __truncsfhf2@PLT 2556; AVX-NEXT: vpextrw $0, %xmm0, (%rbx) 2557; AVX-NEXT: popq %rbx 2558; AVX-NEXT: retq 2559; 2560; F16C-LABEL: store_cvt_f32_to_i16: 2561; F16C: # %bb.0: 2562; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2563; F16C-NEXT: vmovd %xmm0, %eax 2564; F16C-NEXT: movw %ax, (%rdi) 2565; F16C-NEXT: retq 2566; 2567; AVX512-LABEL: store_cvt_f32_to_i16: 2568; AVX512: # %bb.0: 2569; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2570; AVX512-NEXT: vmovd %xmm0, %eax 2571; AVX512-NEXT: movw %ax, (%rdi) 2572; AVX512-NEXT: retq 2573 %1 = fptrunc float %a0 to half 2574 %2 = bitcast half %1 to i16 2575 store i16 %2, ptr %a1 2576 ret void 2577} 2578 2579define void @store_cvt_4f32_to_4i16(<4 x float> %a0, ptr %a1) nounwind { 2580; AVX-LABEL: store_cvt_4f32_to_4i16: 2581; AVX: # %bb.0: 2582; AVX-NEXT: pushq %rbx 2583; AVX-NEXT: subq $64, %rsp 2584; AVX-NEXT: movq %rdi, %rbx 2585; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2586; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 2587; AVX-NEXT: callq __truncsfhf2@PLT 2588; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2589; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 2590; AVX-NEXT: # xmm0 = mem[1,0] 2591; AVX-NEXT: callq __truncsfhf2@PLT 2592; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2593; AVX-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload 2594; AVX-NEXT: # xmm0 = mem[3,3,3,3] 2595; AVX-NEXT: callq __truncsfhf2@PLT 2596; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2597; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 2598; AVX-NEXT: callq __truncsfhf2@PLT 2599; AVX-NEXT: vpextrw $0, %xmm0, (%rbx) 2600; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2601; AVX-NEXT: vpextrw $0, %xmm0, 6(%rbx) 2602; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2603; AVX-NEXT: vpextrw $0, %xmm0, 4(%rbx) 2604; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2605; AVX-NEXT: vpextrw $0, %xmm0, 2(%rbx) 2606; AVX-NEXT: addq $64, %rsp 2607; AVX-NEXT: popq %rbx 2608; AVX-NEXT: retq 2609; 2610; F16C-LABEL: store_cvt_4f32_to_4i16: 2611; F16C: # %bb.0: 2612; F16C-NEXT: vcvtps2ph $4, %xmm0, (%rdi) 2613; F16C-NEXT: retq 2614; 2615; AVX512-LABEL: store_cvt_4f32_to_4i16: 2616; AVX512: # %bb.0: 2617; AVX512-NEXT: vcvtps2ph $4, %xmm0, (%rdi) 2618; AVX512-NEXT: retq 2619 %1 = fptrunc <4 x float> %a0 to <4 x half> 2620 %2 = bitcast <4 x half> %1 to <4 x i16> 2621 store <4 x i16> %2, ptr %a1 2622 ret void 2623} 2624 2625define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, ptr %a1) nounwind { 2626; AVX-LABEL: store_cvt_4f32_to_8i16_undef: 2627; AVX: # %bb.0: 2628; AVX-NEXT: pushq %rbx 2629; AVX-NEXT: subq $64, %rsp 2630; AVX-NEXT: movq %rdi, %rbx 2631; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2632; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 2633; AVX-NEXT: callq __truncsfhf2@PLT 2634; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2635; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2636; AVX-NEXT: callq __truncsfhf2@PLT 2637; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2638; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 2639; AVX-NEXT: # xmm0 = mem[1,0] 2640; AVX-NEXT: callq __truncsfhf2@PLT 2641; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2642; AVX-NEXT: vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload 2643; AVX-NEXT: # xmm0 = mem[3,3,3,3] 2644; AVX-NEXT: callq __truncsfhf2@PLT 2645; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2646; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2647; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2648; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 2649; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 2650; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 2651; AVX-NEXT: vmovaps %xmm0, (%rbx) 2652; AVX-NEXT: addq $64, %rsp 2653; AVX-NEXT: popq %rbx 2654; AVX-NEXT: retq 2655; 2656; F16C-LABEL: store_cvt_4f32_to_8i16_undef: 2657; F16C: # %bb.0: 2658; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2659; F16C-NEXT: vmovaps %xmm0, (%rdi) 2660; F16C-NEXT: retq 2661; 2662; AVX512-LABEL: store_cvt_4f32_to_8i16_undef: 2663; AVX512: # %bb.0: 2664; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2665; AVX512-NEXT: vmovaps %xmm0, (%rdi) 2666; AVX512-NEXT: retq 2667 %1 = fptrunc <4 x float> %a0 to <4 x half> 2668 %2 = bitcast <4 x half> %1 to <4 x i16> 2669 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2670 store <8 x i16> %3, ptr %a1 2671 ret void 2672} 2673 2674define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, ptr %a1) nounwind { 2675; AVX-LABEL: store_cvt_4f32_to_8i16_zero: 2676; AVX: # %bb.0: 2677; AVX-NEXT: pushq %rbx 2678; AVX-NEXT: subq $64, %rsp 2679; AVX-NEXT: movq %rdi, %rbx 2680; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2681; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 2682; AVX-NEXT: callq __truncsfhf2@PLT 2683; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2684; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2685; AVX-NEXT: callq __truncsfhf2@PLT 2686; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2687; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 2688; AVX-NEXT: # xmm0 = mem[1,0] 2689; AVX-NEXT: callq __truncsfhf2@PLT 2690; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2691; AVX-NEXT: vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload 2692; AVX-NEXT: # xmm0 = mem[3,3,3,3] 2693; AVX-NEXT: callq __truncsfhf2@PLT 2694; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2695; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2696; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2697; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 2698; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 2699; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 2700; AVX-NEXT: vmovaps %xmm0, (%rbx) 2701; AVX-NEXT: addq $64, %rsp 2702; AVX-NEXT: popq %rbx 2703; AVX-NEXT: retq 2704; 2705; F16C-LABEL: store_cvt_4f32_to_8i16_zero: 2706; F16C: # %bb.0: 2707; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2708; F16C-NEXT: vmovaps %xmm0, (%rdi) 2709; F16C-NEXT: retq 2710; 2711; AVX512-LABEL: store_cvt_4f32_to_8i16_zero: 2712; AVX512: # %bb.0: 2713; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2714; AVX512-NEXT: vmovaps %xmm0, (%rdi) 2715; AVX512-NEXT: retq 2716 %1 = fptrunc <4 x float> %a0 to <4 x half> 2717 %2 = bitcast <4 x half> %1 to <4 x i16> 2718 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2719 store <8 x i16> %3, ptr %a1 2720 ret void 2721} 2722 2723define void @store_cvt_8f32_to_8i16(<8 x float> %a0, ptr %a1) nounwind { 2724; AVX-LABEL: store_cvt_8f32_to_8i16: 2725; AVX: # %bb.0: 2726; AVX-NEXT: pushq %rbx 2727; AVX-NEXT: subq $80, %rsp 2728; AVX-NEXT: movq %rdi, %rbx 2729; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2730; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 2731; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2732; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2733; AVX-NEXT: vzeroupper 2734; AVX-NEXT: callq __truncsfhf2@PLT 2735; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2736; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2737; AVX-NEXT: # xmm0 = mem[1,0] 2738; AVX-NEXT: callq __truncsfhf2@PLT 2739; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2740; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2741; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2742; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2743; AVX-NEXT: callq __truncsfhf2@PLT 2744; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2745; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2746; AVX-NEXT: # xmm0 = mem[1,1,3,3] 2747; AVX-NEXT: callq __truncsfhf2@PLT 2748; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2749; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2750; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2751; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2752; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2753; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2754; AVX-NEXT: # xmm0 = mem[3,3,3,3] 2755; AVX-NEXT: callq __truncsfhf2@PLT 2756; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2757; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2758; AVX-NEXT: # xmm0 = mem[1,0] 2759; AVX-NEXT: callq __truncsfhf2@PLT 2760; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2761; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2762; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2763; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2764; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2765; AVX-NEXT: vzeroupper 2766; AVX-NEXT: callq __truncsfhf2@PLT 2767; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2768; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2769; AVX-NEXT: # xmm0 = mem[1,1,3,3] 2770; AVX-NEXT: callq __truncsfhf2@PLT 2771; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2772; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2773; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2774; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2775; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2776; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 2777; AVX-NEXT: vmovdqa %xmm0, (%rbx) 2778; AVX-NEXT: addq $80, %rsp 2779; AVX-NEXT: popq %rbx 2780; AVX-NEXT: retq 2781; 2782; F16C-LABEL: store_cvt_8f32_to_8i16: 2783; F16C: # %bb.0: 2784; F16C-NEXT: vcvtps2ph $4, %ymm0, (%rdi) 2785; F16C-NEXT: vzeroupper 2786; F16C-NEXT: retq 2787; 2788; AVX512-LABEL: store_cvt_8f32_to_8i16: 2789; AVX512: # %bb.0: 2790; AVX512-NEXT: vcvtps2ph $4, %ymm0, (%rdi) 2791; AVX512-NEXT: vzeroupper 2792; AVX512-NEXT: retq 2793 %1 = fptrunc <8 x float> %a0 to <8 x half> 2794 %2 = bitcast <8 x half> %1 to <8 x i16> 2795 store <8 x i16> %2, ptr %a1 2796 ret void 2797} 2798 2799define void @store_cvt_16f32_to_16i16(<16 x float> %a0, ptr %a1) nounwind { 2800; AVX1-LABEL: store_cvt_16f32_to_16i16: 2801; AVX1: # %bb.0: 2802; AVX1-NEXT: pushq %rbx 2803; AVX1-NEXT: subq $112, %rsp 2804; AVX1-NEXT: movq %rdi, %rbx 2805; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2806; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2807; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 2808; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2809; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2810; AVX1-NEXT: vzeroupper 2811; AVX1-NEXT: callq __truncsfhf2@PLT 2812; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2813; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2814; AVX1-NEXT: # xmm0 = mem[1,0] 2815; AVX1-NEXT: callq __truncsfhf2@PLT 2816; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2817; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2818; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2819; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2820; AVX1-NEXT: callq __truncsfhf2@PLT 2821; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2822; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2823; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 2824; AVX1-NEXT: callq __truncsfhf2@PLT 2825; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2826; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2827; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2828; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2829; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2830; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2831; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 2832; AVX1-NEXT: callq __truncsfhf2@PLT 2833; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2834; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2835; AVX1-NEXT: # xmm0 = mem[1,0] 2836; AVX1-NEXT: callq __truncsfhf2@PLT 2837; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2838; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2839; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2840; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2841; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2842; AVX1-NEXT: vzeroupper 2843; AVX1-NEXT: callq __truncsfhf2@PLT 2844; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2845; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2846; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 2847; AVX1-NEXT: callq __truncsfhf2@PLT 2848; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2849; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2850; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2851; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2852; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2853; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] 2854; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2855; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2856; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2857; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2858; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2859; AVX1-NEXT: vzeroupper 2860; AVX1-NEXT: callq __truncsfhf2@PLT 2861; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2862; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2863; AVX1-NEXT: # xmm0 = mem[1,0] 2864; AVX1-NEXT: callq __truncsfhf2@PLT 2865; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2866; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2867; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2868; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2869; AVX1-NEXT: callq __truncsfhf2@PLT 2870; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2871; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2872; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 2873; AVX1-NEXT: callq __truncsfhf2@PLT 2874; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2875; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2876; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2877; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2878; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2879; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2880; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 2881; AVX1-NEXT: callq __truncsfhf2@PLT 2882; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2883; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2884; AVX1-NEXT: # xmm0 = mem[1,0] 2885; AVX1-NEXT: callq __truncsfhf2@PLT 2886; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2887; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2888; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2889; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2890; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2891; AVX1-NEXT: vzeroupper 2892; AVX1-NEXT: callq __truncsfhf2@PLT 2893; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2894; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2895; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 2896; AVX1-NEXT: callq __truncsfhf2@PLT 2897; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2898; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2899; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2900; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2901; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2902; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] 2903; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 2904; AVX1-NEXT: vmovaps %ymm0, (%rbx) 2905; AVX1-NEXT: addq $112, %rsp 2906; AVX1-NEXT: popq %rbx 2907; AVX1-NEXT: vzeroupper 2908; AVX1-NEXT: retq 2909; 2910; AVX2-LABEL: store_cvt_16f32_to_16i16: 2911; AVX2: # %bb.0: 2912; AVX2-NEXT: pushq %rbx 2913; AVX2-NEXT: subq $112, %rsp 2914; AVX2-NEXT: movq %rdi, %rbx 2915; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2916; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2917; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 2918; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2919; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2920; AVX2-NEXT: vzeroupper 2921; AVX2-NEXT: callq __truncsfhf2@PLT 2922; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2923; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2924; AVX2-NEXT: # xmm0 = mem[1,0] 2925; AVX2-NEXT: callq __truncsfhf2@PLT 2926; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2927; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2928; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2929; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2930; AVX2-NEXT: callq __truncsfhf2@PLT 2931; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2932; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2933; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 2934; AVX2-NEXT: callq __truncsfhf2@PLT 2935; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2936; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2937; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2938; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2939; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2940; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2941; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 2942; AVX2-NEXT: callq __truncsfhf2@PLT 2943; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2944; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2945; AVX2-NEXT: # xmm0 = mem[1,0] 2946; AVX2-NEXT: callq __truncsfhf2@PLT 2947; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2948; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2949; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2950; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2951; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2952; AVX2-NEXT: vzeroupper 2953; AVX2-NEXT: callq __truncsfhf2@PLT 2954; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2955; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2956; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 2957; AVX2-NEXT: callq __truncsfhf2@PLT 2958; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2959; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2960; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2961; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2962; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2963; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] 2964; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2965; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2966; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 2967; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2968; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2969; AVX2-NEXT: vzeroupper 2970; AVX2-NEXT: callq __truncsfhf2@PLT 2971; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2972; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2973; AVX2-NEXT: # xmm0 = mem[1,0] 2974; AVX2-NEXT: callq __truncsfhf2@PLT 2975; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2976; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2977; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2978; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2979; AVX2-NEXT: callq __truncsfhf2@PLT 2980; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2981; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2982; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 2983; AVX2-NEXT: callq __truncsfhf2@PLT 2984; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2985; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2986; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2987; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2988; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2989; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2990; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 2991; AVX2-NEXT: callq __truncsfhf2@PLT 2992; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2993; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2994; AVX2-NEXT: # xmm0 = mem[1,0] 2995; AVX2-NEXT: callq __truncsfhf2@PLT 2996; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2997; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2998; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2999; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3000; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3001; AVX2-NEXT: vzeroupper 3002; AVX2-NEXT: callq __truncsfhf2@PLT 3003; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3004; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3005; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 3006; AVX2-NEXT: callq __truncsfhf2@PLT 3007; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3008; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3009; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 3010; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3011; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3012; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] 3013; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3014; AVX2-NEXT: vmovdqa %ymm0, (%rbx) 3015; AVX2-NEXT: addq $112, %rsp 3016; AVX2-NEXT: popq %rbx 3017; AVX2-NEXT: vzeroupper 3018; AVX2-NEXT: retq 3019; 3020; F16C-LABEL: store_cvt_16f32_to_16i16: 3021; F16C: # %bb.0: 3022; F16C-NEXT: vcvtps2ph $4, %ymm1, 16(%rdi) 3023; F16C-NEXT: vcvtps2ph $4, %ymm0, (%rdi) 3024; F16C-NEXT: vzeroupper 3025; F16C-NEXT: retq 3026; 3027; AVX512-LABEL: store_cvt_16f32_to_16i16: 3028; AVX512: # %bb.0: 3029; AVX512-NEXT: vcvtps2ph $4, %zmm0, (%rdi) 3030; AVX512-NEXT: vzeroupper 3031; AVX512-NEXT: retq 3032 %1 = fptrunc <16 x float> %a0 to <16 x half> 3033 %2 = bitcast <16 x half> %1 to <16 x i16> 3034 store <16 x i16> %2, ptr %a1 3035 ret void 3036} 3037 3038; 3039; Double to Half 3040; 3041 3042define i16 @cvt_f64_to_i16(double %a0) nounwind { 3043; ALL-LABEL: cvt_f64_to_i16: 3044; ALL: # %bb.0: 3045; ALL-NEXT: pushq %rax 3046; ALL-NEXT: callq __truncdfhf2@PLT 3047; ALL-NEXT: vpextrw $0, %xmm0, %eax 3048; ALL-NEXT: # kill: def $ax killed $ax killed $eax 3049; ALL-NEXT: popq %rcx 3050; ALL-NEXT: retq 3051; AVX-LABEL: cvt_f64_to_i16: 3052; AVX: # %bb.0: 3053; AVX-NEXT: pushq %rax 3054; AVX-NEXT: callq __truncdfhf2@PLT 3055; AVX-NEXT: vpextrw $0, %xmm0, %eax 3056; AVX-NEXT: # kill: def $ax killed $ax killed $eax 3057; AVX-NEXT: popq %rcx 3058; AVX-NEXT: retq 3059; 3060; F16C-LABEL: cvt_f64_to_i16: 3061; F16C: # %bb.0: 3062; F16C-NEXT: pushq %rax 3063; F16C-NEXT: callq __truncdfhf2@PLT 3064; F16C-NEXT: vpextrw $0, %xmm0, %eax 3065; F16C-NEXT: # kill: def $ax killed $ax killed $eax 3066; F16C-NEXT: popq %rcx 3067; F16C-NEXT: retq 3068; 3069; AVX512-LABEL: cvt_f64_to_i16: 3070; AVX512: # %bb.0: 3071; AVX512-NEXT: pushq %rax 3072; AVX512-NEXT: callq __truncdfhf2@PLT 3073; AVX512-NEXT: vpextrw $0, %xmm0, %eax 3074; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 3075; AVX512-NEXT: popq %rcx 3076; AVX512-NEXT: retq 3077 %1 = fptrunc double %a0 to half 3078 %2 = bitcast half %1 to i16 3079 ret i16 %2 3080} 3081 3082define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind { 3083; AVX-LABEL: cvt_2f64_to_2i16: 3084; AVX: # %bb.0: 3085; AVX-NEXT: subq $40, %rsp 3086; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3087; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3088; AVX-NEXT: callq __truncdfhf2@PLT 3089; AVX-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3090; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3091; AVX-NEXT: callq __truncdfhf2@PLT 3092; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 3093; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3094; AVX-NEXT: addq $40, %rsp 3095; AVX-NEXT: retq 3096; 3097; F16C-LABEL: cvt_2f64_to_2i16: 3098; F16C: # %bb.0: 3099; F16C-NEXT: subq $40, %rsp 3100; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3101; F16C-NEXT: callq __truncdfhf2@PLT 3102; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3103; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3104; F16C-NEXT: # xmm0 = mem[1,0] 3105; F16C-NEXT: callq __truncdfhf2@PLT 3106; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3107; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3108; F16C-NEXT: addq $40, %rsp 3109; F16C-NEXT: retq 3110 %1 = fptrunc <2 x double> %a0 to <2 x half> 3111 %2 = bitcast <2 x half> %1 to <2 x i16> 3112 ret <2 x i16> %2 3113} 3114 3115define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { 3116; AVX1-LABEL: cvt_4f64_to_4i16: 3117; AVX1: # %bb.0: 3118; AVX1-NEXT: subq $88, %rsp 3119; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3120; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3121; AVX1-NEXT: vzeroupper 3122; AVX1-NEXT: callq __truncdfhf2@PLT 3123; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3124; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3125; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3126; AVX1-NEXT: vzeroupper 3127; AVX1-NEXT: callq __truncdfhf2@PLT 3128; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3129; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3130; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3131; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3132; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3133; AVX1-NEXT: vzeroupper 3134; AVX1-NEXT: callq __truncdfhf2@PLT 3135; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3136; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3137; AVX1-NEXT: callq __truncdfhf2@PLT 3138; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3139; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3140; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3141; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 3142; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 3143; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 3144; AVX1-NEXT: addq $88, %rsp 3145; AVX1-NEXT: retq 3146; 3147; AVX2-LABEL: cvt_4f64_to_4i16: 3148; AVX2: # %bb.0: 3149; AVX2-NEXT: subq $88, %rsp 3150; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3151; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3152; AVX2-NEXT: vzeroupper 3153; AVX2-NEXT: callq __truncdfhf2@PLT 3154; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3155; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3156; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3157; AVX2-NEXT: vzeroupper 3158; AVX2-NEXT: callq __truncdfhf2@PLT 3159; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3160; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3161; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3162; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3163; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3164; AVX2-NEXT: vzeroupper 3165; AVX2-NEXT: callq __truncdfhf2@PLT 3166; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3167; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3168; AVX2-NEXT: callq __truncdfhf2@PLT 3169; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3170; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3171; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3172; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 3173; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 3174; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 3175; AVX2-NEXT: addq $88, %rsp 3176; AVX2-NEXT: retq 3177; 3178; F16C-LABEL: cvt_4f64_to_4i16: 3179; F16C: # %bb.0: 3180; F16C-NEXT: subq $72, %rsp 3181; F16C-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3182; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 3183; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3184; F16C-NEXT: vzeroupper 3185; F16C-NEXT: callq __truncdfhf2@PLT 3186; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3187; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3188; F16C-NEXT: # xmm0 = mem[1,0] 3189; F16C-NEXT: callq __truncdfhf2@PLT 3190; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3191; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3192; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3193; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3194; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3195; F16C-NEXT: vzeroupper 3196; F16C-NEXT: callq __truncdfhf2@PLT 3197; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3198; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3199; F16C-NEXT: # xmm0 = mem[1,0] 3200; F16C-NEXT: callq __truncdfhf2@PLT 3201; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3202; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3203; F16C-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3204; F16C-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero 3205; F16C-NEXT: addq $72, %rsp 3206; F16C-NEXT: retq 3207; 3208; AVX512-LABEL: cvt_4f64_to_4i16: 3209; AVX512: # %bb.0: 3210; AVX512-NEXT: subq $72, %rsp 3211; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3212; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3213; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3214; AVX512-NEXT: vzeroupper 3215; AVX512-NEXT: callq __truncdfhf2@PLT 3216; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3217; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3218; AVX512-NEXT: # xmm0 = mem[1,0] 3219; AVX512-NEXT: callq __truncdfhf2@PLT 3220; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3221; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3222; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3223; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3224; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3225; AVX512-NEXT: vzeroupper 3226; AVX512-NEXT: callq __truncdfhf2@PLT 3227; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3228; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3229; AVX512-NEXT: # xmm0 = mem[1,0] 3230; AVX512-NEXT: callq __truncdfhf2@PLT 3231; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3232; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3233; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3234; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3235; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3236; AVX512-NEXT: callq __truncdfhf2@PLT 3237; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 3238; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3239; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] 3240; AVX512-NEXT: addq $72, %rsp 3241; AVX512-NEXT: retq 3242 %1 = fptrunc <4 x double> %a0 to <4 x half> 3243 %2 = bitcast <4 x half> %1 to <4 x i16> 3244 ret <4 x i16> %2 3245} 3246 3247define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { 3248; AVX1-LABEL: cvt_4f64_to_8i16_undef: 3249; AVX1: # %bb.0: 3250; AVX1-NEXT: subq $88, %rsp 3251; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3252; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3253; AVX1-NEXT: vzeroupper 3254; AVX1-NEXT: callq __truncdfhf2@PLT 3255; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3256; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3257; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3258; AVX1-NEXT: vzeroupper 3259; AVX1-NEXT: callq __truncdfhf2@PLT 3260; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3261; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3262; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3263; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3264; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3265; AVX1-NEXT: vzeroupper 3266; AVX1-NEXT: callq __truncdfhf2@PLT 3267; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3268; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3269; AVX1-NEXT: callq __truncdfhf2@PLT 3270; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3271; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3272; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3273; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 3274; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 3275; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 3276; AVX1-NEXT: addq $88, %rsp 3277; AVX1-NEXT: retq 3278; 3279; AVX2-LABEL: cvt_4f64_to_8i16_undef: 3280; AVX2: # %bb.0: 3281; AVX2-NEXT: subq $88, %rsp 3282; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3283; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3284; AVX2-NEXT: vzeroupper 3285; AVX2-NEXT: callq __truncdfhf2@PLT 3286; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3287; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3288; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3289; AVX2-NEXT: vzeroupper 3290; AVX2-NEXT: callq __truncdfhf2@PLT 3291; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3292; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3293; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3294; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3295; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3296; AVX2-NEXT: vzeroupper 3297; AVX2-NEXT: callq __truncdfhf2@PLT 3298; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3299; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3300; AVX2-NEXT: callq __truncdfhf2@PLT 3301; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3302; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3303; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3304; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 3305; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 3306; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 3307; AVX2-NEXT: addq $88, %rsp 3308; AVX2-NEXT: retq 3309; 3310; F16C-LABEL: cvt_4f64_to_8i16_undef: 3311; F16C: # %bb.0: 3312; F16C-NEXT: subq $72, %rsp 3313; F16C-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3314; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 3315; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3316; F16C-NEXT: vzeroupper 3317; F16C-NEXT: callq __truncdfhf2@PLT 3318; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3319; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3320; F16C-NEXT: # xmm0 = mem[1,0] 3321; F16C-NEXT: callq __truncdfhf2@PLT 3322; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3323; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3324; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3325; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3326; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3327; F16C-NEXT: vzeroupper 3328; F16C-NEXT: callq __truncdfhf2@PLT 3329; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3330; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3331; F16C-NEXT: # xmm0 = mem[1,0] 3332; F16C-NEXT: callq __truncdfhf2@PLT 3333; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3334; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3335; F16C-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3336; F16C-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero 3337; F16C-NEXT: addq $72, %rsp 3338; F16C-NEXT: retq 3339; 3340; AVX512-LABEL: cvt_4f64_to_8i16_undef: 3341; AVX512: # %bb.0: 3342; AVX512-NEXT: subq $72, %rsp 3343; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3344; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3345; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3346; AVX512-NEXT: vzeroupper 3347; AVX512-NEXT: callq __truncdfhf2@PLT 3348; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3349; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3350; AVX512-NEXT: # xmm0 = mem[1,0] 3351; AVX512-NEXT: callq __truncdfhf2@PLT 3352; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3353; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3354; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3355; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3356; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3357; AVX512-NEXT: vzeroupper 3358; AVX512-NEXT: callq __truncdfhf2@PLT 3359; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3360; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3361; AVX512-NEXT: # xmm0 = mem[1,0] 3362; AVX512-NEXT: callq __truncdfhf2@PLT 3363; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3364; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3365; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3366; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3367; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3368; AVX512-NEXT: callq __truncdfhf2@PLT 3369; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 3370; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3371; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] 3372; AVX512-NEXT: addq $72, %rsp 3373; AVX512-NEXT: retq 3374 %1 = fptrunc <4 x double> %a0 to <4 x half> 3375 %2 = bitcast <4 x half> %1 to <4 x i16> 3376 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3377 ret <8 x i16> %3 3378} 3379 3380define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { 3381; AVX1-LABEL: cvt_4f64_to_8i16_zero: 3382; AVX1: # %bb.0: 3383; AVX1-NEXT: subq $88, %rsp 3384; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3385; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3386; AVX1-NEXT: vzeroupper 3387; AVX1-NEXT: callq __truncdfhf2@PLT 3388; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3389; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3390; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3391; AVX1-NEXT: vzeroupper 3392; AVX1-NEXT: callq __truncdfhf2@PLT 3393; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3394; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3395; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3396; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3397; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3398; AVX1-NEXT: vzeroupper 3399; AVX1-NEXT: callq __truncdfhf2@PLT 3400; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3401; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3402; AVX1-NEXT: callq __truncdfhf2@PLT 3403; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3404; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3405; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3406; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 3407; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 3408; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 3409; AVX1-NEXT: addq $88, %rsp 3410; AVX1-NEXT: retq 3411; 3412; AVX2-LABEL: cvt_4f64_to_8i16_zero: 3413; AVX2: # %bb.0: 3414; AVX2-NEXT: subq $88, %rsp 3415; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3416; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3417; AVX2-NEXT: vzeroupper 3418; AVX2-NEXT: callq __truncdfhf2@PLT 3419; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3420; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3421; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3422; AVX2-NEXT: vzeroupper 3423; AVX2-NEXT: callq __truncdfhf2@PLT 3424; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3425; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3426; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3427; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3428; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3429; AVX2-NEXT: vzeroupper 3430; AVX2-NEXT: callq __truncdfhf2@PLT 3431; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3432; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3433; AVX2-NEXT: callq __truncdfhf2@PLT 3434; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3435; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3436; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3437; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 3438; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 3439; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 3440; AVX2-NEXT: addq $88, %rsp 3441; AVX2-NEXT: retq 3442; 3443; F16C-LABEL: cvt_4f64_to_8i16_zero: 3444; F16C: # %bb.0: 3445; F16C-NEXT: subq $72, %rsp 3446; F16C-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3447; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 3448; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3449; F16C-NEXT: vzeroupper 3450; F16C-NEXT: callq __truncdfhf2@PLT 3451; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3452; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3453; F16C-NEXT: # xmm0 = mem[1,0] 3454; F16C-NEXT: callq __truncdfhf2@PLT 3455; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3456; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3457; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3458; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3459; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3460; F16C-NEXT: vzeroupper 3461; F16C-NEXT: callq __truncdfhf2@PLT 3462; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3463; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3464; F16C-NEXT: # xmm0 = mem[1,0] 3465; F16C-NEXT: callq __truncdfhf2@PLT 3466; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3467; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3468; F16C-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3469; F16C-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero 3470; F16C-NEXT: addq $72, %rsp 3471; F16C-NEXT: retq 3472; 3473; AVX512-LABEL: cvt_4f64_to_8i16_zero: 3474; AVX512: # %bb.0: 3475; AVX512-NEXT: subq $72, %rsp 3476; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3477; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3478; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3479; AVX512-NEXT: vzeroupper 3480; AVX512-NEXT: callq __truncdfhf2@PLT 3481; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3482; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3483; AVX512-NEXT: # xmm0 = mem[1,0] 3484; AVX512-NEXT: callq __truncdfhf2@PLT 3485; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3486; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3487; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3488; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3489; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3490; AVX512-NEXT: vzeroupper 3491; AVX512-NEXT: callq __truncdfhf2@PLT 3492; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3493; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3494; AVX512-NEXT: # xmm0 = mem[1,0] 3495; AVX512-NEXT: callq __truncdfhf2@PLT 3496; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3497; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3498; AVX512-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3499; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero 3500; AVX512-NEXT: addq $72, %rsp 3501; AVX512-NEXT: retq 3502 %1 = fptrunc <4 x double> %a0 to <4 x half> 3503 %2 = bitcast <4 x half> %1 to <4 x i16> 3504 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3505 ret <8 x i16> %3 3506} 3507 3508define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { 3509; AVX-LABEL: cvt_8f64_to_8i16: 3510; AVX: # %bb.0: 3511; AVX-NEXT: subq $104, %rsp 3512; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3513; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3514; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 3515; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3516; AVX-NEXT: vzeroupper 3517; AVX-NEXT: callq __truncdfhf2@PLT 3518; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3519; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3520; AVX-NEXT: # xmm0 = mem[1,0] 3521; AVX-NEXT: callq __truncdfhf2@PLT 3522; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3523; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3524; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3525; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3526; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3527; AVX-NEXT: vzeroupper 3528; AVX-NEXT: callq __truncdfhf2@PLT 3529; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3530; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3531; AVX-NEXT: # xmm0 = mem[1,0] 3532; AVX-NEXT: callq __truncdfhf2@PLT 3533; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3534; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3535; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3536; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3537; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3538; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3539; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 3540; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3541; AVX-NEXT: vzeroupper 3542; AVX-NEXT: callq __truncdfhf2@PLT 3543; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3544; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3545; AVX-NEXT: # xmm0 = mem[1,0] 3546; AVX-NEXT: callq __truncdfhf2@PLT 3547; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3548; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3549; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3550; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3551; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3552; AVX-NEXT: vzeroupper 3553; AVX-NEXT: callq __truncdfhf2@PLT 3554; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3555; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3556; AVX-NEXT: # xmm0 = mem[1,0] 3557; AVX-NEXT: callq __truncdfhf2@PLT 3558; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3559; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3560; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3561; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3562; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3563; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 3564; AVX-NEXT: addq $104, %rsp 3565; AVX-NEXT: retq 3566; 3567; F16C-LABEL: cvt_8f64_to_8i16: 3568; F16C: # %bb.0: 3569; F16C-NEXT: subq $104, %rsp 3570; F16C-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3571; F16C-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3572; F16C-NEXT: vextractf128 $1, %ymm1, %xmm0 3573; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3574; F16C-NEXT: vzeroupper 3575; F16C-NEXT: callq __truncdfhf2@PLT 3576; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3577; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3578; F16C-NEXT: # xmm0 = mem[1,0] 3579; F16C-NEXT: callq __truncdfhf2@PLT 3580; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3581; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3582; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3583; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3584; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3585; F16C-NEXT: vzeroupper 3586; F16C-NEXT: callq __truncdfhf2@PLT 3587; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3588; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3589; F16C-NEXT: # xmm0 = mem[1,0] 3590; F16C-NEXT: callq __truncdfhf2@PLT 3591; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3592; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3593; F16C-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3594; F16C-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3595; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3596; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3597; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 3598; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3599; F16C-NEXT: vzeroupper 3600; F16C-NEXT: callq __truncdfhf2@PLT 3601; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3602; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3603; F16C-NEXT: # xmm0 = mem[1,0] 3604; F16C-NEXT: callq __truncdfhf2@PLT 3605; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3606; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3607; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3608; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3609; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3610; F16C-NEXT: vzeroupper 3611; F16C-NEXT: callq __truncdfhf2@PLT 3612; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3613; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3614; F16C-NEXT: # xmm0 = mem[1,0] 3615; F16C-NEXT: callq __truncdfhf2@PLT 3616; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3617; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3618; F16C-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3619; F16C-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3620; F16C-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3621; F16C-NEXT: # xmm0 = xmm0[0],mem[0] 3622; F16C-NEXT: addq $104, %rsp 3623; F16C-NEXT: retq 3624; 3625; AVX512-LABEL: cvt_8f64_to_8i16: 3626; AVX512: # %bb.0: 3627; AVX512-NEXT: subq $120, %rsp 3628; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3629; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 3630; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3631; AVX512-NEXT: vzeroupper 3632; AVX512-NEXT: callq __truncdfhf2@PLT 3633; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3634; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3635; AVX512-NEXT: # xmm0 = mem[1,0] 3636; AVX512-NEXT: callq __truncdfhf2@PLT 3637; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3638; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3639; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3640; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 3641; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm0 3642; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3643; AVX512-NEXT: vzeroupper 3644; AVX512-NEXT: callq __truncdfhf2@PLT 3645; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3646; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3647; AVX512-NEXT: # xmm0 = mem[1,0] 3648; AVX512-NEXT: callq __truncdfhf2@PLT 3649; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3650; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3651; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3652; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3653; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3654; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 3655; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3656; AVX512-NEXT: vzeroupper 3657; AVX512-NEXT: callq __truncdfhf2@PLT 3658; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3659; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3660; AVX512-NEXT: # xmm0 = mem[1,0] 3661; AVX512-NEXT: callq __truncdfhf2@PLT 3662; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3663; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3664; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 3665; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 3666; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3667; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3668; AVX512-NEXT: vzeroupper 3669; AVX512-NEXT: callq __truncdfhf2@PLT 3670; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3671; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3672; AVX512-NEXT: # xmm0 = mem[1,0] 3673; AVX512-NEXT: callq __truncdfhf2@PLT 3674; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3675; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3676; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3677; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3678; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3679; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] 3680; AVX512-NEXT: addq $120, %rsp 3681; AVX512-NEXT: retq 3682 %1 = fptrunc <8 x double> %a0 to <8 x half> 3683 %2 = bitcast <8 x half> %1 to <8 x i16> 3684 ret <8 x i16> %2 3685} 3686 3687; 3688; Double to Half (Store) 3689; 3690 3691define void @store_cvt_f64_to_i16(double %a0, ptr %a1) nounwind { 3692; ALL-LABEL: store_cvt_f64_to_i16: 3693; ALL: # %bb.0: 3694; ALL-NEXT: pushq %rbx 3695; ALL-NEXT: movq %rdi, %rbx 3696; ALL-NEXT: callq __truncdfhf2@PLT 3697; ALL-NEXT: vpextrw $0, %xmm0, (%rbx) 3698; ALL-NEXT: popq %rbx 3699; ALL-NEXT: retq 3700; AVX-LABEL: store_cvt_f64_to_i16: 3701; AVX: # %bb.0: 3702; AVX-NEXT: pushq %rbx 3703; AVX-NEXT: movq %rdi, %rbx 3704; AVX-NEXT: callq __truncdfhf2@PLT 3705; AVX-NEXT: vpextrw $0, %xmm0, (%rbx) 3706; AVX-NEXT: popq %rbx 3707; AVX-NEXT: retq 3708; 3709; F16C-LABEL: store_cvt_f64_to_i16: 3710; F16C: # %bb.0: 3711; F16C-NEXT: pushq %rbx 3712; F16C-NEXT: movq %rdi, %rbx 3713; F16C-NEXT: callq __truncdfhf2@PLT 3714; F16C-NEXT: vpextrw $0, %xmm0, (%rbx) 3715; F16C-NEXT: popq %rbx 3716; F16C-NEXT: retq 3717; 3718; AVX512-LABEL: store_cvt_f64_to_i16: 3719; AVX512: # %bb.0: 3720; AVX512-NEXT: pushq %rbx 3721; AVX512-NEXT: movq %rdi, %rbx 3722; AVX512-NEXT: callq __truncdfhf2@PLT 3723; AVX512-NEXT: vpextrw $0, %xmm0, (%rbx) 3724; AVX512-NEXT: popq %rbx 3725; AVX512-NEXT: retq 3726 %1 = fptrunc double %a0 to half 3727 %2 = bitcast half %1 to i16 3728 store i16 %2, ptr %a1 3729 ret void 3730} 3731 3732define void @store_cvt_2f64_to_2i16(<2 x double> %a0, ptr %a1) nounwind { 3733; AVX-LABEL: store_cvt_2f64_to_2i16: 3734; AVX: # %bb.0: 3735; AVX-NEXT: pushq %rbx 3736; AVX-NEXT: subq $32, %rsp 3737; AVX-NEXT: movq %rdi, %rbx 3738; AVX-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3739; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3740; AVX-NEXT: callq __truncdfhf2@PLT 3741; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3742; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3743; AVX-NEXT: callq __truncdfhf2@PLT 3744; AVX-NEXT: vpextrw $0, %xmm0, (%rbx) 3745; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3746; AVX-NEXT: vpextrw $0, %xmm0, 2(%rbx) 3747; AVX-NEXT: addq $32, %rsp 3748; AVX-NEXT: popq %rbx 3749; AVX-NEXT: retq 3750; 3751; F16C-LABEL: store_cvt_2f64_to_2i16: 3752; F16C: # %bb.0: 3753; F16C-NEXT: pushq %rbx 3754; F16C-NEXT: subq $32, %rsp 3755; F16C-NEXT: movq %rdi, %rbx 3756; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3757; F16C-NEXT: callq __truncdfhf2@PLT 3758; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3759; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3760; F16C-NEXT: # xmm0 = mem[1,0] 3761; F16C-NEXT: callq __truncdfhf2@PLT 3762; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3763; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3764; F16C-NEXT: vmovd %xmm0, (%rbx) 3765; F16C-NEXT: addq $32, %rsp 3766; F16C-NEXT: popq %rbx 3767; F16C-NEXT: retq 3768; 3769; AVX512-LABEL: store_cvt_2f64_to_2i16: 3770; AVX512: # %bb.0: 3771; AVX512-NEXT: pushq %rbx 3772; AVX512-NEXT: subq $32, %rsp 3773; AVX512-NEXT: movq %rdi, %rbx 3774; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3775; AVX512-NEXT: callq __truncdfhf2@PLT 3776; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3777; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3778; AVX512-NEXT: # xmm0 = mem[1,0] 3779; AVX512-NEXT: callq __truncdfhf2@PLT 3780; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3781; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3782; AVX512-NEXT: vmovd %xmm0, (%rbx) 3783; AVX512-NEXT: addq $32, %rsp 3784; AVX512-NEXT: popq %rbx 3785; AVX512-NEXT: retq 3786 %1 = fptrunc <2 x double> %a0 to <2 x half> 3787 %2 = bitcast <2 x half> %1 to <2 x i16> 3788 store <2 x i16> %2, ptr %a1 3789 ret void 3790} 3791 3792define void @store_cvt_4f64_to_4i16(<4 x double> %a0, ptr %a1) nounwind { 3793; AVX1-LABEL: store_cvt_4f64_to_4i16: 3794; AVX1: # %bb.0: 3795; AVX1-NEXT: pushq %rbx 3796; AVX1-NEXT: subq $80, %rsp 3797; AVX1-NEXT: movq %rdi, %rbx 3798; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3799; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3800; AVX1-NEXT: vzeroupper 3801; AVX1-NEXT: callq __truncdfhf2@PLT 3802; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3803; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3804; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3805; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3806; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3807; AVX1-NEXT: vzeroupper 3808; AVX1-NEXT: callq __truncdfhf2@PLT 3809; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3810; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3811; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3812; AVX1-NEXT: vzeroupper 3813; AVX1-NEXT: callq __truncdfhf2@PLT 3814; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3815; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3816; AVX1-NEXT: callq __truncdfhf2@PLT 3817; AVX1-NEXT: vpextrw $0, %xmm0, 4(%rbx) 3818; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3819; AVX1-NEXT: vpextrw $0, %xmm0, (%rbx) 3820; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3821; AVX1-NEXT: vpextrw $0, %xmm0, 6(%rbx) 3822; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3823; AVX1-NEXT: vpextrw $0, %xmm0, 2(%rbx) 3824; AVX1-NEXT: addq $80, %rsp 3825; AVX1-NEXT: popq %rbx 3826; AVX1-NEXT: retq 3827; 3828; AVX2-LABEL: store_cvt_4f64_to_4i16: 3829; AVX2: # %bb.0: 3830; AVX2-NEXT: pushq %rbx 3831; AVX2-NEXT: subq $80, %rsp 3832; AVX2-NEXT: movq %rdi, %rbx 3833; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3834; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3835; AVX2-NEXT: vzeroupper 3836; AVX2-NEXT: callq __truncdfhf2@PLT 3837; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3838; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3839; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3840; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3841; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3842; AVX2-NEXT: vzeroupper 3843; AVX2-NEXT: callq __truncdfhf2@PLT 3844; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3845; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3846; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3847; AVX2-NEXT: vzeroupper 3848; AVX2-NEXT: callq __truncdfhf2@PLT 3849; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3850; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3851; AVX2-NEXT: callq __truncdfhf2@PLT 3852; AVX2-NEXT: vpextrw $0, %xmm0, 4(%rbx) 3853; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3854; AVX2-NEXT: vpextrw $0, %xmm0, (%rbx) 3855; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3856; AVX2-NEXT: vpextrw $0, %xmm0, 6(%rbx) 3857; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3858; AVX2-NEXT: vpextrw $0, %xmm0, 2(%rbx) 3859; AVX2-NEXT: addq $80, %rsp 3860; AVX2-NEXT: popq %rbx 3861; AVX2-NEXT: retq 3862; 3863; F16C-LABEL: store_cvt_4f64_to_4i16: 3864; F16C: # %bb.0: 3865; F16C-NEXT: pushq %rbx 3866; F16C-NEXT: subq $64, %rsp 3867; F16C-NEXT: movq %rdi, %rbx 3868; F16C-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3869; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 3870; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3871; F16C-NEXT: vzeroupper 3872; F16C-NEXT: callq __truncdfhf2@PLT 3873; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3874; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3875; F16C-NEXT: # xmm0 = mem[1,0] 3876; F16C-NEXT: callq __truncdfhf2@PLT 3877; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3878; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3879; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3880; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3881; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3882; F16C-NEXT: vzeroupper 3883; F16C-NEXT: callq __truncdfhf2@PLT 3884; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3885; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3886; F16C-NEXT: # xmm0 = mem[1,0] 3887; F16C-NEXT: callq __truncdfhf2@PLT 3888; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3889; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3890; F16C-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3891; F16C-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3892; F16C-NEXT: vmovq %xmm0, (%rbx) 3893; F16C-NEXT: addq $64, %rsp 3894; F16C-NEXT: popq %rbx 3895; F16C-NEXT: retq 3896; 3897; AVX512-LABEL: store_cvt_4f64_to_4i16: 3898; AVX512: # %bb.0: 3899; AVX512-NEXT: pushq %rbx 3900; AVX512-NEXT: subq $64, %rsp 3901; AVX512-NEXT: movq %rdi, %rbx 3902; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3903; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3904; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3905; AVX512-NEXT: vzeroupper 3906; AVX512-NEXT: callq __truncdfhf2@PLT 3907; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3908; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3909; AVX512-NEXT: # xmm0 = mem[1,0] 3910; AVX512-NEXT: callq __truncdfhf2@PLT 3911; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3912; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3913; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3914; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3915; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3916; AVX512-NEXT: vzeroupper 3917; AVX512-NEXT: callq __truncdfhf2@PLT 3918; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3919; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3920; AVX512-NEXT: # xmm0 = mem[1,0] 3921; AVX512-NEXT: callq __truncdfhf2@PLT 3922; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3923; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3924; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3925; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3926; AVX512-NEXT: vmovq %xmm0, (%rbx) 3927; AVX512-NEXT: addq $64, %rsp 3928; AVX512-NEXT: popq %rbx 3929; AVX512-NEXT: retq 3930 %1 = fptrunc <4 x double> %a0 to <4 x half> 3931 %2 = bitcast <4 x half> %1 to <4 x i16> 3932 store <4 x i16> %2, ptr %a1 3933 ret void 3934} 3935 3936define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind { 3937; AVX1-LABEL: store_cvt_4f64_to_8i16_undef: 3938; AVX1: # %bb.0: 3939; AVX1-NEXT: pushq %rbx 3940; AVX1-NEXT: subq $80, %rsp 3941; AVX1-NEXT: movq %rdi, %rbx 3942; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3943; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3944; AVX1-NEXT: vzeroupper 3945; AVX1-NEXT: callq __truncdfhf2@PLT 3946; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3947; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3948; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3949; AVX1-NEXT: vzeroupper 3950; AVX1-NEXT: callq __truncdfhf2@PLT 3951; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3952; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3953; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3954; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3955; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3956; AVX1-NEXT: vzeroupper 3957; AVX1-NEXT: callq __truncdfhf2@PLT 3958; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3959; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3960; AVX1-NEXT: callq __truncdfhf2@PLT 3961; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3962; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3963; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3964; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 3965; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 3966; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 3967; AVX1-NEXT: vmovaps %xmm0, (%rbx) 3968; AVX1-NEXT: addq $80, %rsp 3969; AVX1-NEXT: popq %rbx 3970; AVX1-NEXT: retq 3971; 3972; AVX2-LABEL: store_cvt_4f64_to_8i16_undef: 3973; AVX2: # %bb.0: 3974; AVX2-NEXT: pushq %rbx 3975; AVX2-NEXT: subq $80, %rsp 3976; AVX2-NEXT: movq %rdi, %rbx 3977; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3978; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3979; AVX2-NEXT: vzeroupper 3980; AVX2-NEXT: callq __truncdfhf2@PLT 3981; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3982; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3983; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3984; AVX2-NEXT: vzeroupper 3985; AVX2-NEXT: callq __truncdfhf2@PLT 3986; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3987; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3988; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3989; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3990; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3991; AVX2-NEXT: vzeroupper 3992; AVX2-NEXT: callq __truncdfhf2@PLT 3993; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3994; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3995; AVX2-NEXT: callq __truncdfhf2@PLT 3996; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3997; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3998; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3999; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 4000; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 4001; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 4002; AVX2-NEXT: vmovaps %xmm0, (%rbx) 4003; AVX2-NEXT: addq $80, %rsp 4004; AVX2-NEXT: popq %rbx 4005; AVX2-NEXT: retq 4006; 4007; F16C-LABEL: store_cvt_4f64_to_8i16_undef: 4008; F16C: # %bb.0: 4009; F16C-NEXT: pushq %rbx 4010; F16C-NEXT: subq $64, %rsp 4011; F16C-NEXT: movq %rdi, %rbx 4012; F16C-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4013; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 4014; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4015; F16C-NEXT: vzeroupper 4016; F16C-NEXT: callq __truncdfhf2@PLT 4017; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4018; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4019; F16C-NEXT: # xmm0 = mem[1,0] 4020; F16C-NEXT: callq __truncdfhf2@PLT 4021; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4022; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4023; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4024; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4025; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4026; F16C-NEXT: vzeroupper 4027; F16C-NEXT: callq __truncdfhf2@PLT 4028; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4029; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4030; F16C-NEXT: # xmm0 = mem[1,0] 4031; F16C-NEXT: callq __truncdfhf2@PLT 4032; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4033; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4034; F16C-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4035; F16C-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero 4036; F16C-NEXT: vmovaps %xmm0, (%rbx) 4037; F16C-NEXT: addq $64, %rsp 4038; F16C-NEXT: popq %rbx 4039; F16C-NEXT: retq 4040; 4041; AVX512-LABEL: store_cvt_4f64_to_8i16_undef: 4042; AVX512: # %bb.0: 4043; AVX512-NEXT: pushq %rbx 4044; AVX512-NEXT: subq $64, %rsp 4045; AVX512-NEXT: movq %rdi, %rbx 4046; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4047; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 4048; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4049; AVX512-NEXT: vzeroupper 4050; AVX512-NEXT: callq __truncdfhf2@PLT 4051; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4052; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4053; AVX512-NEXT: # xmm0 = mem[1,0] 4054; AVX512-NEXT: callq __truncdfhf2@PLT 4055; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4056; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4057; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4058; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4059; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4060; AVX512-NEXT: vzeroupper 4061; AVX512-NEXT: callq __truncdfhf2@PLT 4062; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4063; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4064; AVX512-NEXT: # xmm0 = mem[1,0] 4065; AVX512-NEXT: callq __truncdfhf2@PLT 4066; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4067; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4068; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4069; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4070; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4071; AVX512-NEXT: callq __truncdfhf2@PLT 4072; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 4073; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4074; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] 4075; AVX512-NEXT: vmovaps %xmm0, (%rbx) 4076; AVX512-NEXT: addq $64, %rsp 4077; AVX512-NEXT: popq %rbx 4078; AVX512-NEXT: retq 4079 %1 = fptrunc <4 x double> %a0 to <4 x half> 4080 %2 = bitcast <4 x half> %1 to <4 x i16> 4081 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 4082 store <8 x i16> %3, ptr %a1 4083 ret void 4084} 4085 4086define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind { 4087; AVX1-LABEL: store_cvt_4f64_to_8i16_zero: 4088; AVX1: # %bb.0: 4089; AVX1-NEXT: pushq %rbx 4090; AVX1-NEXT: subq $80, %rsp 4091; AVX1-NEXT: movq %rdi, %rbx 4092; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 4093; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4094; AVX1-NEXT: vzeroupper 4095; AVX1-NEXT: callq __truncdfhf2@PLT 4096; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4097; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4098; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4099; AVX1-NEXT: vzeroupper 4100; AVX1-NEXT: callq __truncdfhf2@PLT 4101; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4102; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4103; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 4104; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 4105; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4106; AVX1-NEXT: vzeroupper 4107; AVX1-NEXT: callq __truncdfhf2@PLT 4108; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4109; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 4110; AVX1-NEXT: callq __truncdfhf2@PLT 4111; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4112; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4113; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4114; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 4115; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 4116; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 4117; AVX1-NEXT: vmovaps %xmm0, (%rbx) 4118; AVX1-NEXT: addq $80, %rsp 4119; AVX1-NEXT: popq %rbx 4120; AVX1-NEXT: retq 4121; 4122; AVX2-LABEL: store_cvt_4f64_to_8i16_zero: 4123; AVX2: # %bb.0: 4124; AVX2-NEXT: pushq %rbx 4125; AVX2-NEXT: subq $80, %rsp 4126; AVX2-NEXT: movq %rdi, %rbx 4127; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 4128; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4129; AVX2-NEXT: vzeroupper 4130; AVX2-NEXT: callq __truncdfhf2@PLT 4131; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4132; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4133; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4134; AVX2-NEXT: vzeroupper 4135; AVX2-NEXT: callq __truncdfhf2@PLT 4136; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4137; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 4138; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 4139; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 4140; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4141; AVX2-NEXT: vzeroupper 4142; AVX2-NEXT: callq __truncdfhf2@PLT 4143; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4144; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 4145; AVX2-NEXT: callq __truncdfhf2@PLT 4146; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4147; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4148; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4149; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 4150; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 4151; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 4152; AVX2-NEXT: vmovaps %xmm0, (%rbx) 4153; AVX2-NEXT: addq $80, %rsp 4154; AVX2-NEXT: popq %rbx 4155; AVX2-NEXT: retq 4156; 4157; F16C-LABEL: store_cvt_4f64_to_8i16_zero: 4158; F16C: # %bb.0: 4159; F16C-NEXT: pushq %rbx 4160; F16C-NEXT: subq $64, %rsp 4161; F16C-NEXT: movq %rdi, %rbx 4162; F16C-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4163; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 4164; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4165; F16C-NEXT: vzeroupper 4166; F16C-NEXT: callq __truncdfhf2@PLT 4167; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4168; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4169; F16C-NEXT: # xmm0 = mem[1,0] 4170; F16C-NEXT: callq __truncdfhf2@PLT 4171; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4172; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4173; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4174; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4175; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4176; F16C-NEXT: vzeroupper 4177; F16C-NEXT: callq __truncdfhf2@PLT 4178; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4179; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4180; F16C-NEXT: # xmm0 = mem[1,0] 4181; F16C-NEXT: callq __truncdfhf2@PLT 4182; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4183; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4184; F16C-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4185; F16C-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero 4186; F16C-NEXT: vmovaps %xmm0, (%rbx) 4187; F16C-NEXT: addq $64, %rsp 4188; F16C-NEXT: popq %rbx 4189; F16C-NEXT: retq 4190; 4191; AVX512-LABEL: store_cvt_4f64_to_8i16_zero: 4192; AVX512: # %bb.0: 4193; AVX512-NEXT: pushq %rbx 4194; AVX512-NEXT: subq $64, %rsp 4195; AVX512-NEXT: movq %rdi, %rbx 4196; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4197; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 4198; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4199; AVX512-NEXT: vzeroupper 4200; AVX512-NEXT: callq __truncdfhf2@PLT 4201; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4202; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4203; AVX512-NEXT: # xmm0 = mem[1,0] 4204; AVX512-NEXT: callq __truncdfhf2@PLT 4205; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4206; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4207; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4208; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4209; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4210; AVX512-NEXT: vzeroupper 4211; AVX512-NEXT: callq __truncdfhf2@PLT 4212; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4213; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4214; AVX512-NEXT: # xmm0 = mem[1,0] 4215; AVX512-NEXT: callq __truncdfhf2@PLT 4216; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4217; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4218; AVX512-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4219; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero 4220; AVX512-NEXT: vmovaps %xmm0, (%rbx) 4221; AVX512-NEXT: addq $64, %rsp 4222; AVX512-NEXT: popq %rbx 4223; AVX512-NEXT: retq 4224 %1 = fptrunc <4 x double> %a0 to <4 x half> 4225 %2 = bitcast <4 x half> %1 to <4 x i16> 4226 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 4227 store <8 x i16> %3, ptr %a1 4228 ret void 4229} 4230 4231define void @store_cvt_8f64_to_8i16(<8 x double> %a0, ptr %a1) nounwind { 4232; AVX-LABEL: store_cvt_8f64_to_8i16: 4233; AVX: # %bb.0: 4234; AVX-NEXT: pushq %rbx 4235; AVX-NEXT: subq $96, %rsp 4236; AVX-NEXT: movq %rdi, %rbx 4237; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4238; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4239; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 4240; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4241; AVX-NEXT: vzeroupper 4242; AVX-NEXT: callq __truncdfhf2@PLT 4243; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4244; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4245; AVX-NEXT: # xmm0 = mem[1,0] 4246; AVX-NEXT: callq __truncdfhf2@PLT 4247; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4248; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4249; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4250; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4251; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4252; AVX-NEXT: vzeroupper 4253; AVX-NEXT: callq __truncdfhf2@PLT 4254; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4255; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4256; AVX-NEXT: # xmm0 = mem[1,0] 4257; AVX-NEXT: callq __truncdfhf2@PLT 4258; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4259; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4260; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4261; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4262; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4263; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4264; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 4265; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4266; AVX-NEXT: vzeroupper 4267; AVX-NEXT: callq __truncdfhf2@PLT 4268; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4269; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4270; AVX-NEXT: # xmm0 = mem[1,0] 4271; AVX-NEXT: callq __truncdfhf2@PLT 4272; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4273; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4274; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4275; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4276; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4277; AVX-NEXT: vzeroupper 4278; AVX-NEXT: callq __truncdfhf2@PLT 4279; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4280; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4281; AVX-NEXT: # xmm0 = mem[1,0] 4282; AVX-NEXT: callq __truncdfhf2@PLT 4283; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4284; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4285; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4286; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4287; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4288; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 4289; AVX-NEXT: vmovdqa %xmm0, (%rbx) 4290; AVX-NEXT: addq $96, %rsp 4291; AVX-NEXT: popq %rbx 4292; AVX-NEXT: retq 4293; 4294; F16C-LABEL: store_cvt_8f64_to_8i16: 4295; F16C: # %bb.0: 4296; F16C-NEXT: pushq %rbx 4297; F16C-NEXT: subq $96, %rsp 4298; F16C-NEXT: movq %rdi, %rbx 4299; F16C-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4300; F16C-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4301; F16C-NEXT: vextractf128 $1, %ymm1, %xmm0 4302; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4303; F16C-NEXT: vzeroupper 4304; F16C-NEXT: callq __truncdfhf2@PLT 4305; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4306; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4307; F16C-NEXT: # xmm0 = mem[1,0] 4308; F16C-NEXT: callq __truncdfhf2@PLT 4309; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4310; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4311; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4312; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4313; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4314; F16C-NEXT: vzeroupper 4315; F16C-NEXT: callq __truncdfhf2@PLT 4316; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4317; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4318; F16C-NEXT: # xmm0 = mem[1,0] 4319; F16C-NEXT: callq __truncdfhf2@PLT 4320; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4321; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4322; F16C-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4323; F16C-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4324; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4325; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4326; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 4327; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4328; F16C-NEXT: vzeroupper 4329; F16C-NEXT: callq __truncdfhf2@PLT 4330; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4331; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4332; F16C-NEXT: # xmm0 = mem[1,0] 4333; F16C-NEXT: callq __truncdfhf2@PLT 4334; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4335; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4336; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4337; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4338; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4339; F16C-NEXT: vzeroupper 4340; F16C-NEXT: callq __truncdfhf2@PLT 4341; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4342; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4343; F16C-NEXT: # xmm0 = mem[1,0] 4344; F16C-NEXT: callq __truncdfhf2@PLT 4345; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4346; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4347; F16C-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4348; F16C-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4349; F16C-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4350; F16C-NEXT: # xmm0 = xmm0[0],mem[0] 4351; F16C-NEXT: vmovdqa %xmm0, (%rbx) 4352; F16C-NEXT: addq $96, %rsp 4353; F16C-NEXT: popq %rbx 4354; F16C-NEXT: retq 4355; 4356; AVX512-LABEL: store_cvt_8f64_to_8i16: 4357; AVX512: # %bb.0: 4358; AVX512-NEXT: pushq %rbx 4359; AVX512-NEXT: subq $112, %rsp 4360; AVX512-NEXT: movq %rdi, %rbx 4361; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4362; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 4363; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4364; AVX512-NEXT: vzeroupper 4365; AVX512-NEXT: callq __truncdfhf2@PLT 4366; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4367; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4368; AVX512-NEXT: # xmm0 = mem[1,0] 4369; AVX512-NEXT: callq __truncdfhf2@PLT 4370; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4371; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4372; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4373; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 4374; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm0 4375; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4376; AVX512-NEXT: vzeroupper 4377; AVX512-NEXT: callq __truncdfhf2@PLT 4378; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4379; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4380; AVX512-NEXT: # xmm0 = mem[1,0] 4381; AVX512-NEXT: callq __truncdfhf2@PLT 4382; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4383; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4384; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4385; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4386; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4387; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 4388; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 4389; AVX512-NEXT: vzeroupper 4390; AVX512-NEXT: callq __truncdfhf2@PLT 4391; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4392; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4393; AVX512-NEXT: # xmm0 = mem[1,0] 4394; AVX512-NEXT: callq __truncdfhf2@PLT 4395; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4396; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4397; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4398; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 4399; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 4400; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4401; AVX512-NEXT: vzeroupper 4402; AVX512-NEXT: callq __truncdfhf2@PLT 4403; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4404; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4405; AVX512-NEXT: # xmm0 = mem[1,0] 4406; AVX512-NEXT: callq __truncdfhf2@PLT 4407; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4408; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4409; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4410; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4411; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4412; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] 4413; AVX512-NEXT: vmovdqa %xmm0, (%rbx) 4414; AVX512-NEXT: addq $112, %rsp 4415; AVX512-NEXT: popq %rbx 4416; AVX512-NEXT: retq 4417 %1 = fptrunc <8 x double> %a0 to <8 x half> 4418 %2 = bitcast <8 x half> %1 to <8 x i16> 4419 store <8 x i16> %2, ptr %a1 4420 ret void 4421} 4422 4423define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind { 4424; AVX1-LABEL: store_cvt_32f32_to_32f16: 4425; AVX1: # %bb.0: 4426; AVX1-NEXT: pushq %rbx 4427; AVX1-NEXT: subq $176, %rsp 4428; AVX1-NEXT: movq %rdi, %rbx 4429; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4430; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4431; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4432; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4433; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 4434; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4435; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 4436; AVX1-NEXT: vzeroupper 4437; AVX1-NEXT: callq __truncsfhf2@PLT 4438; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4439; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4440; AVX1-NEXT: # xmm0 = mem[1,0] 4441; AVX1-NEXT: callq __truncsfhf2@PLT 4442; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4443; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4444; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4445; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4446; AVX1-NEXT: callq __truncsfhf2@PLT 4447; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4448; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4449; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 4450; AVX1-NEXT: callq __truncsfhf2@PLT 4451; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4452; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4453; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4454; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4455; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4456; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4457; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 4458; AVX1-NEXT: callq __truncsfhf2@PLT 4459; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4460; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4461; AVX1-NEXT: # xmm0 = mem[1,0] 4462; AVX1-NEXT: callq __truncsfhf2@PLT 4463; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4464; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4465; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4466; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4467; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4468; AVX1-NEXT: vzeroupper 4469; AVX1-NEXT: callq __truncsfhf2@PLT 4470; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4471; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4472; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 4473; AVX1-NEXT: callq __truncsfhf2@PLT 4474; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4475; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4476; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4477; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4478; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4479; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] 4480; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4481; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4482; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 4483; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4484; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 4485; AVX1-NEXT: vzeroupper 4486; AVX1-NEXT: callq __truncsfhf2@PLT 4487; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4488; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4489; AVX1-NEXT: # xmm0 = mem[1,0] 4490; AVX1-NEXT: callq __truncsfhf2@PLT 4491; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4492; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4493; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4494; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4495; AVX1-NEXT: callq __truncsfhf2@PLT 4496; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4497; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4498; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 4499; AVX1-NEXT: callq __truncsfhf2@PLT 4500; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4501; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4502; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4503; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4504; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4505; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4506; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 4507; AVX1-NEXT: callq __truncsfhf2@PLT 4508; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4509; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4510; AVX1-NEXT: # xmm0 = mem[1,0] 4511; AVX1-NEXT: callq __truncsfhf2@PLT 4512; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4513; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4514; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4515; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4516; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4517; AVX1-NEXT: vzeroupper 4518; AVX1-NEXT: callq __truncsfhf2@PLT 4519; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4520; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4521; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 4522; AVX1-NEXT: callq __truncsfhf2@PLT 4523; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4524; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4525; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4526; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4527; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4528; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] 4529; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 4530; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4531; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4532; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 4533; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4534; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 4535; AVX1-NEXT: vzeroupper 4536; AVX1-NEXT: callq __truncsfhf2@PLT 4537; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4538; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4539; AVX1-NEXT: # xmm0 = mem[1,0] 4540; AVX1-NEXT: callq __truncsfhf2@PLT 4541; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4542; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4543; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4544; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4545; AVX1-NEXT: callq __truncsfhf2@PLT 4546; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4547; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4548; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 4549; AVX1-NEXT: callq __truncsfhf2@PLT 4550; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4551; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4552; AVX1-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4553; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4554; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4555; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4556; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 4557; AVX1-NEXT: callq __truncsfhf2@PLT 4558; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4559; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4560; AVX1-NEXT: # xmm0 = mem[1,0] 4561; AVX1-NEXT: callq __truncsfhf2@PLT 4562; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4563; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4564; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4565; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4566; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4567; AVX1-NEXT: vzeroupper 4568; AVX1-NEXT: callq __truncsfhf2@PLT 4569; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4570; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4571; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 4572; AVX1-NEXT: callq __truncsfhf2@PLT 4573; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4574; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4575; AVX1-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4576; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4577; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4578; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] 4579; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4580; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4581; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 4582; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4583; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 4584; AVX1-NEXT: vzeroupper 4585; AVX1-NEXT: callq __truncsfhf2@PLT 4586; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4587; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4588; AVX1-NEXT: # xmm0 = mem[1,0] 4589; AVX1-NEXT: callq __truncsfhf2@PLT 4590; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4591; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4592; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4593; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4594; AVX1-NEXT: callq __truncsfhf2@PLT 4595; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4596; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4597; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 4598; AVX1-NEXT: callq __truncsfhf2@PLT 4599; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4600; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4601; AVX1-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4602; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4603; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4604; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4605; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 4606; AVX1-NEXT: callq __truncsfhf2@PLT 4607; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4608; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4609; AVX1-NEXT: # xmm0 = mem[1,0] 4610; AVX1-NEXT: callq __truncsfhf2@PLT 4611; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4612; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4613; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4614; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4615; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4616; AVX1-NEXT: vzeroupper 4617; AVX1-NEXT: callq __truncsfhf2@PLT 4618; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4619; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4620; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 4621; AVX1-NEXT: callq __truncsfhf2@PLT 4622; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4623; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4624; AVX1-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4625; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4626; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4627; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] 4628; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 4629; AVX1-NEXT: vmovaps %ymm0, 32(%rbx) 4630; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4631; AVX1-NEXT: vmovaps %ymm0, (%rbx) 4632; AVX1-NEXT: addq $176, %rsp 4633; AVX1-NEXT: popq %rbx 4634; AVX1-NEXT: vzeroupper 4635; AVX1-NEXT: retq 4636; 4637; AVX2-LABEL: store_cvt_32f32_to_32f16: 4638; AVX2: # %bb.0: 4639; AVX2-NEXT: pushq %rbx 4640; AVX2-NEXT: subq $176, %rsp 4641; AVX2-NEXT: movq %rdi, %rbx 4642; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4643; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4644; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4645; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4646; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 4647; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4648; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 4649; AVX2-NEXT: vzeroupper 4650; AVX2-NEXT: callq __truncsfhf2@PLT 4651; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4652; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4653; AVX2-NEXT: # xmm0 = mem[1,0] 4654; AVX2-NEXT: callq __truncsfhf2@PLT 4655; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4656; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4657; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4658; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4659; AVX2-NEXT: callq __truncsfhf2@PLT 4660; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4661; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4662; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 4663; AVX2-NEXT: callq __truncsfhf2@PLT 4664; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4665; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4666; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4667; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4668; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4669; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4670; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 4671; AVX2-NEXT: callq __truncsfhf2@PLT 4672; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4673; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4674; AVX2-NEXT: # xmm0 = mem[1,0] 4675; AVX2-NEXT: callq __truncsfhf2@PLT 4676; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4677; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4678; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4679; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4680; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4681; AVX2-NEXT: vzeroupper 4682; AVX2-NEXT: callq __truncsfhf2@PLT 4683; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4684; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4685; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 4686; AVX2-NEXT: callq __truncsfhf2@PLT 4687; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4688; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4689; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4690; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4691; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4692; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] 4693; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4694; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4695; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 4696; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4697; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 4698; AVX2-NEXT: vzeroupper 4699; AVX2-NEXT: callq __truncsfhf2@PLT 4700; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4701; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4702; AVX2-NEXT: # xmm0 = mem[1,0] 4703; AVX2-NEXT: callq __truncsfhf2@PLT 4704; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4705; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4706; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4707; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4708; AVX2-NEXT: callq __truncsfhf2@PLT 4709; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4710; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4711; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 4712; AVX2-NEXT: callq __truncsfhf2@PLT 4713; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4714; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4715; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4716; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4717; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4718; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4719; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 4720; AVX2-NEXT: callq __truncsfhf2@PLT 4721; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4722; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4723; AVX2-NEXT: # xmm0 = mem[1,0] 4724; AVX2-NEXT: callq __truncsfhf2@PLT 4725; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4726; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4727; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4728; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4729; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4730; AVX2-NEXT: vzeroupper 4731; AVX2-NEXT: callq __truncsfhf2@PLT 4732; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4733; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4734; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 4735; AVX2-NEXT: callq __truncsfhf2@PLT 4736; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4737; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4738; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4739; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4740; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4741; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] 4742; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 4743; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4744; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4745; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 4746; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4747; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 4748; AVX2-NEXT: vzeroupper 4749; AVX2-NEXT: callq __truncsfhf2@PLT 4750; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4751; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4752; AVX2-NEXT: # xmm0 = mem[1,0] 4753; AVX2-NEXT: callq __truncsfhf2@PLT 4754; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4755; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4756; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4757; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4758; AVX2-NEXT: callq __truncsfhf2@PLT 4759; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4760; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4761; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 4762; AVX2-NEXT: callq __truncsfhf2@PLT 4763; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4764; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4765; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4766; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4767; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4768; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4769; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 4770; AVX2-NEXT: callq __truncsfhf2@PLT 4771; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4772; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4773; AVX2-NEXT: # xmm0 = mem[1,0] 4774; AVX2-NEXT: callq __truncsfhf2@PLT 4775; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4776; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4777; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4778; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4779; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4780; AVX2-NEXT: vzeroupper 4781; AVX2-NEXT: callq __truncsfhf2@PLT 4782; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4783; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4784; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 4785; AVX2-NEXT: callq __truncsfhf2@PLT 4786; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4787; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4788; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4789; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4790; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4791; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] 4792; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4793; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4794; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 4795; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4796; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 4797; AVX2-NEXT: vzeroupper 4798; AVX2-NEXT: callq __truncsfhf2@PLT 4799; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4800; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4801; AVX2-NEXT: # xmm0 = mem[1,0] 4802; AVX2-NEXT: callq __truncsfhf2@PLT 4803; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4804; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4805; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4806; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4807; AVX2-NEXT: callq __truncsfhf2@PLT 4808; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4809; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4810; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 4811; AVX2-NEXT: callq __truncsfhf2@PLT 4812; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4813; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4814; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4815; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4816; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4817; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4818; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 4819; AVX2-NEXT: callq __truncsfhf2@PLT 4820; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4821; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4822; AVX2-NEXT: # xmm0 = mem[1,0] 4823; AVX2-NEXT: callq __truncsfhf2@PLT 4824; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4825; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4826; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4827; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4828; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4829; AVX2-NEXT: vzeroupper 4830; AVX2-NEXT: callq __truncsfhf2@PLT 4831; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4832; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4833; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 4834; AVX2-NEXT: callq __truncsfhf2@PLT 4835; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4836; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4837; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4838; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4839; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4840; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] 4841; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 4842; AVX2-NEXT: vmovdqa %ymm0, 32(%rbx) 4843; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4844; AVX2-NEXT: vmovaps %ymm0, (%rbx) 4845; AVX2-NEXT: addq $176, %rsp 4846; AVX2-NEXT: popq %rbx 4847; AVX2-NEXT: vzeroupper 4848; AVX2-NEXT: retq 4849; 4850; F16C-LABEL: store_cvt_32f32_to_32f16: 4851; F16C: # %bb.0: 4852; F16C-NEXT: vcvtps2ph $4, %ymm3, 48(%rdi) 4853; F16C-NEXT: vcvtps2ph $4, %ymm2, 32(%rdi) 4854; F16C-NEXT: vcvtps2ph $4, %ymm1, 16(%rdi) 4855; F16C-NEXT: vcvtps2ph $4, %ymm0, (%rdi) 4856; F16C-NEXT: vzeroupper 4857; F16C-NEXT: retq 4858; 4859; AVX512-LABEL: store_cvt_32f32_to_32f16: 4860; AVX512: # %bb.0: 4861; AVX512-NEXT: vcvtps2ph $4, %zmm1, 32(%rdi) 4862; AVX512-NEXT: vcvtps2ph $4, %zmm0, (%rdi) 4863; AVX512-NEXT: vzeroupper 4864; AVX512-NEXT: retq 4865 %1 = fptrunc <32 x float> %a0 to <32 x half> 4866 store <32 x half> %1, ptr %a1 4867 ret void 4868} 4869 4870define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind { 4871; AVX-LABEL: fptosi_2f16_to_4i32: 4872; AVX: # %bb.0: 4873; AVX-NEXT: subq $40, %rsp 4874; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 4875; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4876; AVX-NEXT: callq __extendhfsf2@PLT 4877; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4878; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4879; AVX-NEXT: callq __extendhfsf2@PLT 4880; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 4881; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 4882; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 4883; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 4884; AVX-NEXT: addq $40, %rsp 4885; AVX-NEXT: retq 4886; 4887; F16C-LABEL: fptosi_2f16_to_4i32: 4888; F16C: # %bb.0: 4889; F16C-NEXT: vpextrw $0, %xmm0, %eax 4890; F16C-NEXT: movzwl %ax, %eax 4891; F16C-NEXT: vmovd %eax, %xmm1 4892; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 4893; F16C-NEXT: vpsrld $16, %xmm0, %xmm0 4894; F16C-NEXT: vpextrw $0, %xmm0, %eax 4895; F16C-NEXT: movzwl %ax, %eax 4896; F16C-NEXT: vmovd %eax, %xmm0 4897; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 4898; F16C-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4899; F16C-NEXT: vcvttps2dq %xmm0, %xmm0 4900; F16C-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 4901; F16C-NEXT: retq 4902; 4903; AVX512-LABEL: fptosi_2f16_to_4i32: 4904; AVX512: # %bb.0: 4905; AVX512-NEXT: vpextrw $0, %xmm0, %eax 4906; AVX512-NEXT: movzwl %ax, %eax 4907; AVX512-NEXT: vmovd %eax, %xmm1 4908; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 4909; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 4910; AVX512-NEXT: vpextrw $0, %xmm0, %eax 4911; AVX512-NEXT: movzwl %ax, %eax 4912; AVX512-NEXT: vmovd %eax, %xmm0 4913; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 4914; AVX512-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4915; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 4916; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 4917; AVX512-NEXT: retq 4918 %cvt = fptosi <2 x half> %a to <2 x i32> 4919 %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4920 ret <4 x i32> %ext 4921} 4922