1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,AVX1
3; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,AVX2
4; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=F16C
5; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=F16C
6; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+f16c,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=F16C
7; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+f16c,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=F16C
8; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512
9; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512
10; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512
11
12;
13; Half to Float
14;
15
16define float @cvt_i16_to_f32(i16 %a0) nounwind {
17; AVX-LABEL: cvt_i16_to_f32:
18; AVX:       # %bb.0:
19; AVX-NEXT:    vpinsrw $0, %edi, %xmm0, %xmm0
20; AVX-NEXT:    jmp __extendhfsf2@PLT # TAILCALL
21;
22; F16C-LABEL: cvt_i16_to_f32:
23; F16C:       # %bb.0:
24; F16C-NEXT:    movzwl %di, %eax
25; F16C-NEXT:    vmovd %eax, %xmm0
26; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
27; F16C-NEXT:    retq
28;
29; AVX512-LABEL: cvt_i16_to_f32:
30; AVX512:       # %bb.0:
31; AVX512-NEXT:    movzwl %di, %eax
32; AVX512-NEXT:    vmovd %eax, %xmm0
33; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
34; AVX512-NEXT:    retq
35  %1 = bitcast i16 %a0 to half
36  %2 = fpext half %1 to float
37  ret float %2
38}
39
40define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
41; AVX-LABEL: cvt_4i16_to_4f32:
42; AVX:       # %bb.0:
43; AVX-NEXT:    subq $72, %rsp
44; AVX-NEXT:    vmovq %xmm0, %rax
45; AVX-NEXT:    movq %rax, %rcx
46; AVX-NEXT:    movq %rax, %rdx
47; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
48; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
49; AVX-NEXT:    shrl $16, %eax
50; AVX-NEXT:    shrq $32, %rcx
51; AVX-NEXT:    shrq $48, %rdx
52; AVX-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm1
53; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
54; AVX-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm1
55; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
56; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
57; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
58; AVX-NEXT:    callq __extendhfsf2@PLT
59; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
60; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
61; AVX-NEXT:    callq __extendhfsf2@PLT
62; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
63; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
64; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
65; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
66; AVX-NEXT:    callq __extendhfsf2@PLT
67; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
68; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
69; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
70; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
71; AVX-NEXT:    callq __extendhfsf2@PLT
72; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
73; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
74; AVX-NEXT:    addq $72, %rsp
75; AVX-NEXT:    retq
76;
77; F16C-LABEL: cvt_4i16_to_4f32:
78; F16C:       # %bb.0:
79; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
80; F16C-NEXT:    retq
81;
82; AVX512-LABEL: cvt_4i16_to_4f32:
83; AVX512:       # %bb.0:
84; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
85; AVX512-NEXT:    retq
86  %1 = bitcast <4 x i16> %a0 to <4 x half>
87  %2 = fpext <4 x half> %1 to <4 x float>
88  ret <4 x float> %2
89}
90
91define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
92; AVX-LABEL: cvt_8i16_to_4f32:
93; AVX:       # %bb.0:
94; AVX-NEXT:    subq $72, %rsp
95; AVX-NEXT:    vmovq %xmm0, %rax
96; AVX-NEXT:    movq %rax, %rcx
97; AVX-NEXT:    movq %rax, %rdx
98; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
99; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
100; AVX-NEXT:    shrl $16, %eax
101; AVX-NEXT:    shrq $32, %rcx
102; AVX-NEXT:    shrq $48, %rdx
103; AVX-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm1
104; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
105; AVX-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm1
106; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
107; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
108; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
109; AVX-NEXT:    callq __extendhfsf2@PLT
110; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
111; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
112; AVX-NEXT:    callq __extendhfsf2@PLT
113; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
114; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
115; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
116; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
117; AVX-NEXT:    callq __extendhfsf2@PLT
118; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
119; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
120; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
121; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
122; AVX-NEXT:    callq __extendhfsf2@PLT
123; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
124; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
125; AVX-NEXT:    addq $72, %rsp
126; AVX-NEXT:    retq
127;
128; F16C-LABEL: cvt_8i16_to_4f32:
129; F16C:       # %bb.0:
130; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
131; F16C-NEXT:    retq
132;
133; AVX512-LABEL: cvt_8i16_to_4f32:
134; AVX512:       # %bb.0:
135; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
136; AVX512-NEXT:    retq
137  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
138  %2 = bitcast <4 x i16> %1 to <4 x half>
139  %3 = fpext <4 x half> %2 to <4 x float>
140  ret <4 x float> %3
141}
142
143define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind {
144; AVX-LABEL: cvt_8i16_to_8f32:
145; AVX:       # %bb.0:
146; AVX-NEXT:    subq $56, %rsp
147; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
148; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
149; AVX-NEXT:    callq __extendhfsf2@PLT
150; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
151; AVX-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
152; AVX-NEXT:    # xmm0 = mem[2,3,0,1]
153; AVX-NEXT:    callq __extendhfsf2@PLT
154; AVX-NEXT:    vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
155; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
156; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
157; AVX-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
158; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
159; AVX-NEXT:    callq __extendhfsf2@PLT
160; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
161; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
162; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
163; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
164; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
165; AVX-NEXT:    callq __extendhfsf2@PLT
166; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
167; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
168; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
169; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
170; AVX-NEXT:    callq __extendhfsf2@PLT
171; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
172; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
173; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
174; AVX-NEXT:    callq __extendhfsf2@PLT
175; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
176; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
177; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
178; AVX-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
179; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
180; AVX-NEXT:    callq __extendhfsf2@PLT
181; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
182; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
183; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
184; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
185; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
186; AVX-NEXT:    callq __extendhfsf2@PLT
187; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
188; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
189; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
190; AVX-NEXT:    addq $56, %rsp
191; AVX-NEXT:    retq
192;
193; F16C-LABEL: cvt_8i16_to_8f32:
194; F16C:       # %bb.0:
195; F16C-NEXT:    vcvtph2ps %xmm0, %ymm0
196; F16C-NEXT:    retq
197;
198; AVX512-LABEL: cvt_8i16_to_8f32:
199; AVX512:       # %bb.0:
200; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
201; AVX512-NEXT:    retq
202  %1 = bitcast <8 x i16> %a0 to <8 x half>
203  %2 = fpext <8 x half> %1 to <8 x float>
204  ret <8 x float> %2
205}
206
207define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
208; AVX1-LABEL: cvt_16i16_to_16f32:
209; AVX1:       # %bb.0:
210; AVX1-NEXT:    subq $104, %rsp
211; AVX1-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
212; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
213; AVX1-NEXT:    vzeroupper
214; AVX1-NEXT:    callq __extendhfsf2@PLT
215; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
216; AVX1-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
217; AVX1-NEXT:    # xmm0 = mem[2,3,0,1]
218; AVX1-NEXT:    callq __extendhfsf2@PLT
219; AVX1-NEXT:    vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
220; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
221; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
222; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
223; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
224; AVX1-NEXT:    callq __extendhfsf2@PLT
225; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
226; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
227; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
228; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
229; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
230; AVX1-NEXT:    vzeroupper
231; AVX1-NEXT:    callq __extendhfsf2@PLT
232; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
233; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
234; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
235; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
236; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
237; AVX1-NEXT:    vzeroupper
238; AVX1-NEXT:    callq __extendhfsf2@PLT
239; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
240; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
241; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
242; AVX1-NEXT:    vzeroupper
243; AVX1-NEXT:    callq __extendhfsf2@PLT
244; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
245; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
246; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
247; AVX1-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
248; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
249; AVX1-NEXT:    callq __extendhfsf2@PLT
250; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
251; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
252; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
253; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
254; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
255; AVX1-NEXT:    vzeroupper
256; AVX1-NEXT:    callq __extendhfsf2@PLT
257; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
258; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
259; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
260; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
261; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
262; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
263; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
264; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
265; AVX1-NEXT:    vzeroupper
266; AVX1-NEXT:    callq __extendhfsf2@PLT
267; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
268; AVX1-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
269; AVX1-NEXT:    # xmm0 = mem[2,3,0,1]
270; AVX1-NEXT:    callq __extendhfsf2@PLT
271; AVX1-NEXT:    vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
272; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
273; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
274; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
275; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
276; AVX1-NEXT:    callq __extendhfsf2@PLT
277; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
278; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
279; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
280; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
281; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
282; AVX1-NEXT:    callq __extendhfsf2@PLT
283; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
284; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
285; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
286; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
287; AVX1-NEXT:    callq __extendhfsf2@PLT
288; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
289; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
290; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
291; AVX1-NEXT:    callq __extendhfsf2@PLT
292; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
293; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
294; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
295; AVX1-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
296; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
297; AVX1-NEXT:    callq __extendhfsf2@PLT
298; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
299; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
300; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
301; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
302; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
303; AVX1-NEXT:    callq __extendhfsf2@PLT
304; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
305; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
306; AVX1-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload
307; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
308; AVX1-NEXT:    addq $104, %rsp
309; AVX1-NEXT:    retq
310;
311; AVX2-LABEL: cvt_16i16_to_16f32:
312; AVX2:       # %bb.0:
313; AVX2-NEXT:    subq $104, %rsp
314; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
315; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
316; AVX2-NEXT:    vzeroupper
317; AVX2-NEXT:    callq __extendhfsf2@PLT
318; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
319; AVX2-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
320; AVX2-NEXT:    # xmm0 = mem[2,3,0,1]
321; AVX2-NEXT:    callq __extendhfsf2@PLT
322; AVX2-NEXT:    vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
323; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
324; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
325; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
326; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
327; AVX2-NEXT:    callq __extendhfsf2@PLT
328; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
329; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
330; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
331; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
332; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
333; AVX2-NEXT:    vzeroupper
334; AVX2-NEXT:    callq __extendhfsf2@PLT
335; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
336; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
337; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
338; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
339; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
340; AVX2-NEXT:    vzeroupper
341; AVX2-NEXT:    callq __extendhfsf2@PLT
342; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
343; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
344; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
345; AVX2-NEXT:    vzeroupper
346; AVX2-NEXT:    callq __extendhfsf2@PLT
347; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
348; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
349; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
350; AVX2-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
351; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
352; AVX2-NEXT:    callq __extendhfsf2@PLT
353; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
354; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
355; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
356; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
357; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
358; AVX2-NEXT:    vzeroupper
359; AVX2-NEXT:    callq __extendhfsf2@PLT
360; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
361; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
362; AVX2-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
363; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
364; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
365; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
366; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
367; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
368; AVX2-NEXT:    vzeroupper
369; AVX2-NEXT:    callq __extendhfsf2@PLT
370; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
371; AVX2-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
372; AVX2-NEXT:    # xmm0 = mem[2,3,0,1]
373; AVX2-NEXT:    callq __extendhfsf2@PLT
374; AVX2-NEXT:    vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
375; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
376; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
377; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
378; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
379; AVX2-NEXT:    callq __extendhfsf2@PLT
380; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
381; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
382; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
383; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
384; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
385; AVX2-NEXT:    callq __extendhfsf2@PLT
386; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
387; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
388; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
389; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
390; AVX2-NEXT:    callq __extendhfsf2@PLT
391; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
392; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
393; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
394; AVX2-NEXT:    callq __extendhfsf2@PLT
395; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
396; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
397; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
398; AVX2-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
399; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
400; AVX2-NEXT:    callq __extendhfsf2@PLT
401; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
402; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
403; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
404; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
405; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
406; AVX2-NEXT:    callq __extendhfsf2@PLT
407; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
408; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
409; AVX2-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload
410; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
411; AVX2-NEXT:    addq $104, %rsp
412; AVX2-NEXT:    retq
413;
414; F16C-LABEL: cvt_16i16_to_16f32:
415; F16C:       # %bb.0:
416; F16C-NEXT:    vcvtph2ps %xmm0, %ymm2
417; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
418; F16C-NEXT:    vcvtph2ps %xmm0, %ymm1
419; F16C-NEXT:    vmovaps %ymm2, %ymm0
420; F16C-NEXT:    retq
421;
422; AVX512-LABEL: cvt_16i16_to_16f32:
423; AVX512:       # %bb.0:
424; AVX512-NEXT:    vcvtph2ps %ymm0, %zmm0
425; AVX512-NEXT:    retq
426  %1 = bitcast <16 x i16> %a0 to <16 x half>
427  %2 = fpext <16 x half> %1 to <16 x float>
428  ret <16 x float> %2
429}
430
431define <2 x float> @cvt_2i16_to_2f32_constrained(<2 x i16> %a0) nounwind strictfp {
432; AVX-LABEL: cvt_2i16_to_2f32_constrained:
433; AVX:       # %bb.0:
434; AVX-NEXT:    subq $40, %rsp
435; AVX-NEXT:    vmovd %xmm0, %eax
436; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
437; AVX-NEXT:    shrl $16, %eax
438; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
439; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
440; AVX-NEXT:    callq __extendhfsf2@PLT
441; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
442; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
443; AVX-NEXT:    callq __extendhfsf2@PLT
444; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
445; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
446; AVX-NEXT:    addq $40, %rsp
447; AVX-NEXT:    retq
448;
449; F16C-LABEL: cvt_2i16_to_2f32_constrained:
450; F16C:       # %bb.0:
451; F16C-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
452; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
453; F16C-NEXT:    retq
454;
455; AVX512-LABEL: cvt_2i16_to_2f32_constrained:
456; AVX512:       # %bb.0:
457; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
458; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
459; AVX512-NEXT:    retq
460  %1 = bitcast <2 x i16> %a0 to <2 x half>
461  %2 = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %1, metadata !"fpexcept.strict") strictfp
462  ret <2 x float> %2
463}
464declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) strictfp
465
466define <4 x float> @cvt_4i16_to_4f32_constrained(<4 x i16> %a0) nounwind strictfp {
467; AVX-LABEL: cvt_4i16_to_4f32_constrained:
468; AVX:       # %bb.0:
469; AVX-NEXT:    subq $72, %rsp
470; AVX-NEXT:    vmovq %xmm0, %rax
471; AVX-NEXT:    movq %rax, %rcx
472; AVX-NEXT:    movq %rax, %rdx
473; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
474; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
475; AVX-NEXT:    shrl $16, %eax
476; AVX-NEXT:    shrq $32, %rcx
477; AVX-NEXT:    shrq $48, %rdx
478; AVX-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm1
479; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
480; AVX-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm1
481; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
482; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
483; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
484; AVX-NEXT:    callq __extendhfsf2@PLT
485; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
486; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
487; AVX-NEXT:    callq __extendhfsf2@PLT
488; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
489; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
490; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
491; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
492; AVX-NEXT:    callq __extendhfsf2@PLT
493; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
494; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
495; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
496; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
497; AVX-NEXT:    callq __extendhfsf2@PLT
498; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
499; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
500; AVX-NEXT:    addq $72, %rsp
501; AVX-NEXT:    retq
502;
503; F16C-LABEL: cvt_4i16_to_4f32_constrained:
504; F16C:       # %bb.0:
505; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
506; F16C-NEXT:    retq
507;
508; AVX512-LABEL: cvt_4i16_to_4f32_constrained:
509; AVX512:       # %bb.0:
510; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
511; AVX512-NEXT:    retq
512  %1 = bitcast <4 x i16> %a0 to <4 x half>
513  %2 = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %1, metadata !"fpexcept.strict") strictfp
514  ret <4 x float> %2
515}
516declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) strictfp
517
518define <8 x float> @cvt_8i16_to_8f32_constrained(<8 x i16> %a0) nounwind strictfp {
519; AVX-LABEL: cvt_8i16_to_8f32_constrained:
520; AVX:       # %bb.0:
521; AVX-NEXT:    subq $56, %rsp
522; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
523; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
524; AVX-NEXT:    callq __extendhfsf2@PLT
525; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
526; AVX-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
527; AVX-NEXT:    # xmm0 = mem[2,3,0,1]
528; AVX-NEXT:    callq __extendhfsf2@PLT
529; AVX-NEXT:    vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
530; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
531; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
532; AVX-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
533; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
534; AVX-NEXT:    callq __extendhfsf2@PLT
535; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
536; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
537; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
538; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
539; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
540; AVX-NEXT:    callq __extendhfsf2@PLT
541; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
542; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
543; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
544; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
545; AVX-NEXT:    callq __extendhfsf2@PLT
546; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
547; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
548; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
549; AVX-NEXT:    callq __extendhfsf2@PLT
550; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
551; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
552; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
553; AVX-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
554; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
555; AVX-NEXT:    callq __extendhfsf2@PLT
556; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
557; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
558; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
559; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
560; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
561; AVX-NEXT:    callq __extendhfsf2@PLT
562; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
563; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
564; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
565; AVX-NEXT:    addq $56, %rsp
566; AVX-NEXT:    retq
567;
568; F16C-LABEL: cvt_8i16_to_8f32_constrained:
569; F16C:       # %bb.0:
570; F16C-NEXT:    vcvtph2ps %xmm0, %ymm0
571; F16C-NEXT:    retq
572;
573; AVX512-LABEL: cvt_8i16_to_8f32_constrained:
574; AVX512:       # %bb.0:
575; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
576; AVX512-NEXT:    retq
577  %1 = bitcast <8 x i16> %a0 to <8 x half>
578  %2 = call <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half> %1, metadata !"fpexcept.strict") strictfp
579  ret <8 x float> %2
580}
581declare <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half>, metadata) strictfp
582
583define <16 x float> @cvt_16i16_to_16f32_constrained(<16 x i16> %a0) nounwind strictfp {
584; AVX1-LABEL: cvt_16i16_to_16f32_constrained:
585; AVX1:       # %bb.0:
586; AVX1-NEXT:    subq $104, %rsp
587; AVX1-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
588; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
589; AVX1-NEXT:    vzeroupper
590; AVX1-NEXT:    callq __extendhfsf2@PLT
591; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
592; AVX1-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
593; AVX1-NEXT:    # xmm0 = mem[2,3,0,1]
594; AVX1-NEXT:    callq __extendhfsf2@PLT
595; AVX1-NEXT:    vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
596; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
597; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
598; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
599; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
600; AVX1-NEXT:    callq __extendhfsf2@PLT
601; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
602; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
603; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
604; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
605; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
606; AVX1-NEXT:    vzeroupper
607; AVX1-NEXT:    callq __extendhfsf2@PLT
608; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
609; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
610; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
611; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
612; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
613; AVX1-NEXT:    vzeroupper
614; AVX1-NEXT:    callq __extendhfsf2@PLT
615; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
616; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
617; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
618; AVX1-NEXT:    vzeroupper
619; AVX1-NEXT:    callq __extendhfsf2@PLT
620; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
621; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
622; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
623; AVX1-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
624; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
625; AVX1-NEXT:    callq __extendhfsf2@PLT
626; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
627; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
628; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
629; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
630; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
631; AVX1-NEXT:    vzeroupper
632; AVX1-NEXT:    callq __extendhfsf2@PLT
633; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
634; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
635; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
636; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
637; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
638; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
639; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
640; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
641; AVX1-NEXT:    vzeroupper
642; AVX1-NEXT:    callq __extendhfsf2@PLT
643; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
644; AVX1-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
645; AVX1-NEXT:    # xmm0 = mem[2,3,0,1]
646; AVX1-NEXT:    callq __extendhfsf2@PLT
647; AVX1-NEXT:    vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
648; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
649; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
650; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
651; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
652; AVX1-NEXT:    callq __extendhfsf2@PLT
653; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
654; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
655; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
656; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
657; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
658; AVX1-NEXT:    callq __extendhfsf2@PLT
659; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
660; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
661; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
662; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
663; AVX1-NEXT:    callq __extendhfsf2@PLT
664; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
665; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
666; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
667; AVX1-NEXT:    callq __extendhfsf2@PLT
668; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
669; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
670; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
671; AVX1-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
672; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
673; AVX1-NEXT:    callq __extendhfsf2@PLT
674; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
675; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
676; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
677; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
678; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
679; AVX1-NEXT:    callq __extendhfsf2@PLT
680; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
681; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
682; AVX1-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload
683; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
684; AVX1-NEXT:    addq $104, %rsp
685; AVX1-NEXT:    retq
686;
687; AVX2-LABEL: cvt_16i16_to_16f32_constrained:
688; AVX2:       # %bb.0:
689; AVX2-NEXT:    subq $104, %rsp
690; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
691; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
692; AVX2-NEXT:    vzeroupper
693; AVX2-NEXT:    callq __extendhfsf2@PLT
694; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
695; AVX2-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
696; AVX2-NEXT:    # xmm0 = mem[2,3,0,1]
697; AVX2-NEXT:    callq __extendhfsf2@PLT
698; AVX2-NEXT:    vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
699; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
700; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
701; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
702; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
703; AVX2-NEXT:    callq __extendhfsf2@PLT
704; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
705; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
706; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
707; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
708; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
709; AVX2-NEXT:    vzeroupper
710; AVX2-NEXT:    callq __extendhfsf2@PLT
711; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
712; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
713; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
714; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
715; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
716; AVX2-NEXT:    vzeroupper
717; AVX2-NEXT:    callq __extendhfsf2@PLT
718; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
719; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
720; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
721; AVX2-NEXT:    vzeroupper
722; AVX2-NEXT:    callq __extendhfsf2@PLT
723; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
724; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
725; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
726; AVX2-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
727; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
728; AVX2-NEXT:    callq __extendhfsf2@PLT
729; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
730; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
731; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
732; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
733; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
734; AVX2-NEXT:    vzeroupper
735; AVX2-NEXT:    callq __extendhfsf2@PLT
736; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
737; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
738; AVX2-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
739; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
740; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
741; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
742; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
743; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
744; AVX2-NEXT:    vzeroupper
745; AVX2-NEXT:    callq __extendhfsf2@PLT
746; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
747; AVX2-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
748; AVX2-NEXT:    # xmm0 = mem[2,3,0,1]
749; AVX2-NEXT:    callq __extendhfsf2@PLT
750; AVX2-NEXT:    vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
751; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
752; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
753; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
754; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
755; AVX2-NEXT:    callq __extendhfsf2@PLT
756; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
757; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
758; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
759; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
760; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
761; AVX2-NEXT:    callq __extendhfsf2@PLT
762; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
763; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
764; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
765; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
766; AVX2-NEXT:    callq __extendhfsf2@PLT
767; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
768; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
769; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
770; AVX2-NEXT:    callq __extendhfsf2@PLT
771; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
772; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
773; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
774; AVX2-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
775; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
776; AVX2-NEXT:    callq __extendhfsf2@PLT
777; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
778; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
779; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
780; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
781; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
782; AVX2-NEXT:    callq __extendhfsf2@PLT
783; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
784; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
785; AVX2-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload
786; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
787; AVX2-NEXT:    addq $104, %rsp
788; AVX2-NEXT:    retq
789;
790; F16C-LABEL: cvt_16i16_to_16f32_constrained:
791; F16C:       # %bb.0:
792; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm1
793; F16C-NEXT:    vcvtph2ps %xmm1, %ymm1
794; F16C-NEXT:    vcvtph2ps %xmm0, %ymm0
795; F16C-NEXT:    retq
796;
797; AVX512-LABEL: cvt_16i16_to_16f32_constrained:
798; AVX512:       # %bb.0:
799; AVX512-NEXT:    vcvtph2ps %ymm0, %zmm0
800; AVX512-NEXT:    retq
801  %1 = bitcast <16 x i16> %a0 to <16 x half>
802  %2 = call <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half> %1, metadata !"fpexcept.strict") strictfp
803  ret <16 x float> %2
804}
805declare <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half>, metadata) strictfp
806
807;
808; Half to Float (Load)
809;
810
811define float @load_cvt_i16_to_f32(ptr %a0) nounwind {
812; AVX-LABEL: load_cvt_i16_to_f32:
813; AVX:       # %bb.0:
814; AVX-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
815; AVX-NEXT:    jmp __extendhfsf2@PLT # TAILCALL
816;
817; F16C-LABEL: load_cvt_i16_to_f32:
818; F16C:       # %bb.0:
819; F16C-NEXT:    movzwl (%rdi), %eax
820; F16C-NEXT:    vmovd %eax, %xmm0
821; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
822; F16C-NEXT:    retq
823;
824; AVX512-LABEL: load_cvt_i16_to_f32:
825; AVX512:       # %bb.0:
826; AVX512-NEXT:    movzwl (%rdi), %eax
827; AVX512-NEXT:    vmovd %eax, %xmm0
828; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
829; AVX512-NEXT:    retq
830  %1 = load i16, ptr %a0
831  %2 = bitcast i16 %1 to half
832  %3 = fpext half %2 to float
833  ret float %3
834}
835
836define <4 x float> @load_cvt_4i16_to_4f32(ptr %a0) nounwind {
837; AVX-LABEL: load_cvt_4i16_to_4f32:
838; AVX:       # %bb.0:
839; AVX-NEXT:    subq $72, %rsp
840; AVX-NEXT:    vpinsrw $0, 6(%rdi), %xmm0, %xmm0
841; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
842; AVX-NEXT:    vpinsrw $0, 4(%rdi), %xmm0, %xmm0
843; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
844; AVX-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
845; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
846; AVX-NEXT:    vpinsrw $0, 2(%rdi), %xmm0, %xmm0
847; AVX-NEXT:    callq __extendhfsf2@PLT
848; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
849; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
850; AVX-NEXT:    callq __extendhfsf2@PLT
851; AVX-NEXT:    vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
852; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
853; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
854; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
855; AVX-NEXT:    callq __extendhfsf2@PLT
856; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
857; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
858; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
859; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
860; AVX-NEXT:    callq __extendhfsf2@PLT
861; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
862; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
863; AVX-NEXT:    addq $72, %rsp
864; AVX-NEXT:    retq
865;
866; F16C-LABEL: load_cvt_4i16_to_4f32:
867; F16C:       # %bb.0:
868; F16C-NEXT:    vcvtph2ps (%rdi), %xmm0
869; F16C-NEXT:    retq
870;
871; AVX512-LABEL: load_cvt_4i16_to_4f32:
872; AVX512:       # %bb.0:
873; AVX512-NEXT:    vcvtph2ps (%rdi), %xmm0
874; AVX512-NEXT:    retq
875  %1 = load <4 x i16>, ptr %a0
876  %2 = bitcast <4 x i16> %1 to <4 x half>
877  %3 = fpext <4 x half> %2 to <4 x float>
878  ret <4 x float> %3
879}
880
881define <4 x float> @load_cvt_8i16_to_4f32(ptr %a0) nounwind {
882; AVX-LABEL: load_cvt_8i16_to_4f32:
883; AVX:       # %bb.0:
884; AVX-NEXT:    subq $72, %rsp
885; AVX-NEXT:    movq (%rdi), %rax
886; AVX-NEXT:    movq %rax, %rcx
887; AVX-NEXT:    movq %rax, %rdx
888; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
889; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
890; AVX-NEXT:    shrl $16, %eax
891; AVX-NEXT:    shrq $32, %rcx
892; AVX-NEXT:    shrq $48, %rdx
893; AVX-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm1
894; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
895; AVX-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm1
896; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
897; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
898; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
899; AVX-NEXT:    callq __extendhfsf2@PLT
900; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
901; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
902; AVX-NEXT:    callq __extendhfsf2@PLT
903; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
904; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
905; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
906; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
907; AVX-NEXT:    callq __extendhfsf2@PLT
908; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
909; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
910; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
911; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
912; AVX-NEXT:    callq __extendhfsf2@PLT
913; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
914; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
915; AVX-NEXT:    addq $72, %rsp
916; AVX-NEXT:    retq
917;
918; F16C-LABEL: load_cvt_8i16_to_4f32:
919; F16C:       # %bb.0:
920; F16C-NEXT:    vcvtph2ps (%rdi), %xmm0
921; F16C-NEXT:    retq
922;
923; AVX512-LABEL: load_cvt_8i16_to_4f32:
924; AVX512:       # %bb.0:
925; AVX512-NEXT:    vcvtph2ps (%rdi), %xmm0
926; AVX512-NEXT:    retq
927  %1 = load <8 x i16>, ptr %a0
928  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
929  %3 = bitcast <4 x i16> %2 to <4 x half>
930  %4 = fpext <4 x half> %3 to <4 x float>
931  ret <4 x float> %4
932}
933
934define <8 x float> @load_cvt_8i16_to_8f32(ptr %a0) nounwind {
935; AVX-LABEL: load_cvt_8i16_to_8f32:
936; AVX:       # %bb.0:
937; AVX-NEXT:    pushq %rbx
938; AVX-NEXT:    subq $48, %rsp
939; AVX-NEXT:    movq %rdi, %rbx
940; AVX-NEXT:    vmovdqa (%rdi), %xmm0
941; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
942; AVX-NEXT:    vpinsrw $0, 8(%rdi), %xmm0, %xmm0
943; AVX-NEXT:    callq __extendhfsf2@PLT
944; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
945; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
946; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
947; AVX-NEXT:    callq __extendhfsf2@PLT
948; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
949; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
950; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
951; AVX-NEXT:    vpinsrw $0, 12(%rbx), %xmm0, %xmm0
952; AVX-NEXT:    callq __extendhfsf2@PLT
953; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
954; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
955; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
956; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
957; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
958; AVX-NEXT:    callq __extendhfsf2@PLT
959; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
960; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
961; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
962; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
963; AVX-NEXT:    callq __extendhfsf2@PLT
964; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
965; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
966; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
967; AVX-NEXT:    callq __extendhfsf2@PLT
968; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
969; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
970; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
971; AVX-NEXT:    vpinsrw $0, 4(%rbx), %xmm0, %xmm0
972; AVX-NEXT:    callq __extendhfsf2@PLT
973; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
974; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
975; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
976; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
977; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
978; AVX-NEXT:    callq __extendhfsf2@PLT
979; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
980; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
981; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
982; AVX-NEXT:    addq $48, %rsp
983; AVX-NEXT:    popq %rbx
984; AVX-NEXT:    retq
985;
986; F16C-LABEL: load_cvt_8i16_to_8f32:
987; F16C:       # %bb.0:
988; F16C-NEXT:    vcvtph2ps (%rdi), %ymm0
989; F16C-NEXT:    retq
990;
991; AVX512-LABEL: load_cvt_8i16_to_8f32:
992; AVX512:       # %bb.0:
993; AVX512-NEXT:    vcvtph2ps (%rdi), %ymm0
994; AVX512-NEXT:    retq
995  %1 = load <8 x i16>, ptr %a0
996  %2 = bitcast <8 x i16> %1 to <8 x half>
997  %3 = fpext <8 x half> %2 to <8 x float>
998  ret <8 x float> %3
999}
1000
1001define <16 x float> @load_cvt_16i16_to_16f32(ptr %a0) nounwind {
1002; AVX1-LABEL: load_cvt_16i16_to_16f32:
1003; AVX1:       # %bb.0:
1004; AVX1-NEXT:    pushq %rbx
1005; AVX1-NEXT:    subq $80, %rsp
1006; AVX1-NEXT:    movq %rdi, %rbx
1007; AVX1-NEXT:    vpinsrw $0, 8(%rdi), %xmm0, %xmm0
1008; AVX1-NEXT:    callq __extendhfsf2@PLT
1009; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1010; AVX1-NEXT:    vmovdqa (%rbx), %xmm1
1011; AVX1-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1012; AVX1-NEXT:    vmovaps 16(%rbx), %xmm0
1013; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1014; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1015; AVX1-NEXT:    callq __extendhfsf2@PLT
1016; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1017; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1018; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1019; AVX1-NEXT:    vpinsrw $0, 12(%rbx), %xmm0, %xmm0
1020; AVX1-NEXT:    callq __extendhfsf2@PLT
1021; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1022; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1023; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1024; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1025; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1026; AVX1-NEXT:    callq __extendhfsf2@PLT
1027; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1028; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1029; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1030; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1031; AVX1-NEXT:    callq __extendhfsf2@PLT
1032; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1033; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1034; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
1035; AVX1-NEXT:    callq __extendhfsf2@PLT
1036; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1037; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1038; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1039; AVX1-NEXT:    vpinsrw $0, 4(%rbx), %xmm0, %xmm0
1040; AVX1-NEXT:    callq __extendhfsf2@PLT
1041; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1042; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1043; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1044; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1045; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
1046; AVX1-NEXT:    callq __extendhfsf2@PLT
1047; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1048; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1049; AVX1-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
1050; AVX1-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1051; AVX1-NEXT:    vpinsrw $0, 24(%rbx), %xmm0, %xmm0
1052; AVX1-NEXT:    vzeroupper
1053; AVX1-NEXT:    callq __extendhfsf2@PLT
1054; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1055; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1056; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1057; AVX1-NEXT:    callq __extendhfsf2@PLT
1058; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1059; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1060; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1061; AVX1-NEXT:    vpinsrw $0, 28(%rbx), %xmm0, %xmm0
1062; AVX1-NEXT:    callq __extendhfsf2@PLT
1063; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1064; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1065; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1066; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1067; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1068; AVX1-NEXT:    callq __extendhfsf2@PLT
1069; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1070; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1071; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1072; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1073; AVX1-NEXT:    callq __extendhfsf2@PLT
1074; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1075; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1076; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
1077; AVX1-NEXT:    callq __extendhfsf2@PLT
1078; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1079; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1080; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1081; AVX1-NEXT:    vpinsrw $0, 20(%rbx), %xmm0, %xmm0
1082; AVX1-NEXT:    callq __extendhfsf2@PLT
1083; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1084; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1085; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1086; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1087; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
1088; AVX1-NEXT:    callq __extendhfsf2@PLT
1089; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1090; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1091; AVX1-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload
1092; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1093; AVX1-NEXT:    addq $80, %rsp
1094; AVX1-NEXT:    popq %rbx
1095; AVX1-NEXT:    retq
1096;
1097; AVX2-LABEL: load_cvt_16i16_to_16f32:
1098; AVX2:       # %bb.0:
1099; AVX2-NEXT:    pushq %rbx
1100; AVX2-NEXT:    subq $80, %rsp
1101; AVX2-NEXT:    movq %rdi, %rbx
1102; AVX2-NEXT:    vpinsrw $0, 8(%rdi), %xmm0, %xmm0
1103; AVX2-NEXT:    callq __extendhfsf2@PLT
1104; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1105; AVX2-NEXT:    vmovdqa (%rbx), %xmm1
1106; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1107; AVX2-NEXT:    vmovaps 16(%rbx), %xmm0
1108; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1109; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1110; AVX2-NEXT:    callq __extendhfsf2@PLT
1111; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1112; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1113; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1114; AVX2-NEXT:    vpinsrw $0, 12(%rbx), %xmm0, %xmm0
1115; AVX2-NEXT:    callq __extendhfsf2@PLT
1116; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1117; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1118; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1119; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1120; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1121; AVX2-NEXT:    callq __extendhfsf2@PLT
1122; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1123; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1124; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1125; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1126; AVX2-NEXT:    callq __extendhfsf2@PLT
1127; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1128; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1129; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
1130; AVX2-NEXT:    callq __extendhfsf2@PLT
1131; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1132; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1133; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1134; AVX2-NEXT:    vpinsrw $0, 4(%rbx), %xmm0, %xmm0
1135; AVX2-NEXT:    callq __extendhfsf2@PLT
1136; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1137; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1138; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1139; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1140; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
1141; AVX2-NEXT:    callq __extendhfsf2@PLT
1142; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1143; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1144; AVX2-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
1145; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1146; AVX2-NEXT:    vpinsrw $0, 24(%rbx), %xmm0, %xmm0
1147; AVX2-NEXT:    vzeroupper
1148; AVX2-NEXT:    callq __extendhfsf2@PLT
1149; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1150; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1151; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1152; AVX2-NEXT:    callq __extendhfsf2@PLT
1153; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1154; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1155; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1156; AVX2-NEXT:    vpinsrw $0, 28(%rbx), %xmm0, %xmm0
1157; AVX2-NEXT:    callq __extendhfsf2@PLT
1158; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1159; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1160; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1161; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1162; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1163; AVX2-NEXT:    callq __extendhfsf2@PLT
1164; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1165; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1166; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1167; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1168; AVX2-NEXT:    callq __extendhfsf2@PLT
1169; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1170; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1171; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
1172; AVX2-NEXT:    callq __extendhfsf2@PLT
1173; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1174; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1175; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1176; AVX2-NEXT:    vpinsrw $0, 20(%rbx), %xmm0, %xmm0
1177; AVX2-NEXT:    callq __extendhfsf2@PLT
1178; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1179; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1180; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1181; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1182; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
1183; AVX2-NEXT:    callq __extendhfsf2@PLT
1184; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1185; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1186; AVX2-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload
1187; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1188; AVX2-NEXT:    addq $80, %rsp
1189; AVX2-NEXT:    popq %rbx
1190; AVX2-NEXT:    retq
1191;
1192; F16C-LABEL: load_cvt_16i16_to_16f32:
1193; F16C:       # %bb.0:
1194; F16C-NEXT:    vcvtph2ps (%rdi), %ymm0
1195; F16C-NEXT:    vcvtph2ps 16(%rdi), %ymm1
1196; F16C-NEXT:    retq
1197;
1198; AVX512-LABEL: load_cvt_16i16_to_16f32:
1199; AVX512:       # %bb.0:
1200; AVX512-NEXT:    vcvtph2ps (%rdi), %zmm0
1201; AVX512-NEXT:    retq
1202  %1 = load <16 x i16>, ptr %a0
1203  %2 = bitcast <16 x i16> %1 to <16 x half>
1204  %3 = fpext <16 x half> %2 to <16 x float>
1205  ret <16 x float> %3
1206}
1207
1208define <4 x float> @load_cvt_4i16_to_4f32_constrained(ptr %a0) nounwind strictfp {
1209; AVX-LABEL: load_cvt_4i16_to_4f32_constrained:
1210; AVX:       # %bb.0:
1211; AVX-NEXT:    subq $72, %rsp
1212; AVX-NEXT:    vpinsrw $0, 6(%rdi), %xmm0, %xmm0
1213; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1214; AVX-NEXT:    vpinsrw $0, 4(%rdi), %xmm0, %xmm0
1215; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1216; AVX-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
1217; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1218; AVX-NEXT:    vpinsrw $0, 2(%rdi), %xmm0, %xmm0
1219; AVX-NEXT:    callq __extendhfsf2@PLT
1220; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1221; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1222; AVX-NEXT:    callq __extendhfsf2@PLT
1223; AVX-NEXT:    vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1224; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
1225; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1226; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1227; AVX-NEXT:    callq __extendhfsf2@PLT
1228; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1229; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1230; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1231; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1232; AVX-NEXT:    callq __extendhfsf2@PLT
1233; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1234; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1235; AVX-NEXT:    addq $72, %rsp
1236; AVX-NEXT:    retq
1237;
1238; F16C-LABEL: load_cvt_4i16_to_4f32_constrained:
1239; F16C:       # %bb.0:
1240; F16C-NEXT:    vcvtph2ps (%rdi), %xmm0
1241; F16C-NEXT:    retq
1242;
1243; AVX512-LABEL: load_cvt_4i16_to_4f32_constrained:
1244; AVX512:       # %bb.0:
1245; AVX512-NEXT:    vcvtph2ps (%rdi), %xmm0
1246; AVX512-NEXT:    retq
1247  %1 = load <4 x i16>, ptr %a0
1248  %2 = bitcast <4 x i16> %1 to <4 x half>
1249  %3 = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %2, metadata !"fpexcept.strict") strictfp
1250  ret <4 x float> %3
1251}
1252
1253define <4 x float> @load_cvt_8i16_to_4f32_constrained(ptr %a0) nounwind strictfp {
1254; AVX-LABEL: load_cvt_8i16_to_4f32_constrained:
1255; AVX:       # %bb.0:
1256; AVX-NEXT:    subq $72, %rsp
1257; AVX-NEXT:    movq (%rdi), %rax
1258; AVX-NEXT:    movq %rax, %rcx
1259; AVX-NEXT:    movq %rax, %rdx
1260; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1261; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
1262; AVX-NEXT:    shrl $16, %eax
1263; AVX-NEXT:    shrq $32, %rcx
1264; AVX-NEXT:    shrq $48, %rdx
1265; AVX-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm1
1266; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1267; AVX-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm1
1268; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1269; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
1270; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1271; AVX-NEXT:    callq __extendhfsf2@PLT
1272; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1273; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1274; AVX-NEXT:    callq __extendhfsf2@PLT
1275; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1276; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1277; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1278; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1279; AVX-NEXT:    callq __extendhfsf2@PLT
1280; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1281; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1282; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1283; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1284; AVX-NEXT:    callq __extendhfsf2@PLT
1285; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1286; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1287; AVX-NEXT:    addq $72, %rsp
1288; AVX-NEXT:    retq
1289;
1290; F16C-LABEL: load_cvt_8i16_to_4f32_constrained:
1291; F16C:       # %bb.0:
1292; F16C-NEXT:    vcvtph2ps (%rdi), %xmm0
1293; F16C-NEXT:    retq
1294;
1295; AVX512-LABEL: load_cvt_8i16_to_4f32_constrained:
1296; AVX512:       # %bb.0:
1297; AVX512-NEXT:    vcvtph2ps (%rdi), %xmm0
1298; AVX512-NEXT:    retq
1299  %1 = load <8 x i16>, ptr %a0
1300  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1301  %3 = bitcast <4 x i16> %2 to <4 x half>
1302  %4 = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %3, metadata !"fpexcept.strict") strictfp
1303  ret <4 x float> %4
1304}
1305
1306;
1307; Half to Double
1308;
1309
1310define double @cvt_i16_to_f64(i16 %a0) nounwind {
1311; AVX-LABEL: cvt_i16_to_f64:
1312; AVX:       # %bb.0:
1313; AVX-NEXT:    pushq %rax
1314; AVX-NEXT:    vpinsrw $0, %edi, %xmm0, %xmm0
1315; AVX-NEXT:    callq __extendhfsf2@PLT
1316; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1317; AVX-NEXT:    popq %rax
1318; AVX-NEXT:    retq
1319;
1320; F16C-LABEL: cvt_i16_to_f64:
1321; F16C:       # %bb.0:
1322; F16C-NEXT:    movzwl %di, %eax
1323; F16C-NEXT:    vmovd %eax, %xmm0
1324; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
1325; F16C-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1326; F16C-NEXT:    retq
1327;
1328; AVX512-LABEL: cvt_i16_to_f64:
1329; AVX512:       # %bb.0:
1330; AVX512-NEXT:    movzwl %di, %eax
1331; AVX512-NEXT:    vmovd %eax, %xmm0
1332; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1333; AVX512-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1334; AVX512-NEXT:    retq
1335  %1 = bitcast i16 %a0 to half
1336  %2 = fpext half %1 to double
1337  ret double %2
1338}
1339
1340define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
1341; AVX-LABEL: cvt_2i16_to_2f64:
1342; AVX:       # %bb.0:
1343; AVX-NEXT:    subq $40, %rsp
1344; AVX-NEXT:    vmovd %xmm0, %eax
1345; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1346; AVX-NEXT:    shrl $16, %eax
1347; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
1348; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1349; AVX-NEXT:    callq __extendhfsf2@PLT
1350; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1351; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1352; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1353; AVX-NEXT:    callq __extendhfsf2@PLT
1354; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1355; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1356; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1357; AVX-NEXT:    addq $40, %rsp
1358; AVX-NEXT:    retq
1359;
1360; F16C-LABEL: cvt_2i16_to_2f64:
1361; F16C:       # %bb.0:
1362; F16C-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1363; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
1364; F16C-NEXT:    vcvtps2pd %xmm0, %xmm0
1365; F16C-NEXT:    retq
1366;
1367; AVX512-LABEL: cvt_2i16_to_2f64:
1368; AVX512:       # %bb.0:
1369; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1370; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1371; AVX512-NEXT:    vcvtps2pd %xmm0, %xmm0
1372; AVX512-NEXT:    retq
1373  %1 = bitcast <2 x i16> %a0 to <2 x half>
1374  %2 = fpext <2 x half> %1 to <2 x double>
1375  ret <2 x double> %2
1376}
1377
1378define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
1379; AVX-LABEL: cvt_4i16_to_4f64:
1380; AVX:       # %bb.0:
1381; AVX-NEXT:    subq $72, %rsp
1382; AVX-NEXT:    vmovq %xmm0, %rax
1383; AVX-NEXT:    movq %rax, %rcx
1384; AVX-NEXT:    movl %eax, %edx
1385; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1386; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1387; AVX-NEXT:    shrq $48, %rax
1388; AVX-NEXT:    shrq $32, %rcx
1389; AVX-NEXT:    shrl $16, %edx
1390; AVX-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm0
1391; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1392; AVX-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm0
1393; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1394; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1395; AVX-NEXT:    callq __extendhfsf2@PLT
1396; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1397; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1398; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1399; AVX-NEXT:    callq __extendhfsf2@PLT
1400; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1401; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1402; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1403; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1404; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1405; AVX-NEXT:    callq __extendhfsf2@PLT
1406; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1407; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1408; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1409; AVX-NEXT:    callq __extendhfsf2@PLT
1410; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1411; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1412; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1413; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
1414; AVX-NEXT:    addq $72, %rsp
1415; AVX-NEXT:    retq
1416;
1417; F16C-LABEL: cvt_4i16_to_4f64:
1418; F16C:       # %bb.0:
1419; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
1420; F16C-NEXT:    vcvtps2pd %xmm0, %ymm0
1421; F16C-NEXT:    retq
1422;
1423; AVX512-LABEL: cvt_4i16_to_4f64:
1424; AVX512:       # %bb.0:
1425; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1426; AVX512-NEXT:    vcvtps2pd %xmm0, %ymm0
1427; AVX512-NEXT:    retq
1428  %1 = bitcast <4 x i16> %a0 to <4 x half>
1429  %2 = fpext <4 x half> %1 to <4 x double>
1430  ret <4 x double> %2
1431}
1432
1433define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
1434; AVX-LABEL: cvt_8i16_to_2f64:
1435; AVX:       # %bb.0:
1436; AVX-NEXT:    subq $40, %rsp
1437; AVX-NEXT:    vmovd %xmm0, %eax
1438; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1439; AVX-NEXT:    shrl $16, %eax
1440; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
1441; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1442; AVX-NEXT:    callq __extendhfsf2@PLT
1443; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1444; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1445; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1446; AVX-NEXT:    callq __extendhfsf2@PLT
1447; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1448; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1449; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1450; AVX-NEXT:    addq $40, %rsp
1451; AVX-NEXT:    retq
1452;
1453; F16C-LABEL: cvt_8i16_to_2f64:
1454; F16C:       # %bb.0:
1455; F16C-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1456; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
1457; F16C-NEXT:    vcvtps2pd %xmm0, %xmm0
1458; F16C-NEXT:    retq
1459;
1460; AVX512-LABEL: cvt_8i16_to_2f64:
1461; AVX512:       # %bb.0:
1462; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1463; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1464; AVX512-NEXT:    vcvtps2pd %xmm0, %xmm0
1465; AVX512-NEXT:    retq
1466  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
1467  %2 = bitcast <2 x i16> %1 to <2 x half>
1468  %3 = fpext <2 x half> %2 to <2 x double>
1469  ret <2 x double> %3
1470}
1471
1472define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
1473; AVX-LABEL: cvt_8i16_to_4f64:
1474; AVX:       # %bb.0:
1475; AVX-NEXT:    subq $72, %rsp
1476; AVX-NEXT:    vmovq %xmm0, %rax
1477; AVX-NEXT:    movq %rax, %rcx
1478; AVX-NEXT:    movl %eax, %edx
1479; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1480; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1481; AVX-NEXT:    shrq $48, %rax
1482; AVX-NEXT:    shrq $32, %rcx
1483; AVX-NEXT:    shrl $16, %edx
1484; AVX-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm0
1485; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1486; AVX-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm0
1487; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1488; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1489; AVX-NEXT:    callq __extendhfsf2@PLT
1490; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1491; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1492; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1493; AVX-NEXT:    callq __extendhfsf2@PLT
1494; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1495; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1496; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1497; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1498; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1499; AVX-NEXT:    callq __extendhfsf2@PLT
1500; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1501; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1502; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1503; AVX-NEXT:    callq __extendhfsf2@PLT
1504; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1505; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1506; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1507; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
1508; AVX-NEXT:    addq $72, %rsp
1509; AVX-NEXT:    retq
1510;
1511; F16C-LABEL: cvt_8i16_to_4f64:
1512; F16C:       # %bb.0:
1513; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
1514; F16C-NEXT:    vcvtps2pd %xmm0, %ymm0
1515; F16C-NEXT:    retq
1516;
1517; AVX512-LABEL: cvt_8i16_to_4f64:
1518; AVX512:       # %bb.0:
1519; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1520; AVX512-NEXT:    vcvtps2pd %xmm0, %ymm0
1521; AVX512-NEXT:    retq
1522  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1523  %2 = bitcast <4 x i16> %1 to <4 x half>
1524  %3 = fpext <4 x half> %2 to <4 x double>
1525  ret <4 x double> %3
1526}
1527
1528define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
1529; AVX-LABEL: cvt_8i16_to_8f64:
1530; AVX:       # %bb.0:
1531; AVX-NEXT:    subq $88, %rsp
1532; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1533; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
1534; AVX-NEXT:    callq __extendhfsf2@PLT
1535; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1536; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1537; AVX-NEXT:    vpermilps $245, (%rsp), %xmm0 # 16-byte Folded Reload
1538; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
1539; AVX-NEXT:    callq __extendhfsf2@PLT
1540; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1541; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1542; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1543; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1544; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1545; AVX-NEXT:    callq __extendhfsf2@PLT
1546; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1547; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1548; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
1549; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
1550; AVX-NEXT:    callq __extendhfsf2@PLT
1551; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1552; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1553; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1554; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1555; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1556; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
1557; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1558; AVX-NEXT:    vzeroupper
1559; AVX-NEXT:    callq __extendhfsf2@PLT
1560; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1561; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1562; AVX-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
1563; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
1564; AVX-NEXT:    callq __extendhfsf2@PLT
1565; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1566; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1567; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1568; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1569; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
1570; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1571; AVX-NEXT:    callq __extendhfsf2@PLT
1572; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1573; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1574; AVX-NEXT:    vpermilps $78, (%rsp), %xmm0 # 16-byte Folded Reload
1575; AVX-NEXT:    # xmm0 = mem[2,3,0,1]
1576; AVX-NEXT:    callq __extendhfsf2@PLT
1577; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1578; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1579; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1580; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
1581; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1582; AVX-NEXT:    addq $88, %rsp
1583; AVX-NEXT:    retq
1584;
1585; F16C-LABEL: cvt_8i16_to_8f64:
1586; F16C:       # %bb.0:
1587; F16C-NEXT:    vcvtph2ps %xmm0, %ymm1
1588; F16C-NEXT:    vcvtps2pd %xmm1, %ymm0
1589; F16C-NEXT:    vextractf128 $1, %ymm1, %xmm1
1590; F16C-NEXT:    vcvtps2pd %xmm1, %ymm1
1591; F16C-NEXT:    retq
1592;
1593; AVX512-LABEL: cvt_8i16_to_8f64:
1594; AVX512:       # %bb.0:
1595; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
1596; AVX512-NEXT:    vcvtps2pd %ymm0, %zmm0
1597; AVX512-NEXT:    retq
1598  %1 = bitcast <8 x i16> %a0 to <8 x half>
1599  %2 = fpext <8 x half> %1 to <8 x double>
1600  ret <8 x double> %2
1601}
1602
1603define <2 x double> @cvt_2i16_to_2f64_constrained(<2 x i16> %a0) nounwind strictfp {
1604; AVX-LABEL: cvt_2i16_to_2f64_constrained:
1605; AVX:       # %bb.0:
1606; AVX-NEXT:    subq $40, %rsp
1607; AVX-NEXT:    vmovd %xmm0, %eax
1608; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1609; AVX-NEXT:    shrl $16, %eax
1610; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
1611; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1612; AVX-NEXT:    callq __extendhfsf2@PLT
1613; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1614; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1615; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1616; AVX-NEXT:    callq __extendhfsf2@PLT
1617; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1618; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1619; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1620; AVX-NEXT:    addq $40, %rsp
1621; AVX-NEXT:    retq
1622;
1623; F16C-LABEL: cvt_2i16_to_2f64_constrained:
1624; F16C:       # %bb.0:
1625; F16C-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1626; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
1627; F16C-NEXT:    vcvtps2pd %xmm0, %xmm0
1628; F16C-NEXT:    retq
1629;
1630; AVX512-LABEL: cvt_2i16_to_2f64_constrained:
1631; AVX512:       # %bb.0:
1632; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1633; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1634; AVX512-NEXT:    vcvtps2pd %xmm0, %xmm0
1635; AVX512-NEXT:    retq
1636  %1 = bitcast <2 x i16> %a0 to <2 x half>
1637  %2 = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f16(<2 x half> %1, metadata !"fpexcept.strict") strictfp
1638  ret <2 x double> %2
1639}
1640declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f16(<2 x half>, metadata) strictfp
1641
1642define <4 x double> @cvt_4i16_to_4f64_constrained(<4 x i16> %a0) nounwind strictfp {
1643; AVX-LABEL: cvt_4i16_to_4f64_constrained:
1644; AVX:       # %bb.0:
1645; AVX-NEXT:    subq $72, %rsp
1646; AVX-NEXT:    vmovq %xmm0, %rax
1647; AVX-NEXT:    movq %rax, %rcx
1648; AVX-NEXT:    movl %eax, %edx
1649; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1650; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1651; AVX-NEXT:    shrq $48, %rax
1652; AVX-NEXT:    shrq $32, %rcx
1653; AVX-NEXT:    shrl $16, %edx
1654; AVX-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm0
1655; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1656; AVX-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm0
1657; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1658; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1659; AVX-NEXT:    callq __extendhfsf2@PLT
1660; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1661; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1662; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1663; AVX-NEXT:    callq __extendhfsf2@PLT
1664; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1665; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1666; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1667; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1668; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1669; AVX-NEXT:    callq __extendhfsf2@PLT
1670; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1671; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1672; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1673; AVX-NEXT:    callq __extendhfsf2@PLT
1674; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1675; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1676; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1677; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
1678; AVX-NEXT:    addq $72, %rsp
1679; AVX-NEXT:    retq
1680;
1681; F16C-LABEL: cvt_4i16_to_4f64_constrained:
1682; F16C:       # %bb.0:
1683; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
1684; F16C-NEXT:    vcvtps2pd %xmm0, %ymm0
1685; F16C-NEXT:    retq
1686;
1687; AVX512-LABEL: cvt_4i16_to_4f64_constrained:
1688; AVX512:       # %bb.0:
1689; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1690; AVX512-NEXT:    vcvtps2pd %xmm0, %ymm0
1691; AVX512-NEXT:    retq
1692  %1 = bitcast <4 x i16> %a0 to <4 x half>
1693  %2 = call <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16(<4 x half> %1, metadata !"fpexcept.strict") strictfp
1694  ret <4 x double> %2
1695}
1696declare <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16(<4 x half>, metadata) strictfp
1697
1698define <8 x double> @cvt_8i16_to_8f64_constrained(<8 x i16> %a0) nounwind strictfp {
1699; AVX-LABEL: cvt_8i16_to_8f64_constrained:
1700; AVX:       # %bb.0:
1701; AVX-NEXT:    subq $88, %rsp
1702; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1703; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
1704; AVX-NEXT:    callq __extendhfsf2@PLT
1705; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1706; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1707; AVX-NEXT:    vpermilps $245, (%rsp), %xmm0 # 16-byte Folded Reload
1708; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
1709; AVX-NEXT:    callq __extendhfsf2@PLT
1710; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1711; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1712; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1713; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1714; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1715; AVX-NEXT:    callq __extendhfsf2@PLT
1716; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1717; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1718; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
1719; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
1720; AVX-NEXT:    callq __extendhfsf2@PLT
1721; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1722; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1723; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1724; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1725; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1726; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
1727; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1728; AVX-NEXT:    vzeroupper
1729; AVX-NEXT:    callq __extendhfsf2@PLT
1730; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1731; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1732; AVX-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
1733; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
1734; AVX-NEXT:    callq __extendhfsf2@PLT
1735; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1736; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1737; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1738; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1739; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
1740; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1741; AVX-NEXT:    callq __extendhfsf2@PLT
1742; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1743; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1744; AVX-NEXT:    vpermilps $78, (%rsp), %xmm0 # 16-byte Folded Reload
1745; AVX-NEXT:    # xmm0 = mem[2,3,0,1]
1746; AVX-NEXT:    callq __extendhfsf2@PLT
1747; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1748; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1749; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1750; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
1751; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1752; AVX-NEXT:    addq $88, %rsp
1753; AVX-NEXT:    retq
1754;
1755; F16C-LABEL: cvt_8i16_to_8f64_constrained:
1756; F16C:       # %bb.0:
1757; F16C-NEXT:    vcvtph2ps %xmm0, %ymm0
1758; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm1
1759; F16C-NEXT:    vcvtps2pd %xmm1, %ymm1
1760; F16C-NEXT:    vcvtps2pd %xmm0, %ymm0
1761; F16C-NEXT:    retq
1762;
1763; AVX512-LABEL: cvt_8i16_to_8f64_constrained:
1764; AVX512:       # %bb.0:
1765; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
1766; AVX512-NEXT:    vcvtps2pd %ymm0, %zmm0
1767; AVX512-NEXT:    retq
1768  %1 = bitcast <8 x i16> %a0 to <8 x half>
1769  %2 = call <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f16(<8 x half> %1, metadata !"fpexcept.strict") strictfp
1770  ret <8 x double> %2
1771}
1772declare <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f16(<8 x half>, metadata) strictfp
1773
1774;
1775; Half to Double (Load)
1776;
1777
1778define double @load_cvt_i16_to_f64(ptr %a0) nounwind {
1779; AVX-LABEL: load_cvt_i16_to_f64:
1780; AVX:       # %bb.0:
1781; AVX-NEXT:    pushq %rax
1782; AVX-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
1783; AVX-NEXT:    callq __extendhfsf2@PLT
1784; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1785; AVX-NEXT:    popq %rax
1786; AVX-NEXT:    retq
1787;
1788; F16C-LABEL: load_cvt_i16_to_f64:
1789; F16C:       # %bb.0:
1790; F16C-NEXT:    movzwl (%rdi), %eax
1791; F16C-NEXT:    vmovd %eax, %xmm0
1792; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
1793; F16C-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1794; F16C-NEXT:    retq
1795;
1796; AVX512-LABEL: load_cvt_i16_to_f64:
1797; AVX512:       # %bb.0:
1798; AVX512-NEXT:    movzwl (%rdi), %eax
1799; AVX512-NEXT:    vmovd %eax, %xmm0
1800; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1801; AVX512-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1802; AVX512-NEXT:    retq
1803  %1 = load i16, ptr %a0
1804  %2 = bitcast i16 %1 to half
1805  %3 = fpext half %2 to double
1806  ret double %3
1807}
1808
1809define <2 x double> @load_cvt_2i16_to_2f64(ptr %a0) nounwind {
1810; AVX-LABEL: load_cvt_2i16_to_2f64:
1811; AVX:       # %bb.0:
1812; AVX-NEXT:    subq $40, %rsp
1813; AVX-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
1814; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1815; AVX-NEXT:    vpinsrw $0, 2(%rdi), %xmm0, %xmm0
1816; AVX-NEXT:    callq __extendhfsf2@PLT
1817; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1818; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1819; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1820; AVX-NEXT:    callq __extendhfsf2@PLT
1821; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1822; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
1823; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1824; AVX-NEXT:    addq $40, %rsp
1825; AVX-NEXT:    retq
1826;
1827; F16C-LABEL: load_cvt_2i16_to_2f64:
1828; F16C:       # %bb.0:
1829; F16C-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1830; F16C-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1831; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
1832; F16C-NEXT:    vcvtps2pd %xmm0, %xmm0
1833; F16C-NEXT:    retq
1834;
1835; AVX512-LABEL: load_cvt_2i16_to_2f64:
1836; AVX512:       # %bb.0:
1837; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1838; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1839; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1840; AVX512-NEXT:    vcvtps2pd %xmm0, %xmm0
1841; AVX512-NEXT:    retq
1842  %1 = load <2 x i16>, ptr %a0
1843  %2 = bitcast <2 x i16> %1 to <2 x half>
1844  %3 = fpext <2 x half> %2 to <2 x double>
1845  ret <2 x double> %3
1846}
1847
1848define <4 x double> @load_cvt_4i16_to_4f64(ptr %a0) nounwind {
1849; AVX-LABEL: load_cvt_4i16_to_4f64:
1850; AVX:       # %bb.0:
1851; AVX-NEXT:    subq $72, %rsp
1852; AVX-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
1853; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1854; AVX-NEXT:    vpinsrw $0, 2(%rdi), %xmm0, %xmm0
1855; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1856; AVX-NEXT:    vpinsrw $0, 4(%rdi), %xmm0, %xmm0
1857; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1858; AVX-NEXT:    vpinsrw $0, 6(%rdi), %xmm0, %xmm0
1859; AVX-NEXT:    callq __extendhfsf2@PLT
1860; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1861; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1862; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1863; AVX-NEXT:    callq __extendhfsf2@PLT
1864; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1865; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1866; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1867; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1868; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1869; AVX-NEXT:    callq __extendhfsf2@PLT
1870; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1871; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1872; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1873; AVX-NEXT:    callq __extendhfsf2@PLT
1874; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1875; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1876; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1877; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
1878; AVX-NEXT:    addq $72, %rsp
1879; AVX-NEXT:    retq
1880;
1881; F16C-LABEL: load_cvt_4i16_to_4f64:
1882; F16C:       # %bb.0:
1883; F16C-NEXT:    vcvtph2ps (%rdi), %xmm0
1884; F16C-NEXT:    vcvtps2pd %xmm0, %ymm0
1885; F16C-NEXT:    retq
1886;
1887; AVX512-LABEL: load_cvt_4i16_to_4f64:
1888; AVX512:       # %bb.0:
1889; AVX512-NEXT:    vcvtph2ps (%rdi), %xmm0
1890; AVX512-NEXT:    vcvtps2pd %xmm0, %ymm0
1891; AVX512-NEXT:    retq
1892  %1 = load <4 x i16>, ptr %a0
1893  %2 = bitcast <4 x i16> %1 to <4 x half>
1894  %3 = fpext <4 x half> %2 to <4 x double>
1895  ret <4 x double> %3
1896}
1897
1898define <4 x double> @load_cvt_8i16_to_4f64(ptr %a0) nounwind {
1899; AVX-LABEL: load_cvt_8i16_to_4f64:
1900; AVX:       # %bb.0:
1901; AVX-NEXT:    subq $72, %rsp
1902; AVX-NEXT:    movq (%rdi), %rax
1903; AVX-NEXT:    movq %rax, %rcx
1904; AVX-NEXT:    movl %eax, %edx
1905; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1906; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1907; AVX-NEXT:    shrq $48, %rax
1908; AVX-NEXT:    shrq $32, %rcx
1909; AVX-NEXT:    shrl $16, %edx
1910; AVX-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm0
1911; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1912; AVX-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm0
1913; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1914; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1915; AVX-NEXT:    callq __extendhfsf2@PLT
1916; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1917; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1918; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1919; AVX-NEXT:    callq __extendhfsf2@PLT
1920; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1921; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1922; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1923; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1924; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1925; AVX-NEXT:    callq __extendhfsf2@PLT
1926; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1927; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1928; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1929; AVX-NEXT:    callq __extendhfsf2@PLT
1930; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1931; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1932; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1933; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
1934; AVX-NEXT:    addq $72, %rsp
1935; AVX-NEXT:    retq
1936;
1937; F16C-LABEL: load_cvt_8i16_to_4f64:
1938; F16C:       # %bb.0:
1939; F16C-NEXT:    vcvtph2ps (%rdi), %xmm0
1940; F16C-NEXT:    vcvtps2pd %xmm0, %ymm0
1941; F16C-NEXT:    retq
1942;
1943; AVX512-LABEL: load_cvt_8i16_to_4f64:
1944; AVX512:       # %bb.0:
1945; AVX512-NEXT:    vcvtph2ps (%rdi), %xmm0
1946; AVX512-NEXT:    vcvtps2pd %xmm0, %ymm0
1947; AVX512-NEXT:    retq
1948  %1 = load <8 x i16>, ptr %a0
1949  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1950  %3 = bitcast <4 x i16> %2 to <4 x half>
1951  %4 = fpext <4 x half> %3 to <4 x double>
1952  ret <4 x double> %4
1953}
1954
1955define <8 x double> @load_cvt_8i16_to_8f64(ptr %a0) nounwind {
1956; AVX1-LABEL: load_cvt_8i16_to_8f64:
1957; AVX1:       # %bb.0:
1958; AVX1-NEXT:    pushq %rbx
1959; AVX1-NEXT:    subq $80, %rsp
1960; AVX1-NEXT:    movq %rdi, %rbx
1961; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
1962; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1963; AVX1-NEXT:    vpinsrw $0, 4(%rdi), %xmm0, %xmm0
1964; AVX1-NEXT:    callq __extendhfsf2@PLT
1965; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1966; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1967; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1968; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
1969; AVX1-NEXT:    callq __extendhfsf2@PLT
1970; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1971; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1972; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1973; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1974; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1975; AVX1-NEXT:    callq __extendhfsf2@PLT
1976; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1977; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1978; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1979; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
1980; AVX1-NEXT:    callq __extendhfsf2@PLT
1981; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1982; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1983; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1984; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1985; AVX1-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1986; AVX1-NEXT:    vpinsrw $0, 12(%rbx), %xmm0, %xmm0
1987; AVX1-NEXT:    vzeroupper
1988; AVX1-NEXT:    callq __extendhfsf2@PLT
1989; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1990; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1991; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1992; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1993; AVX1-NEXT:    callq __extendhfsf2@PLT
1994; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1995; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1996; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1997; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1998; AVX1-NEXT:    vpinsrw $0, 8(%rbx), %xmm0, %xmm0
1999; AVX1-NEXT:    callq __extendhfsf2@PLT
2000; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2001; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2002; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2003; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2004; AVX1-NEXT:    callq __extendhfsf2@PLT
2005; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2006; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2007; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2008; AVX1-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload
2009; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2010; AVX1-NEXT:    addq $80, %rsp
2011; AVX1-NEXT:    popq %rbx
2012; AVX1-NEXT:    retq
2013;
2014; AVX2-LABEL: load_cvt_8i16_to_8f64:
2015; AVX2:       # %bb.0:
2016; AVX2-NEXT:    pushq %rbx
2017; AVX2-NEXT:    subq $80, %rsp
2018; AVX2-NEXT:    movq %rdi, %rbx
2019; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
2020; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2021; AVX2-NEXT:    vpinsrw $0, 4(%rdi), %xmm0, %xmm0
2022; AVX2-NEXT:    callq __extendhfsf2@PLT
2023; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2024; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2025; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2026; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
2027; AVX2-NEXT:    callq __extendhfsf2@PLT
2028; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2029; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2030; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2031; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2032; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2033; AVX2-NEXT:    callq __extendhfsf2@PLT
2034; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2035; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2036; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2037; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
2038; AVX2-NEXT:    callq __extendhfsf2@PLT
2039; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2040; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
2041; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2042; AVX2-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
2043; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2044; AVX2-NEXT:    vpinsrw $0, 12(%rbx), %xmm0, %xmm0
2045; AVX2-NEXT:    vzeroupper
2046; AVX2-NEXT:    callq __extendhfsf2@PLT
2047; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2048; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2049; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2050; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2051; AVX2-NEXT:    callq __extendhfsf2@PLT
2052; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2053; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
2054; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2055; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2056; AVX2-NEXT:    vpinsrw $0, 8(%rbx), %xmm0, %xmm0
2057; AVX2-NEXT:    callq __extendhfsf2@PLT
2058; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2059; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2060; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2061; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2062; AVX2-NEXT:    callq __extendhfsf2@PLT
2063; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2064; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2065; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2066; AVX2-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload
2067; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2068; AVX2-NEXT:    addq $80, %rsp
2069; AVX2-NEXT:    popq %rbx
2070; AVX2-NEXT:    retq
2071;
2072; F16C-LABEL: load_cvt_8i16_to_8f64:
2073; F16C:       # %bb.0:
2074; F16C-NEXT:    vcvtph2ps (%rdi), %ymm1
2075; F16C-NEXT:    vcvtps2pd %xmm1, %ymm0
2076; F16C-NEXT:    vextractf128 $1, %ymm1, %xmm1
2077; F16C-NEXT:    vcvtps2pd %xmm1, %ymm1
2078; F16C-NEXT:    retq
2079;
2080; AVX512-LABEL: load_cvt_8i16_to_8f64:
2081; AVX512:       # %bb.0:
2082; AVX512-NEXT:    vcvtph2ps (%rdi), %ymm0
2083; AVX512-NEXT:    vcvtps2pd %ymm0, %zmm0
2084; AVX512-NEXT:    retq
2085  %1 = load <8 x i16>, ptr %a0
2086  %2 = bitcast <8 x i16> %1 to <8 x half>
2087  %3 = fpext <8 x half> %2 to <8 x double>
2088  ret <8 x double> %3
2089}
2090
2091;
2092; Float to Half
2093;
2094
2095define i16 @cvt_f32_to_i16(float %a0) nounwind {
2096; AVX-LABEL: cvt_f32_to_i16:
2097; AVX:       # %bb.0:
2098; AVX-NEXT:    pushq %rax
2099; AVX-NEXT:    callq __truncsfhf2@PLT
2100; AVX-NEXT:    vpextrw $0, %xmm0, %eax
2101; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
2102; AVX-NEXT:    popq %rcx
2103; AVX-NEXT:    retq
2104;
2105; F16C-LABEL: cvt_f32_to_i16:
2106; F16C:       # %bb.0:
2107; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2108; F16C-NEXT:    vmovd %xmm0, %eax
2109; F16C-NEXT:    # kill: def $ax killed $ax killed $eax
2110; F16C-NEXT:    retq
2111;
2112; AVX512-LABEL: cvt_f32_to_i16:
2113; AVX512:       # %bb.0:
2114; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2115; AVX512-NEXT:    vmovd %xmm0, %eax
2116; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
2117; AVX512-NEXT:    retq
2118  %1 = fptrunc float %a0 to half
2119  %2 = bitcast half %1 to i16
2120  ret i16 %2
2121}
2122
2123define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind {
2124; AVX-LABEL: cvt_4f32_to_4i16:
2125; AVX:       # %bb.0:
2126; AVX-NEXT:    subq $72, %rsp
2127; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2128; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
2129; AVX-NEXT:    callq __truncsfhf2@PLT
2130; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2131; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2132; AVX-NEXT:    callq __truncsfhf2@PLT
2133; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2134; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
2135; AVX-NEXT:    # xmm0 = mem[1,0]
2136; AVX-NEXT:    callq __truncsfhf2@PLT
2137; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2138; AVX-NEXT:    vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload
2139; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
2140; AVX-NEXT:    callq __truncsfhf2@PLT
2141; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2142; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2143; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2144; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2145; AVX-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2146; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
2147; AVX-NEXT:    addq $72, %rsp
2148; AVX-NEXT:    retq
2149;
2150; F16C-LABEL: cvt_4f32_to_4i16:
2151; F16C:       # %bb.0:
2152; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2153; F16C-NEXT:    retq
2154;
2155; AVX512-LABEL: cvt_4f32_to_4i16:
2156; AVX512:       # %bb.0:
2157; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2158; AVX512-NEXT:    retq
2159  %1 = fptrunc <4 x float> %a0 to <4 x half>
2160  %2 = bitcast <4 x half> %1 to <4 x i16>
2161  ret <4 x i16> %2
2162}
2163
2164define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
2165; AVX-LABEL: cvt_4f32_to_8i16_undef:
2166; AVX:       # %bb.0:
2167; AVX-NEXT:    subq $72, %rsp
2168; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2169; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
2170; AVX-NEXT:    callq __truncsfhf2@PLT
2171; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2172; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2173; AVX-NEXT:    callq __truncsfhf2@PLT
2174; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2175; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
2176; AVX-NEXT:    # xmm0 = mem[1,0]
2177; AVX-NEXT:    callq __truncsfhf2@PLT
2178; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2179; AVX-NEXT:    vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload
2180; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
2181; AVX-NEXT:    callq __truncsfhf2@PLT
2182; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2183; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2184; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2185; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2186; AVX-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2187; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
2188; AVX-NEXT:    addq $72, %rsp
2189; AVX-NEXT:    retq
2190;
2191; F16C-LABEL: cvt_4f32_to_8i16_undef:
2192; F16C:       # %bb.0:
2193; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2194; F16C-NEXT:    retq
2195;
2196; AVX512-LABEL: cvt_4f32_to_8i16_undef:
2197; AVX512:       # %bb.0:
2198; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2199; AVX512-NEXT:    retq
2200  %1 = fptrunc <4 x float> %a0 to <4 x half>
2201  %2 = bitcast <4 x half> %1 to <4 x i16>
2202  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2203  ret <8 x i16> %3
2204}
2205
2206define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
2207; AVX-LABEL: cvt_4f32_to_8i16_zero:
2208; AVX:       # %bb.0:
2209; AVX-NEXT:    subq $72, %rsp
2210; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2211; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
2212; AVX-NEXT:    callq __truncsfhf2@PLT
2213; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2214; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2215; AVX-NEXT:    callq __truncsfhf2@PLT
2216; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2217; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
2218; AVX-NEXT:    # xmm0 = mem[1,0]
2219; AVX-NEXT:    callq __truncsfhf2@PLT
2220; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2221; AVX-NEXT:    vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload
2222; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
2223; AVX-NEXT:    callq __truncsfhf2@PLT
2224; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2225; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2226; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2227; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2228; AVX-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2229; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
2230; AVX-NEXT:    addq $72, %rsp
2231; AVX-NEXT:    retq
2232;
2233; F16C-LABEL: cvt_4f32_to_8i16_zero:
2234; F16C:       # %bb.0:
2235; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2236; F16C-NEXT:    retq
2237;
2238; AVX512-LABEL: cvt_4f32_to_8i16_zero:
2239; AVX512:       # %bb.0:
2240; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2241; AVX512-NEXT:    retq
2242  %1 = fptrunc <4 x float> %a0 to <4 x half>
2243  %2 = bitcast <4 x half> %1 to <4 x i16>
2244  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2245  ret <8 x i16> %3
2246}
2247
2248define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind {
2249; AVX-LABEL: cvt_8f32_to_8i16:
2250; AVX:       # %bb.0:
2251; AVX-NEXT:    subq $88, %rsp
2252; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2253; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
2254; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2255; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2256; AVX-NEXT:    vzeroupper
2257; AVX-NEXT:    callq __truncsfhf2@PLT
2258; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2259; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2260; AVX-NEXT:    # xmm0 = mem[1,0]
2261; AVX-NEXT:    callq __truncsfhf2@PLT
2262; AVX-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2263; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2264; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2265; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2266; AVX-NEXT:    callq __truncsfhf2@PLT
2267; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2268; AVX-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2269; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
2270; AVX-NEXT:    callq __truncsfhf2@PLT
2271; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2272; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2273; AVX-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2274; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2275; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2276; AVX-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2277; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
2278; AVX-NEXT:    callq __truncsfhf2@PLT
2279; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2280; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2281; AVX-NEXT:    # xmm0 = mem[1,0]
2282; AVX-NEXT:    callq __truncsfhf2@PLT
2283; AVX-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2284; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2285; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2286; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2287; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2288; AVX-NEXT:    vzeroupper
2289; AVX-NEXT:    callq __truncsfhf2@PLT
2290; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2291; AVX-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2292; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
2293; AVX-NEXT:    callq __truncsfhf2@PLT
2294; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2295; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2296; AVX-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2297; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2298; AVX-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2299; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
2300; AVX-NEXT:    addq $88, %rsp
2301; AVX-NEXT:    retq
2302;
2303; F16C-LABEL: cvt_8f32_to_8i16:
2304; F16C:       # %bb.0:
2305; F16C-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
2306; F16C-NEXT:    vzeroupper
2307; F16C-NEXT:    retq
2308;
2309; AVX512-LABEL: cvt_8f32_to_8i16:
2310; AVX512:       # %bb.0:
2311; AVX512-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
2312; AVX512-NEXT:    vzeroupper
2313; AVX512-NEXT:    retq
2314  %1 = fptrunc <8 x float> %a0 to <8 x half>
2315  %2 = bitcast <8 x half> %1 to <8 x i16>
2316  ret <8 x i16> %2
2317}
2318
2319define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
2320; AVX1-LABEL: cvt_16f32_to_16i16:
2321; AVX1:       # %bb.0:
2322; AVX1-NEXT:    subq $120, %rsp
2323; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2324; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2325; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
2326; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2327; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2328; AVX1-NEXT:    vzeroupper
2329; AVX1-NEXT:    callq __truncsfhf2@PLT
2330; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2331; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2332; AVX1-NEXT:    # xmm0 = mem[1,0]
2333; AVX1-NEXT:    callq __truncsfhf2@PLT
2334; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2335; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2336; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2337; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2338; AVX1-NEXT:    callq __truncsfhf2@PLT
2339; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2340; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2341; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
2342; AVX1-NEXT:    callq __truncsfhf2@PLT
2343; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2344; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2345; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2346; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2347; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2348; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2349; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
2350; AVX1-NEXT:    callq __truncsfhf2@PLT
2351; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2352; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2353; AVX1-NEXT:    # xmm0 = mem[1,0]
2354; AVX1-NEXT:    callq __truncsfhf2@PLT
2355; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2356; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2357; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2358; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2359; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2360; AVX1-NEXT:    vzeroupper
2361; AVX1-NEXT:    callq __truncsfhf2@PLT
2362; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2363; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2364; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
2365; AVX1-NEXT:    callq __truncsfhf2@PLT
2366; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2367; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2368; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2369; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2370; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2371; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
2372; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2373; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2374; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2375; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2376; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2377; AVX1-NEXT:    vzeroupper
2378; AVX1-NEXT:    callq __truncsfhf2@PLT
2379; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2380; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2381; AVX1-NEXT:    # xmm0 = mem[1,0]
2382; AVX1-NEXT:    callq __truncsfhf2@PLT
2383; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2384; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2385; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2386; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2387; AVX1-NEXT:    callq __truncsfhf2@PLT
2388; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2389; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2390; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
2391; AVX1-NEXT:    callq __truncsfhf2@PLT
2392; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2393; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2394; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2395; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2396; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2397; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2398; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
2399; AVX1-NEXT:    callq __truncsfhf2@PLT
2400; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2401; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2402; AVX1-NEXT:    # xmm0 = mem[1,0]
2403; AVX1-NEXT:    callq __truncsfhf2@PLT
2404; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2405; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2406; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2407; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2408; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2409; AVX1-NEXT:    vzeroupper
2410; AVX1-NEXT:    callq __truncsfhf2@PLT
2411; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2412; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2413; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
2414; AVX1-NEXT:    callq __truncsfhf2@PLT
2415; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2416; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2417; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2418; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2419; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2420; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
2421; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
2422; AVX1-NEXT:    addq $120, %rsp
2423; AVX1-NEXT:    retq
2424;
2425; AVX2-LABEL: cvt_16f32_to_16i16:
2426; AVX2:       # %bb.0:
2427; AVX2-NEXT:    subq $120, %rsp
2428; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2429; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2430; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
2431; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2432; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2433; AVX2-NEXT:    vzeroupper
2434; AVX2-NEXT:    callq __truncsfhf2@PLT
2435; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2436; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2437; AVX2-NEXT:    # xmm0 = mem[1,0]
2438; AVX2-NEXT:    callq __truncsfhf2@PLT
2439; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2440; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2441; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2442; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2443; AVX2-NEXT:    callq __truncsfhf2@PLT
2444; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2445; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2446; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
2447; AVX2-NEXT:    callq __truncsfhf2@PLT
2448; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2449; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2450; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2451; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2452; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2453; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2454; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
2455; AVX2-NEXT:    callq __truncsfhf2@PLT
2456; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2457; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2458; AVX2-NEXT:    # xmm0 = mem[1,0]
2459; AVX2-NEXT:    callq __truncsfhf2@PLT
2460; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2461; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2462; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2463; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2464; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2465; AVX2-NEXT:    vzeroupper
2466; AVX2-NEXT:    callq __truncsfhf2@PLT
2467; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2468; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2469; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
2470; AVX2-NEXT:    callq __truncsfhf2@PLT
2471; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2472; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2473; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2474; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2475; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2476; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
2477; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2478; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2479; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
2480; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2481; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2482; AVX2-NEXT:    vzeroupper
2483; AVX2-NEXT:    callq __truncsfhf2@PLT
2484; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2485; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2486; AVX2-NEXT:    # xmm0 = mem[1,0]
2487; AVX2-NEXT:    callq __truncsfhf2@PLT
2488; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2489; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2490; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2491; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2492; AVX2-NEXT:    callq __truncsfhf2@PLT
2493; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2494; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2495; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
2496; AVX2-NEXT:    callq __truncsfhf2@PLT
2497; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2498; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2499; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2500; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2501; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2502; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2503; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
2504; AVX2-NEXT:    callq __truncsfhf2@PLT
2505; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2506; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2507; AVX2-NEXT:    # xmm0 = mem[1,0]
2508; AVX2-NEXT:    callq __truncsfhf2@PLT
2509; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2510; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2511; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2512; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2513; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2514; AVX2-NEXT:    vzeroupper
2515; AVX2-NEXT:    callq __truncsfhf2@PLT
2516; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2517; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2518; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
2519; AVX2-NEXT:    callq __truncsfhf2@PLT
2520; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2521; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2522; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2523; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2524; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2525; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
2526; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
2527; AVX2-NEXT:    addq $120, %rsp
2528; AVX2-NEXT:    retq
2529;
2530; F16C-LABEL: cvt_16f32_to_16i16:
2531; F16C:       # %bb.0:
2532; F16C-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
2533; F16C-NEXT:    vcvtps2ph $4, %ymm1, %xmm1
2534; F16C-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2535; F16C-NEXT:    retq
2536;
2537; AVX512-LABEL: cvt_16f32_to_16i16:
2538; AVX512:       # %bb.0:
2539; AVX512-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
2540; AVX512-NEXT:    retq
2541  %1 = fptrunc <16 x float> %a0 to <16 x half>
2542  %2 = bitcast <16 x half> %1 to <16 x i16>
2543  ret <16 x i16> %2
2544}
2545
2546;
2547; Float to Half (Store)
2548;
2549
2550define void @store_cvt_f32_to_i16(float %a0, ptr %a1) nounwind {
2551; AVX-LABEL: store_cvt_f32_to_i16:
2552; AVX:       # %bb.0:
2553; AVX-NEXT:    pushq %rbx
2554; AVX-NEXT:    movq %rdi, %rbx
2555; AVX-NEXT:    callq __truncsfhf2@PLT
2556; AVX-NEXT:    vpextrw $0, %xmm0, (%rbx)
2557; AVX-NEXT:    popq %rbx
2558; AVX-NEXT:    retq
2559;
2560; F16C-LABEL: store_cvt_f32_to_i16:
2561; F16C:       # %bb.0:
2562; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2563; F16C-NEXT:    vmovd %xmm0, %eax
2564; F16C-NEXT:    movw %ax, (%rdi)
2565; F16C-NEXT:    retq
2566;
2567; AVX512-LABEL: store_cvt_f32_to_i16:
2568; AVX512:       # %bb.0:
2569; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2570; AVX512-NEXT:    vmovd %xmm0, %eax
2571; AVX512-NEXT:    movw %ax, (%rdi)
2572; AVX512-NEXT:    retq
2573  %1 = fptrunc float %a0 to half
2574  %2 = bitcast half %1 to i16
2575  store i16 %2, ptr %a1
2576  ret void
2577}
2578
2579define void @store_cvt_4f32_to_4i16(<4 x float> %a0, ptr %a1) nounwind {
2580; AVX-LABEL: store_cvt_4f32_to_4i16:
2581; AVX:       # %bb.0:
2582; AVX-NEXT:    pushq %rbx
2583; AVX-NEXT:    subq $64, %rsp
2584; AVX-NEXT:    movq %rdi, %rbx
2585; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2586; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
2587; AVX-NEXT:    callq __truncsfhf2@PLT
2588; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2589; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
2590; AVX-NEXT:    # xmm0 = mem[1,0]
2591; AVX-NEXT:    callq __truncsfhf2@PLT
2592; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2593; AVX-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
2594; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
2595; AVX-NEXT:    callq __truncsfhf2@PLT
2596; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2597; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
2598; AVX-NEXT:    callq __truncsfhf2@PLT
2599; AVX-NEXT:    vpextrw $0, %xmm0, (%rbx)
2600; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2601; AVX-NEXT:    vpextrw $0, %xmm0, 6(%rbx)
2602; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2603; AVX-NEXT:    vpextrw $0, %xmm0, 4(%rbx)
2604; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2605; AVX-NEXT:    vpextrw $0, %xmm0, 2(%rbx)
2606; AVX-NEXT:    addq $64, %rsp
2607; AVX-NEXT:    popq %rbx
2608; AVX-NEXT:    retq
2609;
2610; F16C-LABEL: store_cvt_4f32_to_4i16:
2611; F16C:       # %bb.0:
2612; F16C-NEXT:    vcvtps2ph $4, %xmm0, (%rdi)
2613; F16C-NEXT:    retq
2614;
2615; AVX512-LABEL: store_cvt_4f32_to_4i16:
2616; AVX512:       # %bb.0:
2617; AVX512-NEXT:    vcvtps2ph $4, %xmm0, (%rdi)
2618; AVX512-NEXT:    retq
2619  %1 = fptrunc <4 x float> %a0 to <4 x half>
2620  %2 = bitcast <4 x half> %1 to <4 x i16>
2621  store <4 x i16> %2, ptr %a1
2622  ret void
2623}
2624
2625define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, ptr %a1) nounwind {
2626; AVX-LABEL: store_cvt_4f32_to_8i16_undef:
2627; AVX:       # %bb.0:
2628; AVX-NEXT:    pushq %rbx
2629; AVX-NEXT:    subq $64, %rsp
2630; AVX-NEXT:    movq %rdi, %rbx
2631; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2632; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
2633; AVX-NEXT:    callq __truncsfhf2@PLT
2634; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2635; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2636; AVX-NEXT:    callq __truncsfhf2@PLT
2637; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2638; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
2639; AVX-NEXT:    # xmm0 = mem[1,0]
2640; AVX-NEXT:    callq __truncsfhf2@PLT
2641; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2642; AVX-NEXT:    vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload
2643; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
2644; AVX-NEXT:    callq __truncsfhf2@PLT
2645; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2646; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2647; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2648; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2649; AVX-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2650; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
2651; AVX-NEXT:    vmovaps %xmm0, (%rbx)
2652; AVX-NEXT:    addq $64, %rsp
2653; AVX-NEXT:    popq %rbx
2654; AVX-NEXT:    retq
2655;
2656; F16C-LABEL: store_cvt_4f32_to_8i16_undef:
2657; F16C:       # %bb.0:
2658; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2659; F16C-NEXT:    vmovaps %xmm0, (%rdi)
2660; F16C-NEXT:    retq
2661;
2662; AVX512-LABEL: store_cvt_4f32_to_8i16_undef:
2663; AVX512:       # %bb.0:
2664; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2665; AVX512-NEXT:    vmovaps %xmm0, (%rdi)
2666; AVX512-NEXT:    retq
2667  %1 = fptrunc <4 x float> %a0 to <4 x half>
2668  %2 = bitcast <4 x half> %1 to <4 x i16>
2669  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2670  store <8 x i16> %3, ptr %a1
2671  ret void
2672}
2673
2674define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, ptr %a1) nounwind {
2675; AVX-LABEL: store_cvt_4f32_to_8i16_zero:
2676; AVX:       # %bb.0:
2677; AVX-NEXT:    pushq %rbx
2678; AVX-NEXT:    subq $64, %rsp
2679; AVX-NEXT:    movq %rdi, %rbx
2680; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2681; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
2682; AVX-NEXT:    callq __truncsfhf2@PLT
2683; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2684; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2685; AVX-NEXT:    callq __truncsfhf2@PLT
2686; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2687; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
2688; AVX-NEXT:    # xmm0 = mem[1,0]
2689; AVX-NEXT:    callq __truncsfhf2@PLT
2690; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2691; AVX-NEXT:    vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload
2692; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
2693; AVX-NEXT:    callq __truncsfhf2@PLT
2694; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2695; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2696; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2697; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2698; AVX-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2699; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
2700; AVX-NEXT:    vmovaps %xmm0, (%rbx)
2701; AVX-NEXT:    addq $64, %rsp
2702; AVX-NEXT:    popq %rbx
2703; AVX-NEXT:    retq
2704;
2705; F16C-LABEL: store_cvt_4f32_to_8i16_zero:
2706; F16C:       # %bb.0:
2707; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2708; F16C-NEXT:    vmovaps %xmm0, (%rdi)
2709; F16C-NEXT:    retq
2710;
2711; AVX512-LABEL: store_cvt_4f32_to_8i16_zero:
2712; AVX512:       # %bb.0:
2713; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2714; AVX512-NEXT:    vmovaps %xmm0, (%rdi)
2715; AVX512-NEXT:    retq
2716  %1 = fptrunc <4 x float> %a0 to <4 x half>
2717  %2 = bitcast <4 x half> %1 to <4 x i16>
2718  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2719  store <8 x i16> %3, ptr %a1
2720  ret void
2721}
2722
2723define void @store_cvt_8f32_to_8i16(<8 x float> %a0, ptr %a1) nounwind {
2724; AVX-LABEL: store_cvt_8f32_to_8i16:
2725; AVX:       # %bb.0:
2726; AVX-NEXT:    pushq %rbx
2727; AVX-NEXT:    subq $80, %rsp
2728; AVX-NEXT:    movq %rdi, %rbx
2729; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2730; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
2731; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2732; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2733; AVX-NEXT:    vzeroupper
2734; AVX-NEXT:    callq __truncsfhf2@PLT
2735; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2736; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2737; AVX-NEXT:    # xmm0 = mem[1,0]
2738; AVX-NEXT:    callq __truncsfhf2@PLT
2739; AVX-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2740; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2741; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2742; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2743; AVX-NEXT:    callq __truncsfhf2@PLT
2744; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2745; AVX-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2746; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
2747; AVX-NEXT:    callq __truncsfhf2@PLT
2748; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2749; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2750; AVX-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2751; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2752; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2753; AVX-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2754; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
2755; AVX-NEXT:    callq __truncsfhf2@PLT
2756; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2757; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2758; AVX-NEXT:    # xmm0 = mem[1,0]
2759; AVX-NEXT:    callq __truncsfhf2@PLT
2760; AVX-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2761; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2762; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2763; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2764; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2765; AVX-NEXT:    vzeroupper
2766; AVX-NEXT:    callq __truncsfhf2@PLT
2767; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2768; AVX-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2769; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
2770; AVX-NEXT:    callq __truncsfhf2@PLT
2771; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2772; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2773; AVX-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2774; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2775; AVX-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2776; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
2777; AVX-NEXT:    vmovdqa %xmm0, (%rbx)
2778; AVX-NEXT:    addq $80, %rsp
2779; AVX-NEXT:    popq %rbx
2780; AVX-NEXT:    retq
2781;
2782; F16C-LABEL: store_cvt_8f32_to_8i16:
2783; F16C:       # %bb.0:
2784; F16C-NEXT:    vcvtps2ph $4, %ymm0, (%rdi)
2785; F16C-NEXT:    vzeroupper
2786; F16C-NEXT:    retq
2787;
2788; AVX512-LABEL: store_cvt_8f32_to_8i16:
2789; AVX512:       # %bb.0:
2790; AVX512-NEXT:    vcvtps2ph $4, %ymm0, (%rdi)
2791; AVX512-NEXT:    vzeroupper
2792; AVX512-NEXT:    retq
2793  %1 = fptrunc <8 x float> %a0 to <8 x half>
2794  %2 = bitcast <8 x half> %1 to <8 x i16>
2795  store <8 x i16> %2, ptr %a1
2796  ret void
2797}
2798
2799define void @store_cvt_16f32_to_16i16(<16 x float> %a0, ptr %a1) nounwind {
2800; AVX1-LABEL: store_cvt_16f32_to_16i16:
2801; AVX1:       # %bb.0:
2802; AVX1-NEXT:    pushq %rbx
2803; AVX1-NEXT:    subq $112, %rsp
2804; AVX1-NEXT:    movq %rdi, %rbx
2805; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2806; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2807; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
2808; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2809; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2810; AVX1-NEXT:    vzeroupper
2811; AVX1-NEXT:    callq __truncsfhf2@PLT
2812; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2813; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2814; AVX1-NEXT:    # xmm0 = mem[1,0]
2815; AVX1-NEXT:    callq __truncsfhf2@PLT
2816; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2817; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2818; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2819; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2820; AVX1-NEXT:    callq __truncsfhf2@PLT
2821; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2822; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2823; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
2824; AVX1-NEXT:    callq __truncsfhf2@PLT
2825; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2826; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2827; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2828; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2829; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2830; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2831; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
2832; AVX1-NEXT:    callq __truncsfhf2@PLT
2833; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2834; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2835; AVX1-NEXT:    # xmm0 = mem[1,0]
2836; AVX1-NEXT:    callq __truncsfhf2@PLT
2837; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2838; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2839; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2840; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2841; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2842; AVX1-NEXT:    vzeroupper
2843; AVX1-NEXT:    callq __truncsfhf2@PLT
2844; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2845; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2846; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
2847; AVX1-NEXT:    callq __truncsfhf2@PLT
2848; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2849; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2850; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2851; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2852; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2853; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
2854; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2855; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2856; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2857; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2858; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2859; AVX1-NEXT:    vzeroupper
2860; AVX1-NEXT:    callq __truncsfhf2@PLT
2861; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2862; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2863; AVX1-NEXT:    # xmm0 = mem[1,0]
2864; AVX1-NEXT:    callq __truncsfhf2@PLT
2865; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2866; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2867; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2868; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2869; AVX1-NEXT:    callq __truncsfhf2@PLT
2870; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2871; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2872; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
2873; AVX1-NEXT:    callq __truncsfhf2@PLT
2874; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2875; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2876; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2877; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2878; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2879; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2880; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
2881; AVX1-NEXT:    callq __truncsfhf2@PLT
2882; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2883; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2884; AVX1-NEXT:    # xmm0 = mem[1,0]
2885; AVX1-NEXT:    callq __truncsfhf2@PLT
2886; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2887; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2888; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2889; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2890; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2891; AVX1-NEXT:    vzeroupper
2892; AVX1-NEXT:    callq __truncsfhf2@PLT
2893; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2894; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2895; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
2896; AVX1-NEXT:    callq __truncsfhf2@PLT
2897; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2898; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2899; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2900; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2901; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2902; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
2903; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
2904; AVX1-NEXT:    vmovaps %ymm0, (%rbx)
2905; AVX1-NEXT:    addq $112, %rsp
2906; AVX1-NEXT:    popq %rbx
2907; AVX1-NEXT:    vzeroupper
2908; AVX1-NEXT:    retq
2909;
2910; AVX2-LABEL: store_cvt_16f32_to_16i16:
2911; AVX2:       # %bb.0:
2912; AVX2-NEXT:    pushq %rbx
2913; AVX2-NEXT:    subq $112, %rsp
2914; AVX2-NEXT:    movq %rdi, %rbx
2915; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2916; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2917; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
2918; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2919; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2920; AVX2-NEXT:    vzeroupper
2921; AVX2-NEXT:    callq __truncsfhf2@PLT
2922; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2923; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2924; AVX2-NEXT:    # xmm0 = mem[1,0]
2925; AVX2-NEXT:    callq __truncsfhf2@PLT
2926; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2927; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2928; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2929; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2930; AVX2-NEXT:    callq __truncsfhf2@PLT
2931; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2932; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2933; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
2934; AVX2-NEXT:    callq __truncsfhf2@PLT
2935; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2936; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2937; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2938; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2939; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2940; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2941; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
2942; AVX2-NEXT:    callq __truncsfhf2@PLT
2943; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2944; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2945; AVX2-NEXT:    # xmm0 = mem[1,0]
2946; AVX2-NEXT:    callq __truncsfhf2@PLT
2947; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2948; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2949; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2950; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2951; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2952; AVX2-NEXT:    vzeroupper
2953; AVX2-NEXT:    callq __truncsfhf2@PLT
2954; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2955; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2956; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
2957; AVX2-NEXT:    callq __truncsfhf2@PLT
2958; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2959; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2960; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2961; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2962; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2963; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
2964; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2965; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2966; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
2967; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2968; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2969; AVX2-NEXT:    vzeroupper
2970; AVX2-NEXT:    callq __truncsfhf2@PLT
2971; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2972; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2973; AVX2-NEXT:    # xmm0 = mem[1,0]
2974; AVX2-NEXT:    callq __truncsfhf2@PLT
2975; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2976; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2977; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2978; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2979; AVX2-NEXT:    callq __truncsfhf2@PLT
2980; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2981; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2982; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
2983; AVX2-NEXT:    callq __truncsfhf2@PLT
2984; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2985; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2986; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2987; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2988; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2989; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2990; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
2991; AVX2-NEXT:    callq __truncsfhf2@PLT
2992; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2993; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2994; AVX2-NEXT:    # xmm0 = mem[1,0]
2995; AVX2-NEXT:    callq __truncsfhf2@PLT
2996; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2997; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2998; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2999; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3000; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3001; AVX2-NEXT:    vzeroupper
3002; AVX2-NEXT:    callq __truncsfhf2@PLT
3003; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3004; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3005; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
3006; AVX2-NEXT:    callq __truncsfhf2@PLT
3007; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3008; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3009; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
3010; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3011; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3012; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
3013; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3014; AVX2-NEXT:    vmovdqa %ymm0, (%rbx)
3015; AVX2-NEXT:    addq $112, %rsp
3016; AVX2-NEXT:    popq %rbx
3017; AVX2-NEXT:    vzeroupper
3018; AVX2-NEXT:    retq
3019;
3020; F16C-LABEL: store_cvt_16f32_to_16i16:
3021; F16C:       # %bb.0:
3022; F16C-NEXT:    vcvtps2ph $4, %ymm1, 16(%rdi)
3023; F16C-NEXT:    vcvtps2ph $4, %ymm0, (%rdi)
3024; F16C-NEXT:    vzeroupper
3025; F16C-NEXT:    retq
3026;
3027; AVX512-LABEL: store_cvt_16f32_to_16i16:
3028; AVX512:       # %bb.0:
3029; AVX512-NEXT:    vcvtps2ph $4, %zmm0, (%rdi)
3030; AVX512-NEXT:    vzeroupper
3031; AVX512-NEXT:    retq
3032  %1 = fptrunc <16 x float> %a0 to <16 x half>
3033  %2 = bitcast <16 x half> %1 to <16 x i16>
3034  store <16 x i16> %2, ptr %a1
3035  ret void
3036}
3037
3038;
3039; Double to Half
3040;
3041
3042define i16 @cvt_f64_to_i16(double %a0) nounwind {
3043; ALL-LABEL: cvt_f64_to_i16:
3044; ALL:       # %bb.0:
3045; ALL-NEXT:    pushq %rax
3046; ALL-NEXT:    callq __truncdfhf2@PLT
3047; ALL-NEXT:    vpextrw $0, %xmm0, %eax
3048; ALL-NEXT:    # kill: def $ax killed $ax killed $eax
3049; ALL-NEXT:    popq %rcx
3050; ALL-NEXT:    retq
3051; AVX-LABEL: cvt_f64_to_i16:
3052; AVX:       # %bb.0:
3053; AVX-NEXT:    pushq %rax
3054; AVX-NEXT:    callq __truncdfhf2@PLT
3055; AVX-NEXT:    vpextrw $0, %xmm0, %eax
3056; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
3057; AVX-NEXT:    popq %rcx
3058; AVX-NEXT:    retq
3059;
3060; F16C-LABEL: cvt_f64_to_i16:
3061; F16C:       # %bb.0:
3062; F16C-NEXT:    pushq %rax
3063; F16C-NEXT:    callq __truncdfhf2@PLT
3064; F16C-NEXT:    vpextrw $0, %xmm0, %eax
3065; F16C-NEXT:    # kill: def $ax killed $ax killed $eax
3066; F16C-NEXT:    popq %rcx
3067; F16C-NEXT:    retq
3068;
3069; AVX512-LABEL: cvt_f64_to_i16:
3070; AVX512:       # %bb.0:
3071; AVX512-NEXT:    pushq %rax
3072; AVX512-NEXT:    callq __truncdfhf2@PLT
3073; AVX512-NEXT:    vpextrw $0, %xmm0, %eax
3074; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
3075; AVX512-NEXT:    popq %rcx
3076; AVX512-NEXT:    retq
3077  %1 = fptrunc double %a0 to half
3078  %2 = bitcast half %1 to i16
3079  ret i16 %2
3080}
3081
3082define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
3083; AVX-LABEL: cvt_2f64_to_2i16:
3084; AVX:       # %bb.0:
3085; AVX-NEXT:    subq $40, %rsp
3086; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3087; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3088; AVX-NEXT:    callq __truncdfhf2@PLT
3089; AVX-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3090; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3091; AVX-NEXT:    callq __truncdfhf2@PLT
3092; AVX-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
3093; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3094; AVX-NEXT:    addq $40, %rsp
3095; AVX-NEXT:    retq
3096;
3097; F16C-LABEL: cvt_2f64_to_2i16:
3098; F16C:       # %bb.0:
3099; F16C-NEXT:    subq $40, %rsp
3100; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3101; F16C-NEXT:    callq __truncdfhf2@PLT
3102; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3103; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3104; F16C-NEXT:    # xmm0 = mem[1,0]
3105; F16C-NEXT:    callq __truncdfhf2@PLT
3106; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3107; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3108; F16C-NEXT:    addq $40, %rsp
3109; F16C-NEXT:    retq
3110  %1 = fptrunc <2 x double> %a0 to <2 x half>
3111  %2 = bitcast <2 x half> %1 to <2 x i16>
3112  ret <2 x i16> %2
3113}
3114
3115define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
3116; AVX1-LABEL: cvt_4f64_to_4i16:
3117; AVX1:       # %bb.0:
3118; AVX1-NEXT:    subq $88, %rsp
3119; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3120; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3121; AVX1-NEXT:    vzeroupper
3122; AVX1-NEXT:    callq __truncdfhf2@PLT
3123; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3124; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3125; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3126; AVX1-NEXT:    vzeroupper
3127; AVX1-NEXT:    callq __truncdfhf2@PLT
3128; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3129; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3130; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3131; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3132; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3133; AVX1-NEXT:    vzeroupper
3134; AVX1-NEXT:    callq __truncdfhf2@PLT
3135; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3136; AVX1-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3137; AVX1-NEXT:    callq __truncdfhf2@PLT
3138; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3139; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3140; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3141; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
3142; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3143; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
3144; AVX1-NEXT:    addq $88, %rsp
3145; AVX1-NEXT:    retq
3146;
3147; AVX2-LABEL: cvt_4f64_to_4i16:
3148; AVX2:       # %bb.0:
3149; AVX2-NEXT:    subq $88, %rsp
3150; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3151; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3152; AVX2-NEXT:    vzeroupper
3153; AVX2-NEXT:    callq __truncdfhf2@PLT
3154; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3155; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3156; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3157; AVX2-NEXT:    vzeroupper
3158; AVX2-NEXT:    callq __truncdfhf2@PLT
3159; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3160; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3161; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3162; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3163; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3164; AVX2-NEXT:    vzeroupper
3165; AVX2-NEXT:    callq __truncdfhf2@PLT
3166; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3167; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3168; AVX2-NEXT:    callq __truncdfhf2@PLT
3169; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3170; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3171; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3172; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
3173; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3174; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
3175; AVX2-NEXT:    addq $88, %rsp
3176; AVX2-NEXT:    retq
3177;
3178; F16C-LABEL: cvt_4f64_to_4i16:
3179; F16C:       # %bb.0:
3180; F16C-NEXT:    subq $72, %rsp
3181; F16C-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3182; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
3183; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3184; F16C-NEXT:    vzeroupper
3185; F16C-NEXT:    callq __truncdfhf2@PLT
3186; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3187; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3188; F16C-NEXT:    # xmm0 = mem[1,0]
3189; F16C-NEXT:    callq __truncdfhf2@PLT
3190; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3191; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3192; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3193; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3194; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3195; F16C-NEXT:    vzeroupper
3196; F16C-NEXT:    callq __truncdfhf2@PLT
3197; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3198; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3199; F16C-NEXT:    # xmm0 = mem[1,0]
3200; F16C-NEXT:    callq __truncdfhf2@PLT
3201; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3202; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3203; F16C-NEXT:    vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3204; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
3205; F16C-NEXT:    addq $72, %rsp
3206; F16C-NEXT:    retq
3207;
3208; AVX512-LABEL: cvt_4f64_to_4i16:
3209; AVX512:       # %bb.0:
3210; AVX512-NEXT:    subq $72, %rsp
3211; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3212; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3213; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3214; AVX512-NEXT:    vzeroupper
3215; AVX512-NEXT:    callq __truncdfhf2@PLT
3216; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3217; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3218; AVX512-NEXT:    # xmm0 = mem[1,0]
3219; AVX512-NEXT:    callq __truncdfhf2@PLT
3220; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3221; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3222; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3223; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3224; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3225; AVX512-NEXT:    vzeroupper
3226; AVX512-NEXT:    callq __truncdfhf2@PLT
3227; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3228; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3229; AVX512-NEXT:    # xmm0 = mem[1,0]
3230; AVX512-NEXT:    callq __truncdfhf2@PLT
3231; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3232; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3233; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3234; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3235; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3236; AVX512-NEXT:    callq __truncdfhf2@PLT
3237; AVX512-NEXT:    vpbroadcastw %xmm0, %xmm0
3238; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3239; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
3240; AVX512-NEXT:    addq $72, %rsp
3241; AVX512-NEXT:    retq
3242  %1 = fptrunc <4 x double> %a0 to <4 x half>
3243  %2 = bitcast <4 x half> %1 to <4 x i16>
3244  ret <4 x i16> %2
3245}
3246
3247define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
3248; AVX1-LABEL: cvt_4f64_to_8i16_undef:
3249; AVX1:       # %bb.0:
3250; AVX1-NEXT:    subq $88, %rsp
3251; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3252; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3253; AVX1-NEXT:    vzeroupper
3254; AVX1-NEXT:    callq __truncdfhf2@PLT
3255; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3256; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3257; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3258; AVX1-NEXT:    vzeroupper
3259; AVX1-NEXT:    callq __truncdfhf2@PLT
3260; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3261; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3262; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3263; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3264; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3265; AVX1-NEXT:    vzeroupper
3266; AVX1-NEXT:    callq __truncdfhf2@PLT
3267; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3268; AVX1-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3269; AVX1-NEXT:    callq __truncdfhf2@PLT
3270; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3271; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3272; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3273; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
3274; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3275; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
3276; AVX1-NEXT:    addq $88, %rsp
3277; AVX1-NEXT:    retq
3278;
3279; AVX2-LABEL: cvt_4f64_to_8i16_undef:
3280; AVX2:       # %bb.0:
3281; AVX2-NEXT:    subq $88, %rsp
3282; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3283; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3284; AVX2-NEXT:    vzeroupper
3285; AVX2-NEXT:    callq __truncdfhf2@PLT
3286; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3287; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3288; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3289; AVX2-NEXT:    vzeroupper
3290; AVX2-NEXT:    callq __truncdfhf2@PLT
3291; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3292; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3293; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3294; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3295; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3296; AVX2-NEXT:    vzeroupper
3297; AVX2-NEXT:    callq __truncdfhf2@PLT
3298; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3299; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3300; AVX2-NEXT:    callq __truncdfhf2@PLT
3301; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3302; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3303; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3304; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
3305; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3306; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
3307; AVX2-NEXT:    addq $88, %rsp
3308; AVX2-NEXT:    retq
3309;
3310; F16C-LABEL: cvt_4f64_to_8i16_undef:
3311; F16C:       # %bb.0:
3312; F16C-NEXT:    subq $72, %rsp
3313; F16C-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3314; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
3315; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3316; F16C-NEXT:    vzeroupper
3317; F16C-NEXT:    callq __truncdfhf2@PLT
3318; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3319; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3320; F16C-NEXT:    # xmm0 = mem[1,0]
3321; F16C-NEXT:    callq __truncdfhf2@PLT
3322; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3323; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3324; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3325; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3326; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3327; F16C-NEXT:    vzeroupper
3328; F16C-NEXT:    callq __truncdfhf2@PLT
3329; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3330; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3331; F16C-NEXT:    # xmm0 = mem[1,0]
3332; F16C-NEXT:    callq __truncdfhf2@PLT
3333; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3334; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3335; F16C-NEXT:    vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3336; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
3337; F16C-NEXT:    addq $72, %rsp
3338; F16C-NEXT:    retq
3339;
3340; AVX512-LABEL: cvt_4f64_to_8i16_undef:
3341; AVX512:       # %bb.0:
3342; AVX512-NEXT:    subq $72, %rsp
3343; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3344; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3345; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3346; AVX512-NEXT:    vzeroupper
3347; AVX512-NEXT:    callq __truncdfhf2@PLT
3348; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3349; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3350; AVX512-NEXT:    # xmm0 = mem[1,0]
3351; AVX512-NEXT:    callq __truncdfhf2@PLT
3352; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3353; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3354; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3355; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3356; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3357; AVX512-NEXT:    vzeroupper
3358; AVX512-NEXT:    callq __truncdfhf2@PLT
3359; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3360; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3361; AVX512-NEXT:    # xmm0 = mem[1,0]
3362; AVX512-NEXT:    callq __truncdfhf2@PLT
3363; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3364; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3365; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3366; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3367; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3368; AVX512-NEXT:    callq __truncdfhf2@PLT
3369; AVX512-NEXT:    vpbroadcastw %xmm0, %xmm0
3370; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3371; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
3372; AVX512-NEXT:    addq $72, %rsp
3373; AVX512-NEXT:    retq
3374  %1 = fptrunc <4 x double> %a0 to <4 x half>
3375  %2 = bitcast <4 x half> %1 to <4 x i16>
3376  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3377  ret <8 x i16> %3
3378}
3379
3380define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
3381; AVX1-LABEL: cvt_4f64_to_8i16_zero:
3382; AVX1:       # %bb.0:
3383; AVX1-NEXT:    subq $88, %rsp
3384; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3385; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3386; AVX1-NEXT:    vzeroupper
3387; AVX1-NEXT:    callq __truncdfhf2@PLT
3388; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3389; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3390; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3391; AVX1-NEXT:    vzeroupper
3392; AVX1-NEXT:    callq __truncdfhf2@PLT
3393; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3394; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3395; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3396; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3397; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3398; AVX1-NEXT:    vzeroupper
3399; AVX1-NEXT:    callq __truncdfhf2@PLT
3400; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3401; AVX1-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3402; AVX1-NEXT:    callq __truncdfhf2@PLT
3403; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3404; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3405; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3406; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
3407; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3408; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
3409; AVX1-NEXT:    addq $88, %rsp
3410; AVX1-NEXT:    retq
3411;
3412; AVX2-LABEL: cvt_4f64_to_8i16_zero:
3413; AVX2:       # %bb.0:
3414; AVX2-NEXT:    subq $88, %rsp
3415; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3416; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3417; AVX2-NEXT:    vzeroupper
3418; AVX2-NEXT:    callq __truncdfhf2@PLT
3419; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3420; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3421; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3422; AVX2-NEXT:    vzeroupper
3423; AVX2-NEXT:    callq __truncdfhf2@PLT
3424; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3425; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3426; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3427; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3428; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3429; AVX2-NEXT:    vzeroupper
3430; AVX2-NEXT:    callq __truncdfhf2@PLT
3431; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3432; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3433; AVX2-NEXT:    callq __truncdfhf2@PLT
3434; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3435; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3436; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3437; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
3438; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3439; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
3440; AVX2-NEXT:    addq $88, %rsp
3441; AVX2-NEXT:    retq
3442;
3443; F16C-LABEL: cvt_4f64_to_8i16_zero:
3444; F16C:       # %bb.0:
3445; F16C-NEXT:    subq $72, %rsp
3446; F16C-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3447; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
3448; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3449; F16C-NEXT:    vzeroupper
3450; F16C-NEXT:    callq __truncdfhf2@PLT
3451; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3452; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3453; F16C-NEXT:    # xmm0 = mem[1,0]
3454; F16C-NEXT:    callq __truncdfhf2@PLT
3455; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3456; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3457; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3458; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3459; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3460; F16C-NEXT:    vzeroupper
3461; F16C-NEXT:    callq __truncdfhf2@PLT
3462; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3463; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3464; F16C-NEXT:    # xmm0 = mem[1,0]
3465; F16C-NEXT:    callq __truncdfhf2@PLT
3466; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3467; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3468; F16C-NEXT:    vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3469; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
3470; F16C-NEXT:    addq $72, %rsp
3471; F16C-NEXT:    retq
3472;
3473; AVX512-LABEL: cvt_4f64_to_8i16_zero:
3474; AVX512:       # %bb.0:
3475; AVX512-NEXT:    subq $72, %rsp
3476; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3477; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3478; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3479; AVX512-NEXT:    vzeroupper
3480; AVX512-NEXT:    callq __truncdfhf2@PLT
3481; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3482; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3483; AVX512-NEXT:    # xmm0 = mem[1,0]
3484; AVX512-NEXT:    callq __truncdfhf2@PLT
3485; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3486; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3487; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3488; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3489; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3490; AVX512-NEXT:    vzeroupper
3491; AVX512-NEXT:    callq __truncdfhf2@PLT
3492; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3493; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3494; AVX512-NEXT:    # xmm0 = mem[1,0]
3495; AVX512-NEXT:    callq __truncdfhf2@PLT
3496; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3497; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3498; AVX512-NEXT:    vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3499; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
3500; AVX512-NEXT:    addq $72, %rsp
3501; AVX512-NEXT:    retq
3502  %1 = fptrunc <4 x double> %a0 to <4 x half>
3503  %2 = bitcast <4 x half> %1 to <4 x i16>
3504  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3505  ret <8 x i16> %3
3506}
3507
3508define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
3509; AVX-LABEL: cvt_8f64_to_8i16:
3510; AVX:       # %bb.0:
3511; AVX-NEXT:    subq $104, %rsp
3512; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3513; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3514; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
3515; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3516; AVX-NEXT:    vzeroupper
3517; AVX-NEXT:    callq __truncdfhf2@PLT
3518; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3519; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3520; AVX-NEXT:    # xmm0 = mem[1,0]
3521; AVX-NEXT:    callq __truncdfhf2@PLT
3522; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3523; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3524; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3525; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3526; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3527; AVX-NEXT:    vzeroupper
3528; AVX-NEXT:    callq __truncdfhf2@PLT
3529; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3530; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3531; AVX-NEXT:    # xmm0 = mem[1,0]
3532; AVX-NEXT:    callq __truncdfhf2@PLT
3533; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3534; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3535; AVX-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3536; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3537; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3538; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3539; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
3540; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3541; AVX-NEXT:    vzeroupper
3542; AVX-NEXT:    callq __truncdfhf2@PLT
3543; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3544; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3545; AVX-NEXT:    # xmm0 = mem[1,0]
3546; AVX-NEXT:    callq __truncdfhf2@PLT
3547; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3548; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3549; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3550; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3551; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3552; AVX-NEXT:    vzeroupper
3553; AVX-NEXT:    callq __truncdfhf2@PLT
3554; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3555; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3556; AVX-NEXT:    # xmm0 = mem[1,0]
3557; AVX-NEXT:    callq __truncdfhf2@PLT
3558; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3559; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3560; AVX-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3561; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3562; AVX-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3563; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
3564; AVX-NEXT:    addq $104, %rsp
3565; AVX-NEXT:    retq
3566;
3567; F16C-LABEL: cvt_8f64_to_8i16:
3568; F16C:       # %bb.0:
3569; F16C-NEXT:    subq $104, %rsp
3570; F16C-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3571; F16C-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3572; F16C-NEXT:    vextractf128 $1, %ymm1, %xmm0
3573; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3574; F16C-NEXT:    vzeroupper
3575; F16C-NEXT:    callq __truncdfhf2@PLT
3576; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3577; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3578; F16C-NEXT:    # xmm0 = mem[1,0]
3579; F16C-NEXT:    callq __truncdfhf2@PLT
3580; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3581; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3582; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3583; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3584; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3585; F16C-NEXT:    vzeroupper
3586; F16C-NEXT:    callq __truncdfhf2@PLT
3587; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3588; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3589; F16C-NEXT:    # xmm0 = mem[1,0]
3590; F16C-NEXT:    callq __truncdfhf2@PLT
3591; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3592; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3593; F16C-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3594; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3595; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3596; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3597; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
3598; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3599; F16C-NEXT:    vzeroupper
3600; F16C-NEXT:    callq __truncdfhf2@PLT
3601; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3602; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3603; F16C-NEXT:    # xmm0 = mem[1,0]
3604; F16C-NEXT:    callq __truncdfhf2@PLT
3605; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3606; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3607; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3608; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3609; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3610; F16C-NEXT:    vzeroupper
3611; F16C-NEXT:    callq __truncdfhf2@PLT
3612; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3613; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3614; F16C-NEXT:    # xmm0 = mem[1,0]
3615; F16C-NEXT:    callq __truncdfhf2@PLT
3616; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3617; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3618; F16C-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3619; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3620; F16C-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3621; F16C-NEXT:    # xmm0 = xmm0[0],mem[0]
3622; F16C-NEXT:    addq $104, %rsp
3623; F16C-NEXT:    retq
3624;
3625; AVX512-LABEL: cvt_8f64_to_8i16:
3626; AVX512:       # %bb.0:
3627; AVX512-NEXT:    subq $120, %rsp
3628; AVX512-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3629; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
3630; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3631; AVX512-NEXT:    vzeroupper
3632; AVX512-NEXT:    callq __truncdfhf2@PLT
3633; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3634; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3635; AVX512-NEXT:    # xmm0 = mem[1,0]
3636; AVX512-NEXT:    callq __truncdfhf2@PLT
3637; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3638; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3639; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3640; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
3641; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
3642; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3643; AVX512-NEXT:    vzeroupper
3644; AVX512-NEXT:    callq __truncdfhf2@PLT
3645; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3646; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3647; AVX512-NEXT:    # xmm0 = mem[1,0]
3648; AVX512-NEXT:    callq __truncdfhf2@PLT
3649; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3650; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3651; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3652; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3653; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3654; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
3655; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3656; AVX512-NEXT:    vzeroupper
3657; AVX512-NEXT:    callq __truncdfhf2@PLT
3658; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3659; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3660; AVX512-NEXT:    # xmm0 = mem[1,0]
3661; AVX512-NEXT:    callq __truncdfhf2@PLT
3662; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3663; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3664; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
3665; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
3666; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3667; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3668; AVX512-NEXT:    vzeroupper
3669; AVX512-NEXT:    callq __truncdfhf2@PLT
3670; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3671; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3672; AVX512-NEXT:    # xmm0 = mem[1,0]
3673; AVX512-NEXT:    callq __truncdfhf2@PLT
3674; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3675; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3676; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3677; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3678; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3679; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
3680; AVX512-NEXT:    addq $120, %rsp
3681; AVX512-NEXT:    retq
3682  %1 = fptrunc <8 x double> %a0 to <8 x half>
3683  %2 = bitcast <8 x half> %1 to <8 x i16>
3684  ret <8 x i16> %2
3685}
3686
3687;
3688; Double to Half (Store)
3689;
3690
3691define void @store_cvt_f64_to_i16(double %a0, ptr %a1) nounwind {
3692; ALL-LABEL: store_cvt_f64_to_i16:
3693; ALL:       # %bb.0:
3694; ALL-NEXT:    pushq %rbx
3695; ALL-NEXT:    movq %rdi, %rbx
3696; ALL-NEXT:    callq __truncdfhf2@PLT
3697; ALL-NEXT:    vpextrw $0, %xmm0, (%rbx)
3698; ALL-NEXT:    popq %rbx
3699; ALL-NEXT:    retq
3700; AVX-LABEL: store_cvt_f64_to_i16:
3701; AVX:       # %bb.0:
3702; AVX-NEXT:    pushq %rbx
3703; AVX-NEXT:    movq %rdi, %rbx
3704; AVX-NEXT:    callq __truncdfhf2@PLT
3705; AVX-NEXT:    vpextrw $0, %xmm0, (%rbx)
3706; AVX-NEXT:    popq %rbx
3707; AVX-NEXT:    retq
3708;
3709; F16C-LABEL: store_cvt_f64_to_i16:
3710; F16C:       # %bb.0:
3711; F16C-NEXT:    pushq %rbx
3712; F16C-NEXT:    movq %rdi, %rbx
3713; F16C-NEXT:    callq __truncdfhf2@PLT
3714; F16C-NEXT:    vpextrw $0, %xmm0, (%rbx)
3715; F16C-NEXT:    popq %rbx
3716; F16C-NEXT:    retq
3717;
3718; AVX512-LABEL: store_cvt_f64_to_i16:
3719; AVX512:       # %bb.0:
3720; AVX512-NEXT:    pushq %rbx
3721; AVX512-NEXT:    movq %rdi, %rbx
3722; AVX512-NEXT:    callq __truncdfhf2@PLT
3723; AVX512-NEXT:    vpextrw $0, %xmm0, (%rbx)
3724; AVX512-NEXT:    popq %rbx
3725; AVX512-NEXT:    retq
3726  %1 = fptrunc double %a0 to half
3727  %2 = bitcast half %1 to i16
3728  store i16 %2, ptr %a1
3729  ret void
3730}
3731
3732define void @store_cvt_2f64_to_2i16(<2 x double> %a0, ptr %a1) nounwind {
3733; AVX-LABEL: store_cvt_2f64_to_2i16:
3734; AVX:       # %bb.0:
3735; AVX-NEXT:    pushq %rbx
3736; AVX-NEXT:    subq $32, %rsp
3737; AVX-NEXT:    movq %rdi, %rbx
3738; AVX-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3739; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3740; AVX-NEXT:    callq __truncdfhf2@PLT
3741; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3742; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3743; AVX-NEXT:    callq __truncdfhf2@PLT
3744; AVX-NEXT:    vpextrw $0, %xmm0, (%rbx)
3745; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3746; AVX-NEXT:    vpextrw $0, %xmm0, 2(%rbx)
3747; AVX-NEXT:    addq $32, %rsp
3748; AVX-NEXT:    popq %rbx
3749; AVX-NEXT:    retq
3750;
3751; F16C-LABEL: store_cvt_2f64_to_2i16:
3752; F16C:       # %bb.0:
3753; F16C-NEXT:    pushq %rbx
3754; F16C-NEXT:    subq $32, %rsp
3755; F16C-NEXT:    movq %rdi, %rbx
3756; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3757; F16C-NEXT:    callq __truncdfhf2@PLT
3758; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3759; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3760; F16C-NEXT:    # xmm0 = mem[1,0]
3761; F16C-NEXT:    callq __truncdfhf2@PLT
3762; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3763; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3764; F16C-NEXT:    vmovd %xmm0, (%rbx)
3765; F16C-NEXT:    addq $32, %rsp
3766; F16C-NEXT:    popq %rbx
3767; F16C-NEXT:    retq
3768;
3769; AVX512-LABEL: store_cvt_2f64_to_2i16:
3770; AVX512:       # %bb.0:
3771; AVX512-NEXT:    pushq %rbx
3772; AVX512-NEXT:    subq $32, %rsp
3773; AVX512-NEXT:    movq %rdi, %rbx
3774; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3775; AVX512-NEXT:    callq __truncdfhf2@PLT
3776; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3777; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3778; AVX512-NEXT:    # xmm0 = mem[1,0]
3779; AVX512-NEXT:    callq __truncdfhf2@PLT
3780; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3781; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3782; AVX512-NEXT:    vmovd %xmm0, (%rbx)
3783; AVX512-NEXT:    addq $32, %rsp
3784; AVX512-NEXT:    popq %rbx
3785; AVX512-NEXT:    retq
3786  %1 = fptrunc <2 x double> %a0 to <2 x half>
3787  %2 = bitcast <2 x half> %1 to <2 x i16>
3788  store <2 x i16> %2, ptr %a1
3789  ret void
3790}
3791
3792define void @store_cvt_4f64_to_4i16(<4 x double> %a0, ptr %a1) nounwind {
3793; AVX1-LABEL: store_cvt_4f64_to_4i16:
3794; AVX1:       # %bb.0:
3795; AVX1-NEXT:    pushq %rbx
3796; AVX1-NEXT:    subq $80, %rsp
3797; AVX1-NEXT:    movq %rdi, %rbx
3798; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3799; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3800; AVX1-NEXT:    vzeroupper
3801; AVX1-NEXT:    callq __truncdfhf2@PLT
3802; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3803; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3804; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3805; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3806; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3807; AVX1-NEXT:    vzeroupper
3808; AVX1-NEXT:    callq __truncdfhf2@PLT
3809; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3810; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3811; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3812; AVX1-NEXT:    vzeroupper
3813; AVX1-NEXT:    callq __truncdfhf2@PLT
3814; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3815; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3816; AVX1-NEXT:    callq __truncdfhf2@PLT
3817; AVX1-NEXT:    vpextrw $0, %xmm0, 4(%rbx)
3818; AVX1-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3819; AVX1-NEXT:    vpextrw $0, %xmm0, (%rbx)
3820; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3821; AVX1-NEXT:    vpextrw $0, %xmm0, 6(%rbx)
3822; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3823; AVX1-NEXT:    vpextrw $0, %xmm0, 2(%rbx)
3824; AVX1-NEXT:    addq $80, %rsp
3825; AVX1-NEXT:    popq %rbx
3826; AVX1-NEXT:    retq
3827;
3828; AVX2-LABEL: store_cvt_4f64_to_4i16:
3829; AVX2:       # %bb.0:
3830; AVX2-NEXT:    pushq %rbx
3831; AVX2-NEXT:    subq $80, %rsp
3832; AVX2-NEXT:    movq %rdi, %rbx
3833; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3834; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3835; AVX2-NEXT:    vzeroupper
3836; AVX2-NEXT:    callq __truncdfhf2@PLT
3837; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3838; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3839; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3840; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3841; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3842; AVX2-NEXT:    vzeroupper
3843; AVX2-NEXT:    callq __truncdfhf2@PLT
3844; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3845; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3846; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3847; AVX2-NEXT:    vzeroupper
3848; AVX2-NEXT:    callq __truncdfhf2@PLT
3849; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3850; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3851; AVX2-NEXT:    callq __truncdfhf2@PLT
3852; AVX2-NEXT:    vpextrw $0, %xmm0, 4(%rbx)
3853; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3854; AVX2-NEXT:    vpextrw $0, %xmm0, (%rbx)
3855; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3856; AVX2-NEXT:    vpextrw $0, %xmm0, 6(%rbx)
3857; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3858; AVX2-NEXT:    vpextrw $0, %xmm0, 2(%rbx)
3859; AVX2-NEXT:    addq $80, %rsp
3860; AVX2-NEXT:    popq %rbx
3861; AVX2-NEXT:    retq
3862;
3863; F16C-LABEL: store_cvt_4f64_to_4i16:
3864; F16C:       # %bb.0:
3865; F16C-NEXT:    pushq %rbx
3866; F16C-NEXT:    subq $64, %rsp
3867; F16C-NEXT:    movq %rdi, %rbx
3868; F16C-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3869; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
3870; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3871; F16C-NEXT:    vzeroupper
3872; F16C-NEXT:    callq __truncdfhf2@PLT
3873; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3874; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3875; F16C-NEXT:    # xmm0 = mem[1,0]
3876; F16C-NEXT:    callq __truncdfhf2@PLT
3877; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3878; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3879; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3880; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3881; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3882; F16C-NEXT:    vzeroupper
3883; F16C-NEXT:    callq __truncdfhf2@PLT
3884; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3885; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3886; F16C-NEXT:    # xmm0 = mem[1,0]
3887; F16C-NEXT:    callq __truncdfhf2@PLT
3888; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3889; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3890; F16C-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3891; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3892; F16C-NEXT:    vmovq %xmm0, (%rbx)
3893; F16C-NEXT:    addq $64, %rsp
3894; F16C-NEXT:    popq %rbx
3895; F16C-NEXT:    retq
3896;
3897; AVX512-LABEL: store_cvt_4f64_to_4i16:
3898; AVX512:       # %bb.0:
3899; AVX512-NEXT:    pushq %rbx
3900; AVX512-NEXT:    subq $64, %rsp
3901; AVX512-NEXT:    movq %rdi, %rbx
3902; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3903; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3904; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3905; AVX512-NEXT:    vzeroupper
3906; AVX512-NEXT:    callq __truncdfhf2@PLT
3907; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3908; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3909; AVX512-NEXT:    # xmm0 = mem[1,0]
3910; AVX512-NEXT:    callq __truncdfhf2@PLT
3911; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3912; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3913; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3914; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3915; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3916; AVX512-NEXT:    vzeroupper
3917; AVX512-NEXT:    callq __truncdfhf2@PLT
3918; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3919; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3920; AVX512-NEXT:    # xmm0 = mem[1,0]
3921; AVX512-NEXT:    callq __truncdfhf2@PLT
3922; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3923; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3924; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3925; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3926; AVX512-NEXT:    vmovq %xmm0, (%rbx)
3927; AVX512-NEXT:    addq $64, %rsp
3928; AVX512-NEXT:    popq %rbx
3929; AVX512-NEXT:    retq
3930  %1 = fptrunc <4 x double> %a0 to <4 x half>
3931  %2 = bitcast <4 x half> %1 to <4 x i16>
3932  store <4 x i16> %2, ptr %a1
3933  ret void
3934}
3935
3936define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind {
3937; AVX1-LABEL: store_cvt_4f64_to_8i16_undef:
3938; AVX1:       # %bb.0:
3939; AVX1-NEXT:    pushq %rbx
3940; AVX1-NEXT:    subq $80, %rsp
3941; AVX1-NEXT:    movq %rdi, %rbx
3942; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3943; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3944; AVX1-NEXT:    vzeroupper
3945; AVX1-NEXT:    callq __truncdfhf2@PLT
3946; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3947; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3948; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3949; AVX1-NEXT:    vzeroupper
3950; AVX1-NEXT:    callq __truncdfhf2@PLT
3951; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3952; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3953; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3954; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3955; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3956; AVX1-NEXT:    vzeroupper
3957; AVX1-NEXT:    callq __truncdfhf2@PLT
3958; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3959; AVX1-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3960; AVX1-NEXT:    callq __truncdfhf2@PLT
3961; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3962; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3963; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3964; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
3965; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3966; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
3967; AVX1-NEXT:    vmovaps %xmm0, (%rbx)
3968; AVX1-NEXT:    addq $80, %rsp
3969; AVX1-NEXT:    popq %rbx
3970; AVX1-NEXT:    retq
3971;
3972; AVX2-LABEL: store_cvt_4f64_to_8i16_undef:
3973; AVX2:       # %bb.0:
3974; AVX2-NEXT:    pushq %rbx
3975; AVX2-NEXT:    subq $80, %rsp
3976; AVX2-NEXT:    movq %rdi, %rbx
3977; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3978; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3979; AVX2-NEXT:    vzeroupper
3980; AVX2-NEXT:    callq __truncdfhf2@PLT
3981; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3982; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3983; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3984; AVX2-NEXT:    vzeroupper
3985; AVX2-NEXT:    callq __truncdfhf2@PLT
3986; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3987; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3988; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3989; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3990; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3991; AVX2-NEXT:    vzeroupper
3992; AVX2-NEXT:    callq __truncdfhf2@PLT
3993; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3994; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3995; AVX2-NEXT:    callq __truncdfhf2@PLT
3996; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3997; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3998; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3999; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4000; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
4001; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
4002; AVX2-NEXT:    vmovaps %xmm0, (%rbx)
4003; AVX2-NEXT:    addq $80, %rsp
4004; AVX2-NEXT:    popq %rbx
4005; AVX2-NEXT:    retq
4006;
4007; F16C-LABEL: store_cvt_4f64_to_8i16_undef:
4008; F16C:       # %bb.0:
4009; F16C-NEXT:    pushq %rbx
4010; F16C-NEXT:    subq $64, %rsp
4011; F16C-NEXT:    movq %rdi, %rbx
4012; F16C-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4013; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
4014; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4015; F16C-NEXT:    vzeroupper
4016; F16C-NEXT:    callq __truncdfhf2@PLT
4017; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4018; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4019; F16C-NEXT:    # xmm0 = mem[1,0]
4020; F16C-NEXT:    callq __truncdfhf2@PLT
4021; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4022; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4023; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4024; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4025; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4026; F16C-NEXT:    vzeroupper
4027; F16C-NEXT:    callq __truncdfhf2@PLT
4028; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4029; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4030; F16C-NEXT:    # xmm0 = mem[1,0]
4031; F16C-NEXT:    callq __truncdfhf2@PLT
4032; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4033; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4034; F16C-NEXT:    vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4035; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
4036; F16C-NEXT:    vmovaps %xmm0, (%rbx)
4037; F16C-NEXT:    addq $64, %rsp
4038; F16C-NEXT:    popq %rbx
4039; F16C-NEXT:    retq
4040;
4041; AVX512-LABEL: store_cvt_4f64_to_8i16_undef:
4042; AVX512:       # %bb.0:
4043; AVX512-NEXT:    pushq %rbx
4044; AVX512-NEXT:    subq $64, %rsp
4045; AVX512-NEXT:    movq %rdi, %rbx
4046; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4047; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
4048; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4049; AVX512-NEXT:    vzeroupper
4050; AVX512-NEXT:    callq __truncdfhf2@PLT
4051; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4052; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4053; AVX512-NEXT:    # xmm0 = mem[1,0]
4054; AVX512-NEXT:    callq __truncdfhf2@PLT
4055; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4056; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4057; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4058; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4059; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4060; AVX512-NEXT:    vzeroupper
4061; AVX512-NEXT:    callq __truncdfhf2@PLT
4062; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4063; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4064; AVX512-NEXT:    # xmm0 = mem[1,0]
4065; AVX512-NEXT:    callq __truncdfhf2@PLT
4066; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4067; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4068; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4069; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4070; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4071; AVX512-NEXT:    callq __truncdfhf2@PLT
4072; AVX512-NEXT:    vpbroadcastw %xmm0, %xmm0
4073; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4074; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
4075; AVX512-NEXT:    vmovaps %xmm0, (%rbx)
4076; AVX512-NEXT:    addq $64, %rsp
4077; AVX512-NEXT:    popq %rbx
4078; AVX512-NEXT:    retq
4079  %1 = fptrunc <4 x double> %a0 to <4 x half>
4080  %2 = bitcast <4 x half> %1 to <4 x i16>
4081  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4082  store <8 x i16> %3, ptr %a1
4083  ret void
4084}
4085
4086define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind {
4087; AVX1-LABEL: store_cvt_4f64_to_8i16_zero:
4088; AVX1:       # %bb.0:
4089; AVX1-NEXT:    pushq %rbx
4090; AVX1-NEXT:    subq $80, %rsp
4091; AVX1-NEXT:    movq %rdi, %rbx
4092; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4093; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4094; AVX1-NEXT:    vzeroupper
4095; AVX1-NEXT:    callq __truncdfhf2@PLT
4096; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4097; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4098; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4099; AVX1-NEXT:    vzeroupper
4100; AVX1-NEXT:    callq __truncdfhf2@PLT
4101; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4102; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4103; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4104; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4105; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4106; AVX1-NEXT:    vzeroupper
4107; AVX1-NEXT:    callq __truncdfhf2@PLT
4108; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4109; AVX1-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
4110; AVX1-NEXT:    callq __truncdfhf2@PLT
4111; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4112; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4113; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4114; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4115; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
4116; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
4117; AVX1-NEXT:    vmovaps %xmm0, (%rbx)
4118; AVX1-NEXT:    addq $80, %rsp
4119; AVX1-NEXT:    popq %rbx
4120; AVX1-NEXT:    retq
4121;
4122; AVX2-LABEL: store_cvt_4f64_to_8i16_zero:
4123; AVX2:       # %bb.0:
4124; AVX2-NEXT:    pushq %rbx
4125; AVX2-NEXT:    subq $80, %rsp
4126; AVX2-NEXT:    movq %rdi, %rbx
4127; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4128; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4129; AVX2-NEXT:    vzeroupper
4130; AVX2-NEXT:    callq __truncdfhf2@PLT
4131; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4132; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4133; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4134; AVX2-NEXT:    vzeroupper
4135; AVX2-NEXT:    callq __truncdfhf2@PLT
4136; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4137; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4138; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
4139; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4140; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4141; AVX2-NEXT:    vzeroupper
4142; AVX2-NEXT:    callq __truncdfhf2@PLT
4143; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4144; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
4145; AVX2-NEXT:    callq __truncdfhf2@PLT
4146; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4147; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4148; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4149; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4150; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
4151; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
4152; AVX2-NEXT:    vmovaps %xmm0, (%rbx)
4153; AVX2-NEXT:    addq $80, %rsp
4154; AVX2-NEXT:    popq %rbx
4155; AVX2-NEXT:    retq
4156;
4157; F16C-LABEL: store_cvt_4f64_to_8i16_zero:
4158; F16C:       # %bb.0:
4159; F16C-NEXT:    pushq %rbx
4160; F16C-NEXT:    subq $64, %rsp
4161; F16C-NEXT:    movq %rdi, %rbx
4162; F16C-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4163; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
4164; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4165; F16C-NEXT:    vzeroupper
4166; F16C-NEXT:    callq __truncdfhf2@PLT
4167; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4168; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4169; F16C-NEXT:    # xmm0 = mem[1,0]
4170; F16C-NEXT:    callq __truncdfhf2@PLT
4171; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4172; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4173; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4174; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4175; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4176; F16C-NEXT:    vzeroupper
4177; F16C-NEXT:    callq __truncdfhf2@PLT
4178; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4179; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4180; F16C-NEXT:    # xmm0 = mem[1,0]
4181; F16C-NEXT:    callq __truncdfhf2@PLT
4182; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4183; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4184; F16C-NEXT:    vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4185; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
4186; F16C-NEXT:    vmovaps %xmm0, (%rbx)
4187; F16C-NEXT:    addq $64, %rsp
4188; F16C-NEXT:    popq %rbx
4189; F16C-NEXT:    retq
4190;
4191; AVX512-LABEL: store_cvt_4f64_to_8i16_zero:
4192; AVX512:       # %bb.0:
4193; AVX512-NEXT:    pushq %rbx
4194; AVX512-NEXT:    subq $64, %rsp
4195; AVX512-NEXT:    movq %rdi, %rbx
4196; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4197; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
4198; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4199; AVX512-NEXT:    vzeroupper
4200; AVX512-NEXT:    callq __truncdfhf2@PLT
4201; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4202; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4203; AVX512-NEXT:    # xmm0 = mem[1,0]
4204; AVX512-NEXT:    callq __truncdfhf2@PLT
4205; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4206; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4207; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4208; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4209; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4210; AVX512-NEXT:    vzeroupper
4211; AVX512-NEXT:    callq __truncdfhf2@PLT
4212; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4213; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4214; AVX512-NEXT:    # xmm0 = mem[1,0]
4215; AVX512-NEXT:    callq __truncdfhf2@PLT
4216; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4217; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4218; AVX512-NEXT:    vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4219; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
4220; AVX512-NEXT:    vmovaps %xmm0, (%rbx)
4221; AVX512-NEXT:    addq $64, %rsp
4222; AVX512-NEXT:    popq %rbx
4223; AVX512-NEXT:    retq
4224  %1 = fptrunc <4 x double> %a0 to <4 x half>
4225  %2 = bitcast <4 x half> %1 to <4 x i16>
4226  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4227  store <8 x i16> %3, ptr %a1
4228  ret void
4229}
4230
4231define void @store_cvt_8f64_to_8i16(<8 x double> %a0, ptr %a1) nounwind {
4232; AVX-LABEL: store_cvt_8f64_to_8i16:
4233; AVX:       # %bb.0:
4234; AVX-NEXT:    pushq %rbx
4235; AVX-NEXT:    subq $96, %rsp
4236; AVX-NEXT:    movq %rdi, %rbx
4237; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4238; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4239; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
4240; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4241; AVX-NEXT:    vzeroupper
4242; AVX-NEXT:    callq __truncdfhf2@PLT
4243; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4244; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4245; AVX-NEXT:    # xmm0 = mem[1,0]
4246; AVX-NEXT:    callq __truncdfhf2@PLT
4247; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4248; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4249; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4250; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4251; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4252; AVX-NEXT:    vzeroupper
4253; AVX-NEXT:    callq __truncdfhf2@PLT
4254; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4255; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4256; AVX-NEXT:    # xmm0 = mem[1,0]
4257; AVX-NEXT:    callq __truncdfhf2@PLT
4258; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4259; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4260; AVX-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4261; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4262; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4263; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4264; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
4265; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4266; AVX-NEXT:    vzeroupper
4267; AVX-NEXT:    callq __truncdfhf2@PLT
4268; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4269; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4270; AVX-NEXT:    # xmm0 = mem[1,0]
4271; AVX-NEXT:    callq __truncdfhf2@PLT
4272; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4273; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4274; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4275; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4276; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4277; AVX-NEXT:    vzeroupper
4278; AVX-NEXT:    callq __truncdfhf2@PLT
4279; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4280; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4281; AVX-NEXT:    # xmm0 = mem[1,0]
4282; AVX-NEXT:    callq __truncdfhf2@PLT
4283; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4284; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4285; AVX-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4286; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4287; AVX-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4288; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
4289; AVX-NEXT:    vmovdqa %xmm0, (%rbx)
4290; AVX-NEXT:    addq $96, %rsp
4291; AVX-NEXT:    popq %rbx
4292; AVX-NEXT:    retq
4293;
4294; F16C-LABEL: store_cvt_8f64_to_8i16:
4295; F16C:       # %bb.0:
4296; F16C-NEXT:    pushq %rbx
4297; F16C-NEXT:    subq $96, %rsp
4298; F16C-NEXT:    movq %rdi, %rbx
4299; F16C-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4300; F16C-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4301; F16C-NEXT:    vextractf128 $1, %ymm1, %xmm0
4302; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4303; F16C-NEXT:    vzeroupper
4304; F16C-NEXT:    callq __truncdfhf2@PLT
4305; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4306; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4307; F16C-NEXT:    # xmm0 = mem[1,0]
4308; F16C-NEXT:    callq __truncdfhf2@PLT
4309; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4310; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4311; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4312; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4313; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4314; F16C-NEXT:    vzeroupper
4315; F16C-NEXT:    callq __truncdfhf2@PLT
4316; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4317; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4318; F16C-NEXT:    # xmm0 = mem[1,0]
4319; F16C-NEXT:    callq __truncdfhf2@PLT
4320; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4321; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4322; F16C-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4323; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4324; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4325; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4326; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
4327; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4328; F16C-NEXT:    vzeroupper
4329; F16C-NEXT:    callq __truncdfhf2@PLT
4330; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4331; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4332; F16C-NEXT:    # xmm0 = mem[1,0]
4333; F16C-NEXT:    callq __truncdfhf2@PLT
4334; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4335; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4336; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4337; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4338; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4339; F16C-NEXT:    vzeroupper
4340; F16C-NEXT:    callq __truncdfhf2@PLT
4341; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4342; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4343; F16C-NEXT:    # xmm0 = mem[1,0]
4344; F16C-NEXT:    callq __truncdfhf2@PLT
4345; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4346; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4347; F16C-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4348; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4349; F16C-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4350; F16C-NEXT:    # xmm0 = xmm0[0],mem[0]
4351; F16C-NEXT:    vmovdqa %xmm0, (%rbx)
4352; F16C-NEXT:    addq $96, %rsp
4353; F16C-NEXT:    popq %rbx
4354; F16C-NEXT:    retq
4355;
4356; AVX512-LABEL: store_cvt_8f64_to_8i16:
4357; AVX512:       # %bb.0:
4358; AVX512-NEXT:    pushq %rbx
4359; AVX512-NEXT:    subq $112, %rsp
4360; AVX512-NEXT:    movq %rdi, %rbx
4361; AVX512-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4362; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
4363; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4364; AVX512-NEXT:    vzeroupper
4365; AVX512-NEXT:    callq __truncdfhf2@PLT
4366; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4367; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4368; AVX512-NEXT:    # xmm0 = mem[1,0]
4369; AVX512-NEXT:    callq __truncdfhf2@PLT
4370; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4371; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4372; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4373; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4374; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
4375; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4376; AVX512-NEXT:    vzeroupper
4377; AVX512-NEXT:    callq __truncdfhf2@PLT
4378; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4379; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4380; AVX512-NEXT:    # xmm0 = mem[1,0]
4381; AVX512-NEXT:    callq __truncdfhf2@PLT
4382; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4383; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4384; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4385; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4386; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4387; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4388; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
4389; AVX512-NEXT:    vzeroupper
4390; AVX512-NEXT:    callq __truncdfhf2@PLT
4391; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4392; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4393; AVX512-NEXT:    # xmm0 = mem[1,0]
4394; AVX512-NEXT:    callq __truncdfhf2@PLT
4395; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4396; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4397; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4398; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4399; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
4400; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4401; AVX512-NEXT:    vzeroupper
4402; AVX512-NEXT:    callq __truncdfhf2@PLT
4403; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4404; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4405; AVX512-NEXT:    # xmm0 = mem[1,0]
4406; AVX512-NEXT:    callq __truncdfhf2@PLT
4407; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4408; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4409; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4410; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4411; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4412; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
4413; AVX512-NEXT:    vmovdqa %xmm0, (%rbx)
4414; AVX512-NEXT:    addq $112, %rsp
4415; AVX512-NEXT:    popq %rbx
4416; AVX512-NEXT:    retq
4417  %1 = fptrunc <8 x double> %a0 to <8 x half>
4418  %2 = bitcast <8 x half> %1 to <8 x i16>
4419  store <8 x i16> %2, ptr %a1
4420  ret void
4421}
4422
4423define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
4424; AVX1-LABEL: store_cvt_32f32_to_32f16:
4425; AVX1:       # %bb.0:
4426; AVX1-NEXT:    pushq %rbx
4427; AVX1-NEXT:    subq $176, %rsp
4428; AVX1-NEXT:    movq %rdi, %rbx
4429; AVX1-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4430; AVX1-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4431; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4432; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4433; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
4434; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4435; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
4436; AVX1-NEXT:    vzeroupper
4437; AVX1-NEXT:    callq __truncsfhf2@PLT
4438; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4439; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4440; AVX1-NEXT:    # xmm0 = mem[1,0]
4441; AVX1-NEXT:    callq __truncsfhf2@PLT
4442; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4443; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4444; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4445; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4446; AVX1-NEXT:    callq __truncsfhf2@PLT
4447; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4448; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4449; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
4450; AVX1-NEXT:    callq __truncsfhf2@PLT
4451; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4452; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4453; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4454; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4455; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4456; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4457; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
4458; AVX1-NEXT:    callq __truncsfhf2@PLT
4459; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4460; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4461; AVX1-NEXT:    # xmm0 = mem[1,0]
4462; AVX1-NEXT:    callq __truncsfhf2@PLT
4463; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4464; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4465; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4466; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4467; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4468; AVX1-NEXT:    vzeroupper
4469; AVX1-NEXT:    callq __truncsfhf2@PLT
4470; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4471; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4472; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
4473; AVX1-NEXT:    callq __truncsfhf2@PLT
4474; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4475; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4476; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4477; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4478; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4479; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
4480; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4481; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4482; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4483; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4484; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
4485; AVX1-NEXT:    vzeroupper
4486; AVX1-NEXT:    callq __truncsfhf2@PLT
4487; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4488; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4489; AVX1-NEXT:    # xmm0 = mem[1,0]
4490; AVX1-NEXT:    callq __truncsfhf2@PLT
4491; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4492; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4493; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4494; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4495; AVX1-NEXT:    callq __truncsfhf2@PLT
4496; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4497; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4498; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
4499; AVX1-NEXT:    callq __truncsfhf2@PLT
4500; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4501; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4502; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4503; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4504; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4505; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4506; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
4507; AVX1-NEXT:    callq __truncsfhf2@PLT
4508; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4509; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4510; AVX1-NEXT:    # xmm0 = mem[1,0]
4511; AVX1-NEXT:    callq __truncsfhf2@PLT
4512; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4513; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4514; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4515; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4516; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4517; AVX1-NEXT:    vzeroupper
4518; AVX1-NEXT:    callq __truncsfhf2@PLT
4519; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4520; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4521; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
4522; AVX1-NEXT:    callq __truncsfhf2@PLT
4523; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4524; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4525; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4526; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4527; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4528; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
4529; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
4530; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4531; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4532; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4533; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4534; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
4535; AVX1-NEXT:    vzeroupper
4536; AVX1-NEXT:    callq __truncsfhf2@PLT
4537; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4538; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4539; AVX1-NEXT:    # xmm0 = mem[1,0]
4540; AVX1-NEXT:    callq __truncsfhf2@PLT
4541; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4542; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4543; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4544; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4545; AVX1-NEXT:    callq __truncsfhf2@PLT
4546; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4547; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4548; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
4549; AVX1-NEXT:    callq __truncsfhf2@PLT
4550; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4551; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4552; AVX1-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4553; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4554; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4555; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4556; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
4557; AVX1-NEXT:    callq __truncsfhf2@PLT
4558; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4559; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4560; AVX1-NEXT:    # xmm0 = mem[1,0]
4561; AVX1-NEXT:    callq __truncsfhf2@PLT
4562; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4563; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4564; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4565; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4566; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4567; AVX1-NEXT:    vzeroupper
4568; AVX1-NEXT:    callq __truncsfhf2@PLT
4569; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4570; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4571; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
4572; AVX1-NEXT:    callq __truncsfhf2@PLT
4573; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4574; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4575; AVX1-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4576; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4577; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4578; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
4579; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4580; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4581; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4582; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4583; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
4584; AVX1-NEXT:    vzeroupper
4585; AVX1-NEXT:    callq __truncsfhf2@PLT
4586; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4587; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4588; AVX1-NEXT:    # xmm0 = mem[1,0]
4589; AVX1-NEXT:    callq __truncsfhf2@PLT
4590; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4591; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4592; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4593; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4594; AVX1-NEXT:    callq __truncsfhf2@PLT
4595; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4596; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4597; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
4598; AVX1-NEXT:    callq __truncsfhf2@PLT
4599; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4600; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4601; AVX1-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4602; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4603; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4604; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4605; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
4606; AVX1-NEXT:    callq __truncsfhf2@PLT
4607; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4608; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4609; AVX1-NEXT:    # xmm0 = mem[1,0]
4610; AVX1-NEXT:    callq __truncsfhf2@PLT
4611; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4612; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4613; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4614; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4615; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4616; AVX1-NEXT:    vzeroupper
4617; AVX1-NEXT:    callq __truncsfhf2@PLT
4618; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4619; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4620; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
4621; AVX1-NEXT:    callq __truncsfhf2@PLT
4622; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4623; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4624; AVX1-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4625; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4626; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4627; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
4628; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
4629; AVX1-NEXT:    vmovaps %ymm0, 32(%rbx)
4630; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4631; AVX1-NEXT:    vmovaps %ymm0, (%rbx)
4632; AVX1-NEXT:    addq $176, %rsp
4633; AVX1-NEXT:    popq %rbx
4634; AVX1-NEXT:    vzeroupper
4635; AVX1-NEXT:    retq
4636;
4637; AVX2-LABEL: store_cvt_32f32_to_32f16:
4638; AVX2:       # %bb.0:
4639; AVX2-NEXT:    pushq %rbx
4640; AVX2-NEXT:    subq $176, %rsp
4641; AVX2-NEXT:    movq %rdi, %rbx
4642; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4643; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4644; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4645; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4646; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
4647; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4648; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
4649; AVX2-NEXT:    vzeroupper
4650; AVX2-NEXT:    callq __truncsfhf2@PLT
4651; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4652; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4653; AVX2-NEXT:    # xmm0 = mem[1,0]
4654; AVX2-NEXT:    callq __truncsfhf2@PLT
4655; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4656; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4657; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4658; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4659; AVX2-NEXT:    callq __truncsfhf2@PLT
4660; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4661; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4662; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
4663; AVX2-NEXT:    callq __truncsfhf2@PLT
4664; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4665; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4666; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4667; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4668; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4669; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4670; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
4671; AVX2-NEXT:    callq __truncsfhf2@PLT
4672; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4673; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4674; AVX2-NEXT:    # xmm0 = mem[1,0]
4675; AVX2-NEXT:    callq __truncsfhf2@PLT
4676; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4677; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4678; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4679; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4680; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4681; AVX2-NEXT:    vzeroupper
4682; AVX2-NEXT:    callq __truncsfhf2@PLT
4683; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4684; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4685; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
4686; AVX2-NEXT:    callq __truncsfhf2@PLT
4687; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4688; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4689; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4690; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4691; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4692; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
4693; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4694; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4695; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
4696; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4697; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
4698; AVX2-NEXT:    vzeroupper
4699; AVX2-NEXT:    callq __truncsfhf2@PLT
4700; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4701; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4702; AVX2-NEXT:    # xmm0 = mem[1,0]
4703; AVX2-NEXT:    callq __truncsfhf2@PLT
4704; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4705; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4706; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4707; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4708; AVX2-NEXT:    callq __truncsfhf2@PLT
4709; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4710; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4711; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
4712; AVX2-NEXT:    callq __truncsfhf2@PLT
4713; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4714; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4715; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4716; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4717; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4718; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4719; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
4720; AVX2-NEXT:    callq __truncsfhf2@PLT
4721; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4722; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4723; AVX2-NEXT:    # xmm0 = mem[1,0]
4724; AVX2-NEXT:    callq __truncsfhf2@PLT
4725; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4726; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4727; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4728; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4729; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4730; AVX2-NEXT:    vzeroupper
4731; AVX2-NEXT:    callq __truncsfhf2@PLT
4732; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4733; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4734; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
4735; AVX2-NEXT:    callq __truncsfhf2@PLT
4736; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4737; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4738; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4739; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4740; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4741; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
4742; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
4743; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4744; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4745; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
4746; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4747; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
4748; AVX2-NEXT:    vzeroupper
4749; AVX2-NEXT:    callq __truncsfhf2@PLT
4750; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4751; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4752; AVX2-NEXT:    # xmm0 = mem[1,0]
4753; AVX2-NEXT:    callq __truncsfhf2@PLT
4754; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4755; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4756; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4757; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4758; AVX2-NEXT:    callq __truncsfhf2@PLT
4759; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4760; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4761; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
4762; AVX2-NEXT:    callq __truncsfhf2@PLT
4763; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4764; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4765; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4766; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4767; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4768; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4769; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
4770; AVX2-NEXT:    callq __truncsfhf2@PLT
4771; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4772; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4773; AVX2-NEXT:    # xmm0 = mem[1,0]
4774; AVX2-NEXT:    callq __truncsfhf2@PLT
4775; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4776; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4777; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4778; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4779; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4780; AVX2-NEXT:    vzeroupper
4781; AVX2-NEXT:    callq __truncsfhf2@PLT
4782; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4783; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4784; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
4785; AVX2-NEXT:    callq __truncsfhf2@PLT
4786; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4787; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4788; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4789; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4790; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4791; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
4792; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4793; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4794; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
4795; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4796; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
4797; AVX2-NEXT:    vzeroupper
4798; AVX2-NEXT:    callq __truncsfhf2@PLT
4799; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4800; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4801; AVX2-NEXT:    # xmm0 = mem[1,0]
4802; AVX2-NEXT:    callq __truncsfhf2@PLT
4803; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4804; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4805; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4806; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4807; AVX2-NEXT:    callq __truncsfhf2@PLT
4808; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4809; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4810; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
4811; AVX2-NEXT:    callq __truncsfhf2@PLT
4812; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4813; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4814; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4815; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4816; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4817; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4818; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
4819; AVX2-NEXT:    callq __truncsfhf2@PLT
4820; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4821; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4822; AVX2-NEXT:    # xmm0 = mem[1,0]
4823; AVX2-NEXT:    callq __truncsfhf2@PLT
4824; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4825; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4826; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4827; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4828; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4829; AVX2-NEXT:    vzeroupper
4830; AVX2-NEXT:    callq __truncsfhf2@PLT
4831; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4832; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4833; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
4834; AVX2-NEXT:    callq __truncsfhf2@PLT
4835; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4836; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4837; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4838; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4839; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4840; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
4841; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
4842; AVX2-NEXT:    vmovdqa %ymm0, 32(%rbx)
4843; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4844; AVX2-NEXT:    vmovaps %ymm0, (%rbx)
4845; AVX2-NEXT:    addq $176, %rsp
4846; AVX2-NEXT:    popq %rbx
4847; AVX2-NEXT:    vzeroupper
4848; AVX2-NEXT:    retq
4849;
4850; F16C-LABEL: store_cvt_32f32_to_32f16:
4851; F16C:       # %bb.0:
4852; F16C-NEXT:    vcvtps2ph $4, %ymm3, 48(%rdi)
4853; F16C-NEXT:    vcvtps2ph $4, %ymm2, 32(%rdi)
4854; F16C-NEXT:    vcvtps2ph $4, %ymm1, 16(%rdi)
4855; F16C-NEXT:    vcvtps2ph $4, %ymm0, (%rdi)
4856; F16C-NEXT:    vzeroupper
4857; F16C-NEXT:    retq
4858;
4859; AVX512-LABEL: store_cvt_32f32_to_32f16:
4860; AVX512:       # %bb.0:
4861; AVX512-NEXT:    vcvtps2ph $4, %zmm1, 32(%rdi)
4862; AVX512-NEXT:    vcvtps2ph $4, %zmm0, (%rdi)
4863; AVX512-NEXT:    vzeroupper
4864; AVX512-NEXT:    retq
4865  %1 = fptrunc <32 x float> %a0 to <32 x half>
4866  store <32 x half> %1, ptr %a1
4867  ret void
4868}
4869
4870define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
4871; AVX-LABEL: fptosi_2f16_to_4i32:
4872; AVX:       # %bb.0:
4873; AVX-NEXT:    subq $40, %rsp
4874; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
4875; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4876; AVX-NEXT:    callq __extendhfsf2@PLT
4877; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4878; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4879; AVX-NEXT:    callq __extendhfsf2@PLT
4880; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
4881; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
4882; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
4883; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
4884; AVX-NEXT:    addq $40, %rsp
4885; AVX-NEXT:    retq
4886;
4887; F16C-LABEL: fptosi_2f16_to_4i32:
4888; F16C:       # %bb.0:
4889; F16C-NEXT:    vpextrw $0, %xmm0, %eax
4890; F16C-NEXT:    movzwl %ax, %eax
4891; F16C-NEXT:    vmovd %eax, %xmm1
4892; F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
4893; F16C-NEXT:    vpsrld $16, %xmm0, %xmm0
4894; F16C-NEXT:    vpextrw $0, %xmm0, %eax
4895; F16C-NEXT:    movzwl %ax, %eax
4896; F16C-NEXT:    vmovd %eax, %xmm0
4897; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
4898; F16C-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4899; F16C-NEXT:    vcvttps2dq %xmm0, %xmm0
4900; F16C-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
4901; F16C-NEXT:    retq
4902;
4903; AVX512-LABEL: fptosi_2f16_to_4i32:
4904; AVX512:       # %bb.0:
4905; AVX512-NEXT:    vpextrw $0, %xmm0, %eax
4906; AVX512-NEXT:    movzwl %ax, %eax
4907; AVX512-NEXT:    vmovd %eax, %xmm1
4908; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
4909; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
4910; AVX512-NEXT:    vpextrw $0, %xmm0, %eax
4911; AVX512-NEXT:    movzwl %ax, %eax
4912; AVX512-NEXT:    vmovd %eax, %xmm0
4913; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
4914; AVX512-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4915; AVX512-NEXT:    vcvttps2dq %xmm0, %xmm0
4916; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
4917; AVX512-NEXT:    retq
4918  %cvt = fptosi <2 x half> %a to <2 x i32>
4919  %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4920  ret <4 x i32> %ext
4921}
4922