1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,-f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL
6
7;
8; Half to Float
9;
10
11define float @cvt_i16_to_f32(i16 %a0) nounwind {
12; AVX1-LABEL: cvt_i16_to_f32:
13; AVX1:       # BB#0:
14; AVX1-NEXT:    movswl %di, %eax
15; AVX1-NEXT:    vmovd %eax, %xmm0
16; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
17; AVX1-NEXT:    retq
18;
19; AVX2-LABEL: cvt_i16_to_f32:
20; AVX2:       # BB#0:
21; AVX2-NEXT:    movswl %di, %eax
22; AVX2-NEXT:    vmovd %eax, %xmm0
23; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
24; AVX2-NEXT:    retq
25;
26; AVX512F-LABEL: cvt_i16_to_f32:
27; AVX512F:       # BB#0:
28; AVX512F-NEXT:    movswl %di, %eax
29; AVX512F-NEXT:    vmovd %eax, %xmm0
30; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
31; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
32; AVX512F-NEXT:    retq
33;
34; AVX512VL-LABEL: cvt_i16_to_f32:
35; AVX512VL:       # BB#0:
36; AVX512VL-NEXT:    movswl %di, %eax
37; AVX512VL-NEXT:    vmovd %eax, %xmm0
38; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
39; AVX512VL-NEXT:    retq
40  %1 = bitcast i16 %a0 to half
41  %2 = fpext half %1 to float
42  ret float %2
43}
44
45define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
46; AVX1-LABEL: cvt_4i16_to_4f32:
47; AVX1:       # BB#0:
48; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
49; AVX1-NEXT:    vmovq %xmm0, %rax
50; AVX1-NEXT:    movq %rax, %rcx
51; AVX1-NEXT:    movq %rax, %rdx
52; AVX1-NEXT:    movswl %ax, %esi
53; AVX1-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
54; AVX1-NEXT:    shrl $16, %eax
55; AVX1-NEXT:    shrq $32, %rcx
56; AVX1-NEXT:    shrq $48, %rdx
57; AVX1-NEXT:    movswl %dx, %edx
58; AVX1-NEXT:    vmovd %edx, %xmm0
59; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
60; AVX1-NEXT:    movswl %cx, %ecx
61; AVX1-NEXT:    vmovd %ecx, %xmm1
62; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
63; AVX1-NEXT:    cwtl
64; AVX1-NEXT:    vmovd %eax, %xmm2
65; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
66; AVX1-NEXT:    vmovd %esi, %xmm3
67; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
68; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
69; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
70; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
71; AVX1-NEXT:    retq
72;
73; AVX2-LABEL: cvt_4i16_to_4f32:
74; AVX2:       # BB#0:
75; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
76; AVX2-NEXT:    vmovq %xmm0, %rax
77; AVX2-NEXT:    movq %rax, %rcx
78; AVX2-NEXT:    movq %rax, %rdx
79; AVX2-NEXT:    movswl %ax, %esi
80; AVX2-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
81; AVX2-NEXT:    shrl $16, %eax
82; AVX2-NEXT:    shrq $32, %rcx
83; AVX2-NEXT:    shrq $48, %rdx
84; AVX2-NEXT:    movswl %dx, %edx
85; AVX2-NEXT:    vmovd %edx, %xmm0
86; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
87; AVX2-NEXT:    movswl %cx, %ecx
88; AVX2-NEXT:    vmovd %ecx, %xmm1
89; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
90; AVX2-NEXT:    cwtl
91; AVX2-NEXT:    vmovd %eax, %xmm2
92; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
93; AVX2-NEXT:    vmovd %esi, %xmm3
94; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
95; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
96; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
97; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
98; AVX2-NEXT:    retq
99;
100; AVX512F-LABEL: cvt_4i16_to_4f32:
101; AVX512F:       # BB#0:
102; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
103; AVX512F-NEXT:    vmovq %xmm0, %rax
104; AVX512F-NEXT:    movq %rax, %rcx
105; AVX512F-NEXT:    movq %rax, %rdx
106; AVX512F-NEXT:    movswl %ax, %esi
107; AVX512F-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
108; AVX512F-NEXT:    shrl $16, %eax
109; AVX512F-NEXT:    shrq $32, %rcx
110; AVX512F-NEXT:    shrq $48, %rdx
111; AVX512F-NEXT:    movswl %dx, %edx
112; AVX512F-NEXT:    vmovd %edx, %xmm0
113; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
114; AVX512F-NEXT:    movswl %cx, %ecx
115; AVX512F-NEXT:    vmovd %ecx, %xmm1
116; AVX512F-NEXT:    vcvtph2ps %ymm1, %zmm1
117; AVX512F-NEXT:    cwtl
118; AVX512F-NEXT:    vmovd %eax, %xmm2
119; AVX512F-NEXT:    vcvtph2ps %ymm2, %zmm2
120; AVX512F-NEXT:    vmovd %esi, %xmm3
121; AVX512F-NEXT:    vcvtph2ps %ymm3, %zmm3
122; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
123; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
124; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
125; AVX512F-NEXT:    retq
126;
127; AVX512VL-LABEL: cvt_4i16_to_4f32:
128; AVX512VL:       # BB#0:
129; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
130; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
131; AVX512VL-NEXT:    movq %rax, %rcx
132; AVX512VL-NEXT:    movq %rax, %rdx
133; AVX512VL-NEXT:    movswl %ax, %esi
134; AVX512VL-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
135; AVX512VL-NEXT:    shrl $16, %eax
136; AVX512VL-NEXT:    shrq $32, %rcx
137; AVX512VL-NEXT:    shrq $48, %rdx
138; AVX512VL-NEXT:    movswl %dx, %edx
139; AVX512VL-NEXT:    vmovd %edx, %xmm0
140; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
141; AVX512VL-NEXT:    movswl %cx, %ecx
142; AVX512VL-NEXT:    vmovd %ecx, %xmm1
143; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
144; AVX512VL-NEXT:    cwtl
145; AVX512VL-NEXT:    vmovd %eax, %xmm2
146; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
147; AVX512VL-NEXT:    vmovd %esi, %xmm3
148; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
149; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
150; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
151; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
152; AVX512VL-NEXT:    retq
153  %1 = bitcast <4 x i16> %a0 to <4 x half>
154  %2 = fpext <4 x half> %1 to <4 x float>
155  ret <4 x float> %2
156}
157
158define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
159; AVX1-LABEL: cvt_8i16_to_4f32:
160; AVX1:       # BB#0:
161; AVX1-NEXT:    vmovq %xmm0, %rax
162; AVX1-NEXT:    movq %rax, %rcx
163; AVX1-NEXT:    movq %rax, %rdx
164; AVX1-NEXT:    movswl %ax, %esi
165; AVX1-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
166; AVX1-NEXT:    shrl $16, %eax
167; AVX1-NEXT:    shrq $32, %rcx
168; AVX1-NEXT:    shrq $48, %rdx
169; AVX1-NEXT:    movswl %dx, %edx
170; AVX1-NEXT:    vmovd %edx, %xmm0
171; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
172; AVX1-NEXT:    movswl %cx, %ecx
173; AVX1-NEXT:    vmovd %ecx, %xmm1
174; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
175; AVX1-NEXT:    cwtl
176; AVX1-NEXT:    vmovd %eax, %xmm2
177; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
178; AVX1-NEXT:    vmovd %esi, %xmm3
179; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
180; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
181; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
182; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
183; AVX1-NEXT:    retq
184;
185; AVX2-LABEL: cvt_8i16_to_4f32:
186; AVX2:       # BB#0:
187; AVX2-NEXT:    vmovq %xmm0, %rax
188; AVX2-NEXT:    movq %rax, %rcx
189; AVX2-NEXT:    movq %rax, %rdx
190; AVX2-NEXT:    movswl %ax, %esi
191; AVX2-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
192; AVX2-NEXT:    shrl $16, %eax
193; AVX2-NEXT:    shrq $32, %rcx
194; AVX2-NEXT:    shrq $48, %rdx
195; AVX2-NEXT:    movswl %dx, %edx
196; AVX2-NEXT:    vmovd %edx, %xmm0
197; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
198; AVX2-NEXT:    movswl %cx, %ecx
199; AVX2-NEXT:    vmovd %ecx, %xmm1
200; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
201; AVX2-NEXT:    cwtl
202; AVX2-NEXT:    vmovd %eax, %xmm2
203; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
204; AVX2-NEXT:    vmovd %esi, %xmm3
205; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
206; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
207; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
208; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
209; AVX2-NEXT:    retq
210;
211; AVX512F-LABEL: cvt_8i16_to_4f32:
212; AVX512F:       # BB#0:
213; AVX512F-NEXT:    vmovq %xmm0, %rax
214; AVX512F-NEXT:    movq %rax, %rcx
215; AVX512F-NEXT:    movq %rax, %rdx
216; AVX512F-NEXT:    movswl %ax, %esi
217; AVX512F-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
218; AVX512F-NEXT:    shrl $16, %eax
219; AVX512F-NEXT:    shrq $32, %rcx
220; AVX512F-NEXT:    shrq $48, %rdx
221; AVX512F-NEXT:    movswl %dx, %edx
222; AVX512F-NEXT:    vmovd %edx, %xmm0
223; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
224; AVX512F-NEXT:    movswl %cx, %ecx
225; AVX512F-NEXT:    vmovd %ecx, %xmm1
226; AVX512F-NEXT:    vcvtph2ps %ymm1, %zmm1
227; AVX512F-NEXT:    cwtl
228; AVX512F-NEXT:    vmovd %eax, %xmm2
229; AVX512F-NEXT:    vcvtph2ps %ymm2, %zmm2
230; AVX512F-NEXT:    vmovd %esi, %xmm3
231; AVX512F-NEXT:    vcvtph2ps %ymm3, %zmm3
232; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
233; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
234; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
235; AVX512F-NEXT:    retq
236;
237; AVX512VL-LABEL: cvt_8i16_to_4f32:
238; AVX512VL:       # BB#0:
239; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
240; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
241; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
242; AVX512VL-NEXT:    movq %rax, %rcx
243; AVX512VL-NEXT:    movq %rax, %rdx
244; AVX512VL-NEXT:    movswl %ax, %esi
245; AVX512VL-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
246; AVX512VL-NEXT:    shrl $16, %eax
247; AVX512VL-NEXT:    shrq $32, %rcx
248; AVX512VL-NEXT:    shrq $48, %rdx
249; AVX512VL-NEXT:    movswl %dx, %edx
250; AVX512VL-NEXT:    vmovd %edx, %xmm0
251; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
252; AVX512VL-NEXT:    movswl %cx, %ecx
253; AVX512VL-NEXT:    vmovd %ecx, %xmm1
254; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
255; AVX512VL-NEXT:    cwtl
256; AVX512VL-NEXT:    vmovd %eax, %xmm2
257; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
258; AVX512VL-NEXT:    vmovd %esi, %xmm3
259; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
260; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
261; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
262; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
263; AVX512VL-NEXT:    retq
264  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
265  %2 = bitcast <4 x i16> %1 to <4 x half>
266  %3 = fpext <4 x half> %2 to <4 x float>
267  ret <4 x float> %3
268}
269
270define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind {
271; AVX1-LABEL: cvt_8i16_to_8f32:
272; AVX1:       # BB#0:
273; AVX1-NEXT:    vpextrq $1, %xmm0, %rdx
274; AVX1-NEXT:    movq %rdx, %r8
275; AVX1-NEXT:    movq %rdx, %r10
276; AVX1-NEXT:    movswl %dx, %r9d
277; AVX1-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<kill>
278; AVX1-NEXT:    shrl $16, %edx
279; AVX1-NEXT:    shrq $32, %r8
280; AVX1-NEXT:    shrq $48, %r10
281; AVX1-NEXT:    vmovq %xmm0, %rdi
282; AVX1-NEXT:    movq %rdi, %rax
283; AVX1-NEXT:    movq %rdi, %rsi
284; AVX1-NEXT:    movswl %di, %ecx
285; AVX1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<kill>
286; AVX1-NEXT:    shrl $16, %edi
287; AVX1-NEXT:    shrq $32, %rax
288; AVX1-NEXT:    shrq $48, %rsi
289; AVX1-NEXT:    movswl %si, %esi
290; AVX1-NEXT:    vmovd %esi, %xmm0
291; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
292; AVX1-NEXT:    cwtl
293; AVX1-NEXT:    vmovd %eax, %xmm1
294; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
295; AVX1-NEXT:    movswl %di, %eax
296; AVX1-NEXT:    vmovd %eax, %xmm2
297; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
298; AVX1-NEXT:    vmovd %ecx, %xmm3
299; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
300; AVX1-NEXT:    movswl %r10w, %eax
301; AVX1-NEXT:    vmovd %eax, %xmm4
302; AVX1-NEXT:    vcvtph2ps %xmm4, %xmm4
303; AVX1-NEXT:    movswl %r8w, %eax
304; AVX1-NEXT:    vmovd %eax, %xmm5
305; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
306; AVX1-NEXT:    movswl %dx, %eax
307; AVX1-NEXT:    vmovd %eax, %xmm6
308; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
309; AVX1-NEXT:    vmovd %r9d, %xmm7
310; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
311; AVX1-NEXT:    vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
312; AVX1-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
313; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
314; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
315; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
316; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
317; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
318; AVX1-NEXT:    retq
319;
320; AVX2-LABEL: cvt_8i16_to_8f32:
321; AVX2:       # BB#0:
322; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
323; AVX2-NEXT:    movq %rdx, %r8
324; AVX2-NEXT:    movq %rdx, %r10
325; AVX2-NEXT:    movswl %dx, %r9d
326; AVX2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<kill>
327; AVX2-NEXT:    shrl $16, %edx
328; AVX2-NEXT:    shrq $32, %r8
329; AVX2-NEXT:    shrq $48, %r10
330; AVX2-NEXT:    vmovq %xmm0, %rdi
331; AVX2-NEXT:    movq %rdi, %rax
332; AVX2-NEXT:    movq %rdi, %rsi
333; AVX2-NEXT:    movswl %di, %ecx
334; AVX2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<kill>
335; AVX2-NEXT:    shrl $16, %edi
336; AVX2-NEXT:    shrq $32, %rax
337; AVX2-NEXT:    shrq $48, %rsi
338; AVX2-NEXT:    movswl %si, %esi
339; AVX2-NEXT:    vmovd %esi, %xmm0
340; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
341; AVX2-NEXT:    cwtl
342; AVX2-NEXT:    vmovd %eax, %xmm1
343; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
344; AVX2-NEXT:    movswl %di, %eax
345; AVX2-NEXT:    vmovd %eax, %xmm2
346; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
347; AVX2-NEXT:    vmovd %ecx, %xmm3
348; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
349; AVX2-NEXT:    movswl %r10w, %eax
350; AVX2-NEXT:    vmovd %eax, %xmm4
351; AVX2-NEXT:    vcvtph2ps %xmm4, %xmm4
352; AVX2-NEXT:    movswl %r8w, %eax
353; AVX2-NEXT:    vmovd %eax, %xmm5
354; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
355; AVX2-NEXT:    movswl %dx, %eax
356; AVX2-NEXT:    vmovd %eax, %xmm6
357; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
358; AVX2-NEXT:    vmovd %r9d, %xmm7
359; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
360; AVX2-NEXT:    vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
361; AVX2-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
362; AVX2-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
363; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
364; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
365; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
366; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
367; AVX2-NEXT:    retq
368;
369; AVX512F-LABEL: cvt_8i16_to_8f32:
370; AVX512F:       # BB#0:
371; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
372; AVX512F-NEXT:    movq %rdx, %r8
373; AVX512F-NEXT:    movq %rdx, %r9
374; AVX512F-NEXT:    movswl %dx, %r10d
375; AVX512F-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<kill>
376; AVX512F-NEXT:    shrl $16, %edx
377; AVX512F-NEXT:    shrq $32, %r8
378; AVX512F-NEXT:    shrq $48, %r9
379; AVX512F-NEXT:    vmovq %xmm0, %rdi
380; AVX512F-NEXT:    movq %rdi, %rax
381; AVX512F-NEXT:    movq %rdi, %rcx
382; AVX512F-NEXT:    movswl %di, %esi
383; AVX512F-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<kill>
384; AVX512F-NEXT:    shrl $16, %edi
385; AVX512F-NEXT:    shrq $32, %rax
386; AVX512F-NEXT:    shrq $48, %rcx
387; AVX512F-NEXT:    movswl %cx, %ecx
388; AVX512F-NEXT:    vmovd %ecx, %xmm0
389; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
390; AVX512F-NEXT:    cwtl
391; AVX512F-NEXT:    vmovd %eax, %xmm1
392; AVX512F-NEXT:    vcvtph2ps %ymm1, %zmm1
393; AVX512F-NEXT:    movswl %di, %eax
394; AVX512F-NEXT:    vmovd %eax, %xmm2
395; AVX512F-NEXT:    vcvtph2ps %ymm2, %zmm2
396; AVX512F-NEXT:    vmovd %esi, %xmm3
397; AVX512F-NEXT:    vcvtph2ps %ymm3, %zmm3
398; AVX512F-NEXT:    movswl %r9w, %eax
399; AVX512F-NEXT:    vmovd %eax, %xmm4
400; AVX512F-NEXT:    vcvtph2ps %ymm4, %zmm4
401; AVX512F-NEXT:    movswl %r8w, %eax
402; AVX512F-NEXT:    vmovd %eax, %xmm5
403; AVX512F-NEXT:    vcvtph2ps %ymm5, %zmm5
404; AVX512F-NEXT:    movswl %dx, %eax
405; AVX512F-NEXT:    vmovd %eax, %xmm6
406; AVX512F-NEXT:    vcvtph2ps %ymm6, %zmm6
407; AVX512F-NEXT:    vmovd %r10d, %xmm7
408; AVX512F-NEXT:    vcvtph2ps %ymm7, %zmm7
409; AVX512F-NEXT:    vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
410; AVX512F-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
411; AVX512F-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
412; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
413; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
414; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
415; AVX512F-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
416; AVX512F-NEXT:    retq
417;
418; AVX512VL-LABEL: cvt_8i16_to_8f32:
419; AVX512VL:       # BB#0:
420; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rdx
421; AVX512VL-NEXT:    movq %rdx, %r8
422; AVX512VL-NEXT:    movq %rdx, %r10
423; AVX512VL-NEXT:    movswl %dx, %r9d
424; AVX512VL-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<kill>
425; AVX512VL-NEXT:    shrl $16, %edx
426; AVX512VL-NEXT:    shrq $32, %r8
427; AVX512VL-NEXT:    shrq $48, %r10
428; AVX512VL-NEXT:    vmovq %xmm0, %rdi
429; AVX512VL-NEXT:    movq %rdi, %rax
430; AVX512VL-NEXT:    movq %rdi, %rsi
431; AVX512VL-NEXT:    movswl %di, %ecx
432; AVX512VL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<kill>
433; AVX512VL-NEXT:    shrl $16, %edi
434; AVX512VL-NEXT:    shrq $32, %rax
435; AVX512VL-NEXT:    shrq $48, %rsi
436; AVX512VL-NEXT:    movswl %si, %esi
437; AVX512VL-NEXT:    vmovd %esi, %xmm0
438; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
439; AVX512VL-NEXT:    cwtl
440; AVX512VL-NEXT:    vmovd %eax, %xmm1
441; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
442; AVX512VL-NEXT:    movswl %di, %eax
443; AVX512VL-NEXT:    vmovd %eax, %xmm2
444; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
445; AVX512VL-NEXT:    vmovd %ecx, %xmm3
446; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
447; AVX512VL-NEXT:    movswl %r10w, %eax
448; AVX512VL-NEXT:    vmovd %eax, %xmm4
449; AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
450; AVX512VL-NEXT:    movswl %r8w, %eax
451; AVX512VL-NEXT:    vmovd %eax, %xmm5
452; AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
453; AVX512VL-NEXT:    movswl %dx, %eax
454; AVX512VL-NEXT:    vmovd %eax, %xmm6
455; AVX512VL-NEXT:    vcvtph2ps %xmm6, %xmm6
456; AVX512VL-NEXT:    vmovd %r9d, %xmm7
457; AVX512VL-NEXT:    vcvtph2ps %xmm7, %xmm7
458; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
459; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
460; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
461; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
462; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
463; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
464; AVX512VL-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
465; AVX512VL-NEXT:    retq
466  %1 = bitcast <8 x i16> %a0 to <8 x half>
467  %2 = fpext <8 x half> %1 to <8 x float>
468  ret <8 x float> %2
469}
470
471define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
472; AVX1-LABEL: cvt_16i16_to_16f32:
473; AVX1:       # BB#0:
474; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
475; AVX1-NEXT:    vmovq %xmm4, %rax
476; AVX1-NEXT:    movq %rax, %rcx
477; AVX1-NEXT:    shrq $48, %rcx
478; AVX1-NEXT:    movswl %cx, %ecx
479; AVX1-NEXT:    vmovd %ecx, %xmm8
480; AVX1-NEXT:    movq %rax, %rcx
481; AVX1-NEXT:    shrq $32, %rcx
482; AVX1-NEXT:    movswl %cx, %ecx
483; AVX1-NEXT:    vmovd %ecx, %xmm9
484; AVX1-NEXT:    movswl %ax, %ecx
485; AVX1-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
486; AVX1-NEXT:    shrl $16, %eax
487; AVX1-NEXT:    cwtl
488; AVX1-NEXT:    vmovd %eax, %xmm10
489; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
490; AVX1-NEXT:    vmovd %ecx, %xmm11
491; AVX1-NEXT:    movq %rax, %rcx
492; AVX1-NEXT:    shrq $48, %rcx
493; AVX1-NEXT:    movswl %cx, %ecx
494; AVX1-NEXT:    vmovd %ecx, %xmm12
495; AVX1-NEXT:    movq %rax, %rcx
496; AVX1-NEXT:    shrq $32, %rcx
497; AVX1-NEXT:    movswl %cx, %ecx
498; AVX1-NEXT:    vmovd %ecx, %xmm13
499; AVX1-NEXT:    movswl %ax, %ecx
500; AVX1-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
501; AVX1-NEXT:    shrl $16, %eax
502; AVX1-NEXT:    cwtl
503; AVX1-NEXT:    vmovd %eax, %xmm14
504; AVX1-NEXT:    vmovq %xmm0, %rax
505; AVX1-NEXT:    vmovd %ecx, %xmm15
506; AVX1-NEXT:    movq %rax, %rcx
507; AVX1-NEXT:    shrq $48, %rcx
508; AVX1-NEXT:    movswl %cx, %ecx
509; AVX1-NEXT:    vmovd %ecx, %xmm2
510; AVX1-NEXT:    movq %rax, %rcx
511; AVX1-NEXT:    shrq $32, %rcx
512; AVX1-NEXT:    movswl %cx, %ecx
513; AVX1-NEXT:    vmovd %ecx, %xmm3
514; AVX1-NEXT:    movswl %ax, %ecx
515; AVX1-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
516; AVX1-NEXT:    shrl $16, %eax
517; AVX1-NEXT:    cwtl
518; AVX1-NEXT:    vmovd %eax, %xmm4
519; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
520; AVX1-NEXT:    vmovd %ecx, %xmm0
521; AVX1-NEXT:    movq %rax, %rcx
522; AVX1-NEXT:    shrq $48, %rcx
523; AVX1-NEXT:    movswl %cx, %ecx
524; AVX1-NEXT:    vmovd %ecx, %xmm5
525; AVX1-NEXT:    movq %rax, %rcx
526; AVX1-NEXT:    shrq $32, %rcx
527; AVX1-NEXT:    movswl %cx, %ecx
528; AVX1-NEXT:    vmovd %ecx, %xmm6
529; AVX1-NEXT:    movl %eax, %ecx
530; AVX1-NEXT:    shrl $16, %ecx
531; AVX1-NEXT:    movswl %cx, %ecx
532; AVX1-NEXT:    vmovd %ecx, %xmm7
533; AVX1-NEXT:    cwtl
534; AVX1-NEXT:    vmovd %eax, %xmm1
535; AVX1-NEXT:    vcvtph2ps %xmm8, %xmm8
536; AVX1-NEXT:    vcvtph2ps %xmm9, %xmm9
537; AVX1-NEXT:    vcvtph2ps %xmm10, %xmm10
538; AVX1-NEXT:    vcvtph2ps %xmm11, %xmm11
539; AVX1-NEXT:    vcvtph2ps %xmm12, %xmm12
540; AVX1-NEXT:    vcvtph2ps %xmm13, %xmm13
541; AVX1-NEXT:    vcvtph2ps %xmm14, %xmm14
542; AVX1-NEXT:    vcvtph2ps %xmm15, %xmm15
543; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
544; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
545; AVX1-NEXT:    vcvtph2ps %xmm4, %xmm4
546; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
547; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
548; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
549; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
550; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
551; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
552; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
553; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
554; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
555; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
556; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
557; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
558; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
559; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
560; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
561; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
562; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
563; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
564; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
565; AVX1-NEXT:    retq
566;
567; AVX2-LABEL: cvt_16i16_to_16f32:
568; AVX2:       # BB#0:
569; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
570; AVX2-NEXT:    vmovq %xmm4, %rax
571; AVX2-NEXT:    movq %rax, %rcx
572; AVX2-NEXT:    shrq $48, %rcx
573; AVX2-NEXT:    movswl %cx, %ecx
574; AVX2-NEXT:    vmovd %ecx, %xmm8
575; AVX2-NEXT:    movq %rax, %rcx
576; AVX2-NEXT:    shrq $32, %rcx
577; AVX2-NEXT:    movswl %cx, %ecx
578; AVX2-NEXT:    vmovd %ecx, %xmm9
579; AVX2-NEXT:    movswl %ax, %ecx
580; AVX2-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
581; AVX2-NEXT:    shrl $16, %eax
582; AVX2-NEXT:    cwtl
583; AVX2-NEXT:    vmovd %eax, %xmm10
584; AVX2-NEXT:    vpextrq $1, %xmm4, %rax
585; AVX2-NEXT:    vmovd %ecx, %xmm11
586; AVX2-NEXT:    movq %rax, %rcx
587; AVX2-NEXT:    shrq $48, %rcx
588; AVX2-NEXT:    movswl %cx, %ecx
589; AVX2-NEXT:    vmovd %ecx, %xmm12
590; AVX2-NEXT:    movq %rax, %rcx
591; AVX2-NEXT:    shrq $32, %rcx
592; AVX2-NEXT:    movswl %cx, %ecx
593; AVX2-NEXT:    vmovd %ecx, %xmm13
594; AVX2-NEXT:    movswl %ax, %ecx
595; AVX2-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
596; AVX2-NEXT:    shrl $16, %eax
597; AVX2-NEXT:    cwtl
598; AVX2-NEXT:    vmovd %eax, %xmm14
599; AVX2-NEXT:    vmovq %xmm0, %rax
600; AVX2-NEXT:    vmovd %ecx, %xmm15
601; AVX2-NEXT:    movq %rax, %rcx
602; AVX2-NEXT:    shrq $48, %rcx
603; AVX2-NEXT:    movswl %cx, %ecx
604; AVX2-NEXT:    vmovd %ecx, %xmm2
605; AVX2-NEXT:    movq %rax, %rcx
606; AVX2-NEXT:    shrq $32, %rcx
607; AVX2-NEXT:    movswl %cx, %ecx
608; AVX2-NEXT:    vmovd %ecx, %xmm3
609; AVX2-NEXT:    movswl %ax, %ecx
610; AVX2-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
611; AVX2-NEXT:    shrl $16, %eax
612; AVX2-NEXT:    cwtl
613; AVX2-NEXT:    vmovd %eax, %xmm4
614; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
615; AVX2-NEXT:    vmovd %ecx, %xmm0
616; AVX2-NEXT:    movq %rax, %rcx
617; AVX2-NEXT:    shrq $48, %rcx
618; AVX2-NEXT:    movswl %cx, %ecx
619; AVX2-NEXT:    vmovd %ecx, %xmm5
620; AVX2-NEXT:    movq %rax, %rcx
621; AVX2-NEXT:    shrq $32, %rcx
622; AVX2-NEXT:    movswl %cx, %ecx
623; AVX2-NEXT:    vmovd %ecx, %xmm6
624; AVX2-NEXT:    movl %eax, %ecx
625; AVX2-NEXT:    shrl $16, %ecx
626; AVX2-NEXT:    movswl %cx, %ecx
627; AVX2-NEXT:    vmovd %ecx, %xmm7
628; AVX2-NEXT:    cwtl
629; AVX2-NEXT:    vmovd %eax, %xmm1
630; AVX2-NEXT:    vcvtph2ps %xmm8, %xmm8
631; AVX2-NEXT:    vcvtph2ps %xmm9, %xmm9
632; AVX2-NEXT:    vcvtph2ps %xmm10, %xmm10
633; AVX2-NEXT:    vcvtph2ps %xmm11, %xmm11
634; AVX2-NEXT:    vcvtph2ps %xmm12, %xmm12
635; AVX2-NEXT:    vcvtph2ps %xmm13, %xmm13
636; AVX2-NEXT:    vcvtph2ps %xmm14, %xmm14
637; AVX2-NEXT:    vcvtph2ps %xmm15, %xmm15
638; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
639; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
640; AVX2-NEXT:    vcvtph2ps %xmm4, %xmm4
641; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
642; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
643; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
644; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
645; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
646; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
647; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
648; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
649; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
650; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
651; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
652; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
653; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
654; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
655; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
656; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
657; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
658; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
659; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
660; AVX2-NEXT:    retq
661;
662; AVX512F-LABEL: cvt_16i16_to_16f32:
663; AVX512F:       # BB#0:
664; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
665; AVX512F-NEXT:    vmovq %xmm0, %rax
666; AVX512F-NEXT:    movq %rax, %rcx
667; AVX512F-NEXT:    shrq $48, %rcx
668; AVX512F-NEXT:    movswl %cx, %ecx
669; AVX512F-NEXT:    vmovd %ecx, %xmm2
670; AVX512F-NEXT:    movq %rax, %rcx
671; AVX512F-NEXT:    shrq $32, %rcx
672; AVX512F-NEXT:    movswl %cx, %ecx
673; AVX512F-NEXT:    vmovd %ecx, %xmm3
674; AVX512F-NEXT:    movswl %ax, %ecx
675; AVX512F-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
676; AVX512F-NEXT:    shrl $16, %eax
677; AVX512F-NEXT:    cwtl
678; AVX512F-NEXT:    vmovd %eax, %xmm4
679; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
680; AVX512F-NEXT:    vmovd %ecx, %xmm0
681; AVX512F-NEXT:    movq %rax, %rcx
682; AVX512F-NEXT:    shrq $48, %rcx
683; AVX512F-NEXT:    movswl %cx, %ecx
684; AVX512F-NEXT:    vmovd %ecx, %xmm5
685; AVX512F-NEXT:    movq %rax, %rcx
686; AVX512F-NEXT:    shrq $32, %rcx
687; AVX512F-NEXT:    movswl %cx, %ecx
688; AVX512F-NEXT:    vmovd %ecx, %xmm6
689; AVX512F-NEXT:    movswl %ax, %ecx
690; AVX512F-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
691; AVX512F-NEXT:    shrl $16, %eax
692; AVX512F-NEXT:    cwtl
693; AVX512F-NEXT:    vmovd %eax, %xmm7
694; AVX512F-NEXT:    vmovq %xmm1, %rax
695; AVX512F-NEXT:    vmovd %ecx, %xmm8
696; AVX512F-NEXT:    movq %rax, %rcx
697; AVX512F-NEXT:    shrq $48, %rcx
698; AVX512F-NEXT:    movswl %cx, %ecx
699; AVX512F-NEXT:    vmovd %ecx, %xmm9
700; AVX512F-NEXT:    movq %rax, %rcx
701; AVX512F-NEXT:    shrq $32, %rcx
702; AVX512F-NEXT:    movswl %cx, %ecx
703; AVX512F-NEXT:    vmovd %ecx, %xmm10
704; AVX512F-NEXT:    movswl %ax, %ecx
705; AVX512F-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
706; AVX512F-NEXT:    shrl $16, %eax
707; AVX512F-NEXT:    cwtl
708; AVX512F-NEXT:    vmovd %eax, %xmm11
709; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
710; AVX512F-NEXT:    vmovd %ecx, %xmm1
711; AVX512F-NEXT:    movq %rax, %rcx
712; AVX512F-NEXT:    shrq $48, %rcx
713; AVX512F-NEXT:    movswl %cx, %ecx
714; AVX512F-NEXT:    vmovd %ecx, %xmm12
715; AVX512F-NEXT:    movq %rax, %rcx
716; AVX512F-NEXT:    shrq $32, %rcx
717; AVX512F-NEXT:    movswl %cx, %ecx
718; AVX512F-NEXT:    vmovd %ecx, %xmm13
719; AVX512F-NEXT:    movl %eax, %ecx
720; AVX512F-NEXT:    shrl $16, %ecx
721; AVX512F-NEXT:    movswl %cx, %ecx
722; AVX512F-NEXT:    vmovd %ecx, %xmm14
723; AVX512F-NEXT:    cwtl
724; AVX512F-NEXT:    vmovd %eax, %xmm15
725; AVX512F-NEXT:    vcvtph2ps %ymm2, %zmm16
726; AVX512F-NEXT:    vcvtph2ps %ymm3, %zmm3
727; AVX512F-NEXT:    vcvtph2ps %ymm4, %zmm4
728; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
729; AVX512F-NEXT:    vcvtph2ps %ymm5, %zmm5
730; AVX512F-NEXT:    vcvtph2ps %ymm6, %zmm6
731; AVX512F-NEXT:    vcvtph2ps %ymm7, %zmm7
732; AVX512F-NEXT:    vcvtph2ps %ymm8, %zmm8
733; AVX512F-NEXT:    vcvtph2ps %ymm9, %zmm9
734; AVX512F-NEXT:    vcvtph2ps %ymm10, %zmm10
735; AVX512F-NEXT:    vcvtph2ps %ymm11, %zmm11
736; AVX512F-NEXT:    vcvtph2ps %ymm1, %zmm1
737; AVX512F-NEXT:    vcvtph2ps %ymm12, %zmm12
738; AVX512F-NEXT:    vcvtph2ps %ymm13, %zmm13
739; AVX512F-NEXT:    vcvtph2ps %ymm14, %zmm14
740; AVX512F-NEXT:    vcvtph2ps %ymm15, %zmm15
741; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[2,3]
742; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm13[0],xmm2[3]
743; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[0]
744; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[2,3]
745; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm10[0],xmm1[3]
746; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm9[0]
747; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
748; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[2,3]
749; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm6[0],xmm2[3]
750; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[0]
751; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
752; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
753; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm16[0]
754; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
755; AVX512F-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
756; AVX512F-NEXT:    retq
757;
758; AVX512VL-LABEL: cvt_16i16_to_16f32:
759; AVX512VL:       # BB#0:
760; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm10
761; AVX512VL-NEXT:    vmovq %xmm0, %rax
762; AVX512VL-NEXT:    movq %rax, %rcx
763; AVX512VL-NEXT:    shrq $48, %rcx
764; AVX512VL-NEXT:    movswl %cx, %ecx
765; AVX512VL-NEXT:    vmovd %ecx, %xmm8
766; AVX512VL-NEXT:    movq %rax, %rcx
767; AVX512VL-NEXT:    shrq $32, %rcx
768; AVX512VL-NEXT:    movswl %cx, %ecx
769; AVX512VL-NEXT:    vmovd %ecx, %xmm9
770; AVX512VL-NEXT:    movswl %ax, %ecx
771; AVX512VL-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
772; AVX512VL-NEXT:    shrl $16, %eax
773; AVX512VL-NEXT:    cwtl
774; AVX512VL-NEXT:    vmovd %eax, %xmm11
775; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
776; AVX512VL-NEXT:    vmovd %ecx, %xmm12
777; AVX512VL-NEXT:    movq %rax, %rcx
778; AVX512VL-NEXT:    shrq $48, %rcx
779; AVX512VL-NEXT:    movswl %cx, %ecx
780; AVX512VL-NEXT:    vmovd %ecx, %xmm13
781; AVX512VL-NEXT:    movq %rax, %rcx
782; AVX512VL-NEXT:    shrq $32, %rcx
783; AVX512VL-NEXT:    movswl %cx, %ecx
784; AVX512VL-NEXT:    vmovd %ecx, %xmm14
785; AVX512VL-NEXT:    movswl %ax, %ecx
786; AVX512VL-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
787; AVX512VL-NEXT:    shrl $16, %eax
788; AVX512VL-NEXT:    cwtl
789; AVX512VL-NEXT:    vmovd %eax, %xmm15
790; AVX512VL-NEXT:    vmovq %xmm10, %rax
791; AVX512VL-NEXT:    vmovd %ecx, %xmm16
792; AVX512VL-NEXT:    movq %rax, %rcx
793; AVX512VL-NEXT:    shrq $48, %rcx
794; AVX512VL-NEXT:    movswl %cx, %ecx
795; AVX512VL-NEXT:    vmovd %ecx, %xmm17
796; AVX512VL-NEXT:    movq %rax, %rcx
797; AVX512VL-NEXT:    shrq $32, %rcx
798; AVX512VL-NEXT:    movswl %cx, %ecx
799; AVX512VL-NEXT:    vmovd %ecx, %xmm18
800; AVX512VL-NEXT:    movswl %ax, %ecx
801; AVX512VL-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
802; AVX512VL-NEXT:    shrl $16, %eax
803; AVX512VL-NEXT:    cwtl
804; AVX512VL-NEXT:    vmovd %eax, %xmm19
805; AVX512VL-NEXT:    vpextrq $1, %xmm10, %rax
806; AVX512VL-NEXT:    vmovd %ecx, %xmm10
807; AVX512VL-NEXT:    movq %rax, %rcx
808; AVX512VL-NEXT:    shrq $48, %rcx
809; AVX512VL-NEXT:    movswl %cx, %ecx
810; AVX512VL-NEXT:    vmovd %ecx, %xmm20
811; AVX512VL-NEXT:    movq %rax, %rcx
812; AVX512VL-NEXT:    shrq $32, %rcx
813; AVX512VL-NEXT:    movswl %cx, %ecx
814; AVX512VL-NEXT:    vmovd %ecx, %xmm21
815; AVX512VL-NEXT:    movl %eax, %ecx
816; AVX512VL-NEXT:    shrl $16, %ecx
817; AVX512VL-NEXT:    movswl %cx, %ecx
818; AVX512VL-NEXT:    vmovd %ecx, %xmm22
819; AVX512VL-NEXT:    cwtl
820; AVX512VL-NEXT:    vmovd %eax, %xmm2
821; AVX512VL-NEXT:    vcvtph2ps %xmm8, %xmm8
822; AVX512VL-NEXT:    vcvtph2ps %xmm9, %xmm9
823; AVX512VL-NEXT:    vcvtph2ps %xmm11, %xmm11
824; AVX512VL-NEXT:    vcvtph2ps %xmm12, %xmm12
825; AVX512VL-NEXT:    vcvtph2ps %xmm13, %xmm13
826; AVX512VL-NEXT:    vcvtph2ps %xmm14, %xmm14
827; AVX512VL-NEXT:    vcvtph2ps %xmm15, %xmm15
828; AVX512VL-NEXT:    vcvtph2ps %xmm16, %xmm16
829; AVX512VL-NEXT:    vcvtph2ps %xmm17, %xmm4
830; AVX512VL-NEXT:    vcvtph2ps %xmm18, %xmm0
831; AVX512VL-NEXT:    vcvtph2ps %xmm19, %xmm5
832; AVX512VL-NEXT:    vcvtph2ps %xmm10, %xmm7
833; AVX512VL-NEXT:    vcvtph2ps %xmm20, %xmm3
834; AVX512VL-NEXT:    vcvtph2ps %xmm21, %xmm6
835; AVX512VL-NEXT:    vcvtph2ps %xmm22, %xmm1
836; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
837; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
838; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
839; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
840; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[2,3]
841; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
842; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
843; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
844; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[2,3]
845; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
846; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
847; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
848; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
849; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
850; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
851; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
852; AVX512VL-NEXT:    retq
853  %1 = bitcast <16 x i16> %a0 to <16 x half>
854  %2 = fpext <16 x half> %1 to <16 x float>
855  ret <16 x float> %2
856}
857
858;
859; Half to Float (Load)
860;
861
862define float @load_cvt_i16_to_f32(i16* %a0) nounwind {
863; AVX1-LABEL: load_cvt_i16_to_f32:
864; AVX1:       # BB#0:
865; AVX1-NEXT:    movswl (%rdi), %eax
866; AVX1-NEXT:    vmovd %eax, %xmm0
867; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
868; AVX1-NEXT:    retq
869;
870; AVX2-LABEL: load_cvt_i16_to_f32:
871; AVX2:       # BB#0:
872; AVX2-NEXT:    movswl (%rdi), %eax
873; AVX2-NEXT:    vmovd %eax, %xmm0
874; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
875; AVX2-NEXT:    retq
876;
877; AVX512F-LABEL: load_cvt_i16_to_f32:
878; AVX512F:       # BB#0:
879; AVX512F-NEXT:    movswl (%rdi), %eax
880; AVX512F-NEXT:    vmovd %eax, %xmm0
881; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
882; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
883; AVX512F-NEXT:    retq
884;
885; AVX512VL-LABEL: load_cvt_i16_to_f32:
886; AVX512VL:       # BB#0:
887; AVX512VL-NEXT:    movswl (%rdi), %eax
888; AVX512VL-NEXT:    vmovd %eax, %xmm0
889; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
890; AVX512VL-NEXT:    retq
891  %1 = load i16, i16* %a0
892  %2 = bitcast i16 %1 to half
893  %3 = fpext half %2 to float
894  ret float %3
895}
896
897define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind {
898; AVX1-LABEL: load_cvt_4i16_to_4f32:
899; AVX1:       # BB#0:
900; AVX1-NEXT:    movswl 6(%rdi), %eax
901; AVX1-NEXT:    vmovd %eax, %xmm0
902; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
903; AVX1-NEXT:    movswl 4(%rdi), %eax
904; AVX1-NEXT:    vmovd %eax, %xmm1
905; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
906; AVX1-NEXT:    movswl (%rdi), %eax
907; AVX1-NEXT:    vmovd %eax, %xmm2
908; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
909; AVX1-NEXT:    movswl 2(%rdi), %eax
910; AVX1-NEXT:    vmovd %eax, %xmm3
911; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
912; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
913; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
914; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
915; AVX1-NEXT:    retq
916;
917; AVX2-LABEL: load_cvt_4i16_to_4f32:
918; AVX2:       # BB#0:
919; AVX2-NEXT:    movswl 6(%rdi), %eax
920; AVX2-NEXT:    vmovd %eax, %xmm0
921; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
922; AVX2-NEXT:    movswl 4(%rdi), %eax
923; AVX2-NEXT:    vmovd %eax, %xmm1
924; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
925; AVX2-NEXT:    movswl (%rdi), %eax
926; AVX2-NEXT:    vmovd %eax, %xmm2
927; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
928; AVX2-NEXT:    movswl 2(%rdi), %eax
929; AVX2-NEXT:    vmovd %eax, %xmm3
930; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
931; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
932; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
933; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
934; AVX2-NEXT:    retq
935;
936; AVX512F-LABEL: load_cvt_4i16_to_4f32:
937; AVX512F:       # BB#0:
938; AVX512F-NEXT:    movswl 6(%rdi), %eax
939; AVX512F-NEXT:    vmovd %eax, %xmm0
940; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
941; AVX512F-NEXT:    movswl 4(%rdi), %eax
942; AVX512F-NEXT:    vmovd %eax, %xmm1
943; AVX512F-NEXT:    vcvtph2ps %ymm1, %zmm1
944; AVX512F-NEXT:    movswl (%rdi), %eax
945; AVX512F-NEXT:    vmovd %eax, %xmm2
946; AVX512F-NEXT:    vcvtph2ps %ymm2, %zmm2
947; AVX512F-NEXT:    movswl 2(%rdi), %eax
948; AVX512F-NEXT:    vmovd %eax, %xmm3
949; AVX512F-NEXT:    vcvtph2ps %ymm3, %zmm3
950; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
951; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
952; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
953; AVX512F-NEXT:    retq
954;
955; AVX512VL-LABEL: load_cvt_4i16_to_4f32:
956; AVX512VL:       # BB#0:
957; AVX512VL-NEXT:    movswl 6(%rdi), %eax
958; AVX512VL-NEXT:    vmovd %eax, %xmm0
959; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
960; AVX512VL-NEXT:    movswl 4(%rdi), %eax
961; AVX512VL-NEXT:    vmovd %eax, %xmm1
962; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
963; AVX512VL-NEXT:    movswl (%rdi), %eax
964; AVX512VL-NEXT:    vmovd %eax, %xmm2
965; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
966; AVX512VL-NEXT:    movswl 2(%rdi), %eax
967; AVX512VL-NEXT:    vmovd %eax, %xmm3
968; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
969; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
970; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
971; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
972; AVX512VL-NEXT:    retq
973  %1 = load <4 x i16>, <4 x i16>* %a0
974  %2 = bitcast <4 x i16> %1 to <4 x half>
975  %3 = fpext <4 x half> %2 to <4 x float>
976  ret <4 x float> %3
977}
978
979define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
980; AVX1-LABEL: load_cvt_8i16_to_4f32:
981; AVX1:       # BB#0:
982; AVX1-NEXT:    movq (%rdi), %rax
983; AVX1-NEXT:    movq %rax, %rcx
984; AVX1-NEXT:    movq %rax, %rdx
985; AVX1-NEXT:    movswl %ax, %esi
986; AVX1-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
987; AVX1-NEXT:    shrl $16, %eax
988; AVX1-NEXT:    shrq $32, %rcx
989; AVX1-NEXT:    shrq $48, %rdx
990; AVX1-NEXT:    movswl %dx, %edx
991; AVX1-NEXT:    vmovd %edx, %xmm0
992; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
993; AVX1-NEXT:    movswl %cx, %ecx
994; AVX1-NEXT:    vmovd %ecx, %xmm1
995; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
996; AVX1-NEXT:    cwtl
997; AVX1-NEXT:    vmovd %eax, %xmm2
998; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
999; AVX1-NEXT:    vmovd %esi, %xmm3
1000; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
1001; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
1002; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1003; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1004; AVX1-NEXT:    retq
1005;
1006; AVX2-LABEL: load_cvt_8i16_to_4f32:
1007; AVX2:       # BB#0:
1008; AVX2-NEXT:    movq (%rdi), %rax
1009; AVX2-NEXT:    movq %rax, %rcx
1010; AVX2-NEXT:    movq %rax, %rdx
1011; AVX2-NEXT:    movswl %ax, %esi
1012; AVX2-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
1013; AVX2-NEXT:    shrl $16, %eax
1014; AVX2-NEXT:    shrq $32, %rcx
1015; AVX2-NEXT:    shrq $48, %rdx
1016; AVX2-NEXT:    movswl %dx, %edx
1017; AVX2-NEXT:    vmovd %edx, %xmm0
1018; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
1019; AVX2-NEXT:    movswl %cx, %ecx
1020; AVX2-NEXT:    vmovd %ecx, %xmm1
1021; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
1022; AVX2-NEXT:    cwtl
1023; AVX2-NEXT:    vmovd %eax, %xmm2
1024; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
1025; AVX2-NEXT:    vmovd %esi, %xmm3
1026; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
1027; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
1028; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1029; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1030; AVX2-NEXT:    retq
1031;
1032; AVX512F-LABEL: load_cvt_8i16_to_4f32:
1033; AVX512F:       # BB#0:
1034; AVX512F-NEXT:    movq (%rdi), %rax
1035; AVX512F-NEXT:    movq %rax, %rcx
1036; AVX512F-NEXT:    movq %rax, %rdx
1037; AVX512F-NEXT:    movswl %ax, %esi
1038; AVX512F-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
1039; AVX512F-NEXT:    shrl $16, %eax
1040; AVX512F-NEXT:    shrq $32, %rcx
1041; AVX512F-NEXT:    shrq $48, %rdx
1042; AVX512F-NEXT:    movswl %dx, %edx
1043; AVX512F-NEXT:    vmovd %edx, %xmm0
1044; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
1045; AVX512F-NEXT:    movswl %cx, %ecx
1046; AVX512F-NEXT:    vmovd %ecx, %xmm1
1047; AVX512F-NEXT:    vcvtph2ps %ymm1, %zmm1
1048; AVX512F-NEXT:    cwtl
1049; AVX512F-NEXT:    vmovd %eax, %xmm2
1050; AVX512F-NEXT:    vcvtph2ps %ymm2, %zmm2
1051; AVX512F-NEXT:    vmovd %esi, %xmm3
1052; AVX512F-NEXT:    vcvtph2ps %ymm3, %zmm3
1053; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
1054; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1055; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1056; AVX512F-NEXT:    retq
1057;
1058; AVX512VL-LABEL: load_cvt_8i16_to_4f32:
1059; AVX512VL:       # BB#0:
1060; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1061; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
1062; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
1063; AVX512VL-NEXT:    movq %rax, %rcx
1064; AVX512VL-NEXT:    movq %rax, %rdx
1065; AVX512VL-NEXT:    movswl %ax, %esi
1066; AVX512VL-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
1067; AVX512VL-NEXT:    shrl $16, %eax
1068; AVX512VL-NEXT:    shrq $32, %rcx
1069; AVX512VL-NEXT:    shrq $48, %rdx
1070; AVX512VL-NEXT:    movswl %dx, %edx
1071; AVX512VL-NEXT:    vmovd %edx, %xmm0
1072; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
1073; AVX512VL-NEXT:    movswl %cx, %ecx
1074; AVX512VL-NEXT:    vmovd %ecx, %xmm1
1075; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
1076; AVX512VL-NEXT:    cwtl
1077; AVX512VL-NEXT:    vmovd %eax, %xmm2
1078; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
1079; AVX512VL-NEXT:    vmovd %esi, %xmm3
1080; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
1081; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
1082; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1083; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1084; AVX512VL-NEXT:    retq
1085  %1 = load <8 x i16>, <8 x i16>* %a0
1086  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1087  %3 = bitcast <4 x i16> %2 to <4 x half>
1088  %4 = fpext <4 x half> %3 to <4 x float>
1089  ret <4 x float> %4
1090}
1091
1092define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind {
1093; AVX1-LABEL: load_cvt_8i16_to_8f32:
1094; AVX1:       # BB#0:
1095; AVX1-NEXT:    movswl 6(%rdi), %eax
1096; AVX1-NEXT:    vmovd %eax, %xmm0
1097; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
1098; AVX1-NEXT:    movswl 4(%rdi), %eax
1099; AVX1-NEXT:    vmovd %eax, %xmm1
1100; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
1101; AVX1-NEXT:    movswl (%rdi), %eax
1102; AVX1-NEXT:    vmovd %eax, %xmm2
1103; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
1104; AVX1-NEXT:    movswl 2(%rdi), %eax
1105; AVX1-NEXT:    vmovd %eax, %xmm3
1106; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
1107; AVX1-NEXT:    movswl 14(%rdi), %eax
1108; AVX1-NEXT:    vmovd %eax, %xmm4
1109; AVX1-NEXT:    vcvtph2ps %xmm4, %xmm4
1110; AVX1-NEXT:    movswl 12(%rdi), %eax
1111; AVX1-NEXT:    vmovd %eax, %xmm5
1112; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
1113; AVX1-NEXT:    movswl 8(%rdi), %eax
1114; AVX1-NEXT:    vmovd %eax, %xmm6
1115; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
1116; AVX1-NEXT:    movswl 10(%rdi), %eax
1117; AVX1-NEXT:    vmovd %eax, %xmm7
1118; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
1119; AVX1-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
1120; AVX1-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
1121; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
1122; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
1123; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1124; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1125; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
1126; AVX1-NEXT:    retq
1127;
1128; AVX2-LABEL: load_cvt_8i16_to_8f32:
1129; AVX2:       # BB#0:
1130; AVX2-NEXT:    movswl 6(%rdi), %eax
1131; AVX2-NEXT:    vmovd %eax, %xmm0
1132; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
1133; AVX2-NEXT:    movswl 4(%rdi), %eax
1134; AVX2-NEXT:    vmovd %eax, %xmm1
1135; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
1136; AVX2-NEXT:    movswl (%rdi), %eax
1137; AVX2-NEXT:    vmovd %eax, %xmm2
1138; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
1139; AVX2-NEXT:    movswl 2(%rdi), %eax
1140; AVX2-NEXT:    vmovd %eax, %xmm3
1141; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
1142; AVX2-NEXT:    movswl 14(%rdi), %eax
1143; AVX2-NEXT:    vmovd %eax, %xmm4
1144; AVX2-NEXT:    vcvtph2ps %xmm4, %xmm4
1145; AVX2-NEXT:    movswl 12(%rdi), %eax
1146; AVX2-NEXT:    vmovd %eax, %xmm5
1147; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
1148; AVX2-NEXT:    movswl 8(%rdi), %eax
1149; AVX2-NEXT:    vmovd %eax, %xmm6
1150; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
1151; AVX2-NEXT:    movswl 10(%rdi), %eax
1152; AVX2-NEXT:    vmovd %eax, %xmm7
1153; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
1154; AVX2-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
1155; AVX2-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
1156; AVX2-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
1157; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
1158; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1159; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1160; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
1161; AVX2-NEXT:    retq
1162;
1163; AVX512F-LABEL: load_cvt_8i16_to_8f32:
1164; AVX512F:       # BB#0:
1165; AVX512F-NEXT:    movswl 6(%rdi), %eax
1166; AVX512F-NEXT:    vmovd %eax, %xmm0
1167; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
1168; AVX512F-NEXT:    movswl 4(%rdi), %eax
1169; AVX512F-NEXT:    vmovd %eax, %xmm1
1170; AVX512F-NEXT:    vcvtph2ps %ymm1, %zmm1
1171; AVX512F-NEXT:    movswl (%rdi), %eax
1172; AVX512F-NEXT:    vmovd %eax, %xmm2
1173; AVX512F-NEXT:    vcvtph2ps %ymm2, %zmm2
1174; AVX512F-NEXT:    movswl 2(%rdi), %eax
1175; AVX512F-NEXT:    vmovd %eax, %xmm3
1176; AVX512F-NEXT:    vcvtph2ps %ymm3, %zmm3
1177; AVX512F-NEXT:    movswl 14(%rdi), %eax
1178; AVX512F-NEXT:    vmovd %eax, %xmm4
1179; AVX512F-NEXT:    vcvtph2ps %ymm4, %zmm4
1180; AVX512F-NEXT:    movswl 12(%rdi), %eax
1181; AVX512F-NEXT:    vmovd %eax, %xmm5
1182; AVX512F-NEXT:    vcvtph2ps %ymm5, %zmm5
1183; AVX512F-NEXT:    movswl 8(%rdi), %eax
1184; AVX512F-NEXT:    vmovd %eax, %xmm6
1185; AVX512F-NEXT:    vcvtph2ps %ymm6, %zmm6
1186; AVX512F-NEXT:    movswl 10(%rdi), %eax
1187; AVX512F-NEXT:    vmovd %eax, %xmm7
1188; AVX512F-NEXT:    vcvtph2ps %ymm7, %zmm7
1189; AVX512F-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
1190; AVX512F-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
1191; AVX512F-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
1192; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
1193; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1194; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1195; AVX512F-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
1196; AVX512F-NEXT:    retq
1197;
1198; AVX512VL-LABEL: load_cvt_8i16_to_8f32:
1199; AVX512VL:       # BB#0:
1200; AVX512VL-NEXT:    movswl 6(%rdi), %eax
1201; AVX512VL-NEXT:    vmovd %eax, %xmm0
1202; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
1203; AVX512VL-NEXT:    movswl 4(%rdi), %eax
1204; AVX512VL-NEXT:    vmovd %eax, %xmm1
1205; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
1206; AVX512VL-NEXT:    movswl (%rdi), %eax
1207; AVX512VL-NEXT:    vmovd %eax, %xmm2
1208; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
1209; AVX512VL-NEXT:    movswl 2(%rdi), %eax
1210; AVX512VL-NEXT:    vmovd %eax, %xmm3
1211; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
1212; AVX512VL-NEXT:    movswl 14(%rdi), %eax
1213; AVX512VL-NEXT:    vmovd %eax, %xmm4
1214; AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
1215; AVX512VL-NEXT:    movswl 12(%rdi), %eax
1216; AVX512VL-NEXT:    vmovd %eax, %xmm5
1217; AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
1218; AVX512VL-NEXT:    movswl 8(%rdi), %eax
1219; AVX512VL-NEXT:    vmovd %eax, %xmm6
1220; AVX512VL-NEXT:    vcvtph2ps %xmm6, %xmm6
1221; AVX512VL-NEXT:    movswl 10(%rdi), %eax
1222; AVX512VL-NEXT:    vmovd %eax, %xmm7
1223; AVX512VL-NEXT:    vcvtph2ps %xmm7, %xmm7
1224; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
1225; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
1226; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
1227; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
1228; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1229; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1230; AVX512VL-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
1231; AVX512VL-NEXT:    retq
1232  %1 = load <8 x i16>, <8 x i16>* %a0
1233  %2 = bitcast <8 x i16> %1 to <8 x half>
1234  %3 = fpext <8 x half> %2 to <8 x float>
1235  ret <8 x float> %3
1236}
1237
1238define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind {
1239; AVX1-LABEL: load_cvt_16i16_to_16f32:
1240; AVX1:       # BB#0:
1241; AVX1-NEXT:    movswl 22(%rdi), %eax
1242; AVX1-NEXT:    vmovd %eax, %xmm0
1243; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm8
1244; AVX1-NEXT:    movswl 20(%rdi), %eax
1245; AVX1-NEXT:    vmovd %eax, %xmm0
1246; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm9
1247; AVX1-NEXT:    movswl 16(%rdi), %eax
1248; AVX1-NEXT:    vmovd %eax, %xmm0
1249; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm10
1250; AVX1-NEXT:    movswl 18(%rdi), %eax
1251; AVX1-NEXT:    vmovd %eax, %xmm0
1252; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm11
1253; AVX1-NEXT:    movswl 30(%rdi), %eax
1254; AVX1-NEXT:    vmovd %eax, %xmm0
1255; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm12
1256; AVX1-NEXT:    movswl 28(%rdi), %eax
1257; AVX1-NEXT:    vmovd %eax, %xmm0
1258; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm13
1259; AVX1-NEXT:    movswl 24(%rdi), %eax
1260; AVX1-NEXT:    vmovd %eax, %xmm0
1261; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm14
1262; AVX1-NEXT:    movswl 26(%rdi), %eax
1263; AVX1-NEXT:    vmovd %eax, %xmm0
1264; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm15
1265; AVX1-NEXT:    movswl 6(%rdi), %eax
1266; AVX1-NEXT:    vmovd %eax, %xmm0
1267; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
1268; AVX1-NEXT:    movswl 4(%rdi), %eax
1269; AVX1-NEXT:    vmovd %eax, %xmm2
1270; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
1271; AVX1-NEXT:    movswl (%rdi), %eax
1272; AVX1-NEXT:    vmovd %eax, %xmm3
1273; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
1274; AVX1-NEXT:    movswl 2(%rdi), %eax
1275; AVX1-NEXT:    vmovd %eax, %xmm4
1276; AVX1-NEXT:    vcvtph2ps %xmm4, %xmm4
1277; AVX1-NEXT:    movswl 14(%rdi), %eax
1278; AVX1-NEXT:    vmovd %eax, %xmm5
1279; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
1280; AVX1-NEXT:    movswl 12(%rdi), %eax
1281; AVX1-NEXT:    vmovd %eax, %xmm6
1282; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
1283; AVX1-NEXT:    movswl 8(%rdi), %eax
1284; AVX1-NEXT:    vmovd %eax, %xmm7
1285; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
1286; AVX1-NEXT:    movswl 10(%rdi), %eax
1287; AVX1-NEXT:    vmovd %eax, %xmm1
1288; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
1289; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
1290; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
1291; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
1292; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
1293; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
1294; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
1295; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1296; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
1297; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
1298; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
1299; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
1300; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
1301; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
1302; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1303; AVX1-NEXT:    retq
1304;
1305; AVX2-LABEL: load_cvt_16i16_to_16f32:
1306; AVX2:       # BB#0:
1307; AVX2-NEXT:    movswl 22(%rdi), %eax
1308; AVX2-NEXT:    vmovd %eax, %xmm0
1309; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm8
1310; AVX2-NEXT:    movswl 20(%rdi), %eax
1311; AVX2-NEXT:    vmovd %eax, %xmm0
1312; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm9
1313; AVX2-NEXT:    movswl 16(%rdi), %eax
1314; AVX2-NEXT:    vmovd %eax, %xmm0
1315; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm10
1316; AVX2-NEXT:    movswl 18(%rdi), %eax
1317; AVX2-NEXT:    vmovd %eax, %xmm0
1318; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm11
1319; AVX2-NEXT:    movswl 30(%rdi), %eax
1320; AVX2-NEXT:    vmovd %eax, %xmm0
1321; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm12
1322; AVX2-NEXT:    movswl 28(%rdi), %eax
1323; AVX2-NEXT:    vmovd %eax, %xmm0
1324; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm13
1325; AVX2-NEXT:    movswl 24(%rdi), %eax
1326; AVX2-NEXT:    vmovd %eax, %xmm0
1327; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm14
1328; AVX2-NEXT:    movswl 26(%rdi), %eax
1329; AVX2-NEXT:    vmovd %eax, %xmm0
1330; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm15
1331; AVX2-NEXT:    movswl 6(%rdi), %eax
1332; AVX2-NEXT:    vmovd %eax, %xmm0
1333; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
1334; AVX2-NEXT:    movswl 4(%rdi), %eax
1335; AVX2-NEXT:    vmovd %eax, %xmm2
1336; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
1337; AVX2-NEXT:    movswl (%rdi), %eax
1338; AVX2-NEXT:    vmovd %eax, %xmm3
1339; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
1340; AVX2-NEXT:    movswl 2(%rdi), %eax
1341; AVX2-NEXT:    vmovd %eax, %xmm4
1342; AVX2-NEXT:    vcvtph2ps %xmm4, %xmm4
1343; AVX2-NEXT:    movswl 14(%rdi), %eax
1344; AVX2-NEXT:    vmovd %eax, %xmm5
1345; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
1346; AVX2-NEXT:    movswl 12(%rdi), %eax
1347; AVX2-NEXT:    vmovd %eax, %xmm6
1348; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
1349; AVX2-NEXT:    movswl 8(%rdi), %eax
1350; AVX2-NEXT:    vmovd %eax, %xmm7
1351; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
1352; AVX2-NEXT:    movswl 10(%rdi), %eax
1353; AVX2-NEXT:    vmovd %eax, %xmm1
1354; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
1355; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
1356; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
1357; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
1358; AVX2-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
1359; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
1360; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
1361; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1362; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
1363; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
1364; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
1365; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
1366; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
1367; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
1368; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1369; AVX2-NEXT:    retq
1370;
1371; AVX512F-LABEL: load_cvt_16i16_to_16f32:
1372; AVX512F:       # BB#0:
1373; AVX512F-NEXT:    movswl 6(%rdi), %eax
1374; AVX512F-NEXT:    vmovd %eax, %xmm0
1375; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm16
1376; AVX512F-NEXT:    movswl 4(%rdi), %eax
1377; AVX512F-NEXT:    vmovd %eax, %xmm1
1378; AVX512F-NEXT:    vcvtph2ps %ymm1, %zmm17
1379; AVX512F-NEXT:    movswl (%rdi), %eax
1380; AVX512F-NEXT:    vmovd %eax, %xmm2
1381; AVX512F-NEXT:    vcvtph2ps %ymm2, %zmm2
1382; AVX512F-NEXT:    movswl 2(%rdi), %eax
1383; AVX512F-NEXT:    vmovd %eax, %xmm3
1384; AVX512F-NEXT:    vcvtph2ps %ymm3, %zmm3
1385; AVX512F-NEXT:    movswl 14(%rdi), %eax
1386; AVX512F-NEXT:    vmovd %eax, %xmm4
1387; AVX512F-NEXT:    vcvtph2ps %ymm4, %zmm4
1388; AVX512F-NEXT:    movswl 12(%rdi), %eax
1389; AVX512F-NEXT:    vmovd %eax, %xmm5
1390; AVX512F-NEXT:    vcvtph2ps %ymm5, %zmm5
1391; AVX512F-NEXT:    movswl 8(%rdi), %eax
1392; AVX512F-NEXT:    vmovd %eax, %xmm6
1393; AVX512F-NEXT:    vcvtph2ps %ymm6, %zmm6
1394; AVX512F-NEXT:    movswl 10(%rdi), %eax
1395; AVX512F-NEXT:    vmovd %eax, %xmm7
1396; AVX512F-NEXT:    vcvtph2ps %ymm7, %zmm7
1397; AVX512F-NEXT:    movswl 22(%rdi), %eax
1398; AVX512F-NEXT:    vmovd %eax, %xmm8
1399; AVX512F-NEXT:    vcvtph2ps %ymm8, %zmm8
1400; AVX512F-NEXT:    movswl 20(%rdi), %eax
1401; AVX512F-NEXT:    vmovd %eax, %xmm9
1402; AVX512F-NEXT:    vcvtph2ps %ymm9, %zmm9
1403; AVX512F-NEXT:    movswl 16(%rdi), %eax
1404; AVX512F-NEXT:    vmovd %eax, %xmm10
1405; AVX512F-NEXT:    vcvtph2ps %ymm10, %zmm10
1406; AVX512F-NEXT:    movswl 18(%rdi), %eax
1407; AVX512F-NEXT:    vmovd %eax, %xmm11
1408; AVX512F-NEXT:    vcvtph2ps %ymm11, %zmm11
1409; AVX512F-NEXT:    movswl 30(%rdi), %eax
1410; AVX512F-NEXT:    vmovd %eax, %xmm12
1411; AVX512F-NEXT:    vcvtph2ps %ymm12, %zmm12
1412; AVX512F-NEXT:    movswl 28(%rdi), %eax
1413; AVX512F-NEXT:    vmovd %eax, %xmm13
1414; AVX512F-NEXT:    vcvtph2ps %ymm13, %zmm13
1415; AVX512F-NEXT:    movswl 24(%rdi), %eax
1416; AVX512F-NEXT:    vmovd %eax, %xmm14
1417; AVX512F-NEXT:    vcvtph2ps %ymm14, %zmm14
1418; AVX512F-NEXT:    movswl 26(%rdi), %eax
1419; AVX512F-NEXT:    vmovd %eax, %xmm15
1420; AVX512F-NEXT:    vcvtph2ps %ymm15, %zmm15
1421; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[2,3]
1422; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm13[0],xmm0[3]
1423; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm12[0]
1424; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[2,3]
1425; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0],xmm1[3]
1426; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
1427; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1428; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[2,3]
1429; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
1430; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
1431; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
1432; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm17[0],xmm2[3]
1433; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm16[0]
1434; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1435; AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
1436; AVX512F-NEXT:    retq
1437;
1438; AVX512VL-LABEL: load_cvt_16i16_to_16f32:
1439; AVX512VL:       # BB#0:
1440; AVX512VL-NEXT:    movswl 6(%rdi), %eax
1441; AVX512VL-NEXT:    vmovd %eax, %xmm0
1442; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm8
1443; AVX512VL-NEXT:    movswl 4(%rdi), %eax
1444; AVX512VL-NEXT:    vmovd %eax, %xmm1
1445; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm9
1446; AVX512VL-NEXT:    movswl (%rdi), %eax
1447; AVX512VL-NEXT:    vmovd %eax, %xmm2
1448; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm10
1449; AVX512VL-NEXT:    movswl 2(%rdi), %eax
1450; AVX512VL-NEXT:    vmovd %eax, %xmm3
1451; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm11
1452; AVX512VL-NEXT:    movswl 14(%rdi), %eax
1453; AVX512VL-NEXT:    vmovd %eax, %xmm4
1454; AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm12
1455; AVX512VL-NEXT:    movswl 12(%rdi), %eax
1456; AVX512VL-NEXT:    vmovd %eax, %xmm5
1457; AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm13
1458; AVX512VL-NEXT:    movswl 8(%rdi), %eax
1459; AVX512VL-NEXT:    vmovd %eax, %xmm6
1460; AVX512VL-NEXT:    vcvtph2ps %xmm6, %xmm14
1461; AVX512VL-NEXT:    movswl 10(%rdi), %eax
1462; AVX512VL-NEXT:    vmovd %eax, %xmm7
1463; AVX512VL-NEXT:    vcvtph2ps %xmm7, %xmm15
1464; AVX512VL-NEXT:    movswl 22(%rdi), %eax
1465; AVX512VL-NEXT:    vmovd %eax, %xmm0
1466; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
1467; AVX512VL-NEXT:    movswl 20(%rdi), %eax
1468; AVX512VL-NEXT:    vmovd %eax, %xmm1
1469; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
1470; AVX512VL-NEXT:    movswl 16(%rdi), %eax
1471; AVX512VL-NEXT:    vmovd %eax, %xmm2
1472; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
1473; AVX512VL-NEXT:    movswl 18(%rdi), %eax
1474; AVX512VL-NEXT:    vmovd %eax, %xmm3
1475; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
1476; AVX512VL-NEXT:    movswl 30(%rdi), %eax
1477; AVX512VL-NEXT:    vmovd %eax, %xmm4
1478; AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
1479; AVX512VL-NEXT:    movswl 28(%rdi), %eax
1480; AVX512VL-NEXT:    vmovd %eax, %xmm5
1481; AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
1482; AVX512VL-NEXT:    movswl 24(%rdi), %eax
1483; AVX512VL-NEXT:    vmovd %eax, %xmm6
1484; AVX512VL-NEXT:    vcvtph2ps %xmm6, %xmm6
1485; AVX512VL-NEXT:    movswl 26(%rdi), %eax
1486; AVX512VL-NEXT:    vmovd %eax, %xmm7
1487; AVX512VL-NEXT:    vcvtph2ps %xmm7, %xmm7
1488; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
1489; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
1490; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
1491; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
1492; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1493; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1494; AVX512VL-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
1495; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
1496; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
1497; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
1498; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
1499; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
1500; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
1501; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1502; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
1503; AVX512VL-NEXT:    retq
1504  %1 = load <16 x i16>, <16 x i16>* %a0
1505  %2 = bitcast <16 x i16> %1 to <16 x half>
1506  %3 = fpext <16 x half> %2 to <16 x float>
1507  ret <16 x float> %3
1508}
1509
1510;
1511; Half to Double
1512;
1513
1514define double @cvt_i16_to_f64(i16 %a0) nounwind {
1515; AVX1-LABEL: cvt_i16_to_f64:
1516; AVX1:       # BB#0:
1517; AVX1-NEXT:    movswl %di, %eax
1518; AVX1-NEXT:    vmovd %eax, %xmm0
1519; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
1520; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1521; AVX1-NEXT:    retq
1522;
1523; AVX2-LABEL: cvt_i16_to_f64:
1524; AVX2:       # BB#0:
1525; AVX2-NEXT:    movswl %di, %eax
1526; AVX2-NEXT:    vmovd %eax, %xmm0
1527; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
1528; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1529; AVX2-NEXT:    retq
1530;
1531; AVX512F-LABEL: cvt_i16_to_f64:
1532; AVX512F:       # BB#0:
1533; AVX512F-NEXT:    movswl %di, %eax
1534; AVX512F-NEXT:    vmovd %eax, %xmm0
1535; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
1536; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1537; AVX512F-NEXT:    retq
1538;
1539; AVX512VL-LABEL: cvt_i16_to_f64:
1540; AVX512VL:       # BB#0:
1541; AVX512VL-NEXT:    movswl %di, %eax
1542; AVX512VL-NEXT:    vmovd %eax, %xmm0
1543; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
1544; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1545; AVX512VL-NEXT:    retq
1546  %1 = bitcast i16 %a0 to half
1547  %2 = fpext half %1 to double
1548  ret double %2
1549}
1550
1551define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
1552; AVX1-LABEL: cvt_2i16_to_2f64:
1553; AVX1:       # BB#0:
1554; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1555; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1556; AVX1-NEXT:    vmovd %xmm0, %eax
1557; AVX1-NEXT:    movswl %ax, %ecx
1558; AVX1-NEXT:    shrl $16, %eax
1559; AVX1-NEXT:    cwtl
1560; AVX1-NEXT:    vmovd %eax, %xmm0
1561; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
1562; AVX1-NEXT:    vmovd %ecx, %xmm1
1563; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
1564; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1565; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1566; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1567; AVX1-NEXT:    retq
1568;
1569; AVX2-LABEL: cvt_2i16_to_2f64:
1570; AVX2:       # BB#0:
1571; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1572; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1573; AVX2-NEXT:    vmovd %xmm0, %eax
1574; AVX2-NEXT:    movswl %ax, %ecx
1575; AVX2-NEXT:    shrl $16, %eax
1576; AVX2-NEXT:    cwtl
1577; AVX2-NEXT:    vmovd %eax, %xmm0
1578; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
1579; AVX2-NEXT:    vmovd %ecx, %xmm1
1580; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
1581; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1582; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1583; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1584; AVX2-NEXT:    retq
1585;
1586; AVX512F-LABEL: cvt_2i16_to_2f64:
1587; AVX512F:       # BB#0:
1588; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1589; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1590; AVX512F-NEXT:    vmovd %xmm0, %eax
1591; AVX512F-NEXT:    movswl %ax, %ecx
1592; AVX512F-NEXT:    shrl $16, %eax
1593; AVX512F-NEXT:    cwtl
1594; AVX512F-NEXT:    vmovd %eax, %xmm0
1595; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
1596; AVX512F-NEXT:    vmovd %ecx, %xmm1
1597; AVX512F-NEXT:    vcvtph2ps %ymm1, %zmm1
1598; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1599; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1600; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1601; AVX512F-NEXT:    retq
1602;
1603; AVX512VL-LABEL: cvt_2i16_to_2f64:
1604; AVX512VL:       # BB#0:
1605; AVX512VL-NEXT:    vpmovqw %xmm0, -{{[0-9]+}}(%rsp)
1606; AVX512VL-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
1607; AVX512VL-NEXT:    movswl %ax, %ecx
1608; AVX512VL-NEXT:    shrl $16, %eax
1609; AVX512VL-NEXT:    cwtl
1610; AVX512VL-NEXT:    vmovd %eax, %xmm0
1611; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
1612; AVX512VL-NEXT:    vmovd %ecx, %xmm1
1613; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
1614; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1615; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1616; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1617; AVX512VL-NEXT:    retq
1618  %1 = bitcast <2 x i16> %a0 to <2 x half>
1619  %2 = fpext <2 x half> %1 to <2 x double>
1620  ret <2 x double> %2
1621}
1622
1623define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
1624; AVX1-LABEL: cvt_4i16_to_4f64:
1625; AVX1:       # BB#0:
1626; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1627; AVX1-NEXT:    vmovq %xmm0, %rax
1628; AVX1-NEXT:    movq %rax, %rcx
1629; AVX1-NEXT:    movl %eax, %edx
1630; AVX1-NEXT:    movswl %ax, %esi
1631; AVX1-NEXT:    shrq $48, %rax
1632; AVX1-NEXT:    shrq $32, %rcx
1633; AVX1-NEXT:    shrl $16, %edx
1634; AVX1-NEXT:    movswl %dx, %edx
1635; AVX1-NEXT:    vmovd %edx, %xmm0
1636; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
1637; AVX1-NEXT:    vmovd %esi, %xmm1
1638; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
1639; AVX1-NEXT:    movswl %cx, %ecx
1640; AVX1-NEXT:    vmovd %ecx, %xmm2
1641; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
1642; AVX1-NEXT:    cwtl
1643; AVX1-NEXT:    vmovd %eax, %xmm3
1644; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
1645; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1646; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1647; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1648; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1649; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1650; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1651; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1652; AVX1-NEXT:    retq
1653;
1654; AVX2-LABEL: cvt_4i16_to_4f64:
1655; AVX2:       # BB#0:
1656; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1657; AVX2-NEXT:    vmovq %xmm0, %rax
1658; AVX2-NEXT:    movq %rax, %rcx
1659; AVX2-NEXT:    movl %eax, %edx
1660; AVX2-NEXT:    movswl %ax, %esi
1661; AVX2-NEXT:    shrq $48, %rax
1662; AVX2-NEXT:    shrq $32, %rcx
1663; AVX2-NEXT:    shrl $16, %edx
1664; AVX2-NEXT:    movswl %dx, %edx
1665; AVX2-NEXT:    vmovd %edx, %xmm0
1666; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
1667; AVX2-NEXT:    vmovd %esi, %xmm1
1668; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
1669; AVX2-NEXT:    movswl %cx, %ecx
1670; AVX2-NEXT:    vmovd %ecx, %xmm2
1671; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
1672; AVX2-NEXT:    cwtl
1673; AVX2-NEXT:    vmovd %eax, %xmm3
1674; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
1675; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1676; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1677; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1678; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1679; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1680; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1681; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1682; AVX2-NEXT:    retq
1683;
1684; AVX512F-LABEL: cvt_4i16_to_4f64:
1685; AVX512F:       # BB#0:
1686; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1687; AVX512F-NEXT:    vmovq %xmm0, %rax
1688; AVX512F-NEXT:    movq %rax, %rcx
1689; AVX512F-NEXT:    movl %eax, %edx
1690; AVX512F-NEXT:    movswl %ax, %esi
1691; AVX512F-NEXT:    shrq $48, %rax
1692; AVX512F-NEXT:    shrq $32, %rcx
1693; AVX512F-NEXT:    shrl $16, %edx
1694; AVX512F-NEXT:    movswl %dx, %edx
1695; AVX512F-NEXT:    vmovd %edx, %xmm0
1696; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
1697; AVX512F-NEXT:    vmovd %esi, %xmm1
1698; AVX512F-NEXT:    vcvtph2ps %ymm1, %zmm1
1699; AVX512F-NEXT:    movswl %cx, %ecx
1700; AVX512F-NEXT:    vmovd %ecx, %xmm2
1701; AVX512F-NEXT:    vcvtph2ps %ymm2, %zmm2
1702; AVX512F-NEXT:    cwtl
1703; AVX512F-NEXT:    vmovd %eax, %xmm3
1704; AVX512F-NEXT:    vcvtph2ps %ymm3, %zmm3
1705; AVX512F-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1706; AVX512F-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1707; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1708; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1709; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1710; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1711; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1712; AVX512F-NEXT:    retq
1713;
1714; AVX512VL-LABEL: cvt_4i16_to_4f64:
1715; AVX512VL:       # BB#0:
1716; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
1717; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
1718; AVX512VL-NEXT:    movq %rax, %rcx
1719; AVX512VL-NEXT:    movl %eax, %edx
1720; AVX512VL-NEXT:    movswl %ax, %esi
1721; AVX512VL-NEXT:    shrq $48, %rax
1722; AVX512VL-NEXT:    shrq $32, %rcx
1723; AVX512VL-NEXT:    shrl $16, %edx
1724; AVX512VL-NEXT:    movswl %dx, %edx
1725; AVX512VL-NEXT:    vmovd %edx, %xmm0
1726; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
1727; AVX512VL-NEXT:    vmovd %esi, %xmm1
1728; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
1729; AVX512VL-NEXT:    movswl %cx, %ecx
1730; AVX512VL-NEXT:    vmovd %ecx, %xmm2
1731; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
1732; AVX512VL-NEXT:    cwtl
1733; AVX512VL-NEXT:    vmovd %eax, %xmm3
1734; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
1735; AVX512VL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1736; AVX512VL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1737; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1738; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1739; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1740; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1741; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1742; AVX512VL-NEXT:    retq
1743  %1 = bitcast <4 x i16> %a0 to <4 x half>
1744  %2 = fpext <4 x half> %1 to <4 x double>
1745  ret <4 x double> %2
1746}
1747
1748define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
1749; AVX1-LABEL: cvt_8i16_to_2f64:
1750; AVX1:       # BB#0:
1751; AVX1-NEXT:    vmovd %xmm0, %eax
1752; AVX1-NEXT:    movswl %ax, %ecx
1753; AVX1-NEXT:    shrl $16, %eax
1754; AVX1-NEXT:    cwtl
1755; AVX1-NEXT:    vmovd %eax, %xmm0
1756; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
1757; AVX1-NEXT:    vmovd %ecx, %xmm1
1758; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
1759; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1760; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1761; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1762; AVX1-NEXT:    retq
1763;
1764; AVX2-LABEL: cvt_8i16_to_2f64:
1765; AVX2:       # BB#0:
1766; AVX2-NEXT:    vmovd %xmm0, %eax
1767; AVX2-NEXT:    movswl %ax, %ecx
1768; AVX2-NEXT:    shrl $16, %eax
1769; AVX2-NEXT:    cwtl
1770; AVX2-NEXT:    vmovd %eax, %xmm0
1771; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
1772; AVX2-NEXT:    vmovd %ecx, %xmm1
1773; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
1774; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1775; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1776; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1777; AVX2-NEXT:    retq
1778;
1779; AVX512F-LABEL: cvt_8i16_to_2f64:
1780; AVX512F:       # BB#0:
1781; AVX512F-NEXT:    vmovd %xmm0, %eax
1782; AVX512F-NEXT:    movswl %ax, %ecx
1783; AVX512F-NEXT:    shrl $16, %eax
1784; AVX512F-NEXT:    cwtl
1785; AVX512F-NEXT:    vmovd %eax, %xmm0
1786; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
1787; AVX512F-NEXT:    vmovd %ecx, %xmm1
1788; AVX512F-NEXT:    vcvtph2ps %ymm1, %zmm1
1789; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1790; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1791; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1792; AVX512F-NEXT:    retq
1793;
1794; AVX512VL-LABEL: cvt_8i16_to_2f64:
1795; AVX512VL:       # BB#0:
1796; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1797; AVX512VL-NEXT:    vpmovqw %xmm0, -{{[0-9]+}}(%rsp)
1798; AVX512VL-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
1799; AVX512VL-NEXT:    movswl %ax, %ecx
1800; AVX512VL-NEXT:    shrl $16, %eax
1801; AVX512VL-NEXT:    cwtl
1802; AVX512VL-NEXT:    vmovd %eax, %xmm0
1803; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
1804; AVX512VL-NEXT:    vmovd %ecx, %xmm1
1805; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
1806; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1807; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1808; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1809; AVX512VL-NEXT:    retq
1810  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
1811  %2 = bitcast <2 x i16> %1 to <2 x half>
1812  %3 = fpext <2 x half> %2 to <2 x double>
1813  ret <2 x double> %3
1814}
1815
1816define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
1817; AVX1-LABEL: cvt_8i16_to_4f64:
1818; AVX1:       # BB#0:
1819; AVX1-NEXT:    vmovq %xmm0, %rax
1820; AVX1-NEXT:    movq %rax, %rcx
1821; AVX1-NEXT:    movl %eax, %edx
1822; AVX1-NEXT:    movswl %ax, %esi
1823; AVX1-NEXT:    shrq $48, %rax
1824; AVX1-NEXT:    shrq $32, %rcx
1825; AVX1-NEXT:    shrl $16, %edx
1826; AVX1-NEXT:    movswl %dx, %edx
1827; AVX1-NEXT:    vmovd %edx, %xmm0
1828; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
1829; AVX1-NEXT:    vmovd %esi, %xmm1
1830; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
1831; AVX1-NEXT:    movswl %cx, %ecx
1832; AVX1-NEXT:    vmovd %ecx, %xmm2
1833; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
1834; AVX1-NEXT:    cwtl
1835; AVX1-NEXT:    vmovd %eax, %xmm3
1836; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
1837; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1838; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1839; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1840; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1841; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1842; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1843; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1844; AVX1-NEXT:    retq
1845;
1846; AVX2-LABEL: cvt_8i16_to_4f64:
1847; AVX2:       # BB#0:
1848; AVX2-NEXT:    vmovq %xmm0, %rax
1849; AVX2-NEXT:    movq %rax, %rcx
1850; AVX2-NEXT:    movl %eax, %edx
1851; AVX2-NEXT:    movswl %ax, %esi
1852; AVX2-NEXT:    shrq $48, %rax
1853; AVX2-NEXT:    shrq $32, %rcx
1854; AVX2-NEXT:    shrl $16, %edx
1855; AVX2-NEXT:    movswl %dx, %edx
1856; AVX2-NEXT:    vmovd %edx, %xmm0
1857; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
1858; AVX2-NEXT:    vmovd %esi, %xmm1
1859; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
1860; AVX2-NEXT:    movswl %cx, %ecx
1861; AVX2-NEXT:    vmovd %ecx, %xmm2
1862; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
1863; AVX2-NEXT:    cwtl
1864; AVX2-NEXT:    vmovd %eax, %xmm3
1865; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
1866; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1867; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1868; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1869; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1870; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1871; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1872; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1873; AVX2-NEXT:    retq
1874;
1875; AVX512F-LABEL: cvt_8i16_to_4f64:
1876; AVX512F:       # BB#0:
1877; AVX512F-NEXT:    vmovq %xmm0, %rax
1878; AVX512F-NEXT:    movq %rax, %rcx
1879; AVX512F-NEXT:    movl %eax, %edx
1880; AVX512F-NEXT:    movswl %ax, %esi
1881; AVX512F-NEXT:    shrq $48, %rax
1882; AVX512F-NEXT:    shrq $32, %rcx
1883; AVX512F-NEXT:    shrl $16, %edx
1884; AVX512F-NEXT:    movswl %dx, %edx
1885; AVX512F-NEXT:    vmovd %edx, %xmm0
1886; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
1887; AVX512F-NEXT:    vmovd %esi, %xmm1
1888; AVX512F-NEXT:    vcvtph2ps %ymm1, %zmm1
1889; AVX512F-NEXT:    movswl %cx, %ecx
1890; AVX512F-NEXT:    vmovd %ecx, %xmm2
1891; AVX512F-NEXT:    vcvtph2ps %ymm2, %zmm2
1892; AVX512F-NEXT:    cwtl
1893; AVX512F-NEXT:    vmovd %eax, %xmm3
1894; AVX512F-NEXT:    vcvtph2ps %ymm3, %zmm3
1895; AVX512F-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1896; AVX512F-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1897; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1898; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1899; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1900; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1901; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1902; AVX512F-NEXT:    retq
1903;
1904; AVX512VL-LABEL: cvt_8i16_to_4f64:
1905; AVX512VL:       # BB#0:
1906; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1907; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
1908; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
1909; AVX512VL-NEXT:    movq %rax, %rcx
1910; AVX512VL-NEXT:    movl %eax, %edx
1911; AVX512VL-NEXT:    movswl %ax, %esi
1912; AVX512VL-NEXT:    shrq $48, %rax
1913; AVX512VL-NEXT:    shrq $32, %rcx
1914; AVX512VL-NEXT:    shrl $16, %edx
1915; AVX512VL-NEXT:    movswl %dx, %edx
1916; AVX512VL-NEXT:    vmovd %edx, %xmm0
1917; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
1918; AVX512VL-NEXT:    vmovd %esi, %xmm1
1919; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
1920; AVX512VL-NEXT:    movswl %cx, %ecx
1921; AVX512VL-NEXT:    vmovd %ecx, %xmm2
1922; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
1923; AVX512VL-NEXT:    cwtl
1924; AVX512VL-NEXT:    vmovd %eax, %xmm3
1925; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
1926; AVX512VL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1927; AVX512VL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1928; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1929; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1930; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1931; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1932; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1933; AVX512VL-NEXT:    retq
1934  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1935  %2 = bitcast <4 x i16> %1 to <4 x half>
1936  %3 = fpext <4 x half> %2 to <4 x double>
1937  ret <4 x double> %3
1938}
1939
1940define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
1941; AVX1-LABEL: cvt_8i16_to_8f64:
1942; AVX1:       # BB#0:
1943; AVX1-NEXT:    vmovq %xmm0, %rdx
1944; AVX1-NEXT:    movq %rdx, %r9
1945; AVX1-NEXT:    movl %edx, %r10d
1946; AVX1-NEXT:    movswl %dx, %r8d
1947; AVX1-NEXT:    shrq $48, %rdx
1948; AVX1-NEXT:    shrq $32, %r9
1949; AVX1-NEXT:    shrl $16, %r10d
1950; AVX1-NEXT:    vpextrq $1, %xmm0, %rdi
1951; AVX1-NEXT:    movq %rdi, %rsi
1952; AVX1-NEXT:    movl %edi, %eax
1953; AVX1-NEXT:    movswl %di, %ecx
1954; AVX1-NEXT:    shrq $48, %rdi
1955; AVX1-NEXT:    shrq $32, %rsi
1956; AVX1-NEXT:    shrl $16, %eax
1957; AVX1-NEXT:    cwtl
1958; AVX1-NEXT:    vmovd %eax, %xmm0
1959; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm1
1960; AVX1-NEXT:    vmovd %ecx, %xmm0
1961; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm2
1962; AVX1-NEXT:    movswl %si, %eax
1963; AVX1-NEXT:    vmovd %eax, %xmm0
1964; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm3
1965; AVX1-NEXT:    movswl %di, %eax
1966; AVX1-NEXT:    vmovd %eax, %xmm0
1967; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm4
1968; AVX1-NEXT:    movswl %r10w, %eax
1969; AVX1-NEXT:    vmovd %eax, %xmm0
1970; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
1971; AVX1-NEXT:    vmovd %r8d, %xmm5
1972; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
1973; AVX1-NEXT:    movswl %r9w, %eax
1974; AVX1-NEXT:    vmovd %eax, %xmm6
1975; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
1976; AVX1-NEXT:    movswl %dx, %eax
1977; AVX1-NEXT:    vmovd %eax, %xmm7
1978; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
1979; AVX1-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
1980; AVX1-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
1981; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1982; AVX1-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
1983; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1984; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0]
1985; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
1986; AVX1-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
1987; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1988; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1989; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1990; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1991; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1992; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
1993; AVX1-NEXT:    retq
1994;
1995; AVX2-LABEL: cvt_8i16_to_8f64:
1996; AVX2:       # BB#0:
1997; AVX2-NEXT:    vmovq %xmm0, %rdx
1998; AVX2-NEXT:    movq %rdx, %r9
1999; AVX2-NEXT:    movl %edx, %r10d
2000; AVX2-NEXT:    movswl %dx, %r8d
2001; AVX2-NEXT:    shrq $48, %rdx
2002; AVX2-NEXT:    shrq $32, %r9
2003; AVX2-NEXT:    shrl $16, %r10d
2004; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
2005; AVX2-NEXT:    movq %rdi, %rsi
2006; AVX2-NEXT:    movl %edi, %eax
2007; AVX2-NEXT:    movswl %di, %ecx
2008; AVX2-NEXT:    shrq $48, %rdi
2009; AVX2-NEXT:    shrq $32, %rsi
2010; AVX2-NEXT:    shrl $16, %eax
2011; AVX2-NEXT:    cwtl
2012; AVX2-NEXT:    vmovd %eax, %xmm0
2013; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm1
2014; AVX2-NEXT:    vmovd %ecx, %xmm0
2015; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm2
2016; AVX2-NEXT:    movswl %si, %eax
2017; AVX2-NEXT:    vmovd %eax, %xmm0
2018; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm3
2019; AVX2-NEXT:    movswl %di, %eax
2020; AVX2-NEXT:    vmovd %eax, %xmm0
2021; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm4
2022; AVX2-NEXT:    movswl %r10w, %eax
2023; AVX2-NEXT:    vmovd %eax, %xmm0
2024; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
2025; AVX2-NEXT:    vmovd %r8d, %xmm5
2026; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
2027; AVX2-NEXT:    movswl %r9w, %eax
2028; AVX2-NEXT:    vmovd %eax, %xmm6
2029; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
2030; AVX2-NEXT:    movswl %dx, %eax
2031; AVX2-NEXT:    vmovd %eax, %xmm7
2032; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
2033; AVX2-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
2034; AVX2-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
2035; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2036; AVX2-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
2037; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2038; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0]
2039; AVX2-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
2040; AVX2-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
2041; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
2042; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
2043; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
2044; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2045; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
2046; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
2047; AVX2-NEXT:    retq
2048;
2049; AVX512F-LABEL: cvt_8i16_to_8f64:
2050; AVX512F:       # BB#0:
2051; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
2052; AVX512F-NEXT:    movq %rdx, %r8
2053; AVX512F-NEXT:    movl %edx, %r9d
2054; AVX512F-NEXT:    movswl %dx, %r10d
2055; AVX512F-NEXT:    shrq $48, %rdx
2056; AVX512F-NEXT:    shrq $32, %r8
2057; AVX512F-NEXT:    shrl $16, %r9d
2058; AVX512F-NEXT:    vmovq %xmm0, %rdi
2059; AVX512F-NEXT:    movq %rdi, %rax
2060; AVX512F-NEXT:    movl %edi, %ecx
2061; AVX512F-NEXT:    movswl %di, %esi
2062; AVX512F-NEXT:    shrq $48, %rdi
2063; AVX512F-NEXT:    shrq $32, %rax
2064; AVX512F-NEXT:    shrl $16, %ecx
2065; AVX512F-NEXT:    movswl %cx, %ecx
2066; AVX512F-NEXT:    vmovd %ecx, %xmm0
2067; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
2068; AVX512F-NEXT:    vmovd %esi, %xmm1
2069; AVX512F-NEXT:    vcvtph2ps %ymm1, %zmm1
2070; AVX512F-NEXT:    cwtl
2071; AVX512F-NEXT:    vmovd %eax, %xmm2
2072; AVX512F-NEXT:    vcvtph2ps %ymm2, %zmm2
2073; AVX512F-NEXT:    movswl %di, %eax
2074; AVX512F-NEXT:    vmovd %eax, %xmm3
2075; AVX512F-NEXT:    vcvtph2ps %ymm3, %zmm3
2076; AVX512F-NEXT:    movswl %r9w, %eax
2077; AVX512F-NEXT:    vmovd %eax, %xmm4
2078; AVX512F-NEXT:    vcvtph2ps %ymm4, %zmm4
2079; AVX512F-NEXT:    vmovd %r10d, %xmm5
2080; AVX512F-NEXT:    vcvtph2ps %ymm5, %zmm5
2081; AVX512F-NEXT:    movswl %r8w, %eax
2082; AVX512F-NEXT:    vmovd %eax, %xmm6
2083; AVX512F-NEXT:    vcvtph2ps %ymm6, %zmm6
2084; AVX512F-NEXT:    movswl %dx, %eax
2085; AVX512F-NEXT:    vmovd %eax, %xmm7
2086; AVX512F-NEXT:    vcvtph2ps %ymm7, %zmm7
2087; AVX512F-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
2088; AVX512F-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
2089; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2090; AVX512F-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
2091; AVX512F-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
2092; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm4 = xmm5[0],xmm4[0]
2093; AVX512F-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
2094; AVX512F-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
2095; AVX512F-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
2096; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2097; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2098; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2099; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2100; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2101; AVX512F-NEXT:    vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
2102; AVX512F-NEXT:    retq
2103;
2104; AVX512VL-LABEL: cvt_8i16_to_8f64:
2105; AVX512VL:       # BB#0:
2106; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rdx
2107; AVX512VL-NEXT:    movq %rdx, %r8
2108; AVX512VL-NEXT:    movl %edx, %r10d
2109; AVX512VL-NEXT:    movswl %dx, %r9d
2110; AVX512VL-NEXT:    shrq $48, %rdx
2111; AVX512VL-NEXT:    shrq $32, %r8
2112; AVX512VL-NEXT:    shrl $16, %r10d
2113; AVX512VL-NEXT:    vmovq %xmm0, %rdi
2114; AVX512VL-NEXT:    movq %rdi, %rax
2115; AVX512VL-NEXT:    movl %edi, %esi
2116; AVX512VL-NEXT:    movswl %di, %ecx
2117; AVX512VL-NEXT:    shrq $48, %rdi
2118; AVX512VL-NEXT:    shrq $32, %rax
2119; AVX512VL-NEXT:    shrl $16, %esi
2120; AVX512VL-NEXT:    movswl %si, %esi
2121; AVX512VL-NEXT:    vmovd %esi, %xmm0
2122; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
2123; AVX512VL-NEXT:    vmovd %ecx, %xmm1
2124; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
2125; AVX512VL-NEXT:    cwtl
2126; AVX512VL-NEXT:    vmovd %eax, %xmm2
2127; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
2128; AVX512VL-NEXT:    movswl %di, %eax
2129; AVX512VL-NEXT:    vmovd %eax, %xmm3
2130; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
2131; AVX512VL-NEXT:    movswl %r10w, %eax
2132; AVX512VL-NEXT:    vmovd %eax, %xmm4
2133; AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
2134; AVX512VL-NEXT:    vmovd %r9d, %xmm5
2135; AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
2136; AVX512VL-NEXT:    movswl %r8w, %eax
2137; AVX512VL-NEXT:    vmovd %eax, %xmm6
2138; AVX512VL-NEXT:    vcvtph2ps %xmm6, %xmm6
2139; AVX512VL-NEXT:    movswl %dx, %eax
2140; AVX512VL-NEXT:    vmovd %eax, %xmm7
2141; AVX512VL-NEXT:    vcvtph2ps %xmm7, %xmm7
2142; AVX512VL-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
2143; AVX512VL-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
2144; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2145; AVX512VL-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
2146; AVX512VL-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
2147; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm4 = xmm5[0],xmm4[0]
2148; AVX512VL-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
2149; AVX512VL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
2150; AVX512VL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
2151; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2152; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2153; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2154; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2155; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2156; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
2157; AVX512VL-NEXT:    retq
2158  %1 = bitcast <8 x i16> %a0 to <8 x half>
2159  %2 = fpext <8 x half> %1 to <8 x double>
2160  ret <8 x double> %2
2161}
2162
2163;
2164; Half to Double (Load)
2165;
2166
2167define double @load_cvt_i16_to_f64(i16* %a0) nounwind {
2168; AVX1-LABEL: load_cvt_i16_to_f64:
2169; AVX1:       # BB#0:
2170; AVX1-NEXT:    movswl (%rdi), %eax
2171; AVX1-NEXT:    vmovd %eax, %xmm0
2172; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
2173; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2174; AVX1-NEXT:    retq
2175;
2176; AVX2-LABEL: load_cvt_i16_to_f64:
2177; AVX2:       # BB#0:
2178; AVX2-NEXT:    movswl (%rdi), %eax
2179; AVX2-NEXT:    vmovd %eax, %xmm0
2180; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
2181; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2182; AVX2-NEXT:    retq
2183;
2184; AVX512F-LABEL: load_cvt_i16_to_f64:
2185; AVX512F:       # BB#0:
2186; AVX512F-NEXT:    movswl (%rdi), %eax
2187; AVX512F-NEXT:    vmovd %eax, %xmm0
2188; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
2189; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2190; AVX512F-NEXT:    retq
2191;
2192; AVX512VL-LABEL: load_cvt_i16_to_f64:
2193; AVX512VL:       # BB#0:
2194; AVX512VL-NEXT:    movswl (%rdi), %eax
2195; AVX512VL-NEXT:    vmovd %eax, %xmm0
2196; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
2197; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2198; AVX512VL-NEXT:    retq
2199  %1 = load i16, i16* %a0
2200  %2 = bitcast i16 %1 to half
2201  %3 = fpext half %2 to double
2202  ret double %3
2203}
2204
2205define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind {
2206; AVX1-LABEL: load_cvt_2i16_to_2f64:
2207; AVX1:       # BB#0:
2208; AVX1-NEXT:    movswl (%rdi), %eax
2209; AVX1-NEXT:    vmovd %eax, %xmm0
2210; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
2211; AVX1-NEXT:    movswl 2(%rdi), %eax
2212; AVX1-NEXT:    vmovd %eax, %xmm1
2213; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
2214; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2215; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2216; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2217; AVX1-NEXT:    retq
2218;
2219; AVX2-LABEL: load_cvt_2i16_to_2f64:
2220; AVX2:       # BB#0:
2221; AVX2-NEXT:    movswl (%rdi), %eax
2222; AVX2-NEXT:    vmovd %eax, %xmm0
2223; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
2224; AVX2-NEXT:    movswl 2(%rdi), %eax
2225; AVX2-NEXT:    vmovd %eax, %xmm1
2226; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
2227; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2228; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2229; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2230; AVX2-NEXT:    retq
2231;
2232; AVX512F-LABEL: load_cvt_2i16_to_2f64:
2233; AVX512F:       # BB#0:
2234; AVX512F-NEXT:    movswl (%rdi), %eax
2235; AVX512F-NEXT:    vmovd %eax, %xmm0
2236; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
2237; AVX512F-NEXT:    movswl 2(%rdi), %eax
2238; AVX512F-NEXT:    vmovd %eax, %xmm1
2239; AVX512F-NEXT:    vcvtph2ps %ymm1, %zmm1
2240; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2241; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2242; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2243; AVX512F-NEXT:    retq
2244;
2245; AVX512VL-LABEL: load_cvt_2i16_to_2f64:
2246; AVX512VL:       # BB#0:
2247; AVX512VL-NEXT:    movswl (%rdi), %eax
2248; AVX512VL-NEXT:    vmovd %eax, %xmm0
2249; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
2250; AVX512VL-NEXT:    movswl 2(%rdi), %eax
2251; AVX512VL-NEXT:    vmovd %eax, %xmm1
2252; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
2253; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2254; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2255; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2256; AVX512VL-NEXT:    retq
2257  %1 = load <2 x i16>, <2 x i16>* %a0
2258  %2 = bitcast <2 x i16> %1 to <2 x half>
2259  %3 = fpext <2 x half> %2 to <2 x double>
2260  ret <2 x double> %3
2261}
2262
2263define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind {
2264; AVX1-LABEL: load_cvt_4i16_to_4f64:
2265; AVX1:       # BB#0:
2266; AVX1-NEXT:    movswl (%rdi), %eax
2267; AVX1-NEXT:    vmovd %eax, %xmm0
2268; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
2269; AVX1-NEXT:    movswl 2(%rdi), %eax
2270; AVX1-NEXT:    vmovd %eax, %xmm1
2271; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
2272; AVX1-NEXT:    movswl 4(%rdi), %eax
2273; AVX1-NEXT:    vmovd %eax, %xmm2
2274; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
2275; AVX1-NEXT:    movswl 6(%rdi), %eax
2276; AVX1-NEXT:    vmovd %eax, %xmm3
2277; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
2278; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
2279; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
2280; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2281; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2282; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2283; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2284; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2285; AVX1-NEXT:    retq
2286;
2287; AVX2-LABEL: load_cvt_4i16_to_4f64:
2288; AVX2:       # BB#0:
2289; AVX2-NEXT:    movswl (%rdi), %eax
2290; AVX2-NEXT:    vmovd %eax, %xmm0
2291; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
2292; AVX2-NEXT:    movswl 2(%rdi), %eax
2293; AVX2-NEXT:    vmovd %eax, %xmm1
2294; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
2295; AVX2-NEXT:    movswl 4(%rdi), %eax
2296; AVX2-NEXT:    vmovd %eax, %xmm2
2297; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
2298; AVX2-NEXT:    movswl 6(%rdi), %eax
2299; AVX2-NEXT:    vmovd %eax, %xmm3
2300; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
2301; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
2302; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
2303; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2304; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2305; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2306; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2307; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2308; AVX2-NEXT:    retq
2309;
2310; AVX512F-LABEL: load_cvt_4i16_to_4f64:
2311; AVX512F:       # BB#0:
2312; AVX512F-NEXT:    movswl (%rdi), %eax
2313; AVX512F-NEXT:    vmovd %eax, %xmm0
2314; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
2315; AVX512F-NEXT:    movswl 2(%rdi), %eax
2316; AVX512F-NEXT:    vmovd %eax, %xmm1
2317; AVX512F-NEXT:    vcvtph2ps %ymm1, %zmm1
2318; AVX512F-NEXT:    movswl 4(%rdi), %eax
2319; AVX512F-NEXT:    vmovd %eax, %xmm2
2320; AVX512F-NEXT:    vcvtph2ps %ymm2, %zmm2
2321; AVX512F-NEXT:    movswl 6(%rdi), %eax
2322; AVX512F-NEXT:    vmovd %eax, %xmm3
2323; AVX512F-NEXT:    vcvtph2ps %ymm3, %zmm3
2324; AVX512F-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
2325; AVX512F-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
2326; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2327; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2328; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2329; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2330; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2331; AVX512F-NEXT:    retq
2332;
2333; AVX512VL-LABEL: load_cvt_4i16_to_4f64:
2334; AVX512VL:       # BB#0:
2335; AVX512VL-NEXT:    movswl (%rdi), %eax
2336; AVX512VL-NEXT:    vmovd %eax, %xmm0
2337; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
2338; AVX512VL-NEXT:    movswl 2(%rdi), %eax
2339; AVX512VL-NEXT:    vmovd %eax, %xmm1
2340; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
2341; AVX512VL-NEXT:    movswl 4(%rdi), %eax
2342; AVX512VL-NEXT:    vmovd %eax, %xmm2
2343; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
2344; AVX512VL-NEXT:    movswl 6(%rdi), %eax
2345; AVX512VL-NEXT:    vmovd %eax, %xmm3
2346; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
2347; AVX512VL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
2348; AVX512VL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
2349; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2350; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2351; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2352; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2353; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2354; AVX512VL-NEXT:    retq
2355  %1 = load <4 x i16>, <4 x i16>* %a0
2356  %2 = bitcast <4 x i16> %1 to <4 x half>
2357  %3 = fpext <4 x half> %2 to <4 x double>
2358  ret <4 x double> %3
2359}
2360
2361define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
2362; AVX1-LABEL: load_cvt_8i16_to_4f64:
2363; AVX1:       # BB#0:
2364; AVX1-NEXT:    movq (%rdi), %rax
2365; AVX1-NEXT:    movq %rax, %rcx
2366; AVX1-NEXT:    movl %eax, %edx
2367; AVX1-NEXT:    movswl %ax, %esi
2368; AVX1-NEXT:    shrq $48, %rax
2369; AVX1-NEXT:    shrq $32, %rcx
2370; AVX1-NEXT:    shrl $16, %edx
2371; AVX1-NEXT:    movswl %dx, %edx
2372; AVX1-NEXT:    vmovd %edx, %xmm0
2373; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
2374; AVX1-NEXT:    vmovd %esi, %xmm1
2375; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
2376; AVX1-NEXT:    movswl %cx, %ecx
2377; AVX1-NEXT:    vmovd %ecx, %xmm2
2378; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
2379; AVX1-NEXT:    cwtl
2380; AVX1-NEXT:    vmovd %eax, %xmm3
2381; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
2382; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
2383; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
2384; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2385; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2386; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2387; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2388; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2389; AVX1-NEXT:    retq
2390;
2391; AVX2-LABEL: load_cvt_8i16_to_4f64:
2392; AVX2:       # BB#0:
2393; AVX2-NEXT:    movq (%rdi), %rax
2394; AVX2-NEXT:    movq %rax, %rcx
2395; AVX2-NEXT:    movl %eax, %edx
2396; AVX2-NEXT:    movswl %ax, %esi
2397; AVX2-NEXT:    shrq $48, %rax
2398; AVX2-NEXT:    shrq $32, %rcx
2399; AVX2-NEXT:    shrl $16, %edx
2400; AVX2-NEXT:    movswl %dx, %edx
2401; AVX2-NEXT:    vmovd %edx, %xmm0
2402; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
2403; AVX2-NEXT:    vmovd %esi, %xmm1
2404; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
2405; AVX2-NEXT:    movswl %cx, %ecx
2406; AVX2-NEXT:    vmovd %ecx, %xmm2
2407; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
2408; AVX2-NEXT:    cwtl
2409; AVX2-NEXT:    vmovd %eax, %xmm3
2410; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
2411; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
2412; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
2413; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2414; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2415; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2416; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2417; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2418; AVX2-NEXT:    retq
2419;
2420; AVX512F-LABEL: load_cvt_8i16_to_4f64:
2421; AVX512F:       # BB#0:
2422; AVX512F-NEXT:    movq (%rdi), %rax
2423; AVX512F-NEXT:    movq %rax, %rcx
2424; AVX512F-NEXT:    movl %eax, %edx
2425; AVX512F-NEXT:    movswl %ax, %esi
2426; AVX512F-NEXT:    shrq $48, %rax
2427; AVX512F-NEXT:    shrq $32, %rcx
2428; AVX512F-NEXT:    shrl $16, %edx
2429; AVX512F-NEXT:    movswl %dx, %edx
2430; AVX512F-NEXT:    vmovd %edx, %xmm0
2431; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
2432; AVX512F-NEXT:    vmovd %esi, %xmm1
2433; AVX512F-NEXT:    vcvtph2ps %ymm1, %zmm1
2434; AVX512F-NEXT:    movswl %cx, %ecx
2435; AVX512F-NEXT:    vmovd %ecx, %xmm2
2436; AVX512F-NEXT:    vcvtph2ps %ymm2, %zmm2
2437; AVX512F-NEXT:    cwtl
2438; AVX512F-NEXT:    vmovd %eax, %xmm3
2439; AVX512F-NEXT:    vcvtph2ps %ymm3, %zmm3
2440; AVX512F-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
2441; AVX512F-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
2442; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2443; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2444; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2445; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2446; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2447; AVX512F-NEXT:    retq
2448;
2449; AVX512VL-LABEL: load_cvt_8i16_to_4f64:
2450; AVX512VL:       # BB#0:
2451; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2452; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
2453; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
2454; AVX512VL-NEXT:    movq %rax, %rcx
2455; AVX512VL-NEXT:    movl %eax, %edx
2456; AVX512VL-NEXT:    movswl %ax, %esi
2457; AVX512VL-NEXT:    shrq $48, %rax
2458; AVX512VL-NEXT:    shrq $32, %rcx
2459; AVX512VL-NEXT:    shrl $16, %edx
2460; AVX512VL-NEXT:    movswl %dx, %edx
2461; AVX512VL-NEXT:    vmovd %edx, %xmm0
2462; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
2463; AVX512VL-NEXT:    vmovd %esi, %xmm1
2464; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
2465; AVX512VL-NEXT:    movswl %cx, %ecx
2466; AVX512VL-NEXT:    vmovd %ecx, %xmm2
2467; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
2468; AVX512VL-NEXT:    cwtl
2469; AVX512VL-NEXT:    vmovd %eax, %xmm3
2470; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
2471; AVX512VL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
2472; AVX512VL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
2473; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2474; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2475; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2476; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2477; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2478; AVX512VL-NEXT:    retq
2479  %1 = load <8 x i16>, <8 x i16>* %a0
2480  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2481  %3 = bitcast <4 x i16> %2 to <4 x half>
2482  %4 = fpext <4 x half> %3 to <4 x double>
2483  ret <4 x double> %4
2484}
2485
2486define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind {
2487; AVX1-LABEL: load_cvt_8i16_to_8f64:
2488; AVX1:       # BB#0:
2489; AVX1-NEXT:    movswl 8(%rdi), %eax
2490; AVX1-NEXT:    vmovd %eax, %xmm0
2491; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm1
2492; AVX1-NEXT:    movswl 10(%rdi), %eax
2493; AVX1-NEXT:    vmovd %eax, %xmm0
2494; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm2
2495; AVX1-NEXT:    movswl 12(%rdi), %eax
2496; AVX1-NEXT:    vmovd %eax, %xmm0
2497; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm3
2498; AVX1-NEXT:    movswl 14(%rdi), %eax
2499; AVX1-NEXT:    vmovd %eax, %xmm0
2500; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm4
2501; AVX1-NEXT:    movswl (%rdi), %eax
2502; AVX1-NEXT:    vmovd %eax, %xmm0
2503; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
2504; AVX1-NEXT:    movswl 2(%rdi), %eax
2505; AVX1-NEXT:    vmovd %eax, %xmm5
2506; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
2507; AVX1-NEXT:    movswl 4(%rdi), %eax
2508; AVX1-NEXT:    vmovd %eax, %xmm6
2509; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
2510; AVX1-NEXT:    movswl 6(%rdi), %eax
2511; AVX1-NEXT:    vmovd %eax, %xmm7
2512; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
2513; AVX1-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
2514; AVX1-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
2515; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2516; AVX1-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
2517; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2518; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
2519; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
2520; AVX1-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
2521; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
2522; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
2523; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
2524; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2525; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2526; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
2527; AVX1-NEXT:    retq
2528;
2529; AVX2-LABEL: load_cvt_8i16_to_8f64:
2530; AVX2:       # BB#0:
2531; AVX2-NEXT:    movswl 8(%rdi), %eax
2532; AVX2-NEXT:    vmovd %eax, %xmm0
2533; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm1
2534; AVX2-NEXT:    movswl 10(%rdi), %eax
2535; AVX2-NEXT:    vmovd %eax, %xmm0
2536; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm2
2537; AVX2-NEXT:    movswl 12(%rdi), %eax
2538; AVX2-NEXT:    vmovd %eax, %xmm0
2539; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm3
2540; AVX2-NEXT:    movswl 14(%rdi), %eax
2541; AVX2-NEXT:    vmovd %eax, %xmm0
2542; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm4
2543; AVX2-NEXT:    movswl (%rdi), %eax
2544; AVX2-NEXT:    vmovd %eax, %xmm0
2545; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
2546; AVX2-NEXT:    movswl 2(%rdi), %eax
2547; AVX2-NEXT:    vmovd %eax, %xmm5
2548; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
2549; AVX2-NEXT:    movswl 4(%rdi), %eax
2550; AVX2-NEXT:    vmovd %eax, %xmm6
2551; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
2552; AVX2-NEXT:    movswl 6(%rdi), %eax
2553; AVX2-NEXT:    vmovd %eax, %xmm7
2554; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
2555; AVX2-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
2556; AVX2-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
2557; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2558; AVX2-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
2559; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2560; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
2561; AVX2-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
2562; AVX2-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
2563; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
2564; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
2565; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
2566; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2567; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2568; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
2569; AVX2-NEXT:    retq
2570;
2571; AVX512F-LABEL: load_cvt_8i16_to_8f64:
2572; AVX512F:       # BB#0:
2573; AVX512F-NEXT:    movswl (%rdi), %eax
2574; AVX512F-NEXT:    vmovd %eax, %xmm0
2575; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
2576; AVX512F-NEXT:    movswl 2(%rdi), %eax
2577; AVX512F-NEXT:    vmovd %eax, %xmm1
2578; AVX512F-NEXT:    vcvtph2ps %ymm1, %zmm1
2579; AVX512F-NEXT:    movswl 4(%rdi), %eax
2580; AVX512F-NEXT:    vmovd %eax, %xmm2
2581; AVX512F-NEXT:    vcvtph2ps %ymm2, %zmm2
2582; AVX512F-NEXT:    movswl 6(%rdi), %eax
2583; AVX512F-NEXT:    vmovd %eax, %xmm3
2584; AVX512F-NEXT:    vcvtph2ps %ymm3, %zmm3
2585; AVX512F-NEXT:    movswl 8(%rdi), %eax
2586; AVX512F-NEXT:    vmovd %eax, %xmm4
2587; AVX512F-NEXT:    vcvtph2ps %ymm4, %zmm4
2588; AVX512F-NEXT:    movswl 10(%rdi), %eax
2589; AVX512F-NEXT:    vmovd %eax, %xmm5
2590; AVX512F-NEXT:    vcvtph2ps %ymm5, %zmm5
2591; AVX512F-NEXT:    movswl 12(%rdi), %eax
2592; AVX512F-NEXT:    vmovd %eax, %xmm6
2593; AVX512F-NEXT:    vcvtph2ps %ymm6, %zmm6
2594; AVX512F-NEXT:    movswl 14(%rdi), %eax
2595; AVX512F-NEXT:    vmovd %eax, %xmm7
2596; AVX512F-NEXT:    vcvtph2ps %ymm7, %zmm7
2597; AVX512F-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
2598; AVX512F-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
2599; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2600; AVX512F-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
2601; AVX512F-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
2602; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0]
2603; AVX512F-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
2604; AVX512F-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
2605; AVX512F-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
2606; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2607; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2608; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2609; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2610; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2611; AVX512F-NEXT:    vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
2612; AVX512F-NEXT:    retq
2613;
2614; AVX512VL-LABEL: load_cvt_8i16_to_8f64:
2615; AVX512VL:       # BB#0:
2616; AVX512VL-NEXT:    movswl (%rdi), %eax
2617; AVX512VL-NEXT:    vmovd %eax, %xmm0
2618; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
2619; AVX512VL-NEXT:    movswl 2(%rdi), %eax
2620; AVX512VL-NEXT:    vmovd %eax, %xmm1
2621; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
2622; AVX512VL-NEXT:    movswl 4(%rdi), %eax
2623; AVX512VL-NEXT:    vmovd %eax, %xmm2
2624; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
2625; AVX512VL-NEXT:    movswl 6(%rdi), %eax
2626; AVX512VL-NEXT:    vmovd %eax, %xmm3
2627; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
2628; AVX512VL-NEXT:    movswl 8(%rdi), %eax
2629; AVX512VL-NEXT:    vmovd %eax, %xmm4
2630; AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
2631; AVX512VL-NEXT:    movswl 10(%rdi), %eax
2632; AVX512VL-NEXT:    vmovd %eax, %xmm5
2633; AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
2634; AVX512VL-NEXT:    movswl 12(%rdi), %eax
2635; AVX512VL-NEXT:    vmovd %eax, %xmm6
2636; AVX512VL-NEXT:    vcvtph2ps %xmm6, %xmm6
2637; AVX512VL-NEXT:    movswl 14(%rdi), %eax
2638; AVX512VL-NEXT:    vmovd %eax, %xmm7
2639; AVX512VL-NEXT:    vcvtph2ps %xmm7, %xmm7
2640; AVX512VL-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
2641; AVX512VL-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
2642; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2643; AVX512VL-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
2644; AVX512VL-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
2645; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0]
2646; AVX512VL-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
2647; AVX512VL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
2648; AVX512VL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
2649; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2650; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2651; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2652; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2653; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2654; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
2655; AVX512VL-NEXT:    retq
2656  %1 = load <8 x i16>, <8 x i16>* %a0
2657  %2 = bitcast <8 x i16> %1 to <8 x half>
2658  %3 = fpext <8 x half> %2 to <8 x double>
2659  ret <8 x double> %3
2660}
2661
2662;
2663; Float to Half
2664;
2665
2666define i16 @cvt_f32_to_i16(float %a0) nounwind {
2667; AVX1-LABEL: cvt_f32_to_i16:
2668; AVX1:       # BB#0:
2669; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2670; AVX1-NEXT:    vmovd %xmm0, %eax
2671; AVX1-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
2672; AVX1-NEXT:    retq
2673;
2674; AVX2-LABEL: cvt_f32_to_i16:
2675; AVX2:       # BB#0:
2676; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2677; AVX2-NEXT:    vmovd %xmm0, %eax
2678; AVX2-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
2679; AVX2-NEXT:    retq
2680;
2681; AVX512F-LABEL: cvt_f32_to_i16:
2682; AVX512F:       # BB#0:
2683; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
2684; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
2685; AVX512F-NEXT:    vmovd %xmm0, %eax
2686; AVX512F-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
2687; AVX512F-NEXT:    retq
2688;
2689; AVX512VL-LABEL: cvt_f32_to_i16:
2690; AVX512VL:       # BB#0:
2691; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2692; AVX512VL-NEXT:    vmovd %xmm0, %eax
2693; AVX512VL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
2694; AVX512VL-NEXT:    retq
2695  %1 = fptrunc float %a0 to half
2696  %2 = bitcast half %1 to i16
2697  ret i16 %2
2698}
2699
2700define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind {
2701; AVX1-LABEL: cvt_4f32_to_4i16:
2702; AVX1:       # BB#0:
2703; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2704; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2705; AVX1-NEXT:    vmovd %xmm1, %eax
2706; AVX1-NEXT:    shll $16, %eax
2707; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2708; AVX1-NEXT:    vmovd %xmm1, %ecx
2709; AVX1-NEXT:    movzwl %cx, %ecx
2710; AVX1-NEXT:    orl %eax, %ecx
2711; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2712; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2713; AVX1-NEXT:    vmovd %xmm1, %eax
2714; AVX1-NEXT:    shll $16, %eax
2715; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2716; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2717; AVX1-NEXT:    vmovd %xmm0, %edx
2718; AVX1-NEXT:    movzwl %dx, %edx
2719; AVX1-NEXT:    orl %eax, %edx
2720; AVX1-NEXT:    shlq $32, %rdx
2721; AVX1-NEXT:    orq %rcx, %rdx
2722; AVX1-NEXT:    vmovq %rdx, %xmm0
2723; AVX1-NEXT:    retq
2724;
2725; AVX2-LABEL: cvt_4f32_to_4i16:
2726; AVX2:       # BB#0:
2727; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2728; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2729; AVX2-NEXT:    vmovd %xmm1, %eax
2730; AVX2-NEXT:    shll $16, %eax
2731; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2732; AVX2-NEXT:    vmovd %xmm1, %ecx
2733; AVX2-NEXT:    movzwl %cx, %ecx
2734; AVX2-NEXT:    orl %eax, %ecx
2735; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2736; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2737; AVX2-NEXT:    vmovd %xmm1, %eax
2738; AVX2-NEXT:    shll $16, %eax
2739; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2740; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2741; AVX2-NEXT:    vmovd %xmm0, %edx
2742; AVX2-NEXT:    movzwl %dx, %edx
2743; AVX2-NEXT:    orl %eax, %edx
2744; AVX2-NEXT:    shlq $32, %rdx
2745; AVX2-NEXT:    orq %rcx, %rdx
2746; AVX2-NEXT:    vmovq %rdx, %xmm0
2747; AVX2-NEXT:    retq
2748;
2749; AVX512F-LABEL: cvt_4f32_to_4i16:
2750; AVX512F:       # BB#0:
2751; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
2752; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm1
2753; AVX512F-NEXT:    vmovd %xmm1, %eax
2754; AVX512F-NEXT:    movzwl %ax, %eax
2755; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2756; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
2757; AVX512F-NEXT:    vmovd %xmm1, %ecx
2758; AVX512F-NEXT:    shll $16, %ecx
2759; AVX512F-NEXT:    orl %eax, %ecx
2760; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2761; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
2762; AVX512F-NEXT:    vmovd %xmm1, %eax
2763; AVX512F-NEXT:    movzwl %ax, %eax
2764; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2765; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
2766; AVX512F-NEXT:    vmovd %xmm0, %edx
2767; AVX512F-NEXT:    shll $16, %edx
2768; AVX512F-NEXT:    orl %eax, %edx
2769; AVX512F-NEXT:    shlq $32, %rdx
2770; AVX512F-NEXT:    orq %rcx, %rdx
2771; AVX512F-NEXT:    vmovq %rdx, %xmm0
2772; AVX512F-NEXT:    retq
2773;
2774; AVX512VL-LABEL: cvt_4f32_to_4i16:
2775; AVX512VL:       # BB#0:
2776; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2777; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2778; AVX512VL-NEXT:    vmovd %xmm1, %eax
2779; AVX512VL-NEXT:    shll $16, %eax
2780; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2781; AVX512VL-NEXT:    vmovd %xmm1, %ecx
2782; AVX512VL-NEXT:    movzwl %cx, %ecx
2783; AVX512VL-NEXT:    orl %eax, %ecx
2784; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2785; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2786; AVX512VL-NEXT:    vmovd %xmm1, %eax
2787; AVX512VL-NEXT:    shll $16, %eax
2788; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2789; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2790; AVX512VL-NEXT:    vmovd %xmm0, %edx
2791; AVX512VL-NEXT:    movzwl %dx, %edx
2792; AVX512VL-NEXT:    orl %eax, %edx
2793; AVX512VL-NEXT:    shlq $32, %rdx
2794; AVX512VL-NEXT:    orq %rcx, %rdx
2795; AVX512VL-NEXT:    vmovq %rdx, %xmm0
2796; AVX512VL-NEXT:    retq
2797  %1 = fptrunc <4 x float> %a0 to <4 x half>
2798  %2 = bitcast <4 x half> %1 to <4 x i16>
2799  ret <4 x i16> %2
2800}
2801
2802define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
2803; AVX1-LABEL: cvt_4f32_to_8i16_undef:
2804; AVX1:       # BB#0:
2805; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2806; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2807; AVX1-NEXT:    vmovd %xmm1, %eax
2808; AVX1-NEXT:    shll $16, %eax
2809; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2810; AVX1-NEXT:    vmovd %xmm1, %ecx
2811; AVX1-NEXT:    movzwl %cx, %ecx
2812; AVX1-NEXT:    orl %eax, %ecx
2813; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2814; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2815; AVX1-NEXT:    vmovd %xmm1, %eax
2816; AVX1-NEXT:    shll $16, %eax
2817; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2818; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2819; AVX1-NEXT:    vmovd %xmm0, %edx
2820; AVX1-NEXT:    movzwl %dx, %edx
2821; AVX1-NEXT:    orl %eax, %edx
2822; AVX1-NEXT:    shlq $32, %rdx
2823; AVX1-NEXT:    orq %rcx, %rdx
2824; AVX1-NEXT:    vmovq %rdx, %xmm0
2825; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2826; AVX1-NEXT:    retq
2827;
2828; AVX2-LABEL: cvt_4f32_to_8i16_undef:
2829; AVX2:       # BB#0:
2830; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2831; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2832; AVX2-NEXT:    vmovd %xmm1, %eax
2833; AVX2-NEXT:    shll $16, %eax
2834; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2835; AVX2-NEXT:    vmovd %xmm1, %ecx
2836; AVX2-NEXT:    movzwl %cx, %ecx
2837; AVX2-NEXT:    orl %eax, %ecx
2838; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2839; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2840; AVX2-NEXT:    vmovd %xmm1, %eax
2841; AVX2-NEXT:    shll $16, %eax
2842; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2843; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2844; AVX2-NEXT:    vmovd %xmm0, %edx
2845; AVX2-NEXT:    movzwl %dx, %edx
2846; AVX2-NEXT:    orl %eax, %edx
2847; AVX2-NEXT:    shlq $32, %rdx
2848; AVX2-NEXT:    orq %rcx, %rdx
2849; AVX2-NEXT:    vmovq %rdx, %xmm0
2850; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2851; AVX2-NEXT:    retq
2852;
2853; AVX512F-LABEL: cvt_4f32_to_8i16_undef:
2854; AVX512F:       # BB#0:
2855; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
2856; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm1
2857; AVX512F-NEXT:    vmovd %xmm1, %eax
2858; AVX512F-NEXT:    movzwl %ax, %eax
2859; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2860; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
2861; AVX512F-NEXT:    vmovd %xmm1, %ecx
2862; AVX512F-NEXT:    shll $16, %ecx
2863; AVX512F-NEXT:    orl %eax, %ecx
2864; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2865; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
2866; AVX512F-NEXT:    vmovd %xmm1, %eax
2867; AVX512F-NEXT:    movzwl %ax, %eax
2868; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2869; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
2870; AVX512F-NEXT:    vmovd %xmm0, %edx
2871; AVX512F-NEXT:    shll $16, %edx
2872; AVX512F-NEXT:    orl %eax, %edx
2873; AVX512F-NEXT:    shlq $32, %rdx
2874; AVX512F-NEXT:    orq %rcx, %rdx
2875; AVX512F-NEXT:    vmovq %rdx, %xmm0
2876; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2877; AVX512F-NEXT:    retq
2878;
2879; AVX512VL-LABEL: cvt_4f32_to_8i16_undef:
2880; AVX512VL:       # BB#0:
2881; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2882; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2883; AVX512VL-NEXT:    vmovd %xmm1, %eax
2884; AVX512VL-NEXT:    shll $16, %eax
2885; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2886; AVX512VL-NEXT:    vmovd %xmm1, %ecx
2887; AVX512VL-NEXT:    movzwl %cx, %ecx
2888; AVX512VL-NEXT:    orl %eax, %ecx
2889; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2890; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2891; AVX512VL-NEXT:    vmovd %xmm1, %eax
2892; AVX512VL-NEXT:    shll $16, %eax
2893; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2894; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2895; AVX512VL-NEXT:    vmovd %xmm0, %edx
2896; AVX512VL-NEXT:    movzwl %dx, %edx
2897; AVX512VL-NEXT:    orl %eax, %edx
2898; AVX512VL-NEXT:    shlq $32, %rdx
2899; AVX512VL-NEXT:    orq %rcx, %rdx
2900; AVX512VL-NEXT:    vmovq %rdx, %xmm0
2901; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2902; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2903; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2904; AVX512VL-NEXT:    retq
2905  %1 = fptrunc <4 x float> %a0 to <4 x half>
2906  %2 = bitcast <4 x half> %1 to <4 x i16>
2907  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2908  ret <8 x i16> %3
2909}
2910
2911define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
2912; AVX1-LABEL: cvt_4f32_to_8i16_zero:
2913; AVX1:       # BB#0:
2914; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2915; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2916; AVX1-NEXT:    vmovd %xmm1, %eax
2917; AVX1-NEXT:    shll $16, %eax
2918; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2919; AVX1-NEXT:    vmovd %xmm1, %ecx
2920; AVX1-NEXT:    movzwl %cx, %ecx
2921; AVX1-NEXT:    orl %eax, %ecx
2922; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2923; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2924; AVX1-NEXT:    vmovd %xmm1, %eax
2925; AVX1-NEXT:    shll $16, %eax
2926; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2927; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2928; AVX1-NEXT:    vmovd %xmm0, %edx
2929; AVX1-NEXT:    movzwl %dx, %edx
2930; AVX1-NEXT:    orl %eax, %edx
2931; AVX1-NEXT:    shlq $32, %rdx
2932; AVX1-NEXT:    orq %rcx, %rdx
2933; AVX1-NEXT:    vmovq %rdx, %xmm0
2934; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
2935; AVX1-NEXT:    retq
2936;
2937; AVX2-LABEL: cvt_4f32_to_8i16_zero:
2938; AVX2:       # BB#0:
2939; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2940; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2941; AVX2-NEXT:    vmovd %xmm1, %eax
2942; AVX2-NEXT:    shll $16, %eax
2943; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2944; AVX2-NEXT:    vmovd %xmm1, %ecx
2945; AVX2-NEXT:    movzwl %cx, %ecx
2946; AVX2-NEXT:    orl %eax, %ecx
2947; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2948; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2949; AVX2-NEXT:    vmovd %xmm1, %eax
2950; AVX2-NEXT:    shll $16, %eax
2951; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2952; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2953; AVX2-NEXT:    vmovd %xmm0, %edx
2954; AVX2-NEXT:    movzwl %dx, %edx
2955; AVX2-NEXT:    orl %eax, %edx
2956; AVX2-NEXT:    shlq $32, %rdx
2957; AVX2-NEXT:    orq %rcx, %rdx
2958; AVX2-NEXT:    vmovq %rdx, %xmm0
2959; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
2960; AVX2-NEXT:    retq
2961;
2962; AVX512F-LABEL: cvt_4f32_to_8i16_zero:
2963; AVX512F:       # BB#0:
2964; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
2965; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm1
2966; AVX512F-NEXT:    vmovd %xmm1, %eax
2967; AVX512F-NEXT:    movzwl %ax, %eax
2968; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2969; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
2970; AVX512F-NEXT:    vmovd %xmm1, %ecx
2971; AVX512F-NEXT:    shll $16, %ecx
2972; AVX512F-NEXT:    orl %eax, %ecx
2973; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2974; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
2975; AVX512F-NEXT:    vmovd %xmm1, %eax
2976; AVX512F-NEXT:    movzwl %ax, %eax
2977; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2978; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
2979; AVX512F-NEXT:    vmovd %xmm0, %edx
2980; AVX512F-NEXT:    shll $16, %edx
2981; AVX512F-NEXT:    orl %eax, %edx
2982; AVX512F-NEXT:    shlq $32, %rdx
2983; AVX512F-NEXT:    orq %rcx, %rdx
2984; AVX512F-NEXT:    vmovq %rdx, %xmm0
2985; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
2986; AVX512F-NEXT:    retq
2987;
2988; AVX512VL-LABEL: cvt_4f32_to_8i16_zero:
2989; AVX512VL:       # BB#0:
2990; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2991; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2992; AVX512VL-NEXT:    vmovd %xmm1, %eax
2993; AVX512VL-NEXT:    shll $16, %eax
2994; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2995; AVX512VL-NEXT:    vmovd %xmm1, %ecx
2996; AVX512VL-NEXT:    movzwl %cx, %ecx
2997; AVX512VL-NEXT:    orl %eax, %ecx
2998; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2999; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3000; AVX512VL-NEXT:    vmovd %xmm1, %eax
3001; AVX512VL-NEXT:    shll $16, %eax
3002; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3003; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3004; AVX512VL-NEXT:    vmovd %xmm0, %edx
3005; AVX512VL-NEXT:    movzwl %dx, %edx
3006; AVX512VL-NEXT:    orl %eax, %edx
3007; AVX512VL-NEXT:    shlq $32, %rdx
3008; AVX512VL-NEXT:    orq %rcx, %rdx
3009; AVX512VL-NEXT:    vmovq %rdx, %xmm0
3010; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3011; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
3012; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
3013; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3014; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
3015; AVX512VL-NEXT:    retq
3016  %1 = fptrunc <4 x float> %a0 to <4 x half>
3017  %2 = bitcast <4 x half> %1 to <4 x i16>
3018  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3019  ret <8 x i16> %3
3020}
3021
3022define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind {
3023; AVX1-LABEL: cvt_8f32_to_8i16:
3024; AVX1:       # BB#0:
3025; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3026; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3027; AVX1-NEXT:    vmovd %xmm1, %eax
3028; AVX1-NEXT:    shll $16, %eax
3029; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
3030; AVX1-NEXT:    vmovd %xmm1, %ecx
3031; AVX1-NEXT:    movzwl %cx, %ecx
3032; AVX1-NEXT:    orl %eax, %ecx
3033; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3034; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3035; AVX1-NEXT:    vmovd %xmm1, %edx
3036; AVX1-NEXT:    shll $16, %edx
3037; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3038; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3039; AVX1-NEXT:    vmovd %xmm1, %eax
3040; AVX1-NEXT:    movzwl %ax, %eax
3041; AVX1-NEXT:    orl %edx, %eax
3042; AVX1-NEXT:    shlq $32, %rax
3043; AVX1-NEXT:    orq %rcx, %rax
3044; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3045; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3046; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3047; AVX1-NEXT:    vmovd %xmm1, %ecx
3048; AVX1-NEXT:    shll $16, %ecx
3049; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
3050; AVX1-NEXT:    vmovd %xmm1, %edx
3051; AVX1-NEXT:    movzwl %dx, %edx
3052; AVX1-NEXT:    orl %ecx, %edx
3053; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3054; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3055; AVX1-NEXT:    vmovd %xmm1, %ecx
3056; AVX1-NEXT:    shll $16, %ecx
3057; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3058; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3059; AVX1-NEXT:    vmovd %xmm0, %esi
3060; AVX1-NEXT:    movzwl %si, %esi
3061; AVX1-NEXT:    orl %ecx, %esi
3062; AVX1-NEXT:    shlq $32, %rsi
3063; AVX1-NEXT:    orq %rdx, %rsi
3064; AVX1-NEXT:    vmovq %rsi, %xmm0
3065; AVX1-NEXT:    vmovq %rax, %xmm1
3066; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
3067; AVX1-NEXT:    vzeroupper
3068; AVX1-NEXT:    retq
3069;
3070; AVX2-LABEL: cvt_8f32_to_8i16:
3071; AVX2:       # BB#0:
3072; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3073; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3074; AVX2-NEXT:    vmovd %xmm1, %eax
3075; AVX2-NEXT:    shll $16, %eax
3076; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
3077; AVX2-NEXT:    vmovd %xmm1, %ecx
3078; AVX2-NEXT:    movzwl %cx, %ecx
3079; AVX2-NEXT:    orl %eax, %ecx
3080; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3081; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3082; AVX2-NEXT:    vmovd %xmm1, %edx
3083; AVX2-NEXT:    shll $16, %edx
3084; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3085; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3086; AVX2-NEXT:    vmovd %xmm1, %eax
3087; AVX2-NEXT:    movzwl %ax, %eax
3088; AVX2-NEXT:    orl %edx, %eax
3089; AVX2-NEXT:    shlq $32, %rax
3090; AVX2-NEXT:    orq %rcx, %rax
3091; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3092; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3093; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3094; AVX2-NEXT:    vmovd %xmm1, %ecx
3095; AVX2-NEXT:    shll $16, %ecx
3096; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
3097; AVX2-NEXT:    vmovd %xmm1, %edx
3098; AVX2-NEXT:    movzwl %dx, %edx
3099; AVX2-NEXT:    orl %ecx, %edx
3100; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3101; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3102; AVX2-NEXT:    vmovd %xmm1, %ecx
3103; AVX2-NEXT:    shll $16, %ecx
3104; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3105; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3106; AVX2-NEXT:    vmovd %xmm0, %esi
3107; AVX2-NEXT:    movzwl %si, %esi
3108; AVX2-NEXT:    orl %ecx, %esi
3109; AVX2-NEXT:    shlq $32, %rsi
3110; AVX2-NEXT:    orq %rdx, %rsi
3111; AVX2-NEXT:    vmovq %rsi, %xmm0
3112; AVX2-NEXT:    vmovq %rax, %xmm1
3113; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
3114; AVX2-NEXT:    vzeroupper
3115; AVX2-NEXT:    retq
3116;
3117; AVX512F-LABEL: cvt_8f32_to_8i16:
3118; AVX512F:       # BB#0:
3119; AVX512F-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
3120; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm1
3121; AVX512F-NEXT:    vmovd %xmm1, %eax
3122; AVX512F-NEXT:    movzwl %ax, %eax
3123; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3124; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3125; AVX512F-NEXT:    vmovd %xmm1, %ecx
3126; AVX512F-NEXT:    shll $16, %ecx
3127; AVX512F-NEXT:    orl %eax, %ecx
3128; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3129; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3130; AVX512F-NEXT:    vmovd %xmm1, %eax
3131; AVX512F-NEXT:    movzwl %ax, %edx
3132; AVX512F-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3133; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3134; AVX512F-NEXT:    vmovd %xmm1, %eax
3135; AVX512F-NEXT:    shll $16, %eax
3136; AVX512F-NEXT:    orl %edx, %eax
3137; AVX512F-NEXT:    shlq $32, %rax
3138; AVX512F-NEXT:    orq %rcx, %rax
3139; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
3140; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm1
3141; AVX512F-NEXT:    vmovd %xmm1, %ecx
3142; AVX512F-NEXT:    movzwl %cx, %ecx
3143; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3144; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3145; AVX512F-NEXT:    vmovd %xmm1, %edx
3146; AVX512F-NEXT:    shll $16, %edx
3147; AVX512F-NEXT:    orl %ecx, %edx
3148; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3149; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3150; AVX512F-NEXT:    vmovd %xmm1, %ecx
3151; AVX512F-NEXT:    movzwl %cx, %ecx
3152; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
3153; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
3154; AVX512F-NEXT:    vmovd %xmm0, %esi
3155; AVX512F-NEXT:    shll $16, %esi
3156; AVX512F-NEXT:    orl %ecx, %esi
3157; AVX512F-NEXT:    shlq $32, %rsi
3158; AVX512F-NEXT:    orq %rdx, %rsi
3159; AVX512F-NEXT:    vmovq %rsi, %xmm0
3160; AVX512F-NEXT:    vmovq %rax, %xmm1
3161; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
3162; AVX512F-NEXT:    retq
3163;
3164; AVX512VL-LABEL: cvt_8f32_to_8i16:
3165; AVX512VL:       # BB#0:
3166; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3167; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3168; AVX512VL-NEXT:    vmovd %xmm1, %eax
3169; AVX512VL-NEXT:    shll $16, %eax
3170; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
3171; AVX512VL-NEXT:    vmovd %xmm1, %ecx
3172; AVX512VL-NEXT:    movzwl %cx, %ecx
3173; AVX512VL-NEXT:    orl %eax, %ecx
3174; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3175; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3176; AVX512VL-NEXT:    vmovd %xmm1, %edx
3177; AVX512VL-NEXT:    shll $16, %edx
3178; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3179; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3180; AVX512VL-NEXT:    vmovd %xmm1, %eax
3181; AVX512VL-NEXT:    movzwl %ax, %eax
3182; AVX512VL-NEXT:    orl %edx, %eax
3183; AVX512VL-NEXT:    shlq $32, %rax
3184; AVX512VL-NEXT:    orq %rcx, %rax
3185; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
3186; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3187; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3188; AVX512VL-NEXT:    vmovd %xmm1, %ecx
3189; AVX512VL-NEXT:    shll $16, %ecx
3190; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
3191; AVX512VL-NEXT:    vmovd %xmm1, %edx
3192; AVX512VL-NEXT:    movzwl %dx, %edx
3193; AVX512VL-NEXT:    orl %ecx, %edx
3194; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3195; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3196; AVX512VL-NEXT:    vmovd %xmm1, %ecx
3197; AVX512VL-NEXT:    shll $16, %ecx
3198; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3199; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3200; AVX512VL-NEXT:    vmovd %xmm0, %esi
3201; AVX512VL-NEXT:    movzwl %si, %esi
3202; AVX512VL-NEXT:    orl %ecx, %esi
3203; AVX512VL-NEXT:    shlq $32, %rsi
3204; AVX512VL-NEXT:    orq %rdx, %rsi
3205; AVX512VL-NEXT:    vmovq %rsi, %xmm0
3206; AVX512VL-NEXT:    vmovq %rax, %xmm1
3207; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
3208; AVX512VL-NEXT:    retq
3209  %1 = fptrunc <8 x float> %a0 to <8 x half>
3210  %2 = bitcast <8 x half> %1 to <8 x i16>
3211  ret <8 x i16> %2
3212}
3213
3214define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
3215; AVX1-LABEL: cvt_16f32_to_16i16:
3216; AVX1:       # BB#0:
3217; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm2
3218; AVX1-NEXT:    vmovd %xmm2, %eax
3219; AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
3220; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
3221; AVX1-NEXT:    vmovd %eax, %xmm3
3222; AVX1-NEXT:    vmovd %xmm2, %eax
3223; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
3224; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
3225; AVX1-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
3226; AVX1-NEXT:    vmovd %xmm2, %eax
3227; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3228; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
3229; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3230; AVX1-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
3231; AVX1-NEXT:    vmovd %xmm1, %eax
3232; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
3233; AVX1-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
3234; AVX1-NEXT:    vmovd %xmm1, %eax
3235; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
3236; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3237; AVX1-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
3238; AVX1-NEXT:    vmovd %xmm1, %eax
3239; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
3240; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3241; AVX1-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
3242; AVX1-NEXT:    vmovd %xmm1, %eax
3243; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
3244; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
3245; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
3246; AVX1-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
3247; AVX1-NEXT:    vmovd %xmm2, %eax
3248; AVX1-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm2
3249; AVX1-NEXT:    vmovd %xmm1, %eax
3250; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3251; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3252; AVX1-NEXT:    vmovd %eax, %xmm3
3253; AVX1-NEXT:    vmovd %xmm1, %eax
3254; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3255; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3256; AVX1-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
3257; AVX1-NEXT:    vmovd %xmm1, %eax
3258; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3259; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
3260; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3261; AVX1-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
3262; AVX1-NEXT:    vmovd %xmm0, %eax
3263; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
3264; AVX1-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
3265; AVX1-NEXT:    vmovd %xmm0, %eax
3266; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
3267; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3268; AVX1-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
3269; AVX1-NEXT:    vmovd %xmm0, %eax
3270; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
3271; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3272; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
3273; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3274; AVX1-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
3275; AVX1-NEXT:    vmovd %xmm1, %eax
3276; AVX1-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm1
3277; AVX1-NEXT:    vmovd %xmm0, %eax
3278; AVX1-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
3279; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3280; AVX1-NEXT:    retq
3281;
3282; AVX2-LABEL: cvt_16f32_to_16i16:
3283; AVX2:       # BB#0:
3284; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm2
3285; AVX2-NEXT:    vmovd %xmm2, %eax
3286; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
3287; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
3288; AVX2-NEXT:    vmovd %eax, %xmm3
3289; AVX2-NEXT:    vmovd %xmm2, %eax
3290; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
3291; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
3292; AVX2-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
3293; AVX2-NEXT:    vmovd %xmm2, %eax
3294; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
3295; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
3296; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3297; AVX2-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
3298; AVX2-NEXT:    vmovd %xmm1, %eax
3299; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
3300; AVX2-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
3301; AVX2-NEXT:    vmovd %xmm1, %eax
3302; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
3303; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3304; AVX2-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
3305; AVX2-NEXT:    vmovd %xmm1, %eax
3306; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
3307; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3308; AVX2-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
3309; AVX2-NEXT:    vmovd %xmm1, %eax
3310; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
3311; AVX2-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
3312; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
3313; AVX2-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
3314; AVX2-NEXT:    vmovd %xmm2, %eax
3315; AVX2-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm2
3316; AVX2-NEXT:    vmovd %xmm1, %eax
3317; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3318; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3319; AVX2-NEXT:    vmovd %eax, %xmm3
3320; AVX2-NEXT:    vmovd %xmm1, %eax
3321; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3322; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3323; AVX2-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
3324; AVX2-NEXT:    vmovd %xmm1, %eax
3325; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
3326; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
3327; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3328; AVX2-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
3329; AVX2-NEXT:    vmovd %xmm0, %eax
3330; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
3331; AVX2-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
3332; AVX2-NEXT:    vmovd %xmm0, %eax
3333; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
3334; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3335; AVX2-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
3336; AVX2-NEXT:    vmovd %xmm0, %eax
3337; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
3338; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3339; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
3340; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3341; AVX2-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
3342; AVX2-NEXT:    vmovd %xmm1, %eax
3343; AVX2-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm1
3344; AVX2-NEXT:    vmovd %xmm0, %eax
3345; AVX2-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
3346; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
3347; AVX2-NEXT:    retq
3348;
3349; AVX512F-LABEL: cvt_16f32_to_16i16:
3350; AVX512F:       # BB#0:
3351; AVX512F-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
3352; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm2
3353; AVX512F-NEXT:    vmovd %xmm2, %eax
3354; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
3355; AVX512F-NEXT:    vcvtps2ph $4, %zmm2, %ymm2
3356; AVX512F-NEXT:    vmovd %eax, %xmm3
3357; AVX512F-NEXT:    vmovd %xmm2, %eax
3358; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
3359; AVX512F-NEXT:    vcvtps2ph $4, %zmm2, %ymm2
3360; AVX512F-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
3361; AVX512F-NEXT:    vmovd %xmm2, %eax
3362; AVX512F-NEXT:    vextractf128 $1, %ymm1, %xmm2
3363; AVX512F-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
3364; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3365; AVX512F-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
3366; AVX512F-NEXT:    vmovd %xmm1, %eax
3367; AVX512F-NEXT:    vcvtps2ph $4, %zmm2, %ymm1
3368; AVX512F-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
3369; AVX512F-NEXT:    vmovd %xmm1, %eax
3370; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
3371; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3372; AVX512F-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
3373; AVX512F-NEXT:    vmovd %xmm1, %eax
3374; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
3375; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3376; AVX512F-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
3377; AVX512F-NEXT:    vmovd %xmm1, %eax
3378; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm1
3379; AVX512F-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
3380; AVX512F-NEXT:    vcvtps2ph $4, %zmm2, %ymm2
3381; AVX512F-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
3382; AVX512F-NEXT:    vmovd %xmm2, %eax
3383; AVX512F-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm2
3384; AVX512F-NEXT:    vmovd %xmm1, %eax
3385; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3386; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3387; AVX512F-NEXT:    vmovd %eax, %xmm3
3388; AVX512F-NEXT:    vmovd %xmm1, %eax
3389; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3390; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3391; AVX512F-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
3392; AVX512F-NEXT:    vmovd %xmm1, %eax
3393; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm1
3394; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
3395; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
3396; AVX512F-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
3397; AVX512F-NEXT:    vmovd %xmm0, %eax
3398; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm0
3399; AVX512F-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
3400; AVX512F-NEXT:    vmovd %xmm0, %eax
3401; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
3402; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
3403; AVX512F-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
3404; AVX512F-NEXT:    vmovd %xmm0, %eax
3405; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
3406; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
3407; AVX512F-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
3408; AVX512F-NEXT:    vmovd %xmm0, %eax
3409; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
3410; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
3411; AVX512F-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm1
3412; AVX512F-NEXT:    vmovd %xmm0, %eax
3413; AVX512F-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
3414; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
3415; AVX512F-NEXT:    retq
3416;
3417; AVX512VL-LABEL: cvt_16f32_to_16i16:
3418; AVX512VL:       # BB#0:
3419; AVX512VL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
3420; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm2
3421; AVX512VL-NEXT:    vmovd %xmm2, %eax
3422; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
3423; AVX512VL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
3424; AVX512VL-NEXT:    vmovd %eax, %xmm3
3425; AVX512VL-NEXT:    vmovd %xmm2, %eax
3426; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
3427; AVX512VL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
3428; AVX512VL-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
3429; AVX512VL-NEXT:    vmovd %xmm2, %eax
3430; AVX512VL-NEXT:    vextractf128 $1, %ymm1, %xmm2
3431; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
3432; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3433; AVX512VL-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
3434; AVX512VL-NEXT:    vmovd %xmm1, %eax
3435; AVX512VL-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
3436; AVX512VL-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
3437; AVX512VL-NEXT:    vmovd %xmm1, %eax
3438; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
3439; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3440; AVX512VL-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
3441; AVX512VL-NEXT:    vmovd %xmm1, %eax
3442; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
3443; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3444; AVX512VL-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
3445; AVX512VL-NEXT:    vmovd %xmm1, %eax
3446; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
3447; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
3448; AVX512VL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
3449; AVX512VL-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
3450; AVX512VL-NEXT:    vmovd %xmm2, %eax
3451; AVX512VL-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm2
3452; AVX512VL-NEXT:    vmovd %xmm1, %eax
3453; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3454; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3455; AVX512VL-NEXT:    vmovd %eax, %xmm3
3456; AVX512VL-NEXT:    vmovd %xmm1, %eax
3457; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3458; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3459; AVX512VL-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
3460; AVX512VL-NEXT:    vmovd %xmm1, %eax
3461; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
3462; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
3463; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3464; AVX512VL-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
3465; AVX512VL-NEXT:    vmovd %xmm0, %eax
3466; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
3467; AVX512VL-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
3468; AVX512VL-NEXT:    vmovd %xmm0, %eax
3469; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
3470; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3471; AVX512VL-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
3472; AVX512VL-NEXT:    vmovd %xmm0, %eax
3473; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
3474; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3475; AVX512VL-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
3476; AVX512VL-NEXT:    vmovd %xmm0, %eax
3477; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
3478; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3479; AVX512VL-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm1
3480; AVX512VL-NEXT:    vmovd %xmm0, %eax
3481; AVX512VL-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
3482; AVX512VL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
3483; AVX512VL-NEXT:    retq
3484  %1 = fptrunc <16 x float> %a0 to <16 x half>
3485  %2 = bitcast <16 x half> %1 to <16 x i16>
3486  ret <16 x i16> %2
3487}
3488
3489;
3490; Float to Half (Store)
3491;
3492
3493define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind {
3494; AVX1-LABEL: store_cvt_f32_to_i16:
3495; AVX1:       # BB#0:
3496; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3497; AVX1-NEXT:    vmovd %xmm0, %eax
3498; AVX1-NEXT:    movw %ax, (%rdi)
3499; AVX1-NEXT:    retq
3500;
3501; AVX2-LABEL: store_cvt_f32_to_i16:
3502; AVX2:       # BB#0:
3503; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3504; AVX2-NEXT:    vmovd %xmm0, %eax
3505; AVX2-NEXT:    movw %ax, (%rdi)
3506; AVX2-NEXT:    retq
3507;
3508; AVX512F-LABEL: store_cvt_f32_to_i16:
3509; AVX512F:       # BB#0:
3510; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
3511; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
3512; AVX512F-NEXT:    vmovd %xmm0, %eax
3513; AVX512F-NEXT:    movw %ax, (%rdi)
3514; AVX512F-NEXT:    retq
3515;
3516; AVX512VL-LABEL: store_cvt_f32_to_i16:
3517; AVX512VL:       # BB#0:
3518; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3519; AVX512VL-NEXT:    vmovd %xmm0, %eax
3520; AVX512VL-NEXT:    movw %ax, (%rdi)
3521; AVX512VL-NEXT:    retq
3522  %1 = fptrunc float %a0 to half
3523  %2 = bitcast half %1 to i16
3524  store i16 %2, i16* %a1
3525  ret void
3526}
3527
3528define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) nounwind {
3529; AVX1-LABEL: store_cvt_4f32_to_4i16:
3530; AVX1:       # BB#0:
3531; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3532; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3533; AVX1-NEXT:    vmovd %xmm1, %eax
3534; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3535; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3536; AVX1-NEXT:    vmovd %xmm1, %ecx
3537; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3538; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3539; AVX1-NEXT:    vmovd %xmm1, %edx
3540; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3541; AVX1-NEXT:    vmovd %xmm0, %esi
3542; AVX1-NEXT:    movw %si, (%rdi)
3543; AVX1-NEXT:    movw %dx, 6(%rdi)
3544; AVX1-NEXT:    movw %cx, 4(%rdi)
3545; AVX1-NEXT:    movw %ax, 2(%rdi)
3546; AVX1-NEXT:    retq
3547;
3548; AVX2-LABEL: store_cvt_4f32_to_4i16:
3549; AVX2:       # BB#0:
3550; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3551; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3552; AVX2-NEXT:    vmovd %xmm1, %eax
3553; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3554; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3555; AVX2-NEXT:    vmovd %xmm1, %ecx
3556; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3557; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3558; AVX2-NEXT:    vmovd %xmm1, %edx
3559; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3560; AVX2-NEXT:    vmovd %xmm0, %esi
3561; AVX2-NEXT:    movw %si, (%rdi)
3562; AVX2-NEXT:    movw %dx, 6(%rdi)
3563; AVX2-NEXT:    movw %cx, 4(%rdi)
3564; AVX2-NEXT:    movw %ax, 2(%rdi)
3565; AVX2-NEXT:    retq
3566;
3567; AVX512F-LABEL: store_cvt_4f32_to_4i16:
3568; AVX512F:       # BB#0:
3569; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
3570; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3571; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3572; AVX512F-NEXT:    vmovd %xmm1, %eax
3573; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3574; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3575; AVX512F-NEXT:    vmovd %xmm1, %ecx
3576; AVX512F-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3577; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3578; AVX512F-NEXT:    vmovd %xmm1, %edx
3579; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
3580; AVX512F-NEXT:    vmovd %xmm0, %esi
3581; AVX512F-NEXT:    movw %si, (%rdi)
3582; AVX512F-NEXT:    movw %dx, 6(%rdi)
3583; AVX512F-NEXT:    movw %cx, 4(%rdi)
3584; AVX512F-NEXT:    movw %ax, 2(%rdi)
3585; AVX512F-NEXT:    retq
3586;
3587; AVX512VL-LABEL: store_cvt_4f32_to_4i16:
3588; AVX512VL:       # BB#0:
3589; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3590; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3591; AVX512VL-NEXT:    vmovd %xmm1, %eax
3592; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3593; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3594; AVX512VL-NEXT:    vmovd %xmm1, %ecx
3595; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3596; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3597; AVX512VL-NEXT:    vmovd %xmm1, %edx
3598; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3599; AVX512VL-NEXT:    vmovd %xmm0, %esi
3600; AVX512VL-NEXT:    movw %si, (%rdi)
3601; AVX512VL-NEXT:    movw %dx, 6(%rdi)
3602; AVX512VL-NEXT:    movw %cx, 4(%rdi)
3603; AVX512VL-NEXT:    movw %ax, 2(%rdi)
3604; AVX512VL-NEXT:    retq
3605  %1 = fptrunc <4 x float> %a0 to <4 x half>
3606  %2 = bitcast <4 x half> %1 to <4 x i16>
3607  store <4 x i16> %2, <4 x i16>* %a1
3608  ret void
3609}
3610
3611define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounwind {
3612; AVX1-LABEL: store_cvt_4f32_to_8i16_undef:
3613; AVX1:       # BB#0:
3614; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3615; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3616; AVX1-NEXT:    vmovd %xmm1, %eax
3617; AVX1-NEXT:    shll $16, %eax
3618; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
3619; AVX1-NEXT:    vmovd %xmm1, %ecx
3620; AVX1-NEXT:    movzwl %cx, %ecx
3621; AVX1-NEXT:    orl %eax, %ecx
3622; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3623; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3624; AVX1-NEXT:    vmovd %xmm1, %eax
3625; AVX1-NEXT:    shll $16, %eax
3626; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3627; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3628; AVX1-NEXT:    vmovd %xmm0, %edx
3629; AVX1-NEXT:    movzwl %dx, %edx
3630; AVX1-NEXT:    orl %eax, %edx
3631; AVX1-NEXT:    shlq $32, %rdx
3632; AVX1-NEXT:    orq %rcx, %rdx
3633; AVX1-NEXT:    vmovq %rdx, %xmm0
3634; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3635; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
3636; AVX1-NEXT:    retq
3637;
3638; AVX2-LABEL: store_cvt_4f32_to_8i16_undef:
3639; AVX2:       # BB#0:
3640; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3641; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3642; AVX2-NEXT:    vmovd %xmm1, %eax
3643; AVX2-NEXT:    shll $16, %eax
3644; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
3645; AVX2-NEXT:    vmovd %xmm1, %ecx
3646; AVX2-NEXT:    movzwl %cx, %ecx
3647; AVX2-NEXT:    orl %eax, %ecx
3648; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3649; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3650; AVX2-NEXT:    vmovd %xmm1, %eax
3651; AVX2-NEXT:    shll $16, %eax
3652; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3653; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3654; AVX2-NEXT:    vmovd %xmm0, %edx
3655; AVX2-NEXT:    movzwl %dx, %edx
3656; AVX2-NEXT:    orl %eax, %edx
3657; AVX2-NEXT:    shlq $32, %rdx
3658; AVX2-NEXT:    orq %rcx, %rdx
3659; AVX2-NEXT:    vmovq %rdx, %xmm0
3660; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3661; AVX2-NEXT:    vmovdqa %xmm0, (%rdi)
3662; AVX2-NEXT:    retq
3663;
3664; AVX512F-LABEL: store_cvt_4f32_to_8i16_undef:
3665; AVX512F:       # BB#0:
3666; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
3667; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm1
3668; AVX512F-NEXT:    vmovd %xmm1, %eax
3669; AVX512F-NEXT:    movzwl %ax, %eax
3670; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3671; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3672; AVX512F-NEXT:    vmovd %xmm1, %ecx
3673; AVX512F-NEXT:    shll $16, %ecx
3674; AVX512F-NEXT:    orl %eax, %ecx
3675; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3676; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3677; AVX512F-NEXT:    vmovd %xmm1, %eax
3678; AVX512F-NEXT:    movzwl %ax, %eax
3679; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
3680; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
3681; AVX512F-NEXT:    vmovd %xmm0, %edx
3682; AVX512F-NEXT:    shll $16, %edx
3683; AVX512F-NEXT:    orl %eax, %edx
3684; AVX512F-NEXT:    shlq $32, %rdx
3685; AVX512F-NEXT:    orq %rcx, %rdx
3686; AVX512F-NEXT:    vmovq %rdx, %xmm0
3687; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3688; AVX512F-NEXT:    vmovdqa %xmm0, (%rdi)
3689; AVX512F-NEXT:    retq
3690;
3691; AVX512VL-LABEL: store_cvt_4f32_to_8i16_undef:
3692; AVX512VL:       # BB#0:
3693; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3694; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3695; AVX512VL-NEXT:    vmovd %xmm1, %eax
3696; AVX512VL-NEXT:    shll $16, %eax
3697; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
3698; AVX512VL-NEXT:    vmovd %xmm1, %ecx
3699; AVX512VL-NEXT:    movzwl %cx, %ecx
3700; AVX512VL-NEXT:    orl %eax, %ecx
3701; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3702; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3703; AVX512VL-NEXT:    vmovd %xmm1, %eax
3704; AVX512VL-NEXT:    shll $16, %eax
3705; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3706; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3707; AVX512VL-NEXT:    vmovd %xmm0, %edx
3708; AVX512VL-NEXT:    movzwl %dx, %edx
3709; AVX512VL-NEXT:    orl %eax, %edx
3710; AVX512VL-NEXT:    shlq $32, %rdx
3711; AVX512VL-NEXT:    orq %rcx, %rdx
3712; AVX512VL-NEXT:    vmovq %rdx, %xmm0
3713; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3714; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
3715; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3716; AVX512VL-NEXT:    vmovdqa %xmm0, (%rdi)
3717; AVX512VL-NEXT:    retq
3718  %1 = fptrunc <4 x float> %a0 to <4 x half>
3719  %2 = bitcast <4 x half> %1 to <4 x i16>
3720  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3721  store <8 x i16> %3, <8 x i16>* %a1
3722  ret void
3723}
3724
3725define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwind {
3726; AVX1-LABEL: store_cvt_4f32_to_8i16_zero:
3727; AVX1:       # BB#0:
3728; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3729; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3730; AVX1-NEXT:    vmovd %xmm1, %eax
3731; AVX1-NEXT:    shll $16, %eax
3732; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
3733; AVX1-NEXT:    vmovd %xmm1, %ecx
3734; AVX1-NEXT:    movzwl %cx, %ecx
3735; AVX1-NEXT:    orl %eax, %ecx
3736; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3737; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3738; AVX1-NEXT:    vmovd %xmm1, %eax
3739; AVX1-NEXT:    shll $16, %eax
3740; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3741; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3742; AVX1-NEXT:    vmovd %xmm0, %edx
3743; AVX1-NEXT:    movzwl %dx, %edx
3744; AVX1-NEXT:    orl %eax, %edx
3745; AVX1-NEXT:    shlq $32, %rdx
3746; AVX1-NEXT:    orq %rcx, %rdx
3747; AVX1-NEXT:    vmovq %rdx, %xmm0
3748; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
3749; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
3750; AVX1-NEXT:    retq
3751;
3752; AVX2-LABEL: store_cvt_4f32_to_8i16_zero:
3753; AVX2:       # BB#0:
3754; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3755; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3756; AVX2-NEXT:    vmovd %xmm1, %eax
3757; AVX2-NEXT:    shll $16, %eax
3758; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
3759; AVX2-NEXT:    vmovd %xmm1, %ecx
3760; AVX2-NEXT:    movzwl %cx, %ecx
3761; AVX2-NEXT:    orl %eax, %ecx
3762; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3763; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3764; AVX2-NEXT:    vmovd %xmm1, %eax
3765; AVX2-NEXT:    shll $16, %eax
3766; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3767; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3768; AVX2-NEXT:    vmovd %xmm0, %edx
3769; AVX2-NEXT:    movzwl %dx, %edx
3770; AVX2-NEXT:    orl %eax, %edx
3771; AVX2-NEXT:    shlq $32, %rdx
3772; AVX2-NEXT:    orq %rcx, %rdx
3773; AVX2-NEXT:    vmovq %rdx, %xmm0
3774; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
3775; AVX2-NEXT:    vmovdqa %xmm0, (%rdi)
3776; AVX2-NEXT:    retq
3777;
3778; AVX512F-LABEL: store_cvt_4f32_to_8i16_zero:
3779; AVX512F:       # BB#0:
3780; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
3781; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm1
3782; AVX512F-NEXT:    vmovd %xmm1, %eax
3783; AVX512F-NEXT:    movzwl %ax, %eax
3784; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3785; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3786; AVX512F-NEXT:    vmovd %xmm1, %ecx
3787; AVX512F-NEXT:    shll $16, %ecx
3788; AVX512F-NEXT:    orl %eax, %ecx
3789; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3790; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3791; AVX512F-NEXT:    vmovd %xmm1, %eax
3792; AVX512F-NEXT:    movzwl %ax, %eax
3793; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
3794; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
3795; AVX512F-NEXT:    vmovd %xmm0, %edx
3796; AVX512F-NEXT:    shll $16, %edx
3797; AVX512F-NEXT:    orl %eax, %edx
3798; AVX512F-NEXT:    shlq $32, %rdx
3799; AVX512F-NEXT:    orq %rcx, %rdx
3800; AVX512F-NEXT:    vmovq %rdx, %xmm0
3801; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
3802; AVX512F-NEXT:    vmovdqa %xmm0, (%rdi)
3803; AVX512F-NEXT:    retq
3804;
3805; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero:
3806; AVX512VL:       # BB#0:
3807; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3808; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3809; AVX512VL-NEXT:    vmovd %xmm1, %eax
3810; AVX512VL-NEXT:    shll $16, %eax
3811; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
3812; AVX512VL-NEXT:    vmovd %xmm1, %ecx
3813; AVX512VL-NEXT:    movzwl %cx, %ecx
3814; AVX512VL-NEXT:    orl %eax, %ecx
3815; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3816; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3817; AVX512VL-NEXT:    vmovd %xmm1, %eax
3818; AVX512VL-NEXT:    shll $16, %eax
3819; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3820; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3821; AVX512VL-NEXT:    vmovd %xmm0, %edx
3822; AVX512VL-NEXT:    movzwl %dx, %edx
3823; AVX512VL-NEXT:    orl %eax, %edx
3824; AVX512VL-NEXT:    shlq $32, %rdx
3825; AVX512VL-NEXT:    orq %rcx, %rdx
3826; AVX512VL-NEXT:    vmovq %rdx, %xmm0
3827; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3828; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
3829; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
3830; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3831; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
3832; AVX512VL-NEXT:    vmovdqa %xmm0, (%rdi)
3833; AVX512VL-NEXT:    retq
3834  %1 = fptrunc <4 x float> %a0 to <4 x half>
3835  %2 = bitcast <4 x half> %1 to <4 x i16>
3836  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3837  store <8 x i16> %3, <8 x i16>* %a1
3838  ret void
3839}
3840
3841define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind {
3842; AVX1-LABEL: store_cvt_8f32_to_8i16:
3843; AVX1:       # BB#0:
3844; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3845; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3846; AVX1-NEXT:    vmovd %xmm1, %r8d
3847; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3848; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3849; AVX1-NEXT:    vmovd %xmm1, %r9d
3850; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3851; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3852; AVX1-NEXT:    vmovd %xmm1, %r10d
3853; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3854; AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
3855; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
3856; AVX1-NEXT:    vmovd %xmm2, %r11d
3857; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
3858; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
3859; AVX1-NEXT:    vmovd %xmm2, %eax
3860; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
3861; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
3862; AVX1-NEXT:    vmovd %xmm2, %ecx
3863; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3864; AVX1-NEXT:    vmovd %xmm0, %edx
3865; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
3866; AVX1-NEXT:    vmovd %xmm0, %esi
3867; AVX1-NEXT:    movw %si, 8(%rdi)
3868; AVX1-NEXT:    movw %dx, (%rdi)
3869; AVX1-NEXT:    movw %cx, 14(%rdi)
3870; AVX1-NEXT:    movw %ax, 12(%rdi)
3871; AVX1-NEXT:    movw %r11w, 10(%rdi)
3872; AVX1-NEXT:    movw %r10w, 6(%rdi)
3873; AVX1-NEXT:    movw %r9w, 4(%rdi)
3874; AVX1-NEXT:    movw %r8w, 2(%rdi)
3875; AVX1-NEXT:    vzeroupper
3876; AVX1-NEXT:    retq
3877;
3878; AVX2-LABEL: store_cvt_8f32_to_8i16:
3879; AVX2:       # BB#0:
3880; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3881; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3882; AVX2-NEXT:    vmovd %xmm1, %r8d
3883; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3884; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3885; AVX2-NEXT:    vmovd %xmm1, %r9d
3886; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3887; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3888; AVX2-NEXT:    vmovd %xmm1, %r10d
3889; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
3890; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
3891; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
3892; AVX2-NEXT:    vmovd %xmm2, %r11d
3893; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
3894; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
3895; AVX2-NEXT:    vmovd %xmm2, %eax
3896; AVX2-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
3897; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
3898; AVX2-NEXT:    vmovd %xmm2, %ecx
3899; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3900; AVX2-NEXT:    vmovd %xmm0, %edx
3901; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
3902; AVX2-NEXT:    vmovd %xmm0, %esi
3903; AVX2-NEXT:    movw %si, 8(%rdi)
3904; AVX2-NEXT:    movw %dx, (%rdi)
3905; AVX2-NEXT:    movw %cx, 14(%rdi)
3906; AVX2-NEXT:    movw %ax, 12(%rdi)
3907; AVX2-NEXT:    movw %r11w, 10(%rdi)
3908; AVX2-NEXT:    movw %r10w, 6(%rdi)
3909; AVX2-NEXT:    movw %r9w, 4(%rdi)
3910; AVX2-NEXT:    movw %r8w, 2(%rdi)
3911; AVX2-NEXT:    vzeroupper
3912; AVX2-NEXT:    retq
3913;
3914; AVX512F-LABEL: store_cvt_8f32_to_8i16:
3915; AVX512F:       # BB#0:
3916; AVX512F-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
3917; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3918; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3919; AVX512F-NEXT:    vmovd %xmm1, %r8d
3920; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3921; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3922; AVX512F-NEXT:    vmovd %xmm1, %r9d
3923; AVX512F-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3924; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
3925; AVX512F-NEXT:    vmovd %xmm1, %r10d
3926; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm1
3927; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
3928; AVX512F-NEXT:    vcvtps2ph $4, %zmm2, %ymm2
3929; AVX512F-NEXT:    vmovd %xmm2, %r11d
3930; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
3931; AVX512F-NEXT:    vcvtps2ph $4, %zmm2, %ymm2
3932; AVX512F-NEXT:    vmovd %xmm2, %eax
3933; AVX512F-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
3934; AVX512F-NEXT:    vcvtps2ph $4, %zmm2, %ymm2
3935; AVX512F-NEXT:    vmovd %xmm2, %ecx
3936; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
3937; AVX512F-NEXT:    vmovd %xmm0, %edx
3938; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm0
3939; AVX512F-NEXT:    vmovd %xmm0, %esi
3940; AVX512F-NEXT:    movw %si, 8(%rdi)
3941; AVX512F-NEXT:    movw %dx, (%rdi)
3942; AVX512F-NEXT:    movw %cx, 14(%rdi)
3943; AVX512F-NEXT:    movw %ax, 12(%rdi)
3944; AVX512F-NEXT:    movw %r11w, 10(%rdi)
3945; AVX512F-NEXT:    movw %r10w, 6(%rdi)
3946; AVX512F-NEXT:    movw %r9w, 4(%rdi)
3947; AVX512F-NEXT:    movw %r8w, 2(%rdi)
3948; AVX512F-NEXT:    retq
3949;
3950; AVX512VL-LABEL: store_cvt_8f32_to_8i16:
3951; AVX512VL:       # BB#0:
3952; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3953; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3954; AVX512VL-NEXT:    vmovd %xmm1, %r8d
3955; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3956; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3957; AVX512VL-NEXT:    vmovd %xmm1, %r9d
3958; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3959; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
3960; AVX512VL-NEXT:    vmovd %xmm1, %r10d
3961; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
3962; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
3963; AVX512VL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
3964; AVX512VL-NEXT:    vmovd %xmm2, %r11d
3965; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
3966; AVX512VL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
3967; AVX512VL-NEXT:    vmovd %xmm2, %eax
3968; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
3969; AVX512VL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
3970; AVX512VL-NEXT:    vmovd %xmm2, %ecx
3971; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
3972; AVX512VL-NEXT:    vmovd %xmm0, %edx
3973; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
3974; AVX512VL-NEXT:    vmovd %xmm0, %esi
3975; AVX512VL-NEXT:    movw %si, 8(%rdi)
3976; AVX512VL-NEXT:    movw %dx, (%rdi)
3977; AVX512VL-NEXT:    movw %cx, 14(%rdi)
3978; AVX512VL-NEXT:    movw %ax, 12(%rdi)
3979; AVX512VL-NEXT:    movw %r11w, 10(%rdi)
3980; AVX512VL-NEXT:    movw %r10w, 6(%rdi)
3981; AVX512VL-NEXT:    movw %r9w, 4(%rdi)
3982; AVX512VL-NEXT:    movw %r8w, 2(%rdi)
3983; AVX512VL-NEXT:    retq
3984  %1 = fptrunc <8 x float> %a0 to <8 x half>
3985  %2 = bitcast <8 x half> %1 to <8 x i16>
3986  store <8 x i16> %2, <8 x i16>* %a1
3987  ret void
3988}
3989
3990define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwind {
3991; AVX1-LABEL: store_cvt_16f32_to_16i16:
3992; AVX1:       # BB#0:
3993; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3994; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3995; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm4
3996; AVX1-NEXT:    vmovd %xmm4, %eax
3997; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm4
3998; AVX1-NEXT:    movw %ax, 24(%rdi)
3999; AVX1-NEXT:    vmovd %xmm4, %eax
4000; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm4
4001; AVX1-NEXT:    movw %ax, 16(%rdi)
4002; AVX1-NEXT:    vmovd %xmm4, %eax
4003; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm4
4004; AVX1-NEXT:    movw %ax, 8(%rdi)
4005; AVX1-NEXT:    vmovd %xmm4, %eax
4006; AVX1-NEXT:    vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
4007; AVX1-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
4008; AVX1-NEXT:    movw %ax, (%rdi)
4009; AVX1-NEXT:    vmovd %xmm4, %eax
4010; AVX1-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
4011; AVX1-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
4012; AVX1-NEXT:    movw %ax, 30(%rdi)
4013; AVX1-NEXT:    vmovd %xmm4, %eax
4014; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
4015; AVX1-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
4016; AVX1-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
4017; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
4018; AVX1-NEXT:    movw %ax, 28(%rdi)
4019; AVX1-NEXT:    vmovd %xmm3, %eax
4020; AVX1-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
4021; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
4022; AVX1-NEXT:    movw %ax, 26(%rdi)
4023; AVX1-NEXT:    vmovd %xmm3, %eax
4024; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
4025; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
4026; AVX1-NEXT:    movw %ax, 22(%rdi)
4027; AVX1-NEXT:    vmovd %xmm3, %eax
4028; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
4029; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
4030; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
4031; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
4032; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
4033; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
4034; AVX1-NEXT:    movw %ax, 20(%rdi)
4035; AVX1-NEXT:    vmovd %xmm1, %eax
4036; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
4037; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
4038; AVX1-NEXT:    movw %ax, 18(%rdi)
4039; AVX1-NEXT:    vmovd %xmm1, %eax
4040; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
4041; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
4042; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
4043; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
4044; AVX1-NEXT:    movw %ax, 14(%rdi)
4045; AVX1-NEXT:    vmovd %xmm2, %eax
4046; AVX1-NEXT:    movw %ax, 12(%rdi)
4047; AVX1-NEXT:    vmovd %xmm1, %eax
4048; AVX1-NEXT:    movw %ax, 10(%rdi)
4049; AVX1-NEXT:    vmovd %xmm0, %eax
4050; AVX1-NEXT:    movw %ax, 6(%rdi)
4051; AVX1-NEXT:    vmovd %xmm3, %eax
4052; AVX1-NEXT:    movw %ax, 4(%rdi)
4053; AVX1-NEXT:    vmovd %xmm4, %eax
4054; AVX1-NEXT:    movw %ax, 2(%rdi)
4055; AVX1-NEXT:    vzeroupper
4056; AVX1-NEXT:    retq
4057;
4058; AVX2-LABEL: store_cvt_16f32_to_16i16:
4059; AVX2:       # BB#0:
4060; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm2
4061; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm3
4062; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm4
4063; AVX2-NEXT:    vmovd %xmm4, %eax
4064; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm4
4065; AVX2-NEXT:    movw %ax, 24(%rdi)
4066; AVX2-NEXT:    vmovd %xmm4, %eax
4067; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm4
4068; AVX2-NEXT:    movw %ax, 16(%rdi)
4069; AVX2-NEXT:    vmovd %xmm4, %eax
4070; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm4
4071; AVX2-NEXT:    movw %ax, 8(%rdi)
4072; AVX2-NEXT:    vmovd %xmm4, %eax
4073; AVX2-NEXT:    vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
4074; AVX2-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
4075; AVX2-NEXT:    movw %ax, (%rdi)
4076; AVX2-NEXT:    vmovd %xmm4, %eax
4077; AVX2-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
4078; AVX2-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
4079; AVX2-NEXT:    movw %ax, 30(%rdi)
4080; AVX2-NEXT:    vmovd %xmm4, %eax
4081; AVX2-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
4082; AVX2-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
4083; AVX2-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
4084; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
4085; AVX2-NEXT:    movw %ax, 28(%rdi)
4086; AVX2-NEXT:    vmovd %xmm3, %eax
4087; AVX2-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
4088; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
4089; AVX2-NEXT:    movw %ax, 26(%rdi)
4090; AVX2-NEXT:    vmovd %xmm3, %eax
4091; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
4092; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
4093; AVX2-NEXT:    movw %ax, 22(%rdi)
4094; AVX2-NEXT:    vmovd %xmm3, %eax
4095; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
4096; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
4097; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
4098; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
4099; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
4100; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
4101; AVX2-NEXT:    movw %ax, 20(%rdi)
4102; AVX2-NEXT:    vmovd %xmm1, %eax
4103; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
4104; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
4105; AVX2-NEXT:    movw %ax, 18(%rdi)
4106; AVX2-NEXT:    vmovd %xmm1, %eax
4107; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
4108; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
4109; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
4110; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
4111; AVX2-NEXT:    movw %ax, 14(%rdi)
4112; AVX2-NEXT:    vmovd %xmm2, %eax
4113; AVX2-NEXT:    movw %ax, 12(%rdi)
4114; AVX2-NEXT:    vmovd %xmm1, %eax
4115; AVX2-NEXT:    movw %ax, 10(%rdi)
4116; AVX2-NEXT:    vmovd %xmm0, %eax
4117; AVX2-NEXT:    movw %ax, 6(%rdi)
4118; AVX2-NEXT:    vmovd %xmm3, %eax
4119; AVX2-NEXT:    movw %ax, 4(%rdi)
4120; AVX2-NEXT:    vmovd %xmm4, %eax
4121; AVX2-NEXT:    movw %ax, 2(%rdi)
4122; AVX2-NEXT:    vzeroupper
4123; AVX2-NEXT:    retq
4124;
4125; AVX512F-LABEL: store_cvt_16f32_to_16i16:
4126; AVX512F:       # BB#0:
4127; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm1
4128; AVX512F-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
4129; AVX512F-NEXT:    vextractf128 $1, %ymm2, %xmm3
4130; AVX512F-NEXT:    vcvtps2ph $4, %zmm3, %ymm4
4131; AVX512F-NEXT:    vmovd %xmm4, %eax
4132; AVX512F-NEXT:    vcvtps2ph $4, %zmm2, %ymm4
4133; AVX512F-NEXT:    movw %ax, 24(%rdi)
4134; AVX512F-NEXT:    vmovd %xmm4, %eax
4135; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm4
4136; AVX512F-NEXT:    movw %ax, 16(%rdi)
4137; AVX512F-NEXT:    vmovd %xmm4, %eax
4138; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm4
4139; AVX512F-NEXT:    movw %ax, 8(%rdi)
4140; AVX512F-NEXT:    vmovd %xmm4, %eax
4141; AVX512F-NEXT:    vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
4142; AVX512F-NEXT:    vcvtps2ph $4, %zmm4, %ymm4
4143; AVX512F-NEXT:    movw %ax, (%rdi)
4144; AVX512F-NEXT:    vmovd %xmm4, %eax
4145; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
4146; AVX512F-NEXT:    vcvtps2ph $4, %zmm4, %ymm4
4147; AVX512F-NEXT:    movw %ax, 30(%rdi)
4148; AVX512F-NEXT:    vmovd %xmm4, %eax
4149; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
4150; AVX512F-NEXT:    vcvtps2ph $4, %zmm4, %ymm4
4151; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
4152; AVX512F-NEXT:    vcvtps2ph $4, %zmm3, %ymm3
4153; AVX512F-NEXT:    movw %ax, 28(%rdi)
4154; AVX512F-NEXT:    vmovd %xmm3, %eax
4155; AVX512F-NEXT:    vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
4156; AVX512F-NEXT:    vcvtps2ph $4, %zmm3, %ymm3
4157; AVX512F-NEXT:    movw %ax, 26(%rdi)
4158; AVX512F-NEXT:    vmovd %xmm3, %eax
4159; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
4160; AVX512F-NEXT:    vcvtps2ph $4, %zmm3, %ymm3
4161; AVX512F-NEXT:    movw %ax, 22(%rdi)
4162; AVX512F-NEXT:    vmovd %xmm3, %eax
4163; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
4164; AVX512F-NEXT:    vcvtps2ph $4, %zmm3, %ymm3
4165; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
4166; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
4167; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
4168; AVX512F-NEXT:    vcvtps2ph $4, %zmm2, %ymm2
4169; AVX512F-NEXT:    movw %ax, 20(%rdi)
4170; AVX512F-NEXT:    vmovd %xmm2, %eax
4171; AVX512F-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
4172; AVX512F-NEXT:    vcvtps2ph $4, %zmm2, %ymm2
4173; AVX512F-NEXT:    movw %ax, 18(%rdi)
4174; AVX512F-NEXT:    vmovd %xmm2, %eax
4175; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
4176; AVX512F-NEXT:    vcvtps2ph $4, %zmm2, %ymm2
4177; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
4178; AVX512F-NEXT:    vcvtps2ph $4, %zmm1, %ymm1
4179; AVX512F-NEXT:    movw %ax, 14(%rdi)
4180; AVX512F-NEXT:    vmovd %xmm1, %eax
4181; AVX512F-NEXT:    movw %ax, 12(%rdi)
4182; AVX512F-NEXT:    vmovd %xmm2, %eax
4183; AVX512F-NEXT:    movw %ax, 10(%rdi)
4184; AVX512F-NEXT:    vmovd %xmm0, %eax
4185; AVX512F-NEXT:    movw %ax, 6(%rdi)
4186; AVX512F-NEXT:    vmovd %xmm3, %eax
4187; AVX512F-NEXT:    movw %ax, 4(%rdi)
4188; AVX512F-NEXT:    vmovd %xmm4, %eax
4189; AVX512F-NEXT:    movw %ax, 2(%rdi)
4190; AVX512F-NEXT:    retq
4191;
4192; AVX512VL-LABEL: store_cvt_16f32_to_16i16:
4193; AVX512VL:       # BB#0:
4194; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
4195; AVX512VL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
4196; AVX512VL-NEXT:    vextractf128 $1, %ymm2, %xmm3
4197; AVX512VL-NEXT:    vcvtps2ph $4, %xmm3, %xmm4
4198; AVX512VL-NEXT:    vmovd %xmm4, %eax
4199; AVX512VL-NEXT:    vcvtps2ph $4, %xmm2, %xmm4
4200; AVX512VL-NEXT:    movw %ax, 24(%rdi)
4201; AVX512VL-NEXT:    vmovd %xmm4, %eax
4202; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm4
4203; AVX512VL-NEXT:    movw %ax, 16(%rdi)
4204; AVX512VL-NEXT:    vmovd %xmm4, %eax
4205; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm4
4206; AVX512VL-NEXT:    movw %ax, 8(%rdi)
4207; AVX512VL-NEXT:    vmovd %xmm4, %eax
4208; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
4209; AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
4210; AVX512VL-NEXT:    movw %ax, (%rdi)
4211; AVX512VL-NEXT:    vmovd %xmm4, %eax
4212; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
4213; AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
4214; AVX512VL-NEXT:    movw %ax, 30(%rdi)
4215; AVX512VL-NEXT:    vmovd %xmm4, %eax
4216; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
4217; AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
4218; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
4219; AVX512VL-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
4220; AVX512VL-NEXT:    movw %ax, 28(%rdi)
4221; AVX512VL-NEXT:    vmovd %xmm3, %eax
4222; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
4223; AVX512VL-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
4224; AVX512VL-NEXT:    movw %ax, 26(%rdi)
4225; AVX512VL-NEXT:    vmovd %xmm3, %eax
4226; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
4227; AVX512VL-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
4228; AVX512VL-NEXT:    movw %ax, 22(%rdi)
4229; AVX512VL-NEXT:    vmovd %xmm3, %eax
4230; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
4231; AVX512VL-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
4232; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
4233; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
4234; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
4235; AVX512VL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
4236; AVX512VL-NEXT:    movw %ax, 20(%rdi)
4237; AVX512VL-NEXT:    vmovd %xmm2, %eax
4238; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
4239; AVX512VL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
4240; AVX512VL-NEXT:    movw %ax, 18(%rdi)
4241; AVX512VL-NEXT:    vmovd %xmm2, %eax
4242; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
4243; AVX512VL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
4244; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
4245; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
4246; AVX512VL-NEXT:    movw %ax, 14(%rdi)
4247; AVX512VL-NEXT:    vmovd %xmm1, %eax
4248; AVX512VL-NEXT:    movw %ax, 12(%rdi)
4249; AVX512VL-NEXT:    vmovd %xmm2, %eax
4250; AVX512VL-NEXT:    movw %ax, 10(%rdi)
4251; AVX512VL-NEXT:    vmovd %xmm0, %eax
4252; AVX512VL-NEXT:    movw %ax, 6(%rdi)
4253; AVX512VL-NEXT:    vmovd %xmm3, %eax
4254; AVX512VL-NEXT:    movw %ax, 4(%rdi)
4255; AVX512VL-NEXT:    vmovd %xmm4, %eax
4256; AVX512VL-NEXT:    movw %ax, 2(%rdi)
4257; AVX512VL-NEXT:    retq
4258  %1 = fptrunc <16 x float> %a0 to <16 x half>
4259  %2 = bitcast <16 x half> %1 to <16 x i16>
4260  store <16 x i16> %2, <16 x i16>* %a1
4261  ret void
4262}
4263
4264;
4265; Double to Half
4266;
4267
4268define i16 @cvt_f64_to_i16(double %a0) nounwind {
4269; ALL-LABEL: cvt_f64_to_i16:
4270; ALL:       # BB#0:
4271; ALL-NEXT:    jmp __truncdfhf2 # TAILCALL
4272  %1 = fptrunc double %a0 to half
4273  %2 = bitcast half %1 to i16
4274  ret i16 %2
4275}
4276
4277define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
4278; ALL-LABEL: cvt_2f64_to_2i16:
4279; ALL:       # BB#0:
4280; ALL-NEXT:    pushq %rbx
4281; ALL-NEXT:    subq $16, %rsp
4282; ALL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4283; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4284; ALL-NEXT:    callq __truncdfhf2
4285; ALL-NEXT:    movw %ax, %bx
4286; ALL-NEXT:    shll $16, %ebx
4287; ALL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4288; ALL-NEXT:    callq __truncdfhf2
4289; ALL-NEXT:    movzwl %ax, %eax
4290; ALL-NEXT:    orl %ebx, %eax
4291; ALL-NEXT:    vmovd %eax, %xmm0
4292; ALL-NEXT:    addq $16, %rsp
4293; ALL-NEXT:    popq %rbx
4294; ALL-NEXT:    retq
4295  %1 = fptrunc <2 x double> %a0 to <2 x half>
4296  %2 = bitcast <2 x half> %1 to <2 x i16>
4297  ret <2 x i16> %2
4298}
4299
4300define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
4301; AVX1-LABEL: cvt_4f64_to_4i16:
4302; AVX1:       # BB#0:
4303; AVX1-NEXT:    pushq %r14
4304; AVX1-NEXT:    pushq %rbx
4305; AVX1-NEXT:    subq $40, %rsp
4306; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4307; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4308; AVX1-NEXT:    vzeroupper
4309; AVX1-NEXT:    callq __truncdfhf2
4310; AVX1-NEXT:    movw %ax, %bx
4311; AVX1-NEXT:    shll $16, %ebx
4312; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4313; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4314; AVX1-NEXT:    vzeroupper
4315; AVX1-NEXT:    callq __truncdfhf2
4316; AVX1-NEXT:    movzwl %ax, %r14d
4317; AVX1-NEXT:    orl %ebx, %r14d
4318; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4319; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4320; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4321; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4322; AVX1-NEXT:    vzeroupper
4323; AVX1-NEXT:    callq __truncdfhf2
4324; AVX1-NEXT:    movw %ax, %bx
4325; AVX1-NEXT:    shll $16, %ebx
4326; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4327; AVX1-NEXT:    callq __truncdfhf2
4328; AVX1-NEXT:    movzwl %ax, %eax
4329; AVX1-NEXT:    orl %ebx, %eax
4330; AVX1-NEXT:    shlq $32, %rax
4331; AVX1-NEXT:    orq %r14, %rax
4332; AVX1-NEXT:    vmovq %rax, %xmm0
4333; AVX1-NEXT:    addq $40, %rsp
4334; AVX1-NEXT:    popq %rbx
4335; AVX1-NEXT:    popq %r14
4336; AVX1-NEXT:    retq
4337;
4338; AVX2-LABEL: cvt_4f64_to_4i16:
4339; AVX2:       # BB#0:
4340; AVX2-NEXT:    pushq %r14
4341; AVX2-NEXT:    pushq %rbx
4342; AVX2-NEXT:    subq $40, %rsp
4343; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4344; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4345; AVX2-NEXT:    vzeroupper
4346; AVX2-NEXT:    callq __truncdfhf2
4347; AVX2-NEXT:    movw %ax, %bx
4348; AVX2-NEXT:    shll $16, %ebx
4349; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4350; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4351; AVX2-NEXT:    vzeroupper
4352; AVX2-NEXT:    callq __truncdfhf2
4353; AVX2-NEXT:    movzwl %ax, %r14d
4354; AVX2-NEXT:    orl %ebx, %r14d
4355; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4356; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
4357; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4358; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4359; AVX2-NEXT:    vzeroupper
4360; AVX2-NEXT:    callq __truncdfhf2
4361; AVX2-NEXT:    movw %ax, %bx
4362; AVX2-NEXT:    shll $16, %ebx
4363; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4364; AVX2-NEXT:    callq __truncdfhf2
4365; AVX2-NEXT:    movzwl %ax, %eax
4366; AVX2-NEXT:    orl %ebx, %eax
4367; AVX2-NEXT:    shlq $32, %rax
4368; AVX2-NEXT:    orq %r14, %rax
4369; AVX2-NEXT:    vmovq %rax, %xmm0
4370; AVX2-NEXT:    addq $40, %rsp
4371; AVX2-NEXT:    popq %rbx
4372; AVX2-NEXT:    popq %r14
4373; AVX2-NEXT:    retq
4374;
4375; AVX512F-LABEL: cvt_4f64_to_4i16:
4376; AVX512F:       # BB#0:
4377; AVX512F-NEXT:    pushq %r14
4378; AVX512F-NEXT:    pushq %rbx
4379; AVX512F-NEXT:    subq $40, %rsp
4380; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4381; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4382; AVX512F-NEXT:    callq __truncdfhf2
4383; AVX512F-NEXT:    movw %ax, %bx
4384; AVX512F-NEXT:    shll $16, %ebx
4385; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4386; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4387; AVX512F-NEXT:    callq __truncdfhf2
4388; AVX512F-NEXT:    movzwl %ax, %r14d
4389; AVX512F-NEXT:    orl %ebx, %r14d
4390; AVX512F-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4391; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
4392; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4393; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4394; AVX512F-NEXT:    callq __truncdfhf2
4395; AVX512F-NEXT:    movw %ax, %bx
4396; AVX512F-NEXT:    shll $16, %ebx
4397; AVX512F-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4398; AVX512F-NEXT:    callq __truncdfhf2
4399; AVX512F-NEXT:    movzwl %ax, %eax
4400; AVX512F-NEXT:    orl %ebx, %eax
4401; AVX512F-NEXT:    shlq $32, %rax
4402; AVX512F-NEXT:    orq %r14, %rax
4403; AVX512F-NEXT:    vmovq %rax, %xmm0
4404; AVX512F-NEXT:    addq $40, %rsp
4405; AVX512F-NEXT:    popq %rbx
4406; AVX512F-NEXT:    popq %r14
4407; AVX512F-NEXT:    retq
4408;
4409; AVX512VL-LABEL: cvt_4f64_to_4i16:
4410; AVX512VL:       # BB#0:
4411; AVX512VL-NEXT:    pushq %r14
4412; AVX512VL-NEXT:    pushq %rbx
4413; AVX512VL-NEXT:    subq $40, %rsp
4414; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4415; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4416; AVX512VL-NEXT:    callq __truncdfhf2
4417; AVX512VL-NEXT:    movw %ax, %bx
4418; AVX512VL-NEXT:    shll $16, %ebx
4419; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4420; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4421; AVX512VL-NEXT:    callq __truncdfhf2
4422; AVX512VL-NEXT:    movzwl %ax, %r14d
4423; AVX512VL-NEXT:    orl %ebx, %r14d
4424; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4425; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
4426; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4427; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4428; AVX512VL-NEXT:    callq __truncdfhf2
4429; AVX512VL-NEXT:    movw %ax, %bx
4430; AVX512VL-NEXT:    shll $16, %ebx
4431; AVX512VL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4432; AVX512VL-NEXT:    callq __truncdfhf2
4433; AVX512VL-NEXT:    movzwl %ax, %eax
4434; AVX512VL-NEXT:    orl %ebx, %eax
4435; AVX512VL-NEXT:    shlq $32, %rax
4436; AVX512VL-NEXT:    orq %r14, %rax
4437; AVX512VL-NEXT:    vmovq %rax, %xmm0
4438; AVX512VL-NEXT:    addq $40, %rsp
4439; AVX512VL-NEXT:    popq %rbx
4440; AVX512VL-NEXT:    popq %r14
4441; AVX512VL-NEXT:    retq
4442  %1 = fptrunc <4 x double> %a0 to <4 x half>
4443  %2 = bitcast <4 x half> %1 to <4 x i16>
4444  ret <4 x i16> %2
4445}
4446
4447define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
4448; AVX1-LABEL: cvt_4f64_to_8i16_undef:
4449; AVX1:       # BB#0:
4450; AVX1-NEXT:    pushq %r14
4451; AVX1-NEXT:    pushq %rbx
4452; AVX1-NEXT:    subq $40, %rsp
4453; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4454; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4455; AVX1-NEXT:    vzeroupper
4456; AVX1-NEXT:    callq __truncdfhf2
4457; AVX1-NEXT:    movw %ax, %bx
4458; AVX1-NEXT:    shll $16, %ebx
4459; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4460; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4461; AVX1-NEXT:    vzeroupper
4462; AVX1-NEXT:    callq __truncdfhf2
4463; AVX1-NEXT:    movzwl %ax, %r14d
4464; AVX1-NEXT:    orl %ebx, %r14d
4465; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4466; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4467; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4468; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4469; AVX1-NEXT:    vzeroupper
4470; AVX1-NEXT:    callq __truncdfhf2
4471; AVX1-NEXT:    movw %ax, %bx
4472; AVX1-NEXT:    shll $16, %ebx
4473; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4474; AVX1-NEXT:    callq __truncdfhf2
4475; AVX1-NEXT:    movzwl %ax, %eax
4476; AVX1-NEXT:    orl %ebx, %eax
4477; AVX1-NEXT:    shlq $32, %rax
4478; AVX1-NEXT:    orq %r14, %rax
4479; AVX1-NEXT:    vmovq %rax, %xmm0
4480; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4481; AVX1-NEXT:    addq $40, %rsp
4482; AVX1-NEXT:    popq %rbx
4483; AVX1-NEXT:    popq %r14
4484; AVX1-NEXT:    retq
4485;
4486; AVX2-LABEL: cvt_4f64_to_8i16_undef:
4487; AVX2:       # BB#0:
4488; AVX2-NEXT:    pushq %r14
4489; AVX2-NEXT:    pushq %rbx
4490; AVX2-NEXT:    subq $40, %rsp
4491; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4492; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4493; AVX2-NEXT:    vzeroupper
4494; AVX2-NEXT:    callq __truncdfhf2
4495; AVX2-NEXT:    movw %ax, %bx
4496; AVX2-NEXT:    shll $16, %ebx
4497; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4498; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4499; AVX2-NEXT:    vzeroupper
4500; AVX2-NEXT:    callq __truncdfhf2
4501; AVX2-NEXT:    movzwl %ax, %r14d
4502; AVX2-NEXT:    orl %ebx, %r14d
4503; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4504; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
4505; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4506; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4507; AVX2-NEXT:    vzeroupper
4508; AVX2-NEXT:    callq __truncdfhf2
4509; AVX2-NEXT:    movw %ax, %bx
4510; AVX2-NEXT:    shll $16, %ebx
4511; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4512; AVX2-NEXT:    callq __truncdfhf2
4513; AVX2-NEXT:    movzwl %ax, %eax
4514; AVX2-NEXT:    orl %ebx, %eax
4515; AVX2-NEXT:    shlq $32, %rax
4516; AVX2-NEXT:    orq %r14, %rax
4517; AVX2-NEXT:    vmovq %rax, %xmm0
4518; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4519; AVX2-NEXT:    addq $40, %rsp
4520; AVX2-NEXT:    popq %rbx
4521; AVX2-NEXT:    popq %r14
4522; AVX2-NEXT:    retq
4523;
4524; AVX512F-LABEL: cvt_4f64_to_8i16_undef:
4525; AVX512F:       # BB#0:
4526; AVX512F-NEXT:    pushq %r14
4527; AVX512F-NEXT:    pushq %rbx
4528; AVX512F-NEXT:    subq $40, %rsp
4529; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4530; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4531; AVX512F-NEXT:    callq __truncdfhf2
4532; AVX512F-NEXT:    movw %ax, %bx
4533; AVX512F-NEXT:    shll $16, %ebx
4534; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4535; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4536; AVX512F-NEXT:    callq __truncdfhf2
4537; AVX512F-NEXT:    movzwl %ax, %r14d
4538; AVX512F-NEXT:    orl %ebx, %r14d
4539; AVX512F-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4540; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
4541; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4542; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4543; AVX512F-NEXT:    callq __truncdfhf2
4544; AVX512F-NEXT:    movw %ax, %bx
4545; AVX512F-NEXT:    shll $16, %ebx
4546; AVX512F-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4547; AVX512F-NEXT:    callq __truncdfhf2
4548; AVX512F-NEXT:    movzwl %ax, %eax
4549; AVX512F-NEXT:    orl %ebx, %eax
4550; AVX512F-NEXT:    shlq $32, %rax
4551; AVX512F-NEXT:    orq %r14, %rax
4552; AVX512F-NEXT:    vmovq %rax, %xmm0
4553; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4554; AVX512F-NEXT:    addq $40, %rsp
4555; AVX512F-NEXT:    popq %rbx
4556; AVX512F-NEXT:    popq %r14
4557; AVX512F-NEXT:    retq
4558;
4559; AVX512VL-LABEL: cvt_4f64_to_8i16_undef:
4560; AVX512VL:       # BB#0:
4561; AVX512VL-NEXT:    pushq %r14
4562; AVX512VL-NEXT:    pushq %rbx
4563; AVX512VL-NEXT:    subq $40, %rsp
4564; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4565; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4566; AVX512VL-NEXT:    callq __truncdfhf2
4567; AVX512VL-NEXT:    movw %ax, %bx
4568; AVX512VL-NEXT:    shll $16, %ebx
4569; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4570; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4571; AVX512VL-NEXT:    callq __truncdfhf2
4572; AVX512VL-NEXT:    movzwl %ax, %r14d
4573; AVX512VL-NEXT:    orl %ebx, %r14d
4574; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4575; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
4576; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4577; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4578; AVX512VL-NEXT:    callq __truncdfhf2
4579; AVX512VL-NEXT:    movw %ax, %bx
4580; AVX512VL-NEXT:    shll $16, %ebx
4581; AVX512VL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4582; AVX512VL-NEXT:    callq __truncdfhf2
4583; AVX512VL-NEXT:    movzwl %ax, %eax
4584; AVX512VL-NEXT:    orl %ebx, %eax
4585; AVX512VL-NEXT:    shlq $32, %rax
4586; AVX512VL-NEXT:    orq %r14, %rax
4587; AVX512VL-NEXT:    vmovq %rax, %xmm0
4588; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
4589; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
4590; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4591; AVX512VL-NEXT:    addq $40, %rsp
4592; AVX512VL-NEXT:    popq %rbx
4593; AVX512VL-NEXT:    popq %r14
4594; AVX512VL-NEXT:    retq
4595  %1 = fptrunc <4 x double> %a0 to <4 x half>
4596  %2 = bitcast <4 x half> %1 to <4 x i16>
4597  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4598  ret <8 x i16> %3
4599}
4600
4601define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
4602; AVX1-LABEL: cvt_4f64_to_8i16_zero:
4603; AVX1:       # BB#0:
4604; AVX1-NEXT:    pushq %r14
4605; AVX1-NEXT:    pushq %rbx
4606; AVX1-NEXT:    subq $40, %rsp
4607; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4608; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4609; AVX1-NEXT:    vzeroupper
4610; AVX1-NEXT:    callq __truncdfhf2
4611; AVX1-NEXT:    movw %ax, %bx
4612; AVX1-NEXT:    shll $16, %ebx
4613; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4614; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4615; AVX1-NEXT:    vzeroupper
4616; AVX1-NEXT:    callq __truncdfhf2
4617; AVX1-NEXT:    movzwl %ax, %r14d
4618; AVX1-NEXT:    orl %ebx, %r14d
4619; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4620; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4621; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4622; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4623; AVX1-NEXT:    vzeroupper
4624; AVX1-NEXT:    callq __truncdfhf2
4625; AVX1-NEXT:    movw %ax, %bx
4626; AVX1-NEXT:    shll $16, %ebx
4627; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4628; AVX1-NEXT:    callq __truncdfhf2
4629; AVX1-NEXT:    movzwl %ax, %eax
4630; AVX1-NEXT:    orl %ebx, %eax
4631; AVX1-NEXT:    shlq $32, %rax
4632; AVX1-NEXT:    orq %r14, %rax
4633; AVX1-NEXT:    vmovq %rax, %xmm0
4634; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
4635; AVX1-NEXT:    addq $40, %rsp
4636; AVX1-NEXT:    popq %rbx
4637; AVX1-NEXT:    popq %r14
4638; AVX1-NEXT:    retq
4639;
4640; AVX2-LABEL: cvt_4f64_to_8i16_zero:
4641; AVX2:       # BB#0:
4642; AVX2-NEXT:    pushq %r14
4643; AVX2-NEXT:    pushq %rbx
4644; AVX2-NEXT:    subq $40, %rsp
4645; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4646; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4647; AVX2-NEXT:    vzeroupper
4648; AVX2-NEXT:    callq __truncdfhf2
4649; AVX2-NEXT:    movw %ax, %bx
4650; AVX2-NEXT:    shll $16, %ebx
4651; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4652; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4653; AVX2-NEXT:    vzeroupper
4654; AVX2-NEXT:    callq __truncdfhf2
4655; AVX2-NEXT:    movzwl %ax, %r14d
4656; AVX2-NEXT:    orl %ebx, %r14d
4657; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4658; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
4659; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4660; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4661; AVX2-NEXT:    vzeroupper
4662; AVX2-NEXT:    callq __truncdfhf2
4663; AVX2-NEXT:    movw %ax, %bx
4664; AVX2-NEXT:    shll $16, %ebx
4665; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4666; AVX2-NEXT:    callq __truncdfhf2
4667; AVX2-NEXT:    movzwl %ax, %eax
4668; AVX2-NEXT:    orl %ebx, %eax
4669; AVX2-NEXT:    shlq $32, %rax
4670; AVX2-NEXT:    orq %r14, %rax
4671; AVX2-NEXT:    vmovq %rax, %xmm0
4672; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
4673; AVX2-NEXT:    addq $40, %rsp
4674; AVX2-NEXT:    popq %rbx
4675; AVX2-NEXT:    popq %r14
4676; AVX2-NEXT:    retq
4677;
4678; AVX512F-LABEL: cvt_4f64_to_8i16_zero:
4679; AVX512F:       # BB#0:
4680; AVX512F-NEXT:    pushq %r14
4681; AVX512F-NEXT:    pushq %rbx
4682; AVX512F-NEXT:    subq $40, %rsp
4683; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4684; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4685; AVX512F-NEXT:    callq __truncdfhf2
4686; AVX512F-NEXT:    movw %ax, %bx
4687; AVX512F-NEXT:    shll $16, %ebx
4688; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4689; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4690; AVX512F-NEXT:    callq __truncdfhf2
4691; AVX512F-NEXT:    movzwl %ax, %r14d
4692; AVX512F-NEXT:    orl %ebx, %r14d
4693; AVX512F-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4694; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
4695; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4696; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4697; AVX512F-NEXT:    callq __truncdfhf2
4698; AVX512F-NEXT:    movw %ax, %bx
4699; AVX512F-NEXT:    shll $16, %ebx
4700; AVX512F-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4701; AVX512F-NEXT:    callq __truncdfhf2
4702; AVX512F-NEXT:    movzwl %ax, %eax
4703; AVX512F-NEXT:    orl %ebx, %eax
4704; AVX512F-NEXT:    shlq $32, %rax
4705; AVX512F-NEXT:    orq %r14, %rax
4706; AVX512F-NEXT:    vmovq %rax, %xmm0
4707; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
4708; AVX512F-NEXT:    addq $40, %rsp
4709; AVX512F-NEXT:    popq %rbx
4710; AVX512F-NEXT:    popq %r14
4711; AVX512F-NEXT:    retq
4712;
4713; AVX512VL-LABEL: cvt_4f64_to_8i16_zero:
4714; AVX512VL:       # BB#0:
4715; AVX512VL-NEXT:    pushq %r14
4716; AVX512VL-NEXT:    pushq %rbx
4717; AVX512VL-NEXT:    subq $40, %rsp
4718; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4719; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4720; AVX512VL-NEXT:    callq __truncdfhf2
4721; AVX512VL-NEXT:    movw %ax, %bx
4722; AVX512VL-NEXT:    shll $16, %ebx
4723; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4724; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4725; AVX512VL-NEXT:    callq __truncdfhf2
4726; AVX512VL-NEXT:    movzwl %ax, %r14d
4727; AVX512VL-NEXT:    orl %ebx, %r14d
4728; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4729; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
4730; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4731; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4732; AVX512VL-NEXT:    callq __truncdfhf2
4733; AVX512VL-NEXT:    movw %ax, %bx
4734; AVX512VL-NEXT:    shll $16, %ebx
4735; AVX512VL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4736; AVX512VL-NEXT:    callq __truncdfhf2
4737; AVX512VL-NEXT:    movzwl %ax, %eax
4738; AVX512VL-NEXT:    orl %ebx, %eax
4739; AVX512VL-NEXT:    shlq $32, %rax
4740; AVX512VL-NEXT:    orq %r14, %rax
4741; AVX512VL-NEXT:    vmovq %rax, %xmm0
4742; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
4743; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
4744; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
4745; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
4746; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
4747; AVX512VL-NEXT:    addq $40, %rsp
4748; AVX512VL-NEXT:    popq %rbx
4749; AVX512VL-NEXT:    popq %r14
4750; AVX512VL-NEXT:    retq
4751  %1 = fptrunc <4 x double> %a0 to <4 x half>
4752  %2 = bitcast <4 x half> %1 to <4 x i16>
4753  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4754  ret <8 x i16> %3
4755}
4756
4757define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
4758; AVX1-LABEL: cvt_8f64_to_8i16:
4759; AVX1:       # BB#0:
4760; AVX1-NEXT:    pushq %r15
4761; AVX1-NEXT:    pushq %r14
4762; AVX1-NEXT:    pushq %rbx
4763; AVX1-NEXT:    subq $64, %rsp
4764; AVX1-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
4765; AVX1-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
4766; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4767; AVX1-NEXT:    vzeroupper
4768; AVX1-NEXT:    callq __truncdfhf2
4769; AVX1-NEXT:    movw %ax, %bx
4770; AVX1-NEXT:    shll $16, %ebx
4771; AVX1-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
4772; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4773; AVX1-NEXT:    vzeroupper
4774; AVX1-NEXT:    callq __truncdfhf2
4775; AVX1-NEXT:    movzwl %ax, %r15d
4776; AVX1-NEXT:    orl %ebx, %r15d
4777; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
4778; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4779; AVX1-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
4780; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4781; AVX1-NEXT:    vzeroupper
4782; AVX1-NEXT:    callq __truncdfhf2
4783; AVX1-NEXT:    movw %ax, %bx
4784; AVX1-NEXT:    shll $16, %ebx
4785; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
4786; AVX1-NEXT:    callq __truncdfhf2
4787; AVX1-NEXT:    movzwl %ax, %r14d
4788; AVX1-NEXT:    orl %ebx, %r14d
4789; AVX1-NEXT:    shlq $32, %r14
4790; AVX1-NEXT:    orq %r15, %r14
4791; AVX1-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4792; AVX1-NEXT:    # xmm0 = mem[1,0]
4793; AVX1-NEXT:    callq __truncdfhf2
4794; AVX1-NEXT:    movw %ax, %bx
4795; AVX1-NEXT:    shll $16, %ebx
4796; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4797; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4798; AVX1-NEXT:    vzeroupper
4799; AVX1-NEXT:    callq __truncdfhf2
4800; AVX1-NEXT:    movzwl %ax, %r15d
4801; AVX1-NEXT:    orl %ebx, %r15d
4802; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4803; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4804; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4805; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4806; AVX1-NEXT:    vzeroupper
4807; AVX1-NEXT:    callq __truncdfhf2
4808; AVX1-NEXT:    movw %ax, %bx
4809; AVX1-NEXT:    shll $16, %ebx
4810; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4811; AVX1-NEXT:    callq __truncdfhf2
4812; AVX1-NEXT:    movzwl %ax, %eax
4813; AVX1-NEXT:    orl %ebx, %eax
4814; AVX1-NEXT:    shlq $32, %rax
4815; AVX1-NEXT:    orq %r15, %rax
4816; AVX1-NEXT:    vmovq %rax, %xmm0
4817; AVX1-NEXT:    vmovq %r14, %xmm1
4818; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
4819; AVX1-NEXT:    addq $64, %rsp
4820; AVX1-NEXT:    popq %rbx
4821; AVX1-NEXT:    popq %r14
4822; AVX1-NEXT:    popq %r15
4823; AVX1-NEXT:    retq
4824;
4825; AVX2-LABEL: cvt_8f64_to_8i16:
4826; AVX2:       # BB#0:
4827; AVX2-NEXT:    pushq %r15
4828; AVX2-NEXT:    pushq %r14
4829; AVX2-NEXT:    pushq %rbx
4830; AVX2-NEXT:    subq $64, %rsp
4831; AVX2-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
4832; AVX2-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
4833; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4834; AVX2-NEXT:    vzeroupper
4835; AVX2-NEXT:    callq __truncdfhf2
4836; AVX2-NEXT:    movw %ax, %bx
4837; AVX2-NEXT:    shll $16, %ebx
4838; AVX2-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
4839; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4840; AVX2-NEXT:    vzeroupper
4841; AVX2-NEXT:    callq __truncdfhf2
4842; AVX2-NEXT:    movzwl %ax, %r15d
4843; AVX2-NEXT:    orl %ebx, %r15d
4844; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
4845; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
4846; AVX2-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
4847; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4848; AVX2-NEXT:    vzeroupper
4849; AVX2-NEXT:    callq __truncdfhf2
4850; AVX2-NEXT:    movw %ax, %bx
4851; AVX2-NEXT:    shll $16, %ebx
4852; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
4853; AVX2-NEXT:    callq __truncdfhf2
4854; AVX2-NEXT:    movzwl %ax, %r14d
4855; AVX2-NEXT:    orl %ebx, %r14d
4856; AVX2-NEXT:    shlq $32, %r14
4857; AVX2-NEXT:    orq %r15, %r14
4858; AVX2-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4859; AVX2-NEXT:    # xmm0 = mem[1,0]
4860; AVX2-NEXT:    callq __truncdfhf2
4861; AVX2-NEXT:    movw %ax, %bx
4862; AVX2-NEXT:    shll $16, %ebx
4863; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4864; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4865; AVX2-NEXT:    vzeroupper
4866; AVX2-NEXT:    callq __truncdfhf2
4867; AVX2-NEXT:    movzwl %ax, %r15d
4868; AVX2-NEXT:    orl %ebx, %r15d
4869; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4870; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
4871; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4872; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4873; AVX2-NEXT:    vzeroupper
4874; AVX2-NEXT:    callq __truncdfhf2
4875; AVX2-NEXT:    movw %ax, %bx
4876; AVX2-NEXT:    shll $16, %ebx
4877; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4878; AVX2-NEXT:    callq __truncdfhf2
4879; AVX2-NEXT:    movzwl %ax, %eax
4880; AVX2-NEXT:    orl %ebx, %eax
4881; AVX2-NEXT:    shlq $32, %rax
4882; AVX2-NEXT:    orq %r15, %rax
4883; AVX2-NEXT:    vmovq %rax, %xmm0
4884; AVX2-NEXT:    vmovq %r14, %xmm1
4885; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
4886; AVX2-NEXT:    addq $64, %rsp
4887; AVX2-NEXT:    popq %rbx
4888; AVX2-NEXT:    popq %r14
4889; AVX2-NEXT:    popq %r15
4890; AVX2-NEXT:    retq
4891;
4892; AVX512F-LABEL: cvt_8f64_to_8i16:
4893; AVX512F:       # BB#0:
4894; AVX512F-NEXT:    pushq %r15
4895; AVX512F-NEXT:    pushq %r14
4896; AVX512F-NEXT:    pushq %rbx
4897; AVX512F-NEXT:    subq $96, %rsp
4898; AVX512F-NEXT:    vmovupd %zmm0, (%rsp) # 64-byte Spill
4899; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4900; AVX512F-NEXT:    callq __truncdfhf2
4901; AVX512F-NEXT:    movw %ax, %bx
4902; AVX512F-NEXT:    shll $16, %ebx
4903; AVX512F-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
4904; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
4905; AVX512F-NEXT:    callq __truncdfhf2
4906; AVX512F-NEXT:    movzwl %ax, %r15d
4907; AVX512F-NEXT:    orl %ebx, %r15d
4908; AVX512F-NEXT:    vmovupd (%rsp), %zmm0 # 64-byte Reload
4909; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
4910; AVX512F-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
4911; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4912; AVX512F-NEXT:    callq __truncdfhf2
4913; AVX512F-NEXT:    movw %ax, %bx
4914; AVX512F-NEXT:    shll $16, %ebx
4915; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
4916; AVX512F-NEXT:    callq __truncdfhf2
4917; AVX512F-NEXT:    movzwl %ax, %r14d
4918; AVX512F-NEXT:    orl %ebx, %r14d
4919; AVX512F-NEXT:    shlq $32, %r14
4920; AVX512F-NEXT:    orq %r15, %r14
4921; AVX512F-NEXT:    vmovupd (%rsp), %zmm0 # 64-byte Reload
4922; AVX512F-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
4923; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4924; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4925; AVX512F-NEXT:    callq __truncdfhf2
4926; AVX512F-NEXT:    movw %ax, %bx
4927; AVX512F-NEXT:    shll $16, %ebx
4928; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4929; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4930; AVX512F-NEXT:    callq __truncdfhf2
4931; AVX512F-NEXT:    movzwl %ax, %r15d
4932; AVX512F-NEXT:    orl %ebx, %r15d
4933; AVX512F-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4934; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
4935; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4936; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4937; AVX512F-NEXT:    callq __truncdfhf2
4938; AVX512F-NEXT:    movw %ax, %bx
4939; AVX512F-NEXT:    shll $16, %ebx
4940; AVX512F-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4941; AVX512F-NEXT:    callq __truncdfhf2
4942; AVX512F-NEXT:    movzwl %ax, %eax
4943; AVX512F-NEXT:    orl %ebx, %eax
4944; AVX512F-NEXT:    shlq $32, %rax
4945; AVX512F-NEXT:    orq %r15, %rax
4946; AVX512F-NEXT:    vmovq %rax, %xmm0
4947; AVX512F-NEXT:    vmovq %r14, %xmm1
4948; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
4949; AVX512F-NEXT:    addq $96, %rsp
4950; AVX512F-NEXT:    popq %rbx
4951; AVX512F-NEXT:    popq %r14
4952; AVX512F-NEXT:    popq %r15
4953; AVX512F-NEXT:    retq
4954;
4955; AVX512VL-LABEL: cvt_8f64_to_8i16:
4956; AVX512VL:       # BB#0:
4957; AVX512VL-NEXT:    pushq %r15
4958; AVX512VL-NEXT:    pushq %r14
4959; AVX512VL-NEXT:    pushq %rbx
4960; AVX512VL-NEXT:    subq $96, %rsp
4961; AVX512VL-NEXT:    vmovupd %zmm0, (%rsp) # 64-byte Spill
4962; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4963; AVX512VL-NEXT:    callq __truncdfhf2
4964; AVX512VL-NEXT:    movw %ax, %bx
4965; AVX512VL-NEXT:    shll $16, %ebx
4966; AVX512VL-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
4967; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
4968; AVX512VL-NEXT:    callq __truncdfhf2
4969; AVX512VL-NEXT:    movzwl %ax, %r15d
4970; AVX512VL-NEXT:    orl %ebx, %r15d
4971; AVX512VL-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
4972; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
4973; AVX512VL-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
4974; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4975; AVX512VL-NEXT:    callq __truncdfhf2
4976; AVX512VL-NEXT:    movw %ax, %bx
4977; AVX512VL-NEXT:    shll $16, %ebx
4978; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
4979; AVX512VL-NEXT:    callq __truncdfhf2
4980; AVX512VL-NEXT:    movzwl %ax, %r14d
4981; AVX512VL-NEXT:    orl %ebx, %r14d
4982; AVX512VL-NEXT:    shlq $32, %r14
4983; AVX512VL-NEXT:    orq %r15, %r14
4984; AVX512VL-NEXT:    vmovupd (%rsp), %zmm0 # 64-byte Reload
4985; AVX512VL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
4986; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4987; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4988; AVX512VL-NEXT:    callq __truncdfhf2
4989; AVX512VL-NEXT:    movw %ax, %bx
4990; AVX512VL-NEXT:    shll $16, %ebx
4991; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4992; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4993; AVX512VL-NEXT:    callq __truncdfhf2
4994; AVX512VL-NEXT:    movzwl %ax, %r15d
4995; AVX512VL-NEXT:    orl %ebx, %r15d
4996; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4997; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
4998; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4999; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5000; AVX512VL-NEXT:    callq __truncdfhf2
5001; AVX512VL-NEXT:    movw %ax, %bx
5002; AVX512VL-NEXT:    shll $16, %ebx
5003; AVX512VL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
5004; AVX512VL-NEXT:    callq __truncdfhf2
5005; AVX512VL-NEXT:    movzwl %ax, %eax
5006; AVX512VL-NEXT:    orl %ebx, %eax
5007; AVX512VL-NEXT:    shlq $32, %rax
5008; AVX512VL-NEXT:    orq %r15, %rax
5009; AVX512VL-NEXT:    vmovq %rax, %xmm0
5010; AVX512VL-NEXT:    vmovq %r14, %xmm1
5011; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
5012; AVX512VL-NEXT:    addq $96, %rsp
5013; AVX512VL-NEXT:    popq %rbx
5014; AVX512VL-NEXT:    popq %r14
5015; AVX512VL-NEXT:    popq %r15
5016; AVX512VL-NEXT:    retq
5017  %1 = fptrunc <8 x double> %a0 to <8 x half>
5018  %2 = bitcast <8 x half> %1 to <8 x i16>
5019  ret <8 x i16> %2
5020}
5021
5022;
5023; Double to Half (Store)
5024;
5025
5026define void @store_cvt_f64_to_i16(double %a0, i16* %a1) nounwind {
5027; ALL-LABEL: store_cvt_f64_to_i16:
5028; ALL:       # BB#0:
5029; ALL-NEXT:    pushq %rbx
5030; ALL-NEXT:    movq %rdi, %rbx
5031; ALL-NEXT:    callq __truncdfhf2
5032; ALL-NEXT:    movw %ax, (%rbx)
5033; ALL-NEXT:    popq %rbx
5034; ALL-NEXT:    retq
5035  %1 = fptrunc double %a0 to half
5036  %2 = bitcast half %1 to i16
5037  store i16 %2, i16* %a1
5038  ret void
5039}
5040
5041define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) nounwind {
5042; ALL-LABEL: store_cvt_2f64_to_2i16:
5043; ALL:       # BB#0:
5044; ALL-NEXT:    pushq %rbp
5045; ALL-NEXT:    pushq %rbx
5046; ALL-NEXT:    subq $24, %rsp
5047; ALL-NEXT:    movq %rdi, %rbx
5048; ALL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
5049; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5050; ALL-NEXT:    callq __truncdfhf2
5051; ALL-NEXT:    movl %eax, %ebp
5052; ALL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
5053; ALL-NEXT:    callq __truncdfhf2
5054; ALL-NEXT:    movw %ax, (%rbx)
5055; ALL-NEXT:    movw %bp, 2(%rbx)
5056; ALL-NEXT:    addq $24, %rsp
5057; ALL-NEXT:    popq %rbx
5058; ALL-NEXT:    popq %rbp
5059; ALL-NEXT:    retq
5060  %1 = fptrunc <2 x double> %a0 to <2 x half>
5061  %2 = bitcast <2 x half> %1 to <2 x i16>
5062  store <2 x i16> %2, <2 x i16>* %a1
5063  ret void
5064}
5065
5066define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind {
5067; AVX1-LABEL: store_cvt_4f64_to_4i16:
5068; AVX1:       # BB#0:
5069; AVX1-NEXT:    pushq %rbp
5070; AVX1-NEXT:    pushq %r15
5071; AVX1-NEXT:    pushq %r14
5072; AVX1-NEXT:    pushq %rbx
5073; AVX1-NEXT:    subq $88, %rsp
5074; AVX1-NEXT:    movq %rdi, %rbx
5075; AVX1-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
5076; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5077; AVX1-NEXT:    vzeroupper
5078; AVX1-NEXT:    callq __truncdfhf2
5079; AVX1-NEXT:    movl %eax, %r14d
5080; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5081; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
5082; AVX1-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
5083; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5084; AVX1-NEXT:    vzeroupper
5085; AVX1-NEXT:    callq __truncdfhf2
5086; AVX1-NEXT:    movl %eax, %r15d
5087; AVX1-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5088; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5089; AVX1-NEXT:    vzeroupper
5090; AVX1-NEXT:    callq __truncdfhf2
5091; AVX1-NEXT:    movl %eax, %ebp
5092; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
5093; AVX1-NEXT:    callq __truncdfhf2
5094; AVX1-NEXT:    movw %ax, 4(%rbx)
5095; AVX1-NEXT:    movw %bp, (%rbx)
5096; AVX1-NEXT:    movw %r15w, 6(%rbx)
5097; AVX1-NEXT:    movw %r14w, 2(%rbx)
5098; AVX1-NEXT:    addq $88, %rsp
5099; AVX1-NEXT:    popq %rbx
5100; AVX1-NEXT:    popq %r14
5101; AVX1-NEXT:    popq %r15
5102; AVX1-NEXT:    popq %rbp
5103; AVX1-NEXT:    retq
5104;
5105; AVX2-LABEL: store_cvt_4f64_to_4i16:
5106; AVX2:       # BB#0:
5107; AVX2-NEXT:    pushq %rbp
5108; AVX2-NEXT:    pushq %r15
5109; AVX2-NEXT:    pushq %r14
5110; AVX2-NEXT:    pushq %rbx
5111; AVX2-NEXT:    subq $88, %rsp
5112; AVX2-NEXT:    movq %rdi, %rbx
5113; AVX2-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
5114; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5115; AVX2-NEXT:    vzeroupper
5116; AVX2-NEXT:    callq __truncdfhf2
5117; AVX2-NEXT:    movl %eax, %r14d
5118; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5119; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
5120; AVX2-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
5121; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5122; AVX2-NEXT:    vzeroupper
5123; AVX2-NEXT:    callq __truncdfhf2
5124; AVX2-NEXT:    movl %eax, %r15d
5125; AVX2-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5126; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5127; AVX2-NEXT:    vzeroupper
5128; AVX2-NEXT:    callq __truncdfhf2
5129; AVX2-NEXT:    movl %eax, %ebp
5130; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
5131; AVX2-NEXT:    callq __truncdfhf2
5132; AVX2-NEXT:    movw %ax, 4(%rbx)
5133; AVX2-NEXT:    movw %bp, (%rbx)
5134; AVX2-NEXT:    movw %r15w, 6(%rbx)
5135; AVX2-NEXT:    movw %r14w, 2(%rbx)
5136; AVX2-NEXT:    addq $88, %rsp
5137; AVX2-NEXT:    popq %rbx
5138; AVX2-NEXT:    popq %r14
5139; AVX2-NEXT:    popq %r15
5140; AVX2-NEXT:    popq %rbp
5141; AVX2-NEXT:    retq
5142;
5143; AVX512F-LABEL: store_cvt_4f64_to_4i16:
5144; AVX512F:       # BB#0:
5145; AVX512F-NEXT:    pushq %rbp
5146; AVX512F-NEXT:    pushq %r15
5147; AVX512F-NEXT:    pushq %r14
5148; AVX512F-NEXT:    pushq %rbx
5149; AVX512F-NEXT:    subq $88, %rsp
5150; AVX512F-NEXT:    movq %rdi, %rbx
5151; AVX512F-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
5152; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5153; AVX512F-NEXT:    callq __truncdfhf2
5154; AVX512F-NEXT:    movl %eax, %r14d
5155; AVX512F-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5156; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
5157; AVX512F-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
5158; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5159; AVX512F-NEXT:    callq __truncdfhf2
5160; AVX512F-NEXT:    movl %eax, %r15d
5161; AVX512F-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5162; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5163; AVX512F-NEXT:    callq __truncdfhf2
5164; AVX512F-NEXT:    movl %eax, %ebp
5165; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
5166; AVX512F-NEXT:    callq __truncdfhf2
5167; AVX512F-NEXT:    movw %ax, 4(%rbx)
5168; AVX512F-NEXT:    movw %bp, (%rbx)
5169; AVX512F-NEXT:    movw %r15w, 6(%rbx)
5170; AVX512F-NEXT:    movw %r14w, 2(%rbx)
5171; AVX512F-NEXT:    addq $88, %rsp
5172; AVX512F-NEXT:    popq %rbx
5173; AVX512F-NEXT:    popq %r14
5174; AVX512F-NEXT:    popq %r15
5175; AVX512F-NEXT:    popq %rbp
5176; AVX512F-NEXT:    retq
5177;
5178; AVX512VL-LABEL: store_cvt_4f64_to_4i16:
5179; AVX512VL:       # BB#0:
5180; AVX512VL-NEXT:    pushq %rbp
5181; AVX512VL-NEXT:    pushq %r15
5182; AVX512VL-NEXT:    pushq %r14
5183; AVX512VL-NEXT:    pushq %rbx
5184; AVX512VL-NEXT:    subq $88, %rsp
5185; AVX512VL-NEXT:    movq %rdi, %rbx
5186; AVX512VL-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
5187; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5188; AVX512VL-NEXT:    callq __truncdfhf2
5189; AVX512VL-NEXT:    movl %eax, %r14d
5190; AVX512VL-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5191; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
5192; AVX512VL-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
5193; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5194; AVX512VL-NEXT:    callq __truncdfhf2
5195; AVX512VL-NEXT:    movl %eax, %r15d
5196; AVX512VL-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5197; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5198; AVX512VL-NEXT:    callq __truncdfhf2
5199; AVX512VL-NEXT:    movl %eax, %ebp
5200; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
5201; AVX512VL-NEXT:    callq __truncdfhf2
5202; AVX512VL-NEXT:    movw %ax, 4(%rbx)
5203; AVX512VL-NEXT:    movw %bp, (%rbx)
5204; AVX512VL-NEXT:    movw %r15w, 6(%rbx)
5205; AVX512VL-NEXT:    movw %r14w, 2(%rbx)
5206; AVX512VL-NEXT:    addq $88, %rsp
5207; AVX512VL-NEXT:    popq %rbx
5208; AVX512VL-NEXT:    popq %r14
5209; AVX512VL-NEXT:    popq %r15
5210; AVX512VL-NEXT:    popq %rbp
5211; AVX512VL-NEXT:    retq
5212  %1 = fptrunc <4 x double> %a0 to <4 x half>
5213  %2 = bitcast <4 x half> %1 to <4 x i16>
5214  store <4 x i16> %2, <4 x i16>* %a1
5215  ret void
5216}
5217
5218define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) nounwind {
5219; AVX1-LABEL: store_cvt_4f64_to_8i16_undef:
5220; AVX1:       # BB#0:
5221; AVX1-NEXT:    pushq %rbp
5222; AVX1-NEXT:    pushq %r14
5223; AVX1-NEXT:    pushq %rbx
5224; AVX1-NEXT:    subq $32, %rsp
5225; AVX1-NEXT:    movq %rdi, %r14
5226; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
5227; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5228; AVX1-NEXT:    vzeroupper
5229; AVX1-NEXT:    callq __truncdfhf2
5230; AVX1-NEXT:    movw %ax, %bp
5231; AVX1-NEXT:    shll $16, %ebp
5232; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
5233; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5234; AVX1-NEXT:    vzeroupper
5235; AVX1-NEXT:    callq __truncdfhf2
5236; AVX1-NEXT:    movzwl %ax, %ebx
5237; AVX1-NEXT:    orl %ebp, %ebx
5238; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
5239; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
5240; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
5241; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5242; AVX1-NEXT:    vzeroupper
5243; AVX1-NEXT:    callq __truncdfhf2
5244; AVX1-NEXT:    movw %ax, %bp
5245; AVX1-NEXT:    shll $16, %ebp
5246; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
5247; AVX1-NEXT:    callq __truncdfhf2
5248; AVX1-NEXT:    movzwl %ax, %eax
5249; AVX1-NEXT:    orl %ebp, %eax
5250; AVX1-NEXT:    shlq $32, %rax
5251; AVX1-NEXT:    orq %rbx, %rax
5252; AVX1-NEXT:    vmovq %rax, %xmm0
5253; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
5254; AVX1-NEXT:    vmovdqa %xmm0, (%r14)
5255; AVX1-NEXT:    addq $32, %rsp
5256; AVX1-NEXT:    popq %rbx
5257; AVX1-NEXT:    popq %r14
5258; AVX1-NEXT:    popq %rbp
5259; AVX1-NEXT:    retq
5260;
5261; AVX2-LABEL: store_cvt_4f64_to_8i16_undef:
5262; AVX2:       # BB#0:
5263; AVX2-NEXT:    pushq %rbp
5264; AVX2-NEXT:    pushq %r14
5265; AVX2-NEXT:    pushq %rbx
5266; AVX2-NEXT:    subq $32, %rsp
5267; AVX2-NEXT:    movq %rdi, %r14
5268; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
5269; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5270; AVX2-NEXT:    vzeroupper
5271; AVX2-NEXT:    callq __truncdfhf2
5272; AVX2-NEXT:    movw %ax, %bp
5273; AVX2-NEXT:    shll $16, %ebp
5274; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
5275; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5276; AVX2-NEXT:    vzeroupper
5277; AVX2-NEXT:    callq __truncdfhf2
5278; AVX2-NEXT:    movzwl %ax, %ebx
5279; AVX2-NEXT:    orl %ebp, %ebx
5280; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
5281; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
5282; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
5283; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5284; AVX2-NEXT:    vzeroupper
5285; AVX2-NEXT:    callq __truncdfhf2
5286; AVX2-NEXT:    movw %ax, %bp
5287; AVX2-NEXT:    shll $16, %ebp
5288; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
5289; AVX2-NEXT:    callq __truncdfhf2
5290; AVX2-NEXT:    movzwl %ax, %eax
5291; AVX2-NEXT:    orl %ebp, %eax
5292; AVX2-NEXT:    shlq $32, %rax
5293; AVX2-NEXT:    orq %rbx, %rax
5294; AVX2-NEXT:    vmovq %rax, %xmm0
5295; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
5296; AVX2-NEXT:    vmovdqa %xmm0, (%r14)
5297; AVX2-NEXT:    addq $32, %rsp
5298; AVX2-NEXT:    popq %rbx
5299; AVX2-NEXT:    popq %r14
5300; AVX2-NEXT:    popq %rbp
5301; AVX2-NEXT:    retq
5302;
5303; AVX512F-LABEL: store_cvt_4f64_to_8i16_undef:
5304; AVX512F:       # BB#0:
5305; AVX512F-NEXT:    pushq %rbp
5306; AVX512F-NEXT:    pushq %r14
5307; AVX512F-NEXT:    pushq %rbx
5308; AVX512F-NEXT:    subq $32, %rsp
5309; AVX512F-NEXT:    movq %rdi, %r14
5310; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
5311; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5312; AVX512F-NEXT:    callq __truncdfhf2
5313; AVX512F-NEXT:    movw %ax, %bp
5314; AVX512F-NEXT:    shll $16, %ebp
5315; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
5316; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5317; AVX512F-NEXT:    callq __truncdfhf2
5318; AVX512F-NEXT:    movzwl %ax, %ebx
5319; AVX512F-NEXT:    orl %ebp, %ebx
5320; AVX512F-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
5321; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
5322; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
5323; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5324; AVX512F-NEXT:    callq __truncdfhf2
5325; AVX512F-NEXT:    movw %ax, %bp
5326; AVX512F-NEXT:    shll $16, %ebp
5327; AVX512F-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
5328; AVX512F-NEXT:    callq __truncdfhf2
5329; AVX512F-NEXT:    movzwl %ax, %eax
5330; AVX512F-NEXT:    orl %ebp, %eax
5331; AVX512F-NEXT:    shlq $32, %rax
5332; AVX512F-NEXT:    orq %rbx, %rax
5333; AVX512F-NEXT:    vmovq %rax, %xmm0
5334; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
5335; AVX512F-NEXT:    vmovdqa %xmm0, (%r14)
5336; AVX512F-NEXT:    addq $32, %rsp
5337; AVX512F-NEXT:    popq %rbx
5338; AVX512F-NEXT:    popq %r14
5339; AVX512F-NEXT:    popq %rbp
5340; AVX512F-NEXT:    retq
5341;
5342; AVX512VL-LABEL: store_cvt_4f64_to_8i16_undef:
5343; AVX512VL:       # BB#0:
5344; AVX512VL-NEXT:    pushq %rbp
5345; AVX512VL-NEXT:    pushq %r14
5346; AVX512VL-NEXT:    pushq %rbx
5347; AVX512VL-NEXT:    subq $32, %rsp
5348; AVX512VL-NEXT:    movq %rdi, %r14
5349; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
5350; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5351; AVX512VL-NEXT:    callq __truncdfhf2
5352; AVX512VL-NEXT:    movw %ax, %bp
5353; AVX512VL-NEXT:    shll $16, %ebp
5354; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
5355; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5356; AVX512VL-NEXT:    callq __truncdfhf2
5357; AVX512VL-NEXT:    movzwl %ax, %ebx
5358; AVX512VL-NEXT:    orl %ebp, %ebx
5359; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
5360; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
5361; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
5362; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5363; AVX512VL-NEXT:    callq __truncdfhf2
5364; AVX512VL-NEXT:    movw %ax, %bp
5365; AVX512VL-NEXT:    shll $16, %ebp
5366; AVX512VL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
5367; AVX512VL-NEXT:    callq __truncdfhf2
5368; AVX512VL-NEXT:    movzwl %ax, %eax
5369; AVX512VL-NEXT:    orl %ebp, %eax
5370; AVX512VL-NEXT:    shlq $32, %rax
5371; AVX512VL-NEXT:    orq %rbx, %rax
5372; AVX512VL-NEXT:    vmovq %rax, %xmm0
5373; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
5374; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
5375; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
5376; AVX512VL-NEXT:    vmovdqa %xmm0, (%r14)
5377; AVX512VL-NEXT:    addq $32, %rsp
5378; AVX512VL-NEXT:    popq %rbx
5379; AVX512VL-NEXT:    popq %r14
5380; AVX512VL-NEXT:    popq %rbp
5381; AVX512VL-NEXT:    retq
5382  %1 = fptrunc <4 x double> %a0 to <4 x half>
5383  %2 = bitcast <4 x half> %1 to <4 x i16>
5384  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
5385  store <8 x i16> %3, <8 x i16>* %a1
5386  ret void
5387}
5388
5389define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounwind {
5390; AVX1-LABEL: store_cvt_4f64_to_8i16_zero:
5391; AVX1:       # BB#0:
5392; AVX1-NEXT:    pushq %rbp
5393; AVX1-NEXT:    pushq %r14
5394; AVX1-NEXT:    pushq %rbx
5395; AVX1-NEXT:    subq $32, %rsp
5396; AVX1-NEXT:    movq %rdi, %r14
5397; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
5398; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5399; AVX1-NEXT:    vzeroupper
5400; AVX1-NEXT:    callq __truncdfhf2
5401; AVX1-NEXT:    movw %ax, %bp
5402; AVX1-NEXT:    shll $16, %ebp
5403; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
5404; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5405; AVX1-NEXT:    vzeroupper
5406; AVX1-NEXT:    callq __truncdfhf2
5407; AVX1-NEXT:    movzwl %ax, %ebx
5408; AVX1-NEXT:    orl %ebp, %ebx
5409; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
5410; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
5411; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
5412; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5413; AVX1-NEXT:    vzeroupper
5414; AVX1-NEXT:    callq __truncdfhf2
5415; AVX1-NEXT:    movw %ax, %bp
5416; AVX1-NEXT:    shll $16, %ebp
5417; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
5418; AVX1-NEXT:    callq __truncdfhf2
5419; AVX1-NEXT:    movzwl %ax, %eax
5420; AVX1-NEXT:    orl %ebp, %eax
5421; AVX1-NEXT:    shlq $32, %rax
5422; AVX1-NEXT:    orq %rbx, %rax
5423; AVX1-NEXT:    vmovq %rax, %xmm0
5424; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
5425; AVX1-NEXT:    vmovdqa %xmm0, (%r14)
5426; AVX1-NEXT:    addq $32, %rsp
5427; AVX1-NEXT:    popq %rbx
5428; AVX1-NEXT:    popq %r14
5429; AVX1-NEXT:    popq %rbp
5430; AVX1-NEXT:    retq
5431;
5432; AVX2-LABEL: store_cvt_4f64_to_8i16_zero:
5433; AVX2:       # BB#0:
5434; AVX2-NEXT:    pushq %rbp
5435; AVX2-NEXT:    pushq %r14
5436; AVX2-NEXT:    pushq %rbx
5437; AVX2-NEXT:    subq $32, %rsp
5438; AVX2-NEXT:    movq %rdi, %r14
5439; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
5440; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5441; AVX2-NEXT:    vzeroupper
5442; AVX2-NEXT:    callq __truncdfhf2
5443; AVX2-NEXT:    movw %ax, %bp
5444; AVX2-NEXT:    shll $16, %ebp
5445; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
5446; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5447; AVX2-NEXT:    vzeroupper
5448; AVX2-NEXT:    callq __truncdfhf2
5449; AVX2-NEXT:    movzwl %ax, %ebx
5450; AVX2-NEXT:    orl %ebp, %ebx
5451; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
5452; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
5453; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
5454; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5455; AVX2-NEXT:    vzeroupper
5456; AVX2-NEXT:    callq __truncdfhf2
5457; AVX2-NEXT:    movw %ax, %bp
5458; AVX2-NEXT:    shll $16, %ebp
5459; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
5460; AVX2-NEXT:    callq __truncdfhf2
5461; AVX2-NEXT:    movzwl %ax, %eax
5462; AVX2-NEXT:    orl %ebp, %eax
5463; AVX2-NEXT:    shlq $32, %rax
5464; AVX2-NEXT:    orq %rbx, %rax
5465; AVX2-NEXT:    vmovq %rax, %xmm0
5466; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
5467; AVX2-NEXT:    vmovdqa %xmm0, (%r14)
5468; AVX2-NEXT:    addq $32, %rsp
5469; AVX2-NEXT:    popq %rbx
5470; AVX2-NEXT:    popq %r14
5471; AVX2-NEXT:    popq %rbp
5472; AVX2-NEXT:    retq
5473;
5474; AVX512F-LABEL: store_cvt_4f64_to_8i16_zero:
5475; AVX512F:       # BB#0:
5476; AVX512F-NEXT:    pushq %rbp
5477; AVX512F-NEXT:    pushq %r14
5478; AVX512F-NEXT:    pushq %rbx
5479; AVX512F-NEXT:    subq $32, %rsp
5480; AVX512F-NEXT:    movq %rdi, %r14
5481; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
5482; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5483; AVX512F-NEXT:    callq __truncdfhf2
5484; AVX512F-NEXT:    movw %ax, %bp
5485; AVX512F-NEXT:    shll $16, %ebp
5486; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
5487; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5488; AVX512F-NEXT:    callq __truncdfhf2
5489; AVX512F-NEXT:    movzwl %ax, %ebx
5490; AVX512F-NEXT:    orl %ebp, %ebx
5491; AVX512F-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
5492; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
5493; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
5494; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5495; AVX512F-NEXT:    callq __truncdfhf2
5496; AVX512F-NEXT:    movw %ax, %bp
5497; AVX512F-NEXT:    shll $16, %ebp
5498; AVX512F-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
5499; AVX512F-NEXT:    callq __truncdfhf2
5500; AVX512F-NEXT:    movzwl %ax, %eax
5501; AVX512F-NEXT:    orl %ebp, %eax
5502; AVX512F-NEXT:    shlq $32, %rax
5503; AVX512F-NEXT:    orq %rbx, %rax
5504; AVX512F-NEXT:    vmovq %rax, %xmm0
5505; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
5506; AVX512F-NEXT:    vmovdqa %xmm0, (%r14)
5507; AVX512F-NEXT:    addq $32, %rsp
5508; AVX512F-NEXT:    popq %rbx
5509; AVX512F-NEXT:    popq %r14
5510; AVX512F-NEXT:    popq %rbp
5511; AVX512F-NEXT:    retq
5512;
5513; AVX512VL-LABEL: store_cvt_4f64_to_8i16_zero:
5514; AVX512VL:       # BB#0:
5515; AVX512VL-NEXT:    pushq %rbp
5516; AVX512VL-NEXT:    pushq %r14
5517; AVX512VL-NEXT:    pushq %rbx
5518; AVX512VL-NEXT:    subq $32, %rsp
5519; AVX512VL-NEXT:    movq %rdi, %r14
5520; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
5521; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5522; AVX512VL-NEXT:    callq __truncdfhf2
5523; AVX512VL-NEXT:    movw %ax, %bp
5524; AVX512VL-NEXT:    shll $16, %ebp
5525; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
5526; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5527; AVX512VL-NEXT:    callq __truncdfhf2
5528; AVX512VL-NEXT:    movzwl %ax, %ebx
5529; AVX512VL-NEXT:    orl %ebp, %ebx
5530; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
5531; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
5532; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
5533; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5534; AVX512VL-NEXT:    callq __truncdfhf2
5535; AVX512VL-NEXT:    movw %ax, %bp
5536; AVX512VL-NEXT:    shll $16, %ebp
5537; AVX512VL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
5538; AVX512VL-NEXT:    callq __truncdfhf2
5539; AVX512VL-NEXT:    movzwl %ax, %eax
5540; AVX512VL-NEXT:    orl %ebp, %eax
5541; AVX512VL-NEXT:    shlq $32, %rax
5542; AVX512VL-NEXT:    orq %rbx, %rax
5543; AVX512VL-NEXT:    vmovq %rax, %xmm0
5544; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
5545; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
5546; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
5547; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5548; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
5549; AVX512VL-NEXT:    vmovdqa %xmm0, (%r14)
5550; AVX512VL-NEXT:    addq $32, %rsp
5551; AVX512VL-NEXT:    popq %rbx
5552; AVX512VL-NEXT:    popq %r14
5553; AVX512VL-NEXT:    popq %rbp
5554; AVX512VL-NEXT:    retq
5555  %1 = fptrunc <4 x double> %a0 to <4 x half>
5556  %2 = bitcast <4 x half> %1 to <4 x i16>
5557  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
5558  store <8 x i16> %3, <8 x i16>* %a1
5559  ret void
5560}
5561
5562define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
5563; AVX1-LABEL: store_cvt_8f64_to_8i16:
5564; AVX1:       # BB#0:
5565; AVX1-NEXT:    pushq %rbp
5566; AVX1-NEXT:    pushq %r15
5567; AVX1-NEXT:    pushq %r14
5568; AVX1-NEXT:    pushq %r13
5569; AVX1-NEXT:    pushq %r12
5570; AVX1-NEXT:    pushq %rbx
5571; AVX1-NEXT:    subq $136, %rsp
5572; AVX1-NEXT:    movq %rdi, %rbx
5573; AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
5574; AVX1-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
5575; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5576; AVX1-NEXT:    vzeroupper
5577; AVX1-NEXT:    callq __truncdfhf2
5578; AVX1-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
5579; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5580; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
5581; AVX1-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
5582; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5583; AVX1-NEXT:    vzeroupper
5584; AVX1-NEXT:    callq __truncdfhf2
5585; AVX1-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
5586; AVX1-NEXT:    vpermilpd $1, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
5587; AVX1-NEXT:    # xmm0 = mem[1,0]
5588; AVX1-NEXT:    callq __truncdfhf2
5589; AVX1-NEXT:    movl %eax, %r12d
5590; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5591; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
5592; AVX1-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
5593; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5594; AVX1-NEXT:    vzeroupper
5595; AVX1-NEXT:    callq __truncdfhf2
5596; AVX1-NEXT:    movl %eax, %r13d
5597; AVX1-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5598; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5599; AVX1-NEXT:    vzeroupper
5600; AVX1-NEXT:    callq __truncdfhf2
5601; AVX1-NEXT:    movl %eax, %ebp
5602; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
5603; AVX1-NEXT:    callq __truncdfhf2
5604; AVX1-NEXT:    movl %eax, %r14d
5605; AVX1-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5606; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5607; AVX1-NEXT:    vzeroupper
5608; AVX1-NEXT:    callq __truncdfhf2
5609; AVX1-NEXT:    movl %eax, %r15d
5610; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
5611; AVX1-NEXT:    callq __truncdfhf2
5612; AVX1-NEXT:    movw %ax, 12(%rbx)
5613; AVX1-NEXT:    movw %r15w, 8(%rbx)
5614; AVX1-NEXT:    movw %r14w, 4(%rbx)
5615; AVX1-NEXT:    movw %bp, (%rbx)
5616; AVX1-NEXT:    movw %r13w, 14(%rbx)
5617; AVX1-NEXT:    movw %r12w, 10(%rbx)
5618; AVX1-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
5619; AVX1-NEXT:    movw %ax, 6(%rbx)
5620; AVX1-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
5621; AVX1-NEXT:    movw %ax, 2(%rbx)
5622; AVX1-NEXT:    addq $136, %rsp
5623; AVX1-NEXT:    popq %rbx
5624; AVX1-NEXT:    popq %r12
5625; AVX1-NEXT:    popq %r13
5626; AVX1-NEXT:    popq %r14
5627; AVX1-NEXT:    popq %r15
5628; AVX1-NEXT:    popq %rbp
5629; AVX1-NEXT:    retq
5630;
5631; AVX2-LABEL: store_cvt_8f64_to_8i16:
5632; AVX2:       # BB#0:
5633; AVX2-NEXT:    pushq %rbp
5634; AVX2-NEXT:    pushq %r15
5635; AVX2-NEXT:    pushq %r14
5636; AVX2-NEXT:    pushq %r13
5637; AVX2-NEXT:    pushq %r12
5638; AVX2-NEXT:    pushq %rbx
5639; AVX2-NEXT:    subq $136, %rsp
5640; AVX2-NEXT:    movq %rdi, %rbx
5641; AVX2-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
5642; AVX2-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
5643; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5644; AVX2-NEXT:    vzeroupper
5645; AVX2-NEXT:    callq __truncdfhf2
5646; AVX2-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
5647; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5648; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
5649; AVX2-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
5650; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5651; AVX2-NEXT:    vzeroupper
5652; AVX2-NEXT:    callq __truncdfhf2
5653; AVX2-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
5654; AVX2-NEXT:    vpermilpd $1, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
5655; AVX2-NEXT:    # xmm0 = mem[1,0]
5656; AVX2-NEXT:    callq __truncdfhf2
5657; AVX2-NEXT:    movl %eax, %r12d
5658; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5659; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
5660; AVX2-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
5661; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5662; AVX2-NEXT:    vzeroupper
5663; AVX2-NEXT:    callq __truncdfhf2
5664; AVX2-NEXT:    movl %eax, %r13d
5665; AVX2-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5666; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5667; AVX2-NEXT:    vzeroupper
5668; AVX2-NEXT:    callq __truncdfhf2
5669; AVX2-NEXT:    movl %eax, %ebp
5670; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
5671; AVX2-NEXT:    callq __truncdfhf2
5672; AVX2-NEXT:    movl %eax, %r14d
5673; AVX2-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5674; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5675; AVX2-NEXT:    vzeroupper
5676; AVX2-NEXT:    callq __truncdfhf2
5677; AVX2-NEXT:    movl %eax, %r15d
5678; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
5679; AVX2-NEXT:    callq __truncdfhf2
5680; AVX2-NEXT:    movw %ax, 12(%rbx)
5681; AVX2-NEXT:    movw %r15w, 8(%rbx)
5682; AVX2-NEXT:    movw %r14w, 4(%rbx)
5683; AVX2-NEXT:    movw %bp, (%rbx)
5684; AVX2-NEXT:    movw %r13w, 14(%rbx)
5685; AVX2-NEXT:    movw %r12w, 10(%rbx)
5686; AVX2-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
5687; AVX2-NEXT:    movw %ax, 6(%rbx)
5688; AVX2-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
5689; AVX2-NEXT:    movw %ax, 2(%rbx)
5690; AVX2-NEXT:    addq $136, %rsp
5691; AVX2-NEXT:    popq %rbx
5692; AVX2-NEXT:    popq %r12
5693; AVX2-NEXT:    popq %r13
5694; AVX2-NEXT:    popq %r14
5695; AVX2-NEXT:    popq %r15
5696; AVX2-NEXT:    popq %rbp
5697; AVX2-NEXT:    retq
5698;
5699; AVX512F-LABEL: store_cvt_8f64_to_8i16:
5700; AVX512F:       # BB#0:
5701; AVX512F-NEXT:    pushq %rbp
5702; AVX512F-NEXT:    pushq %r15
5703; AVX512F-NEXT:    pushq %r14
5704; AVX512F-NEXT:    pushq %r13
5705; AVX512F-NEXT:    pushq %r12
5706; AVX512F-NEXT:    pushq %rbx
5707; AVX512F-NEXT:    subq $200, %rsp
5708; AVX512F-NEXT:    movq %rdi, %rbx
5709; AVX512F-NEXT:    vmovupd %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
5710; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5711; AVX512F-NEXT:    callq __truncdfhf2
5712; AVX512F-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
5713; AVX512F-NEXT:    vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
5714; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
5715; AVX512F-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
5716; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5717; AVX512F-NEXT:    callq __truncdfhf2
5718; AVX512F-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
5719; AVX512F-NEXT:    vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
5720; AVX512F-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
5721; AVX512F-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
5722; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5723; AVX512F-NEXT:    callq __truncdfhf2
5724; AVX512F-NEXT:    movl %eax, %r12d
5725; AVX512F-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5726; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
5727; AVX512F-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
5728; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5729; AVX512F-NEXT:    callq __truncdfhf2
5730; AVX512F-NEXT:    movl %eax, %r13d
5731; AVX512F-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
5732; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
5733; AVX512F-NEXT:    callq __truncdfhf2
5734; AVX512F-NEXT:    movl %eax, %ebp
5735; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
5736; AVX512F-NEXT:    callq __truncdfhf2
5737; AVX512F-NEXT:    movl %eax, %r14d
5738; AVX512F-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5739; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5740; AVX512F-NEXT:    callq __truncdfhf2
5741; AVX512F-NEXT:    movl %eax, %r15d
5742; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
5743; AVX512F-NEXT:    callq __truncdfhf2
5744; AVX512F-NEXT:    movw %ax, 12(%rbx)
5745; AVX512F-NEXT:    movw %r15w, 8(%rbx)
5746; AVX512F-NEXT:    movw %r14w, 4(%rbx)
5747; AVX512F-NEXT:    movw %bp, (%rbx)
5748; AVX512F-NEXT:    movw %r13w, 14(%rbx)
5749; AVX512F-NEXT:    movw %r12w, 10(%rbx)
5750; AVX512F-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
5751; AVX512F-NEXT:    movw %ax, 6(%rbx)
5752; AVX512F-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
5753; AVX512F-NEXT:    movw %ax, 2(%rbx)
5754; AVX512F-NEXT:    addq $200, %rsp
5755; AVX512F-NEXT:    popq %rbx
5756; AVX512F-NEXT:    popq %r12
5757; AVX512F-NEXT:    popq %r13
5758; AVX512F-NEXT:    popq %r14
5759; AVX512F-NEXT:    popq %r15
5760; AVX512F-NEXT:    popq %rbp
5761; AVX512F-NEXT:    retq
5762;
5763; AVX512VL-LABEL: store_cvt_8f64_to_8i16:
5764; AVX512VL:       # BB#0:
5765; AVX512VL-NEXT:    pushq %rbp
5766; AVX512VL-NEXT:    pushq %r15
5767; AVX512VL-NEXT:    pushq %r14
5768; AVX512VL-NEXT:    pushq %r13
5769; AVX512VL-NEXT:    pushq %r12
5770; AVX512VL-NEXT:    pushq %rbx
5771; AVX512VL-NEXT:    subq $200, %rsp
5772; AVX512VL-NEXT:    movq %rdi, %rbx
5773; AVX512VL-NEXT:    vmovupd %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
5774; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5775; AVX512VL-NEXT:    callq __truncdfhf2
5776; AVX512VL-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
5777; AVX512VL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
5778; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
5779; AVX512VL-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
5780; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5781; AVX512VL-NEXT:    callq __truncdfhf2
5782; AVX512VL-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
5783; AVX512VL-NEXT:    vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
5784; AVX512VL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
5785; AVX512VL-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
5786; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5787; AVX512VL-NEXT:    callq __truncdfhf2
5788; AVX512VL-NEXT:    movl %eax, %r12d
5789; AVX512VL-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5790; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
5791; AVX512VL-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
5792; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
5793; AVX512VL-NEXT:    callq __truncdfhf2
5794; AVX512VL-NEXT:    movl %eax, %r13d
5795; AVX512VL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
5796; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
5797; AVX512VL-NEXT:    callq __truncdfhf2
5798; AVX512VL-NEXT:    movl %eax, %ebp
5799; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
5800; AVX512VL-NEXT:    callq __truncdfhf2
5801; AVX512VL-NEXT:    movl %eax, %r14d
5802; AVX512VL-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5803; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5804; AVX512VL-NEXT:    callq __truncdfhf2
5805; AVX512VL-NEXT:    movl %eax, %r15d
5806; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
5807; AVX512VL-NEXT:    callq __truncdfhf2
5808; AVX512VL-NEXT:    movw %ax, 12(%rbx)
5809; AVX512VL-NEXT:    movw %r15w, 8(%rbx)
5810; AVX512VL-NEXT:    movw %r14w, 4(%rbx)
5811; AVX512VL-NEXT:    movw %bp, (%rbx)
5812; AVX512VL-NEXT:    movw %r13w, 14(%rbx)
5813; AVX512VL-NEXT:    movw %r12w, 10(%rbx)
5814; AVX512VL-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
5815; AVX512VL-NEXT:    movw %ax, 6(%rbx)
5816; AVX512VL-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
5817; AVX512VL-NEXT:    movw %ax, 2(%rbx)
5818; AVX512VL-NEXT:    addq $200, %rsp
5819; AVX512VL-NEXT:    popq %rbx
5820; AVX512VL-NEXT:    popq %r12
5821; AVX512VL-NEXT:    popq %r13
5822; AVX512VL-NEXT:    popq %r14
5823; AVX512VL-NEXT:    popq %r15
5824; AVX512VL-NEXT:    popq %rbp
5825; AVX512VL-NEXT:    retq
5826  %1 = fptrunc <8 x double> %a0 to <8 x half>
5827  %2 = bitcast <8 x half> %1 to <8 x i16>
5828  store <8 x i16> %2, <8 x i16>* %a1
5829  ret void
5830}
5831