1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s
4
5; GCN-LABEL: {{^}}fptrunc_f32_to_f16:
6; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
7; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
8; GCN: buffer_store_short v[[R_F16]]
9; GCN: s_endpgm
10define amdgpu_kernel void @fptrunc_f32_to_f16(
11    half addrspace(1)* %r,
12    float addrspace(1)* %a) {
13entry:
14  %a.val = load float, float addrspace(1)* %a
15  %r.val = fptrunc float %a.val to half
16  store half %r.val, half addrspace(1)* %r
17  ret void
18}
19
20; GCN-LABEL: {{^}}fptrunc_f64_to_f16:
21; GCN: buffer_load_dwordx2 v[[[A_F64_0:[0-9]+]]:[[A_F64_1:[0-9]+]]]
22; GCN: v_cvt_f32_f64_e32 v[[A_F32:[0-9]+]], v[[[A_F64_0]]:[[A_F64_1]]]
23; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
24; GCN: buffer_store_short v[[R_F16]]
25; GCN: s_endpgm
26define amdgpu_kernel void @fptrunc_f64_to_f16(
27    half addrspace(1)* %r,
28    double addrspace(1)* %a) {
29entry:
30  %a.val = load double, double addrspace(1)* %a
31  %r.val = fptrunc double %a.val to half
32  store half %r.val, half addrspace(1)* %r
33  ret void
34}
35
36; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16:
37; GCN:     buffer_load_dwordx2 v[[[A_F32_0:[0-9]+]]:[[A_F32_1:[0-9]+]]]
38; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
39; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
40; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
41; SI:      v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
42
43; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
44; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
45
46; GFX9-DAG:   v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
47; GFX9: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
48
49; GCN:     buffer_store_dword v[[R_V2_F16]]
50; GCN:     s_endpgm
51
52define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
53    <2 x half> addrspace(1)* %r,
54    <2 x float> addrspace(1)* %a) {
55entry:
56  %a.val = load <2 x float>, <2 x float> addrspace(1)* %a
57  %r.val = fptrunc <2 x float> %a.val to <2 x half>
58  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
59  ret void
60}
61
62; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f16:
63; GCN: buffer_load_dwordx4 v[[[A_F64_0:[0-9]+]]:[[A_F64_3:[0-9]+]]]
64; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v[[[A_F64_0]]:{{[0-9]+}}]
65; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v[{{[0-9]+}}:[[A_F64_3]]]
66; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
67; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
68;
69; SI-DAG: v_cvt_f16_f32_e32 v[[CVTHI:[0-9]+]], v[[A_F32_1]]
70; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[CVTHI]]
71
72; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
73
74; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
75; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_0]]
76
77; GCN: buffer_store_dword v[[R_V2_F16]]
78
79define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
80    <2 x half> addrspace(1)* %r,
81    <2 x double> addrspace(1)* %a) {
82entry:
83  %a.val = load <2 x double>, <2 x double> addrspace(1)* %a
84  %r.val = fptrunc <2 x double> %a.val to <2 x half>
85  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
86  ret void
87}
88
89; GCN-LABEL: {{^}}fneg_fptrunc_f32_to_f16:
90; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
91; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -v[[A_F32]]
92; GCN: buffer_store_short v[[R_F16]]
93; GCN: s_endpgm
94define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
95    half addrspace(1)* %r,
96    float addrspace(1)* %a) {
97entry:
98  %a.val = load float, float addrspace(1)* %a
99  %a.fneg = fneg float %a.val
100  %r.val = fptrunc float %a.fneg to half
101  store half %r.val, half addrspace(1)* %r
102  ret void
103}
104
105; GCN-LABEL: {{^}}fabs_fptrunc_f32_to_f16:
106; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
107; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]|
108; GCN: buffer_store_short v[[R_F16]]
109; GCN: s_endpgm
110define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
111    half addrspace(1)* %r,
112    float addrspace(1)* %a) {
113entry:
114  %a.val = load float, float addrspace(1)* %a
115  %a.fabs = call float @llvm.fabs.f32(float %a.val)
116  %r.val = fptrunc float %a.fabs to half
117  store half %r.val, half addrspace(1)* %r
118  ret void
119}
120
121; GCN-LABEL: {{^}}fneg_fabs_fptrunc_f32_to_f16:
122; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
123; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -|v[[A_F32]]|
124; GCN: buffer_store_short v[[R_F16]]
125; GCN: s_endpgm
126define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
127    half addrspace(1)* %r,
128    float addrspace(1)* %a) #0 {
129entry:
130  %a.val = load float, float addrspace(1)* %a
131  %a.fabs = call float @llvm.fabs.f32(float %a.val)
132  %a.fneg.fabs = fneg float %a.fabs
133  %r.val = fptrunc float %a.fneg.fabs to half
134  store half %r.val, half addrspace(1)* %r
135  ret void
136}
137
138; GCN-LABEL: {{^}}fptrunc_f32_to_f16_zext_i32:
139; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
140; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
141; SIVI-NOT: v[[R_F16]]
142; GFX9-NOT: v_and_b32
143; GCN: buffer_store_dword v[[R_F16]]
144define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
145    i32 addrspace(1)* %r,
146    float addrspace(1)* %a) #0 {
147entry:
148  %a.val = load float, float addrspace(1)* %a
149  %r.val = fptrunc float %a.val to half
150  %r.i16 = bitcast half %r.val to i16
151  %zext = zext i16 %r.i16 to i32
152  store i32 %zext, i32 addrspace(1)* %r
153  ret void
154}
155
156; GCN-LABEL: {{^}}fptrunc_fabs_f32_to_f16_zext_i32:
157; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
158; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]|
159; SIVI-NOT: v[[R_F16]]
160; GFX9-NOT: v_and_b32
161; GCN: buffer_store_dword v[[R_F16]]
162define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
163    i32 addrspace(1)* %r,
164    float addrspace(1)* %a) #0 {
165entry:
166  %a.val = load float, float addrspace(1)* %a
167  %a.fabs = call float @llvm.fabs.f32(float %a.val)
168  %r.val = fptrunc float %a.fabs to half
169  %r.i16 = bitcast half %r.val to i16
170  %zext = zext i16 %r.i16 to i32
171  store i32 %zext, i32 addrspace(1)* %r
172  ret void
173}
174
175; GCN-LABEL: {{^}}fptrunc_f32_to_f16_sext_i32:
176; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
177; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
178; GCN: v_bfe_i32 v[[R_F16_SEXT:[0-9]+]], v[[R_F16]], 0, 16
179; GCN: buffer_store_dword v[[R_F16_SEXT]]
180define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
181    i32 addrspace(1)* %r,
182    float addrspace(1)* %a) #0 {
183entry:
184  %a.val = load float, float addrspace(1)* %a
185  %r.val = fptrunc float %a.val to half
186  %r.i16 = bitcast half %r.val to i16
187  %zext = sext i16 %r.i16 to i32
188  store i32 %zext, i32 addrspace(1)* %r
189  ret void
190}
191
192declare float @llvm.fabs.f32(float) #1
193
194attributes #0 = { nounwind }
195attributes #1 = { nounwind readnone }
196