1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
3
4; half args should be promoted to float for SI and lower.
5
6; GCN-LABEL: {{^}}load_f16_arg:
7; GCN: s_load_dword [[ARG:s[0-9]+]]
8; SI: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]]
9; VI: v_trunc_f16_e32 [[CVT:v[0-9]+]], [[ARG]]
10; GCN: buffer_store_short [[CVT]]
11define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
12  store half %arg, half addrspace(1)* %out
13  ret void
14}
15
16; GCN-LABEL: {{^}}load_v2f16_arg:
17; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
18; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
19; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
20; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[HI]], [[V0]]
21; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
22; GCN: s_endpgm
23define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
24  store <2 x half> %arg, <2 x half> addrspace(1)* %out
25  ret void
26}
27
28; GCN-LABEL: {{^}}load_v3f16_arg:
29; GCN: buffer_load_ushort
30; GCN: buffer_load_ushort
31; GCN: buffer_load_ushort
32; GCN-NOT: buffer_load
33; GCN-DAG: buffer_store_dword
34; GCN-DAG: buffer_store_short
35; GCN-NOT: buffer_store
36; GCN: s_endpgm
37define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
38  store <3 x half> %arg, <3 x half> addrspace(1)* %out
39  ret void
40}
41
42; GCN-LABEL: {{^}}load_v4f16_arg:
43; GCN: buffer_load_ushort
44; GCN: buffer_load_ushort
45; GCN: buffer_load_ushort
46; GCN: buffer_load_ushort
47; GCN: buffer_store_dwordx2
48; GCN: s_endpgm
49define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
50  store <4 x half> %arg, <4 x half> addrspace(1)* %out
51  ret void
52}
53
54; GCN-LABEL: {{^}}load_v8f16_arg:
55define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
56  store <8 x half> %arg, <8 x half> addrspace(1)* %out
57  ret void
58}
59
60; GCN-LABEL: {{^}}extload_v2f16_arg:
61define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
62  %fpext = fpext <2 x half> %in to <2 x float>
63  store <2 x float> %fpext, <2 x float> addrspace(1)* %out
64  ret void
65}
66
67; GCN-LABEL: {{^}}extload_f16_to_f32_arg:
68define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
69  %ext = fpext half %arg to float
70  store float %ext, float addrspace(1)* %out
71  ret void
72}
73
74; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg:
75define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
76  %ext = fpext <2 x half> %arg to <2 x float>
77  store <2 x float> %ext, <2 x float> addrspace(1)* %out
78  ret void
79}
80
81; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg:
82; GCN: buffer_load_ushort
83; GCN: buffer_load_ushort
84; GCN: buffer_load_ushort
85; GCN-NOT: buffer_load
86; GCN: v_cvt_f32_f16_e32
87; GCN: v_cvt_f32_f16_e32
88; GCN: v_cvt_f32_f16_e32
89; GCN-NOT: v_cvt_f32_f16
90; GCN-DAG: buffer_store_dword
91; GCN-DAG: buffer_store_dwordx2
92; GCN: s_endpgm
93define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
94  %ext = fpext <3 x half> %arg to <3 x float>
95  store <3 x float> %ext, <3 x float> addrspace(1)* %out
96  ret void
97}
98
99; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg:
100define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
101  %ext = fpext <4 x half> %arg to <4 x float>
102  store <4 x float> %ext, <4 x float> addrspace(1)* %out
103  ret void
104}
105
106; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
107; GCN: buffer_load_ushort
108; GCN: buffer_load_ushort
109; GCN: buffer_load_ushort
110; GCN: buffer_load_ushort
111; GCN: buffer_load_ushort
112; GCN: buffer_load_ushort
113; GCN: buffer_load_ushort
114; GCN: buffer_load_ushort
115
116; GCN: v_cvt_f32_f16_e32
117; GCN: v_cvt_f32_f16_e32
118; GCN: v_cvt_f32_f16_e32
119; GCN: v_cvt_f32_f16_e32
120; GCN: v_cvt_f32_f16_e32
121; GCN: v_cvt_f32_f16_e32
122; GCN: v_cvt_f32_f16_e32
123; GCN: v_cvt_f32_f16_e32
124
125; GCN: buffer_store_dwordx4
126; GCN: buffer_store_dwordx4
127define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
128  %ext = fpext <8 x half> %arg to <8 x float>
129  store <8 x float> %ext, <8 x float> addrspace(1)* %out
130  ret void
131}
132
133; GCN-LABEL: {{^}}extload_f16_to_f64_arg:
134; SI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}}
135; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[ARG]]
136; VI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c{{$}}
137; VI: v_trunc_f16_e32 v[[VARG:[0-9]+]], [[ARG]]
138; VI: v_cvt_f32_f16_e32 v[[VARG_F32:[0-9]+]], v[[VARG]]
139; VI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[VARG_F32]]
140; GCN: buffer_store_dwordx2 [[RESULT]]
141define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
142  %ext = fpext half %arg to double
143  store double %ext, double addrspace(1)* %out
144  ret void
145}
146
147; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
148; GCN-DAG: buffer_load_ushort v
149; GCN-DAG: buffer_load_ushort v
150; GCN-DAG: v_cvt_f32_f16_e32
151; GCN-DAG: v_cvt_f32_f16_e32
152; GCN-DAG: v_cvt_f64_f32_e32
153; GCN-DAG: v_cvt_f64_f32_e32
154; GCN: s_endpgm
155define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
156  %ext = fpext <2 x half> %arg to <2 x double>
157  store <2 x double> %ext, <2 x double> addrspace(1)* %out
158  ret void
159}
160
161; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg:
162; GCN-DAG: buffer_load_ushort v
163; GCN-DAG: buffer_load_ushort v
164; GCN-DAG: buffer_load_ushort v
165; GCN-DAG: v_cvt_f32_f16_e32
166; GCN-DAG: v_cvt_f32_f16_e32
167; GCN-DAG: v_cvt_f32_f16_e32
168; GCN-DAG: v_cvt_f64_f32_e32
169; GCN-DAG: v_cvt_f64_f32_e32
170; GCN-DAG: v_cvt_f64_f32_e32
171; GCN: s_endpgm
172define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
173  %ext = fpext <3 x half> %arg to <3 x double>
174  store <3 x double> %ext, <3 x double> addrspace(1)* %out
175  ret void
176}
177
178; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
179; GCN-DAG: buffer_load_ushort v
180; GCN-DAG: buffer_load_ushort v
181; GCN-DAG: buffer_load_ushort v
182; GCN-DAG: buffer_load_ushort v
183; GCN-DAG: v_cvt_f32_f16_e32
184; GCN-DAG: v_cvt_f32_f16_e32
185; GCN-DAG: v_cvt_f32_f16_e32
186; GCN-DAG: v_cvt_f32_f16_e32
187; GCN-DAG: v_cvt_f64_f32_e32
188; GCN-DAG: v_cvt_f64_f32_e32
189; GCN-DAG: v_cvt_f64_f32_e32
190; GCN-DAG: v_cvt_f64_f32_e32
191; GCN: s_endpgm
192define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
193  %ext = fpext <4 x half> %arg to <4 x double>
194  store <4 x double> %ext, <4 x double> addrspace(1)* %out
195  ret void
196}
197
198; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
199; GCN-DAG: buffer_load_ushort v
200; GCN-DAG: buffer_load_ushort v
201; GCN-DAG: buffer_load_ushort v
202; GCN-DAG: buffer_load_ushort v
203
204; GCN-DAG: buffer_load_ushort v
205; GCN-DAG: buffer_load_ushort v
206; GCN-DAG: buffer_load_ushort v
207; GCN-DAG: buffer_load_ushort v
208
209; GCN-DAG: v_cvt_f32_f16_e32
210; GCN-DAG: v_cvt_f32_f16_e32
211; GCN-DAG: v_cvt_f32_f16_e32
212; GCN-DAG: v_cvt_f32_f16_e32
213
214; GCN-DAG: v_cvt_f32_f16_e32
215; GCN-DAG: v_cvt_f32_f16_e32
216; GCN-DAG: v_cvt_f32_f16_e32
217; GCN-DAG: v_cvt_f32_f16_e32
218
219; GCN-DAG: v_cvt_f64_f32_e32
220; GCN-DAG: v_cvt_f64_f32_e32
221; GCN-DAG: v_cvt_f64_f32_e32
222; GCN-DAG: v_cvt_f64_f32_e32
223
224; GCN-DAG: v_cvt_f64_f32_e32
225; GCN-DAG: v_cvt_f64_f32_e32
226; GCN-DAG: v_cvt_f64_f32_e32
227; GCN-DAG: v_cvt_f64_f32_e32
228
229; GCN: s_endpgm
230define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
231  %ext = fpext <8 x half> %arg to <8 x double>
232  store <8 x double> %ext, <8 x double> addrspace(1)* %out
233  ret void
234}
235
236; GCN-LABEL: {{^}}global_load_store_f16:
237; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
238; GCN: buffer_store_short [[TMP]]
239define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
240  %val = load half, half addrspace(1)* %in
241  store half %val, half addrspace(1)* %out
242  ret void
243}
244
245; GCN-LABEL: {{^}}global_load_store_v2f16:
246; GCN: buffer_load_dword [[TMP:v[0-9]+]]
247; GCN: buffer_store_dword [[TMP]]
248define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
249  %val = load <2 x half>, <2 x half> addrspace(1)* %in
250  store <2 x half> %val, <2 x half> addrspace(1)* %out
251  ret void
252}
253
254; GCN-LABEL: {{^}}global_load_store_v4f16:
255; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]]
256; GCN: buffer_store_dwordx2 [[TMP]]
257define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
258  %val = load <4 x half>, <4 x half> addrspace(1)* %in
259  store <4 x half> %val, <4 x half> addrspace(1)* %out
260  ret void
261}
262
263; GCN-LABEL: {{^}}global_load_store_v8f16:
264; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
265; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
266; GCN: s_endpgm
267define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
268  %val = load <8 x half>, <8 x half> addrspace(1)* %in
269  store <8 x half> %val, <8 x half> addrspace(1)* %out
270  ret void
271}
272
273; GCN-LABEL: {{^}}global_extload_f16_to_f32:
274; GCN: buffer_load_ushort [[LOAD:v[0-9]+]]
275; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]]
276; GCN: buffer_store_dword [[CVT]]
277define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
278  %val = load half, half addrspace(1)* %in
279  %cvt = fpext half %val to float
280  store float %cvt, float addrspace(1)* %out
281  ret void
282}
283
284; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
285; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
286; VI:  v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
287; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
288; SI:  v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
289; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
290; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
291; GCN: s_endpgm
292define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
293  %val = load <2 x half>, <2 x half> addrspace(1)* %in
294  %cvt = fpext <2 x half> %val to <2 x float>
295  store <2 x float> %cvt, <2 x float> addrspace(1)* %out
296  ret void
297}
298
299; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32:
300define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
301  %val = load <3 x half>, <3 x half> addrspace(1)* %in
302  %cvt = fpext <3 x half> %val to <3 x float>
303  store <3 x float> %cvt, <3 x float> addrspace(1)* %out
304  ret void
305}
306
307; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32:
308define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
309  %val = load <4 x half>, <4 x half> addrspace(1)* %in
310  %cvt = fpext <4 x half> %val to <4 x float>
311  store <4 x float> %cvt, <4 x float> addrspace(1)* %out
312  ret void
313}
314
315; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32:
316define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
317  %val = load <8 x half>, <8 x half> addrspace(1)* %in
318  %cvt = fpext <8 x half> %val to <8 x float>
319  store <8 x float> %cvt, <8 x float> addrspace(1)* %out
320  ret void
321}
322
323; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32:
324; GCN: buffer_load_dwordx4
325; GCN: buffer_load_dwordx4
326
327; GCN: v_cvt_f32_f16_e32
328; GCN: v_cvt_f32_f16_e32
329; GCN: v_cvt_f32_f16_e32
330; GCN: v_cvt_f32_f16_e32
331; GCN: v_cvt_f32_f16_e32
332; GCN: v_cvt_f32_f16_e32
333; GCN: v_cvt_f32_f16_e32
334; GCN: v_cvt_f32_f16_e32
335; GCN: v_cvt_f32_f16_e32
336; GCN: v_cvt_f32_f16_e32
337; GCN: v_cvt_f32_f16_e32
338; GCN: v_cvt_f32_f16_e32
339; GCN: v_cvt_f32_f16_e32
340; GCN: v_cvt_f32_f16_e32
341; GCN: v_cvt_f32_f16_e32
342; GCN: v_cvt_f32_f16_e32
343
344; GCN: buffer_store_dwordx4
345; GCN: buffer_store_dwordx4
346; GCN: buffer_store_dwordx4
347; GCN: buffer_store_dwordx4
348
349; GCN: s_endpgm
350define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
351  %val = load <16 x half>, <16 x half> addrspace(1)* %in
352  %cvt = fpext <16 x half> %val to <16 x float>
353  store <16 x float> %cvt, <16 x float> addrspace(1)* %out
354  ret void
355}
356
357; GCN-LABEL: {{^}}global_extload_f16_to_f64:
358; GCN: buffer_load_ushort [[LOAD:v[0-9]+]]
359; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]]
360; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]]
361; GCN: buffer_store_dwordx2 [[CVT1]]
362define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
363  %val = load half, half addrspace(1)* %in
364  %cvt = fpext half %val to double
365  store double %cvt, double addrspace(1)* %out
366  ret void
367}
368
369; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64:
370; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
371; GCN-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
372; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
373; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
374; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]]
375; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]]
376; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}}
377; GCN: s_endpgm
378define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
379  %val = load <2 x half>, <2 x half> addrspace(1)* %in
380  %cvt = fpext <2 x half> %val to <2 x double>
381  store <2 x double> %cvt, <2 x double> addrspace(1)* %out
382  ret void
383}
384
385; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64:
386
387; XSI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
388; XSI: v_cvt_f32_f16_e32
389; XSI: v_cvt_f32_f16_e32
390; XSI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
391; XSI: v_cvt_f32_f16_e32
392; XSI-NOT: v_cvt_f32_f16
393
394; XVI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
395; XVI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
396; XVI: v_cvt_f32_f16_e32
397; XVI: v_cvt_f32_f16_e32
398; XVI: v_cvt_f32_f16_e32
399; XVI-NOT: v_cvt_f32_f16
400
401; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]
402; VI:  v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
403; GCN: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]]
404; GCN: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]]
405; SI:  v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
406; GCN: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]]
407
408; GCN: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]]
409; GCN: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]]
410; GCN: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]]
411; GCN-NOT: v_cvt_f64_f32_e32
412
413; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[XLO]]:[[YHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
414; GCN-DAG: buffer_store_dwordx2 [[Z]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
415; GCN: s_endpgm
416define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
417  %val = load <3 x half>, <3 x half> addrspace(1)* %in
418  %cvt = fpext <3 x half> %val to <3 x double>
419  store <3 x double> %cvt, <3 x double> addrspace(1)* %out
420  ret void
421}
422
423; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64:
424define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
425  %val = load <4 x half>, <4 x half> addrspace(1)* %in
426  %cvt = fpext <4 x half> %val to <4 x double>
427  store <4 x double> %cvt, <4 x double> addrspace(1)* %out
428  ret void
429}
430
431; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64:
432define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
433  %val = load <8 x half>, <8 x half> addrspace(1)* %in
434  %cvt = fpext <8 x half> %val to <8 x double>
435  store <8 x double> %cvt, <8 x double> addrspace(1)* %out
436  ret void
437}
438
439; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64:
440define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
441  %val = load <16 x half>, <16 x half> addrspace(1)* %in
442  %cvt = fpext <16 x half> %val to <16 x double>
443  store <16 x double> %cvt, <16 x double> addrspace(1)* %out
444  ret void
445}
446
447; GCN-LABEL: {{^}}global_truncstore_f32_to_f16:
448; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
449; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]]
450; GCN: buffer_store_short [[CVT]]
451define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
452  %val = load float, float addrspace(1)* %in
453  %cvt = fptrunc float %val to half
454  store half %cvt, half addrspace(1)* %out
455  ret void
456}
457
458; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16:
459; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
460; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]]
461; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
462; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]]
463; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]]
464; GCN-DAG: buffer_store_dword [[PACKED]]
465; GCN: s_endpgm
466define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
467  %val = load <2 x float>, <2 x float> addrspace(1)* %in
468  %cvt = fptrunc <2 x float> %val to <2 x half>
469  store <2 x half> %cvt, <2 x half> addrspace(1)* %out
470  ret void
471}
472
473; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16:
474; GCN: buffer_load_dwordx4
475; GCN: v_cvt_f16_f32_e32
476; GCN: v_cvt_f16_f32_e32
477; GCN: v_cvt_f16_f32_e32
478; GCN-NOT: v_cvt_f16_f32_e32
479; GCN: buffer_store_short
480; GCN: buffer_store_dword
481; GCN: s_endpgm
482define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
483  %val = load <3 x float>, <3 x float> addrspace(1)* %in
484  %cvt = fptrunc <3 x float> %val to <3 x half>
485  store <3 x half> %cvt, <3 x half> addrspace(1)* %out
486  ret void
487}
488
489; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16:
490; GCN: buffer_load_dwordx4
491; GCN: v_cvt_f16_f32_e32
492; GCN: v_cvt_f16_f32_e32
493; GCN: v_cvt_f16_f32_e32
494; GCN: v_cvt_f16_f32_e32
495; GCN: buffer_store_dwordx2
496; GCN: s_endpgm
497define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
498  %val = load <4 x float>, <4 x float> addrspace(1)* %in
499  %cvt = fptrunc <4 x float> %val to <4 x half>
500  store <4 x half> %cvt, <4 x half> addrspace(1)* %out
501  ret void
502}
503
504; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16:
505; GCN: buffer_load_dwordx4
506; GCN: buffer_load_dwordx4
507; GCN: v_cvt_f16_f32_e32
508; GCN: v_cvt_f16_f32_e32
509; GCN: v_cvt_f16_f32_e32
510; GCN: v_cvt_f16_f32_e32
511; GCN: v_cvt_f16_f32_e32
512; GCN: v_cvt_f16_f32_e32
513; GCN: v_cvt_f16_f32_e32
514; GCN: v_cvt_f16_f32_e32
515; GCN: buffer_store_dwordx4
516; GCN: s_endpgm
517define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
518  %val = load <8 x float>, <8 x float> addrspace(1)* %in
519  %cvt = fptrunc <8 x float> %val to <8 x half>
520  store <8 x half> %cvt, <8 x half> addrspace(1)* %out
521  ret void
522}
523
524; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16:
525; GCN: buffer_load_dwordx4
526; GCN: buffer_load_dwordx4
527; GCN: buffer_load_dwordx4
528; GCN: buffer_load_dwordx4
529; GCN-DAG: v_cvt_f16_f32_e32
530; GCN-DAG: v_cvt_f16_f32_e32
531; GCN-DAG: v_cvt_f16_f32_e32
532; GCN-DAG: v_cvt_f16_f32_e32
533; GCN-DAG: v_cvt_f16_f32_e32
534; GCN-DAG: v_cvt_f16_f32_e32
535; GCN-DAG: v_cvt_f16_f32_e32
536; GCN-DAG: v_cvt_f16_f32_e32
537; GCN-DAG: v_cvt_f16_f32_e32
538; GCN-DAG: v_cvt_f16_f32_e32
539; GCN-DAG: v_cvt_f16_f32_e32
540; GCN-DAG: v_cvt_f16_f32_e32
541; GCN-DAG: v_cvt_f16_f32_e32
542; GCN-DAG: v_cvt_f16_f32_e32
543; GCN-DAG: v_cvt_f16_f32_e32
544; GCN-DAG: v_cvt_f16_f32_e32
545; GCN-DAG: buffer_store_dwordx4
546; GCN-DAG: buffer_store_dwordx4
547; GCN: s_endpgm
548define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
549  %val = load <16 x float>, <16 x float> addrspace(1)* %in
550  %cvt = fptrunc <16 x float> %val to <16 x half>
551  store <16 x half> %cvt, <16 x half> addrspace(1)* %out
552  ret void
553}
554
555; FIXME: Unsafe math should fold conversions away
556; GCN-LABEL: {{^}}fadd_f16:
557; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
558; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
559; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
560; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
561; SI: v_add_f32
562; GCN: s_endpgm
563define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
564   %add = fadd half %a, %b
565   store half %add, half addrspace(1)* %out, align 4
566   ret void
567}
568
569; GCN-LABEL: {{^}}fadd_v2f16:
570; SI: v_add_f32
571; SI: v_add_f32
572; GCN: s_endpgm
573define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
574  %add = fadd <2 x half> %a, %b
575  store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8
576  ret void
577}
578
579; GCN-LABEL: {{^}}fadd_v4f16:
580; SI: v_add_f32
581; SI: v_add_f32
582; SI: v_add_f32
583; SI: v_add_f32
584; GCN: s_endpgm
585define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
586  %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1
587  %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16
588  %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16
589  %result = fadd <4 x half> %a, %b
590  store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16
591  ret void
592}
593
594; GCN-LABEL: {{^}}fadd_v8f16:
595; SI: v_add_f32
596; SI: v_add_f32
597; SI: v_add_f32
598; SI: v_add_f32
599; SI: v_add_f32
600; SI: v_add_f32
601; SI: v_add_f32
602; SI: v_add_f32
603; GCN: s_endpgm
604define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
605  %add = fadd <8 x half> %a, %b
606  store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32
607  ret void
608}
609
610; GCN-LABEL: {{^}}test_bitcast_from_half:
611; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
612; GCN: buffer_store_short [[TMP]]
613define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
614  %val = load half, half addrspace(1)* %in
615  %val_int = bitcast half %val to i16
616  store i16 %val_int, i16 addrspace(1)* %out
617  ret void
618}
619
620; GCN-LABEL: {{^}}test_bitcast_to_half:
621; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
622; GCN: buffer_store_short [[TMP]]
623define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
624  %val = load i16, i16 addrspace(1)* %in
625  %val_fp = bitcast i16 %val to half
626  store half %val_fp, half addrspace(1)* %out
627  ret void
628}
629
630attributes #0 = { nounwind }
631