1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
3
4; half args should be promoted to float for SI and lower.
5
6; GCN-LABEL: {{^}}load_f16_arg:
7; GCN: s_load_dword [[ARG:s[0-9]+]]
8; SI: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]]
9; VI: v_trunc_f16_e32 [[CVT:v[0-9]+]], [[ARG]]
10; GCN: buffer_store_short [[CVT]]
11define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
12  store half %arg, half addrspace(1)* %out
13  ret void
14}
15
16; GCN-LABEL: {{^}}load_v2f16_arg:
17; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
18; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
19; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
20; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]],  [[V0]], [[HI]]
21; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
22; GCN: s_endpgm
23define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
24  store <2 x half> %arg, <2 x half> addrspace(1)* %out
25  ret void
26}
27
28; GCN-LABEL: {{^}}load_v3f16_arg:
29; GCN: buffer_load_ushort
30; GCN: buffer_load_ushort
31; GCN: buffer_load_ushort
32; GCN-NOT: buffer_load
33; GCN-DAG: buffer_store_dword
34; GCN-DAG: buffer_store_short
35; GCN-NOT: buffer_store
36; GCN: s_endpgm
37define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
38  store <3 x half> %arg, <3 x half> addrspace(1)* %out
39  ret void
40}
41
42; GCN-LABEL: {{^}}load_v4f16_arg:
43; GCN: buffer_load_ushort
44; GCN: buffer_load_ushort
45; GCN: buffer_load_ushort
46; GCN: buffer_load_ushort
47; GCN: buffer_store_dwordx2
48; GCN: s_endpgm
49define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
50  store <4 x half> %arg, <4 x half> addrspace(1)* %out
51  ret void
52}
53
54; GCN-LABEL: {{^}}load_v8f16_arg:
55define amdgpu_kernel void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
56  store <8 x half> %arg, <8 x half> addrspace(1)* %out
57  ret void
58}
59
60; GCN-LABEL: {{^}}extload_v2f16_arg:
61define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
62  %fpext = fpext <2 x half> %in to <2 x float>
63  store <2 x float> %fpext, <2 x float> addrspace(1)* %out
64  ret void
65}
66
67; GCN-LABEL: {{^}}extload_f16_to_f32_arg:
68define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
69  %ext = fpext half %arg to float
70  store float %ext, float addrspace(1)* %out
71  ret void
72}
73
74; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg:
75define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
76  %ext = fpext <2 x half> %arg to <2 x float>
77  store <2 x float> %ext, <2 x float> addrspace(1)* %out
78  ret void
79}
80
81; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg:
82; GCN: buffer_load_ushort
83; GCN: buffer_load_ushort
84; GCN: buffer_load_ushort
85; GCN-NOT: buffer_load
86; GCN: v_cvt_f32_f16_e32
87; GCN: v_cvt_f32_f16_e32
88; GCN: v_cvt_f32_f16_e32
89; GCN-NOT: v_cvt_f32_f16
90; GCN-DAG: buffer_store_dword
91; GCN-DAG: buffer_store_dwordx2
92; GCN: s_endpgm
93define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
94  %ext = fpext <3 x half> %arg to <3 x float>
95  store <3 x float> %ext, <3 x float> addrspace(1)* %out
96  ret void
97}
98
99; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg:
100define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
101  %ext = fpext <4 x half> %arg to <4 x float>
102  store <4 x float> %ext, <4 x float> addrspace(1)* %out
103  ret void
104}
105
106; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
107; GCN: buffer_load_ushort
108; GCN: buffer_load_ushort
109; GCN: buffer_load_ushort
110; GCN: buffer_load_ushort
111; GCN: buffer_load_ushort
112; GCN: buffer_load_ushort
113; GCN: buffer_load_ushort
114; GCN: buffer_load_ushort
115
116; GCN: v_cvt_f32_f16_e32
117; GCN: v_cvt_f32_f16_e32
118; GCN: v_cvt_f32_f16_e32
119; GCN: v_cvt_f32_f16_e32
120; GCN: v_cvt_f32_f16_e32
121; GCN: v_cvt_f32_f16_e32
122; GCN: v_cvt_f32_f16_e32
123; GCN: v_cvt_f32_f16_e32
124
125; GCN: buffer_store_dwordx4
126; GCN: buffer_store_dwordx4
127define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
128  %ext = fpext <8 x half> %arg to <8 x float>
129  store <8 x float> %ext, <8 x float> addrspace(1)* %out
130  ret void
131}
132
133; GCN-LABEL: {{^}}extload_f16_to_f64_arg:
134; SI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}}
135; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[ARG]]
136; VI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c{{$}}
137; VI: v_trunc_f16_e32 v[[VARG:[0-9]+]], [[ARG]]
138; VI: v_cvt_f32_f16_e32 v[[VARG_F32:[0-9]+]], v[[VARG]]
139; VI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[VARG_F32]]
140; GCN: buffer_store_dwordx2 [[RESULT]]
141define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
142  %ext = fpext half %arg to double
143  store double %ext, double addrspace(1)* %out
144  ret void
145}
146
147; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
148; GCN-DAG: buffer_load_ushort v
149; GCN-DAG: buffer_load_ushort v
150; GCN-DAG: v_cvt_f32_f16_e32
151; GCN-DAG: v_cvt_f32_f16_e32
152; GCN-DAG: v_cvt_f64_f32_e32
153; GCN-DAG: v_cvt_f64_f32_e32
154; GCN: s_endpgm
155define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
156  %ext = fpext <2 x half> %arg to <2 x double>
157  store <2 x double> %ext, <2 x double> addrspace(1)* %out
158  ret void
159}
160
161; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg:
162; GCN-DAG: buffer_load_ushort v
163; GCN-DAG: buffer_load_ushort v
164; GCN-DAG: buffer_load_ushort v
165; GCN-DAG: v_cvt_f32_f16_e32
166; GCN-DAG: v_cvt_f32_f16_e32
167; GCN-DAG: v_cvt_f32_f16_e32
168; GCN-DAG: v_cvt_f64_f32_e32
169; GCN-DAG: v_cvt_f64_f32_e32
170; GCN-DAG: v_cvt_f64_f32_e32
171; GCN: s_endpgm
172define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
173  %ext = fpext <3 x half> %arg to <3 x double>
174  store <3 x double> %ext, <3 x double> addrspace(1)* %out
175  ret void
176}
177
178; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
179; GCN-DAG: buffer_load_ushort v
180; GCN-DAG: buffer_load_ushort v
181; GCN-DAG: buffer_load_ushort v
182; GCN-DAG: buffer_load_ushort v
183; GCN-DAG: v_cvt_f32_f16_e32
184; GCN-DAG: v_cvt_f32_f16_e32
185; GCN-DAG: v_cvt_f32_f16_e32
186; GCN-DAG: v_cvt_f32_f16_e32
187; GCN-DAG: v_cvt_f64_f32_e32
188; GCN-DAG: v_cvt_f64_f32_e32
189; GCN-DAG: v_cvt_f64_f32_e32
190; GCN-DAG: v_cvt_f64_f32_e32
191; GCN: s_endpgm
192define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
193  %ext = fpext <4 x half> %arg to <4 x double>
194  store <4 x double> %ext, <4 x double> addrspace(1)* %out
195  ret void
196}
197
198; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
199; GCN-DAG: buffer_load_ushort v
200; GCN-DAG: buffer_load_ushort v
201; GCN-DAG: buffer_load_ushort v
202; GCN-DAG: buffer_load_ushort v
203
204; GCN-DAG: buffer_load_ushort v
205; GCN-DAG: buffer_load_ushort v
206; GCN-DAG: buffer_load_ushort v
207; GCN-DAG: buffer_load_ushort v
208
209; GCN-DAG: v_cvt_f32_f16_e32
210; GCN-DAG: v_cvt_f32_f16_e32
211; GCN-DAG: v_cvt_f32_f16_e32
212; GCN-DAG: v_cvt_f32_f16_e32
213
214; GCN-DAG: v_cvt_f32_f16_e32
215; GCN-DAG: v_cvt_f32_f16_e32
216; GCN-DAG: v_cvt_f32_f16_e32
217; GCN-DAG: v_cvt_f32_f16_e32
218
219; GCN-DAG: v_cvt_f64_f32_e32
220; GCN-DAG: v_cvt_f64_f32_e32
221; GCN-DAG: v_cvt_f64_f32_e32
222; GCN-DAG: v_cvt_f64_f32_e32
223
224; GCN-DAG: v_cvt_f64_f32_e32
225; GCN-DAG: v_cvt_f64_f32_e32
226; GCN-DAG: v_cvt_f64_f32_e32
227; GCN-DAG: v_cvt_f64_f32_e32
228
229; GCN: s_endpgm
230define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
231  %ext = fpext <8 x half> %arg to <8 x double>
232  store <8 x double> %ext, <8 x double> addrspace(1)* %out
233  ret void
234}
235
236; GCN-LABEL: {{^}}global_load_store_f16:
237; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
238; GCN: buffer_store_short [[TMP]]
239define amdgpu_kernel void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
240  %val = load half, half addrspace(1)* %in
241  store half %val, half addrspace(1)* %out
242  ret void
243}
244
245; GCN-LABEL: {{^}}global_load_store_v2f16:
246; GCN: buffer_load_dword [[TMP:v[0-9]+]]
247; GCN: buffer_store_dword [[TMP]]
248define amdgpu_kernel void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
249  %val = load <2 x half>, <2 x half> addrspace(1)* %in
250  store <2 x half> %val, <2 x half> addrspace(1)* %out
251  ret void
252}
253
254; GCN-LABEL: {{^}}global_load_store_v4f16:
255; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]]
256; GCN: buffer_store_dwordx2 [[TMP]]
257define amdgpu_kernel void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
258  %val = load <4 x half>, <4 x half> addrspace(1)* %in
259  store <4 x half> %val, <4 x half> addrspace(1)* %out
260  ret void
261}
262
263; GCN-LABEL: {{^}}global_load_store_v8f16:
264; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
265; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
266; GCN: s_endpgm
267define amdgpu_kernel void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
268  %val = load <8 x half>, <8 x half> addrspace(1)* %in
269  store <8 x half> %val, <8 x half> addrspace(1)* %out
270  ret void
271}
272
273; GCN-LABEL: {{^}}global_extload_f16_to_f32:
274; GCN: buffer_load_ushort [[LOAD:v[0-9]+]]
275; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]]
276; GCN: buffer_store_dword [[CVT]]
277define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
278  %val = load half, half addrspace(1)* %in
279  %cvt = fpext half %val to float
280  store float %cvt, float addrspace(1)* %out
281  ret void
282}
283
284; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
285; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
286; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
287; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
288; SI: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
289; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
290; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
291; GCN: s_endpgm
292define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
293  %val = load <2 x half>, <2 x half> addrspace(1)* %in
294  %cvt = fpext <2 x half> %val to <2 x float>
295  store <2 x float> %cvt, <2 x float> addrspace(1)* %out
296  ret void
297}
298
299; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32:
300define amdgpu_kernel void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
301  %val = load <3 x half>, <3 x half> addrspace(1)* %in
302  %cvt = fpext <3 x half> %val to <3 x float>
303  store <3 x float> %cvt, <3 x float> addrspace(1)* %out
304  ret void
305}
306
307; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32:
308define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
309  %val = load <4 x half>, <4 x half> addrspace(1)* %in
310  %cvt = fpext <4 x half> %val to <4 x float>
311  store <4 x float> %cvt, <4 x float> addrspace(1)* %out
312  ret void
313}
314
315; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32:
316define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
317  %val = load <8 x half>, <8 x half> addrspace(1)* %in
318  %cvt = fpext <8 x half> %val to <8 x float>
319  store <8 x float> %cvt, <8 x float> addrspace(1)* %out
320  ret void
321}
322
323; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32:
324; GCN: buffer_load_dwordx4
325; GCN: buffer_load_dwordx4
326
327; SI: v_cvt_f32_f16_e32
328; SI: v_cvt_f32_f16_e32
329; SI: v_cvt_f32_f16_e32
330; SI: v_cvt_f32_f16_e32
331; SI: v_cvt_f32_f16_e32
332; SI: v_cvt_f32_f16_e32
333; SI: v_cvt_f32_f16_e32
334; SI: v_cvt_f32_f16_e32
335; SI: v_cvt_f32_f16_e32
336; SI: v_cvt_f32_f16_e32
337; SI: v_cvt_f32_f16_e32
338; SI: v_cvt_f32_f16_e32
339; SI: v_cvt_f32_f16_e32
340; SI: v_cvt_f32_f16_e32
341; SI: v_cvt_f32_f16_e32
342; SI: v_cvt_f32_f16_e32
343
344; VI: v_cvt_f32_f16_e32
345; VI: v_cvt_f32_f16_sdwa
346; ...
347
348; GCN: buffer_store_dwordx4
349; GCN: buffer_store_dwordx4
350; GCN: buffer_store_dwordx4
351; GCN: buffer_store_dwordx4
352
353; GCN: s_endpgm
354define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
355  %val = load <16 x half>, <16 x half> addrspace(1)* %in
356  %cvt = fpext <16 x half> %val to <16 x float>
357  store <16 x float> %cvt, <16 x float> addrspace(1)* %out
358  ret void
359}
360
361; GCN-LABEL: {{^}}global_extload_f16_to_f64:
362; GCN: buffer_load_ushort [[LOAD:v[0-9]+]]
363; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]]
364; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]]
365; GCN: buffer_store_dwordx2 [[CVT1]]
366define amdgpu_kernel void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
367  %val = load half, half addrspace(1)* %in
368  %cvt = fpext half %val to double
369  store double %cvt, double addrspace(1)* %out
370  ret void
371}
372
373; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64:
374; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
375
376; SI-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
377; SI-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
378; SI-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
379; SI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]]
380; SI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]]
381
382; VI-DAG: v_cvt_f32_f16_sdwa v[[CVT0:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
383; VI-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD]]
384; VI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT0]]
385; VI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT1]]
386
387; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}}
388; GCN: s_endpgm
389define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
390  %val = load <2 x half>, <2 x half> addrspace(1)* %in
391  %cvt = fpext <2 x half> %val to <2 x double>
392  store <2 x double> %cvt, <2 x double> addrspace(1)* %out
393  ret void
394}
395
396; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64:
397
398; XSI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
399; XSI: v_cvt_f32_f16_e32
400; XSI: v_cvt_f32_f16_e32
401; XSI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
402; XSI: v_cvt_f32_f16_e32
403; XSI-NOT: v_cvt_f32_f16
404
405; XVI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
406; XVI: v_cvt_f32_f16_e32
407; XVI: v_cvt_f32_f16_e32
408; XVI: v_cvt_f32_f16_sdwa
409; XVI-NOT: v_cvt_f32_f16
410
411; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]
412; GCN-DAG: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]]
413; GCN-DAG: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]]
414; SI:      v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
415; SI-DAG:  v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]]
416; VI-DAG:  v_cvt_f32_f16_sdwa [[Y32:v[0-9]+]], v[[IN_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
417
418; GCN-DAG: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]]
419; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]]
420; GCN-DAG: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]]
421; GCN-NOT: v_cvt_f64_f32_e32
422
423; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[XLO]]:[[YHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
424; GCN-DAG: buffer_store_dwordx2 [[Z]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
425; GCN: s_endpgm
426define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
427  %val = load <3 x half>, <3 x half> addrspace(1)* %in
428  %cvt = fpext <3 x half> %val to <3 x double>
429  store <3 x double> %cvt, <3 x double> addrspace(1)* %out
430  ret void
431}
432
433; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64:
434define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
435  %val = load <4 x half>, <4 x half> addrspace(1)* %in
436  %cvt = fpext <4 x half> %val to <4 x double>
437  store <4 x double> %cvt, <4 x double> addrspace(1)* %out
438  ret void
439}
440
441; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64:
442define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
443  %val = load <8 x half>, <8 x half> addrspace(1)* %in
444  %cvt = fpext <8 x half> %val to <8 x double>
445  store <8 x double> %cvt, <8 x double> addrspace(1)* %out
446  ret void
447}
448
449; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64:
450define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
451  %val = load <16 x half>, <16 x half> addrspace(1)* %in
452  %cvt = fpext <16 x half> %val to <16 x double>
453  store <16 x double> %cvt, <16 x double> addrspace(1)* %out
454  ret void
455}
456
457; GCN-LABEL: {{^}}global_truncstore_f32_to_f16:
458; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
459; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]]
460; GCN: buffer_store_short [[CVT]]
461define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
462  %val = load float, float addrspace(1)* %in
463  %cvt = fptrunc float %val to half
464  store half %cvt, half addrspace(1)* %out
465  ret void
466}
467
468; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16:
469; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
470; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]]
471
472; SI-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
473; SI-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]]
474; SI:     v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]]
475
476; VI-DAG: v_cvt_f16_f32_sdwa [[CVT1:v[0-9]+]], v[[HI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
477; VI:     v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[CVT1]]
478
479; GCN-DAG: buffer_store_dword [[PACKED]]
480; GCN: s_endpgm
481define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
482  %val = load <2 x float>, <2 x float> addrspace(1)* %in
483  %cvt = fptrunc <2 x float> %val to <2 x half>
484  store <2 x half> %cvt, <2 x half> addrspace(1)* %out
485  ret void
486}
487
488; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16:
489; GCN: buffer_load_dwordx4
490; GCN-DAG: v_cvt_f16_f32_e32
491; SI-DAG:  v_cvt_f16_f32_e32
492; VI-DAG:  v_cvt_f16_f32_sdwa
493; GCN-DAG: v_cvt_f16_f32_e32
494; GCN: buffer_store_short
495; GCN: buffer_store_dword
496; GCN: s_endpgm
497define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
498  %val = load <3 x float>, <3 x float> addrspace(1)* %in
499  %cvt = fptrunc <3 x float> %val to <3 x half>
500  store <3 x half> %cvt, <3 x half> addrspace(1)* %out
501  ret void
502}
503
504; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16:
505; GCN: buffer_load_dwordx4
506; GCN-DAG: v_cvt_f16_f32_e32
507; SI-DAG:  v_cvt_f16_f32_e32
508; SI-DAG:  v_cvt_f16_f32_e32
509; VI-DAG:  v_cvt_f16_f32_sdwa
510; VI-DAG:  v_cvt_f16_f32_sdwa
511; GCN-DAG: v_cvt_f16_f32_e32
512; GCN: buffer_store_dwordx2
513; GCN: s_endpgm
514define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
515  %val = load <4 x float>, <4 x float> addrspace(1)* %in
516  %cvt = fptrunc <4 x float> %val to <4 x half>
517  store <4 x half> %cvt, <4 x half> addrspace(1)* %out
518  ret void
519}
520
521; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16:
522; GCN: buffer_load_dwordx4
523; GCN: buffer_load_dwordx4
524; SI:  v_cvt_f16_f32_e32
525; SI:  v_cvt_f16_f32_e32
526; SI:  v_cvt_f16_f32_e32
527; SI:  v_cvt_f16_f32_e32
528; SI:  v_cvt_f16_f32_e32
529; SI:  v_cvt_f16_f32_e32
530; SI:  v_cvt_f16_f32_e32
531; SI:  v_cvt_f16_f32_e32
532; VI-DAG:  v_cvt_f16_f32_e32
533; VI-DAG:  v_cvt_f16_f32_e32
534; VI-DAG:  v_cvt_f16_f32_e32
535; VI-DAG:  v_cvt_f16_f32_e32
536; VI-DAG:  v_cvt_f16_f32_sdwa
537; VI-DAG:  v_cvt_f16_f32_sdwa
538; VI-DAG:  v_cvt_f16_f32_sdwa
539; VI-DAG:  v_cvt_f16_f32_sdwa
540; GCN: buffer_store_dwordx4
541; GCN: s_endpgm
542define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
543  %val = load <8 x float>, <8 x float> addrspace(1)* %in
544  %cvt = fptrunc <8 x float> %val to <8 x half>
545  store <8 x half> %cvt, <8 x half> addrspace(1)* %out
546  ret void
547}
548
549; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16:
550; GCN: buffer_load_dwordx4
551; GCN: buffer_load_dwordx4
552; GCN: buffer_load_dwordx4
553; GCN: buffer_load_dwordx4
554; GCN-DAG: v_cvt_f16_f32_e32
555; GCN-DAG: v_cvt_f16_f32_e32
556; GCN-DAG: v_cvt_f16_f32_e32
557; GCN-DAG: v_cvt_f16_f32_e32
558; GCN-DAG: v_cvt_f16_f32_e32
559; GCN-DAG: v_cvt_f16_f32_e32
560; GCN-DAG: v_cvt_f16_f32_e32
561; GCN-DAG: v_cvt_f16_f32_e32
562; GCN-DAG: v_cvt_f16_f32_e32
563; GCN-DAG: v_cvt_f16_f32_e32
564; GCN-DAG: v_cvt_f16_f32_e32
565; GCN-DAG: v_cvt_f16_f32_e32
566; GCN-DAG: v_cvt_f16_f32_e32
567; GCN-DAG: v_cvt_f16_f32_e32
568; GCN-DAG: v_cvt_f16_f32_e32
569; GCN-DAG: v_cvt_f16_f32_e32
570; GCN-DAG: buffer_store_dwordx4
571; GCN-DAG: buffer_store_dwordx4
572; GCN: s_endpgm
573define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
574  %val = load <16 x float>, <16 x float> addrspace(1)* %in
575  %cvt = fptrunc <16 x float> %val to <16 x half>
576  store <16 x half> %cvt, <16 x half> addrspace(1)* %out
577  ret void
578}
579
580; FIXME: Unsafe math should fold conversions away
581; GCN-LABEL: {{^}}fadd_f16:
582; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
583; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
584; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
585; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
586; SI: v_add_f32
587; GCN: s_endpgm
588define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
589   %add = fadd half %a, %b
590   store half %add, half addrspace(1)* %out, align 4
591   ret void
592}
593
594; GCN-LABEL: {{^}}fadd_v2f16:
595; SI: v_add_f32
596; SI: v_add_f32
597; GCN: s_endpgm
598define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
599  %add = fadd <2 x half> %a, %b
600  store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8
601  ret void
602}
603
604; GCN-LABEL: {{^}}fadd_v4f16:
605; SI: v_add_f32
606; SI: v_add_f32
607; SI: v_add_f32
608; SI: v_add_f32
609; GCN: s_endpgm
610define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
611  %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1
612  %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16
613  %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16
614  %result = fadd <4 x half> %a, %b
615  store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16
616  ret void
617}
618
619; GCN-LABEL: {{^}}fadd_v8f16:
620; SI: v_add_f32
621; SI: v_add_f32
622; SI: v_add_f32
623; SI: v_add_f32
624; SI: v_add_f32
625; SI: v_add_f32
626; SI: v_add_f32
627; SI: v_add_f32
628; GCN: s_endpgm
629define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
630  %add = fadd <8 x half> %a, %b
631  store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32
632  ret void
633}
634
635; GCN-LABEL: {{^}}test_bitcast_from_half:
636; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
637; GCN: buffer_store_short [[TMP]]
638define amdgpu_kernel void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
639  %val = load half, half addrspace(1)* %in
640  %val_int = bitcast half %val to i16
641  store i16 %val_int, i16 addrspace(1)* %out
642  ret void
643}
644
645; GCN-LABEL: {{^}}test_bitcast_to_half:
646; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
647; GCN: buffer_store_short [[TMP]]
648define amdgpu_kernel void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
649  %val = load i16, i16 addrspace(1)* %in
650  %val_fp = bitcast i16 %val to half
651  store half %val_fp, half addrspace(1)* %out
652  ret void
653}
654
655attributes #0 = { nounwind }
656