1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI %s
4
5; half args should be promoted to float for SI and lower.
6
7define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
8; SI-LABEL: load_f16_arg:
9; SI:       ; %bb.0:
10; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11; SI-NEXT:    s_load_dword s2, s[4:5], 0x2
12; SI-NEXT:    s_waitcnt lgkmcnt(0)
13; SI-NEXT:    v_mov_b32_e32 v0, s0
14; SI-NEXT:    v_mov_b32_e32 v1, s1
15; SI-NEXT:    v_mov_b32_e32 v2, s2
16; SI-NEXT:    flat_store_short v[0:1], v2
17; SI-NEXT:    s_endpgm
18;
19; VI-LABEL: load_f16_arg:
20; VI:       ; %bb.0:
21; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
22; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
23; VI-NEXT:    s_waitcnt lgkmcnt(0)
24; VI-NEXT:    v_mov_b32_e32 v0, s0
25; VI-NEXT:    v_mov_b32_e32 v1, s1
26; VI-NEXT:    v_mov_b32_e32 v2, s2
27; VI-NEXT:    flat_store_short v[0:1], v2
28; VI-NEXT:    s_endpgm
29  store half %arg, half addrspace(1)* %out
30  ret void
31}
32
33define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
34; SI-LABEL: load_v2f16_arg:
35; SI:       ; %bb.0:
36; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
37; SI-NEXT:    s_load_dword s2, s[4:5], 0x2
38; SI-NEXT:    s_waitcnt lgkmcnt(0)
39; SI-NEXT:    v_mov_b32_e32 v0, s0
40; SI-NEXT:    v_mov_b32_e32 v1, s1
41; SI-NEXT:    v_mov_b32_e32 v2, s2
42; SI-NEXT:    flat_store_dword v[0:1], v2
43; SI-NEXT:    s_endpgm
44;
45; VI-LABEL: load_v2f16_arg:
46; VI:       ; %bb.0:
47; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
48; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
49; VI-NEXT:    s_waitcnt lgkmcnt(0)
50; VI-NEXT:    v_mov_b32_e32 v0, s0
51; VI-NEXT:    v_mov_b32_e32 v1, s1
52; VI-NEXT:    v_mov_b32_e32 v2, s2
53; VI-NEXT:    flat_store_dword v[0:1], v2
54; VI-NEXT:    s_endpgm
55  store <2 x half> %arg, <2 x half> addrspace(1)* %out
56  ret void
57}
58
59define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
60; SI-LABEL: load_v3f16_arg:
61; SI:       ; %bb.0:
62; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
63; SI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
64; SI-NEXT:    s_waitcnt lgkmcnt(0)
65; SI-NEXT:    s_add_u32 s4, s0, 4
66; SI-NEXT:    s_addc_u32 s5, s1, 0
67; SI-NEXT:    v_mov_b32_e32 v2, s4
68; SI-NEXT:    v_mov_b32_e32 v4, s3
69; SI-NEXT:    v_mov_b32_e32 v0, s0
70; SI-NEXT:    v_mov_b32_e32 v3, s5
71; SI-NEXT:    v_mov_b32_e32 v1, s1
72; SI-NEXT:    v_mov_b32_e32 v5, s2
73; SI-NEXT:    flat_store_short v[2:3], v4
74; SI-NEXT:    flat_store_dword v[0:1], v5
75; SI-NEXT:    s_endpgm
76;
77; VI-LABEL: load_v3f16_arg:
78; VI:       ; %bb.0:
79; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
80; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
81; VI-NEXT:    s_waitcnt lgkmcnt(0)
82; VI-NEXT:    s_add_u32 s4, s0, 4
83; VI-NEXT:    s_addc_u32 s5, s1, 0
84; VI-NEXT:    v_mov_b32_e32 v2, s4
85; VI-NEXT:    v_mov_b32_e32 v4, s3
86; VI-NEXT:    v_mov_b32_e32 v0, s0
87; VI-NEXT:    v_mov_b32_e32 v3, s5
88; VI-NEXT:    v_mov_b32_e32 v1, s1
89; VI-NEXT:    v_mov_b32_e32 v5, s2
90; VI-NEXT:    flat_store_short v[2:3], v4
91; VI-NEXT:    flat_store_dword v[0:1], v5
92; VI-NEXT:    s_endpgm
93  store <3 x half> %arg, <3 x half> addrspace(1)* %out
94  ret void
95}
96
97
98; FIXME: Why not one load?
99define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
100; SI-LABEL: load_v4f16_arg:
101; SI:       ; %bb.0:
102; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
103; SI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
104; SI-NEXT:    s_waitcnt lgkmcnt(0)
105; SI-NEXT:    v_mov_b32_e32 v0, s0
106; SI-NEXT:    v_mov_b32_e32 v2, s2
107; SI-NEXT:    v_mov_b32_e32 v1, s1
108; SI-NEXT:    v_mov_b32_e32 v3, s3
109; SI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
110; SI-NEXT:    s_endpgm
111;
112; VI-LABEL: load_v4f16_arg:
113; VI:       ; %bb.0:
114; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
115; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
116; VI-NEXT:    s_waitcnt lgkmcnt(0)
117; VI-NEXT:    v_mov_b32_e32 v0, s0
118; VI-NEXT:    v_mov_b32_e32 v2, s2
119; VI-NEXT:    v_mov_b32_e32 v1, s1
120; VI-NEXT:    v_mov_b32_e32 v3, s3
121; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
122; VI-NEXT:    s_endpgm
123  store <4 x half> %arg, <4 x half> addrspace(1)* %out
124  ret void
125}
126
127define amdgpu_kernel void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
128; SI-LABEL: load_v8f16_arg:
129; SI:       ; %bb.0:
130; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
131; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
132; SI-NEXT:    s_waitcnt lgkmcnt(0)
133; SI-NEXT:    v_mov_b32_e32 v4, s6
134; SI-NEXT:    v_mov_b32_e32 v0, s0
135; SI-NEXT:    v_mov_b32_e32 v5, s7
136; SI-NEXT:    v_mov_b32_e32 v1, s1
137; SI-NEXT:    v_mov_b32_e32 v2, s2
138; SI-NEXT:    v_mov_b32_e32 v3, s3
139; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
140; SI-NEXT:    s_endpgm
141;
142; VI-LABEL: load_v8f16_arg:
143; VI:       ; %bb.0:
144; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
145; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
146; VI-NEXT:    s_waitcnt lgkmcnt(0)
147; VI-NEXT:    v_mov_b32_e32 v4, s6
148; VI-NEXT:    v_mov_b32_e32 v0, s0
149; VI-NEXT:    v_mov_b32_e32 v5, s7
150; VI-NEXT:    v_mov_b32_e32 v1, s1
151; VI-NEXT:    v_mov_b32_e32 v2, s2
152; VI-NEXT:    v_mov_b32_e32 v3, s3
153; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
154; VI-NEXT:    s_endpgm
155  store <8 x half> %arg, <8 x half> addrspace(1)* %out
156  ret void
157}
158
159define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
160; SI-LABEL: extload_v2f16_arg:
161; SI:       ; %bb.0:
162; SI-NEXT:    s_load_dword s0, s[4:5], 0x2
163; SI-NEXT:    s_waitcnt lgkmcnt(0)
164; SI-NEXT:    s_lshr_b32 s1, s0, 16
165; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
166; SI-NEXT:    v_cvt_f32_f16_e32 v1, s1
167; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
168; SI-NEXT:    s_waitcnt lgkmcnt(0)
169; SI-NEXT:    v_mov_b32_e32 v3, s1
170; SI-NEXT:    v_mov_b32_e32 v2, s0
171; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
172; SI-NEXT:    s_endpgm
173;
174; VI-LABEL: extload_v2f16_arg:
175; VI:       ; %bb.0:
176; VI-NEXT:    s_load_dword s0, s[4:5], 0x8
177; VI-NEXT:    s_waitcnt lgkmcnt(0)
178; VI-NEXT:    s_lshr_b32 s1, s0, 16
179; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
180; VI-NEXT:    v_cvt_f32_f16_e32 v1, s1
181; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
182; VI-NEXT:    s_waitcnt lgkmcnt(0)
183; VI-NEXT:    v_mov_b32_e32 v3, s1
184; VI-NEXT:    v_mov_b32_e32 v2, s0
185; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
186; VI-NEXT:    s_endpgm
187  %fpext = fpext <2 x half> %in to <2 x float>
188  store <2 x float> %fpext, <2 x float> addrspace(1)* %out
189  ret void
190}
191
192define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
193; SI-LABEL: extload_f16_to_f32_arg:
194; SI:       ; %bb.0:
195; SI-NEXT:    s_load_dword s0, s[4:5], 0x2
196; SI-NEXT:    s_waitcnt lgkmcnt(0)
197; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
198; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
199; SI-NEXT:    s_waitcnt lgkmcnt(0)
200; SI-NEXT:    v_mov_b32_e32 v0, s0
201; SI-NEXT:    v_mov_b32_e32 v1, s1
202; SI-NEXT:    flat_store_dword v[0:1], v2
203; SI-NEXT:    s_endpgm
204;
205; VI-LABEL: extload_f16_to_f32_arg:
206; VI:       ; %bb.0:
207; VI-NEXT:    s_load_dword s0, s[4:5], 0x8
208; VI-NEXT:    s_waitcnt lgkmcnt(0)
209; VI-NEXT:    v_cvt_f32_f16_e32 v2, s0
210; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
211; VI-NEXT:    s_waitcnt lgkmcnt(0)
212; VI-NEXT:    v_mov_b32_e32 v0, s0
213; VI-NEXT:    v_mov_b32_e32 v1, s1
214; VI-NEXT:    flat_store_dword v[0:1], v2
215; VI-NEXT:    s_endpgm
216  %ext = fpext half %arg to float
217  store float %ext, float addrspace(1)* %out
218  ret void
219}
220
221define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
222; SI-LABEL: extload_v2f16_to_v2f32_arg:
223; SI:       ; %bb.0:
224; SI-NEXT:    s_load_dword s0, s[4:5], 0x2
225; SI-NEXT:    s_waitcnt lgkmcnt(0)
226; SI-NEXT:    s_lshr_b32 s1, s0, 16
227; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
228; SI-NEXT:    v_cvt_f32_f16_e32 v1, s1
229; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
230; SI-NEXT:    s_waitcnt lgkmcnt(0)
231; SI-NEXT:    v_mov_b32_e32 v3, s1
232; SI-NEXT:    v_mov_b32_e32 v2, s0
233; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
234; SI-NEXT:    s_endpgm
235;
236; VI-LABEL: extload_v2f16_to_v2f32_arg:
237; VI:       ; %bb.0:
238; VI-NEXT:    s_load_dword s0, s[4:5], 0x8
239; VI-NEXT:    s_waitcnt lgkmcnt(0)
240; VI-NEXT:    s_lshr_b32 s1, s0, 16
241; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
242; VI-NEXT:    v_cvt_f32_f16_e32 v1, s1
243; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
244; VI-NEXT:    s_waitcnt lgkmcnt(0)
245; VI-NEXT:    v_mov_b32_e32 v3, s1
246; VI-NEXT:    v_mov_b32_e32 v2, s0
247; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
248; VI-NEXT:    s_endpgm
249  %ext = fpext <2 x half> %arg to <2 x float>
250  store <2 x float> %ext, <2 x float> addrspace(1)* %out
251  ret void
252}
253
254define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
255; SI-LABEL: extload_v3f16_to_v3f32_arg:
256; SI:       ; %bb.0:
257; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
258; SI-NEXT:    s_waitcnt lgkmcnt(0)
259; SI-NEXT:    s_lshr_b32 s2, s0, 16
260; SI-NEXT:    v_cvt_f32_f16_e32 v2, s1
261; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
262; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
263; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
264; SI-NEXT:    s_waitcnt lgkmcnt(0)
265; SI-NEXT:    v_mov_b32_e32 v4, s1
266; SI-NEXT:    v_mov_b32_e32 v3, s0
267; SI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
268; SI-NEXT:    s_endpgm
269;
270; VI-LABEL: extload_v3f16_to_v3f32_arg:
271; VI:       ; %bb.0:
272; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
273; VI-NEXT:    s_waitcnt lgkmcnt(0)
274; VI-NEXT:    s_lshr_b32 s2, s0, 16
275; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
276; VI-NEXT:    v_cvt_f32_f16_e32 v2, s1
277; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
278; VI-NEXT:    v_cvt_f32_f16_e32 v1, s2
279; VI-NEXT:    s_waitcnt lgkmcnt(0)
280; VI-NEXT:    v_mov_b32_e32 v4, s1
281; VI-NEXT:    v_mov_b32_e32 v3, s0
282; VI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
283; VI-NEXT:    s_endpgm
284  %ext = fpext <3 x half> %arg to <3 x float>
285  store <3 x float> %ext, <3 x float> addrspace(1)* %out
286  ret void
287}
288
289define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
290; SI-LABEL: extload_v4f16_to_v4f32_arg:
291; SI:       ; %bb.0:
292; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
293; SI-NEXT:    s_waitcnt lgkmcnt(0)
294; SI-NEXT:    s_lshr_b32 s2, s1, 16
295; SI-NEXT:    s_lshr_b32 s3, s0, 16
296; SI-NEXT:    v_cvt_f32_f16_e32 v2, s1
297; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
298; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
299; SI-NEXT:    v_cvt_f32_f16_e32 v3, s2
300; SI-NEXT:    v_cvt_f32_f16_e32 v1, s3
301; SI-NEXT:    s_waitcnt lgkmcnt(0)
302; SI-NEXT:    v_mov_b32_e32 v5, s1
303; SI-NEXT:    v_mov_b32_e32 v4, s0
304; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
305; SI-NEXT:    s_endpgm
306;
307; VI-LABEL: extload_v4f16_to_v4f32_arg:
308; VI:       ; %bb.0:
309; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
310; VI-NEXT:    s_waitcnt lgkmcnt(0)
311; VI-NEXT:    s_lshr_b32 s2, s1, 16
312; VI-NEXT:    s_lshr_b32 s3, s0, 16
313; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
314; VI-NEXT:    v_cvt_f32_f16_e32 v2, s1
315; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
316; VI-NEXT:    v_cvt_f32_f16_e32 v3, s2
317; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
318; VI-NEXT:    s_waitcnt lgkmcnt(0)
319; VI-NEXT:    v_mov_b32_e32 v5, s1
320; VI-NEXT:    v_mov_b32_e32 v4, s0
321; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
322; VI-NEXT:    s_endpgm
323  %ext = fpext <4 x half> %arg to <4 x float>
324  store <4 x float> %ext, <4 x float> addrspace(1)* %out
325  ret void
326}
327
328define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
329; SI-LABEL: extload_v8f16_to_v8f32_arg:
330; SI:       ; %bb.0:
331; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
332; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
333; SI-NEXT:    s_waitcnt lgkmcnt(0)
334; SI-NEXT:    s_lshr_b32 s4, s1, 16
335; SI-NEXT:    s_lshr_b32 s5, s0, 16
336; SI-NEXT:    s_lshr_b32 s8, s3, 16
337; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
338; SI-NEXT:    s_lshr_b32 s4, s2, 16
339; SI-NEXT:    v_cvt_f32_f16_e32 v7, s8
340; SI-NEXT:    v_cvt_f32_f16_e32 v5, s4
341; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
342; SI-NEXT:    v_cvt_f32_f16_e32 v6, s3
343; SI-NEXT:    v_cvt_f32_f16_e32 v4, s2
344; SI-NEXT:    s_add_u32 s0, s6, 16
345; SI-NEXT:    v_cvt_f32_f16_e32 v2, s1
346; SI-NEXT:    s_addc_u32 s1, s7, 0
347; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
348; SI-NEXT:    v_mov_b32_e32 v9, s1
349; SI-NEXT:    v_mov_b32_e32 v8, s0
350; SI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
351; SI-NEXT:    s_nop 0
352; SI-NEXT:    v_mov_b32_e32 v4, s6
353; SI-NEXT:    v_mov_b32_e32 v5, s7
354; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
355; SI-NEXT:    s_endpgm
356;
357; VI-LABEL: extload_v8f16_to_v8f32_arg:
358; VI:       ; %bb.0:
359; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
360; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
361; VI-NEXT:    s_waitcnt lgkmcnt(0)
362; VI-NEXT:    s_lshr_b32 s4, s1, 16
363; VI-NEXT:    s_lshr_b32 s5, s0, 16
364; VI-NEXT:    s_lshr_b32 s8, s3, 16
365; VI-NEXT:    v_cvt_f32_f16_e32 v3, s4
366; VI-NEXT:    s_lshr_b32 s4, s2, 16
367; VI-NEXT:    v_cvt_f32_f16_e32 v7, s8
368; VI-NEXT:    v_cvt_f32_f16_e32 v5, s4
369; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
370; VI-NEXT:    v_cvt_f32_f16_e32 v6, s3
371; VI-NEXT:    v_cvt_f32_f16_e32 v4, s2
372; VI-NEXT:    s_add_u32 s0, s6, 16
373; VI-NEXT:    v_cvt_f32_f16_e32 v2, s1
374; VI-NEXT:    s_addc_u32 s1, s7, 0
375; VI-NEXT:    v_cvt_f32_f16_e32 v1, s5
376; VI-NEXT:    v_mov_b32_e32 v9, s1
377; VI-NEXT:    v_mov_b32_e32 v8, s0
378; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
379; VI-NEXT:    s_nop 0
380; VI-NEXT:    v_mov_b32_e32 v4, s6
381; VI-NEXT:    v_mov_b32_e32 v5, s7
382; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
383; VI-NEXT:    s_endpgm
384  %ext = fpext <8 x half> %arg to <8 x float>
385  store <8 x float> %ext, <8 x float> addrspace(1)* %out
386  ret void
387}
388
389define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
390; SI-LABEL: extload_f16_to_f64_arg:
391; SI:       ; %bb.0:
392; SI-NEXT:    s_load_dword s0, s[4:5], 0x2
393; SI-NEXT:    s_waitcnt lgkmcnt(0)
394; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
395; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
396; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
397; SI-NEXT:    s_waitcnt lgkmcnt(0)
398; SI-NEXT:    v_mov_b32_e32 v3, s1
399; SI-NEXT:    v_mov_b32_e32 v2, s0
400; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
401; SI-NEXT:    s_endpgm
402;
403; VI-LABEL: extload_f16_to_f64_arg:
404; VI:       ; %bb.0:
405; VI-NEXT:    s_load_dword s0, s[4:5], 0x8
406; VI-NEXT:    s_waitcnt lgkmcnt(0)
407; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
408; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
409; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
410; VI-NEXT:    s_waitcnt lgkmcnt(0)
411; VI-NEXT:    v_mov_b32_e32 v3, s1
412; VI-NEXT:    v_mov_b32_e32 v2, s0
413; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
414; VI-NEXT:    s_endpgm
415  %ext = fpext half %arg to double
416  store double %ext, double addrspace(1)* %out
417  ret void
418}
419
420define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
421; SI-LABEL: extload_v2f16_to_v2f64_arg:
422; SI:       ; %bb.0:
423; SI-NEXT:    s_load_dword s0, s[4:5], 0x2
424; SI-NEXT:    s_waitcnt lgkmcnt(0)
425; SI-NEXT:    s_lshr_b32 s1, s0, 16
426; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
427; SI-NEXT:    v_cvt_f32_f16_e32 v2, s1
428; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
429; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
430; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
431; SI-NEXT:    s_waitcnt lgkmcnt(0)
432; SI-NEXT:    v_mov_b32_e32 v5, s1
433; SI-NEXT:    v_mov_b32_e32 v4, s0
434; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
435; SI-NEXT:    s_endpgm
436;
437; VI-LABEL: extload_v2f16_to_v2f64_arg:
438; VI:       ; %bb.0:
439; VI-NEXT:    s_load_dword s0, s[4:5], 0x8
440; VI-NEXT:    s_waitcnt lgkmcnt(0)
441; VI-NEXT:    s_lshr_b32 s1, s0, 16
442; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
443; VI-NEXT:    v_cvt_f32_f16_e32 v2, s1
444; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
445; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
446; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
447; VI-NEXT:    s_waitcnt lgkmcnt(0)
448; VI-NEXT:    v_mov_b32_e32 v5, s1
449; VI-NEXT:    v_mov_b32_e32 v4, s0
450; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
451; VI-NEXT:    s_endpgm
452  %ext = fpext <2 x half> %arg to <2 x double>
453  store <2 x double> %ext, <2 x double> addrspace(1)* %out
454  ret void
455}
456
457define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
458; SI-LABEL: extload_v3f16_to_v3f64_arg:
459; SI:       ; %bb.0:
460; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
461; SI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
462; SI-NEXT:    s_waitcnt lgkmcnt(0)
463; SI-NEXT:    v_cvt_f32_f16_e32 v0, s3
464; SI-NEXT:    s_lshr_b32 s4, s2, 16
465; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
466; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
467; SI-NEXT:    s_add_u32 s2, s0, 16
468; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v0
469; SI-NEXT:    s_addc_u32 s3, s1, 0
470; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
471; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
472; SI-NEXT:    v_mov_b32_e32 v7, s3
473; SI-NEXT:    v_mov_b32_e32 v6, s2
474; SI-NEXT:    flat_store_dwordx2 v[6:7], v[4:5]
475; SI-NEXT:    v_mov_b32_e32 v5, s1
476; SI-NEXT:    v_mov_b32_e32 v4, s0
477; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
478; SI-NEXT:    s_endpgm
479;
480; VI-LABEL: extload_v3f16_to_v3f64_arg:
481; VI:       ; %bb.0:
482; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
483; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
484; VI-NEXT:    s_waitcnt lgkmcnt(0)
485; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
486; VI-NEXT:    s_lshr_b32 s4, s2, 16
487; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
488; VI-NEXT:    v_cvt_f32_f16_e32 v2, s4
489; VI-NEXT:    s_add_u32 s2, s0, 16
490; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
491; VI-NEXT:    s_addc_u32 s3, s1, 0
492; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
493; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
494; VI-NEXT:    v_mov_b32_e32 v7, s3
495; VI-NEXT:    v_mov_b32_e32 v6, s2
496; VI-NEXT:    flat_store_dwordx2 v[6:7], v[4:5]
497; VI-NEXT:    v_mov_b32_e32 v5, s1
498; VI-NEXT:    v_mov_b32_e32 v4, s0
499; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
500; VI-NEXT:    s_endpgm
501  %ext = fpext <3 x half> %arg to <3 x double>
502  store <3 x double> %ext, <3 x double> addrspace(1)* %out
503  ret void
504}
505
506define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
507; SI-LABEL: extload_v4f16_to_v4f64_arg:
508; SI:       ; %bb.0:
509; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
510; SI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
511; SI-NEXT:    s_waitcnt lgkmcnt(0)
512; SI-NEXT:    s_lshr_b32 s4, s3, 16
513; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
514; SI-NEXT:    v_cvt_f32_f16_e32 v5, s3
515; SI-NEXT:    s_lshr_b32 s5, s2, 16
516; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
517; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
518; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
519; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
520; SI-NEXT:    s_add_u32 s2, s0, 16
521; SI-NEXT:    s_addc_u32 s3, s1, 0
522; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
523; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
524; SI-NEXT:    v_mov_b32_e32 v9, s3
525; SI-NEXT:    v_mov_b32_e32 v8, s2
526; SI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
527; SI-NEXT:    s_nop 0
528; SI-NEXT:    v_mov_b32_e32 v5, s1
529; SI-NEXT:    v_mov_b32_e32 v4, s0
530; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
531; SI-NEXT:    s_endpgm
532;
533; VI-LABEL: extload_v4f16_to_v4f64_arg:
534; VI:       ; %bb.0:
535; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
536; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
537; VI-NEXT:    s_waitcnt lgkmcnt(0)
538; VI-NEXT:    s_lshr_b32 s5, s3, 16
539; VI-NEXT:    v_cvt_f32_f16_e32 v4, s5
540; VI-NEXT:    v_cvt_f32_f16_e32 v5, s3
541; VI-NEXT:    s_lshr_b32 s4, s2, 16
542; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
543; VI-NEXT:    v_cvt_f32_f16_e32 v2, s4
544; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
545; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
546; VI-NEXT:    s_add_u32 s2, s0, 16
547; VI-NEXT:    s_addc_u32 s3, s1, 0
548; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
549; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
550; VI-NEXT:    v_mov_b32_e32 v9, s3
551; VI-NEXT:    v_mov_b32_e32 v8, s2
552; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
553; VI-NEXT:    s_nop 0
554; VI-NEXT:    v_mov_b32_e32 v5, s1
555; VI-NEXT:    v_mov_b32_e32 v4, s0
556; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
557; VI-NEXT:    s_endpgm
558  %ext = fpext <4 x half> %arg to <4 x double>
559  store <4 x double> %ext, <4 x double> addrspace(1)* %out
560  ret void
561}
562
563define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
564; SI-LABEL: extload_v8f16_to_v8f64_arg:
565; SI:       ; %bb.0:
566; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
567; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
568; SI-NEXT:    s_waitcnt lgkmcnt(0)
569; SI-NEXT:    s_lshr_b32 s4, s3, 16
570; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
571; SI-NEXT:    v_cvt_f32_f16_e32 v12, s3
572; SI-NEXT:    s_lshr_b32 s5, s2, 16
573; SI-NEXT:    s_lshr_b32 s8, s1, 16
574; SI-NEXT:    s_lshr_b32 s4, s0, 16
575; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
576; SI-NEXT:    v_cvt_f32_f16_e32 v8, s2
577; SI-NEXT:    v_cvt_f32_f16_e32 v9, s0
578; SI-NEXT:    s_add_u32 s0, s6, 48
579; SI-NEXT:    v_cvt_f32_f16_e32 v5, s1
580; SI-NEXT:    v_cvt_f64_f32_e32 v[14:15], v0
581; SI-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
582; SI-NEXT:    s_addc_u32 s1, s7, 0
583; SI-NEXT:    v_cvt_f32_f16_e32 v4, s8
584; SI-NEXT:    v_mov_b32_e32 v17, s1
585; SI-NEXT:    v_mov_b32_e32 v16, s0
586; SI-NEXT:    s_add_u32 s0, s6, 32
587; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
588; SI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v1
589; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
590; SI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
591; SI-NEXT:    s_addc_u32 s1, s7, 0
592; SI-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
593; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
594; SI-NEXT:    v_mov_b32_e32 v13, s1
595; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
596; SI-NEXT:    v_mov_b32_e32 v12, s0
597; SI-NEXT:    s_add_u32 s0, s6, 16
598; SI-NEXT:    s_addc_u32 s1, s7, 0
599; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
600; SI-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
601; SI-NEXT:    s_nop 0
602; SI-NEXT:    v_mov_b32_e32 v9, s1
603; SI-NEXT:    v_mov_b32_e32 v8, s0
604; SI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
605; SI-NEXT:    s_nop 0
606; SI-NEXT:    v_mov_b32_e32 v4, s6
607; SI-NEXT:    v_mov_b32_e32 v5, s7
608; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
609; SI-NEXT:    s_endpgm
610;
611; VI-LABEL: extload_v8f16_to_v8f64_arg:
612; VI:       ; %bb.0:
613; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
614; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
615; VI-NEXT:    s_waitcnt lgkmcnt(0)
616; VI-NEXT:    s_lshr_b32 s4, s0, 16
617; VI-NEXT:    s_lshr_b32 s8, s2, 16
618; VI-NEXT:    s_lshr_b32 s9, s3, 16
619; VI-NEXT:    v_cvt_f32_f16_e32 v0, s4
620; VI-NEXT:    v_cvt_f32_f16_e32 v4, s8
621; VI-NEXT:    v_cvt_f32_f16_e32 v5, s9
622; VI-NEXT:    v_cvt_f32_f16_e32 v12, s3
623; VI-NEXT:    s_lshr_b32 s5, s1, 16
624; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
625; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
626; VI-NEXT:    v_cvt_f32_f16_e32 v8, s2
627; VI-NEXT:    s_add_u32 s0, s6, 48
628; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v4
629; VI-NEXT:    v_cvt_f64_f32_e32 v[14:15], v5
630; VI-NEXT:    v_cvt_f32_f16_e32 v4, s1
631; VI-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
632; VI-NEXT:    s_addc_u32 s1, s7, 0
633; VI-NEXT:    v_cvt_f32_f16_e32 v1, s5
634; VI-NEXT:    v_mov_b32_e32 v17, s1
635; VI-NEXT:    v_mov_b32_e32 v16, s0
636; VI-NEXT:    s_add_u32 s0, s6, 32
637; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
638; VI-NEXT:    s_addc_u32 s1, s7, 0
639; VI-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
640; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v1
641; VI-NEXT:    v_mov_b32_e32 v13, s1
642; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
643; VI-NEXT:    v_mov_b32_e32 v12, s0
644; VI-NEXT:    s_add_u32 s0, s6, 16
645; VI-NEXT:    s_addc_u32 s1, s7, 0
646; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
647; VI-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
648; VI-NEXT:    s_nop 0
649; VI-NEXT:    v_mov_b32_e32 v9, s1
650; VI-NEXT:    v_mov_b32_e32 v8, s0
651; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
652; VI-NEXT:    s_nop 0
653; VI-NEXT:    v_mov_b32_e32 v4, s6
654; VI-NEXT:    v_mov_b32_e32 v5, s7
655; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
656; VI-NEXT:    s_endpgm
657  %ext = fpext <8 x half> %arg to <8 x double>
658  store <8 x double> %ext, <8 x double> addrspace(1)* %out
659  ret void
660}
661
662define amdgpu_kernel void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
663; GCN-LABEL: global_load_store_f16:
664; GCN:       ; %bb.0:
665; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
666; GCN-NEXT:    s_waitcnt lgkmcnt(0)
667; GCN-NEXT:    v_mov_b32_e32 v2, s2
668; GCN-NEXT:    v_mov_b32_e32 v3, s3
669; GCN-NEXT:    flat_load_ushort v2, v[2:3]
670; GCN-NEXT:    v_mov_b32_e32 v0, s0
671; GCN-NEXT:    v_mov_b32_e32 v1, s1
672; GCN-NEXT:    s_waitcnt vmcnt(0)
673; GCN-NEXT:    flat_store_short v[0:1], v2
674; GCN-NEXT:    s_endpgm
675  %val = load half, half addrspace(1)* %in
676  store half %val, half addrspace(1)* %out
677  ret void
678}
679
680define amdgpu_kernel void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
681; GCN-LABEL: global_load_store_v2f16:
682; GCN:       ; %bb.0:
683; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
684; GCN-NEXT:    s_waitcnt lgkmcnt(0)
685; GCN-NEXT:    v_mov_b32_e32 v2, s2
686; GCN-NEXT:    v_mov_b32_e32 v3, s3
687; GCN-NEXT:    flat_load_dword v2, v[2:3]
688; GCN-NEXT:    v_mov_b32_e32 v0, s0
689; GCN-NEXT:    v_mov_b32_e32 v1, s1
690; GCN-NEXT:    s_waitcnt vmcnt(0)
691; GCN-NEXT:    flat_store_dword v[0:1], v2
692; GCN-NEXT:    s_endpgm
693  %val = load <2 x half>, <2 x half> addrspace(1)* %in
694  store <2 x half> %val, <2 x half> addrspace(1)* %out
695  ret void
696}
697
698define amdgpu_kernel void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
699; GCN-LABEL: global_load_store_v4f16:
700; GCN:       ; %bb.0:
701; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
702; GCN-NEXT:    s_waitcnt lgkmcnt(0)
703; GCN-NEXT:    v_mov_b32_e32 v0, s0
704; GCN-NEXT:    v_mov_b32_e32 v1, s1
705; GCN-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
706; GCN-NEXT:    v_mov_b32_e32 v2, s2
707; GCN-NEXT:    v_mov_b32_e32 v3, s3
708; GCN-NEXT:    s_waitcnt vmcnt(0)
709; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
710; GCN-NEXT:    s_endpgm
711  %val = load <4 x half>, <4 x half> addrspace(1)* %in
712  store <4 x half> %val, <4 x half> addrspace(1)* %out
713  ret void
714}
715
716define amdgpu_kernel void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
717; GCN-LABEL: global_load_store_v8f16:
718; GCN:       ; %bb.0:
719; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
720; GCN-NEXT:    s_waitcnt lgkmcnt(0)
721; GCN-NEXT:    v_mov_b32_e32 v0, s2
722; GCN-NEXT:    v_mov_b32_e32 v1, s3
723; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
724; GCN-NEXT:    v_mov_b32_e32 v4, s0
725; GCN-NEXT:    v_mov_b32_e32 v5, s1
726; GCN-NEXT:    s_waitcnt vmcnt(0)
727; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
728; GCN-NEXT:    s_endpgm
729  %val = load <8 x half>, <8 x half> addrspace(1)* %in
730  store <8 x half> %val, <8 x half> addrspace(1)* %out
731  ret void
732}
733
734define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
735; GCN-LABEL: global_extload_f16_to_f32:
736; GCN:       ; %bb.0:
737; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
738; GCN-NEXT:    s_waitcnt lgkmcnt(0)
739; GCN-NEXT:    v_mov_b32_e32 v0, s2
740; GCN-NEXT:    v_mov_b32_e32 v1, s3
741; GCN-NEXT:    flat_load_ushort v0, v[0:1]
742; GCN-NEXT:    v_mov_b32_e32 v1, s1
743; GCN-NEXT:    s_waitcnt vmcnt(0)
744; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v0
745; GCN-NEXT:    v_mov_b32_e32 v0, s0
746; GCN-NEXT:    flat_store_dword v[0:1], v2
747; GCN-NEXT:    s_endpgm
748  %val = load half, half addrspace(1)* %in
749  %cvt = fpext half %val to float
750  store float %cvt, float addrspace(1)* %out
751  ret void
752}
753
754define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
755; SI-LABEL: global_extload_v2f16_to_v2f32:
756; SI:       ; %bb.0:
757; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
758; SI-NEXT:    s_waitcnt lgkmcnt(0)
759; SI-NEXT:    v_mov_b32_e32 v0, s2
760; SI-NEXT:    v_mov_b32_e32 v1, s3
761; SI-NEXT:    flat_load_dword v1, v[0:1]
762; SI-NEXT:    v_mov_b32_e32 v2, s0
763; SI-NEXT:    v_mov_b32_e32 v3, s1
764; SI-NEXT:    s_waitcnt vmcnt(0)
765; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
766; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
767; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
768; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
769; SI-NEXT:    s_endpgm
770;
771; VI-LABEL: global_extload_v2f16_to_v2f32:
772; VI:       ; %bb.0:
773; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
774; VI-NEXT:    s_waitcnt lgkmcnt(0)
775; VI-NEXT:    v_mov_b32_e32 v0, s2
776; VI-NEXT:    v_mov_b32_e32 v1, s3
777; VI-NEXT:    flat_load_dword v1, v[0:1]
778; VI-NEXT:    v_mov_b32_e32 v2, s0
779; VI-NEXT:    v_mov_b32_e32 v3, s1
780; VI-NEXT:    s_waitcnt vmcnt(0)
781; VI-NEXT:    v_cvt_f32_f16_e32 v0, v1
782; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
783; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
784; VI-NEXT:    s_endpgm
785  %val = load <2 x half>, <2 x half> addrspace(1)* %in
786  %cvt = fpext <2 x half> %val to <2 x float>
787  store <2 x float> %cvt, <2 x float> addrspace(1)* %out
788  ret void
789}
790
791define amdgpu_kernel void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
792; SI-LABEL: global_extload_v3f16_to_v3f32:
793; SI:       ; %bb.0:
794; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
795; SI-NEXT:    s_waitcnt lgkmcnt(0)
796; SI-NEXT:    v_mov_b32_e32 v0, s2
797; SI-NEXT:    v_mov_b32_e32 v1, s3
798; SI-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
799; SI-NEXT:    v_mov_b32_e32 v3, s0
800; SI-NEXT:    v_mov_b32_e32 v4, s1
801; SI-NEXT:    s_waitcnt vmcnt(0)
802; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
803; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
804; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
805; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
806; SI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
807; SI-NEXT:    s_endpgm
808;
809; VI-LABEL: global_extload_v3f16_to_v3f32:
810; VI:       ; %bb.0:
811; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
812; VI-NEXT:    s_waitcnt lgkmcnt(0)
813; VI-NEXT:    v_mov_b32_e32 v0, s2
814; VI-NEXT:    v_mov_b32_e32 v1, s3
815; VI-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
816; VI-NEXT:    v_mov_b32_e32 v3, s0
817; VI-NEXT:    v_mov_b32_e32 v4, s1
818; VI-NEXT:    s_waitcnt vmcnt(0)
819; VI-NEXT:    v_cvt_f32_f16_e32 v0, v1
820; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
821; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
822; VI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
823; VI-NEXT:    s_endpgm
824  %val = load <3 x half>, <3 x half> addrspace(1)* %in
825  %cvt = fpext <3 x half> %val to <3 x float>
826  store <3 x float> %cvt, <3 x float> addrspace(1)* %out
827  ret void
828}
829
830define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
831; SI-LABEL: global_extload_v4f16_to_v4f32:
832; SI:       ; %bb.0:
833; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
834; SI-NEXT:    s_waitcnt lgkmcnt(0)
835; SI-NEXT:    v_mov_b32_e32 v0, s2
836; SI-NEXT:    v_mov_b32_e32 v1, s3
837; SI-NEXT:    flat_load_dwordx2 v[3:4], v[0:1]
838; SI-NEXT:    v_mov_b32_e32 v5, s1
839; SI-NEXT:    s_waitcnt vmcnt(0)
840; SI-NEXT:    v_cvt_f32_f16_e32 v2, v4
841; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
842; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
843; SI-NEXT:    v_cvt_f32_f16_e32 v0, v3
844; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
845; SI-NEXT:    v_cvt_f32_f16_e32 v1, v4
846; SI-NEXT:    v_mov_b32_e32 v4, s0
847; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
848; SI-NEXT:    s_endpgm
849;
850; VI-LABEL: global_extload_v4f16_to_v4f32:
851; VI:       ; %bb.0:
852; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
853; VI-NEXT:    s_waitcnt lgkmcnt(0)
854; VI-NEXT:    v_mov_b32_e32 v0, s2
855; VI-NEXT:    v_mov_b32_e32 v1, s3
856; VI-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
857; VI-NEXT:    s_waitcnt vmcnt(0)
858; VI-NEXT:    v_cvt_f32_f16_e32 v0, v4
859; VI-NEXT:    v_cvt_f32_f16_e32 v2, v5
860; VI-NEXT:    v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
861; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
862; VI-NEXT:    v_mov_b32_e32 v4, s0
863; VI-NEXT:    v_mov_b32_e32 v5, s1
864; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
865; VI-NEXT:    s_endpgm
866  %val = load <4 x half>, <4 x half> addrspace(1)* %in
867  %cvt = fpext <4 x half> %val to <4 x float>
868  store <4 x float> %cvt, <4 x float> addrspace(1)* %out
869  ret void
870}
871
872define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
873; SI-LABEL: global_extload_v8f16_to_v8f32:
874; SI:       ; %bb.0:
875; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
876; SI-NEXT:    s_waitcnt lgkmcnt(0)
877; SI-NEXT:    v_mov_b32_e32 v0, s2
878; SI-NEXT:    v_mov_b32_e32 v1, s3
879; SI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
880; SI-NEXT:    s_add_u32 s2, s0, 16
881; SI-NEXT:    s_addc_u32 s3, s1, 0
882; SI-NEXT:    v_mov_b32_e32 v13, s1
883; SI-NEXT:    v_mov_b32_e32 v12, s0
884; SI-NEXT:    s_waitcnt vmcnt(0)
885; SI-NEXT:    v_cvt_f32_f16_e32 v10, v3
886; SI-NEXT:    v_cvt_f32_f16_e32 v8, v2
887; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
888; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
889; SI-NEXT:    v_cvt_f32_f16_e32 v6, v1
890; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
891; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
892; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
893; SI-NEXT:    v_cvt_f32_f16_e32 v11, v3
894; SI-NEXT:    v_cvt_f32_f16_e32 v9, v2
895; SI-NEXT:    v_cvt_f32_f16_e32 v7, v1
896; SI-NEXT:    v_cvt_f32_f16_e32 v5, v0
897; SI-NEXT:    v_mov_b32_e32 v0, s2
898; SI-NEXT:    v_mov_b32_e32 v1, s3
899; SI-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
900; SI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
901; SI-NEXT:    s_endpgm
902;
903; VI-LABEL: global_extload_v8f16_to_v8f32:
904; VI:       ; %bb.0:
905; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
906; VI-NEXT:    s_waitcnt lgkmcnt(0)
907; VI-NEXT:    v_mov_b32_e32 v0, s2
908; VI-NEXT:    v_mov_b32_e32 v1, s3
909; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
910; VI-NEXT:    s_add_u32 s2, s0, 16
911; VI-NEXT:    s_addc_u32 s3, s1, 0
912; VI-NEXT:    v_mov_b32_e32 v13, s1
913; VI-NEXT:    v_mov_b32_e32 v12, s0
914; VI-NEXT:    s_waitcnt vmcnt(0)
915; VI-NEXT:    v_cvt_f32_f16_e32 v10, v3
916; VI-NEXT:    v_cvt_f32_f16_e32 v8, v2
917; VI-NEXT:    v_cvt_f32_f16_sdwa v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
918; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
919; VI-NEXT:    v_cvt_f32_f16_e32 v6, v1
920; VI-NEXT:    v_cvt_f32_f16_e32 v4, v0
921; VI-NEXT:    v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
922; VI-NEXT:    v_cvt_f32_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
923; VI-NEXT:    v_mov_b32_e32 v0, s2
924; VI-NEXT:    v_mov_b32_e32 v1, s3
925; VI-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
926; VI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
927; VI-NEXT:    s_endpgm
928  %val = load <8 x half>, <8 x half> addrspace(1)* %in
929  %cvt = fpext <8 x half> %val to <8 x float>
930  store <8 x float> %cvt, <8 x float> addrspace(1)* %out
931  ret void
932}
933
934define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
935; SI-LABEL: global_extload_v16f16_to_v16f32:
936; SI:       ; %bb.0:
937; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
938; SI-NEXT:    s_waitcnt lgkmcnt(0)
939; SI-NEXT:    s_add_u32 s4, s2, 16
940; SI-NEXT:    v_mov_b32_e32 v5, s3
941; SI-NEXT:    s_addc_u32 s5, s3, 0
942; SI-NEXT:    v_mov_b32_e32 v0, s4
943; SI-NEXT:    v_mov_b32_e32 v4, s2
944; SI-NEXT:    v_mov_b32_e32 v1, s5
945; SI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
946; SI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
947; SI-NEXT:    s_add_u32 s2, s0, 16
948; SI-NEXT:    s_addc_u32 s3, s1, 0
949; SI-NEXT:    v_mov_b32_e32 v14, s3
950; SI-NEXT:    v_mov_b32_e32 v13, s2
951; SI-NEXT:    s_add_u32 s2, s0, 48
952; SI-NEXT:    s_addc_u32 s3, s1, 0
953; SI-NEXT:    s_waitcnt vmcnt(1)
954; SI-NEXT:    v_cvt_f32_f16_e32 v8, v1
955; SI-NEXT:    s_waitcnt vmcnt(0)
956; SI-NEXT:    v_cvt_f32_f16_e32 v11, v7
957; SI-NEXT:    v_cvt_f32_f16_e32 v9, v6
958; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
959; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
960; SI-NEXT:    v_cvt_f32_f16_e32 v12, v7
961; SI-NEXT:    v_cvt_f32_f16_e32 v10, v6
962; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
963; SI-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
964; SI-NEXT:    s_nop 0
965; SI-NEXT:    v_cvt_f32_f16_e32 v12, v3
966; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
967; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
968; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
969; SI-NEXT:    v_cvt_f32_f16_e32 v6, v0
970; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
971; SI-NEXT:    v_cvt_f32_f16_e32 v10, v2
972; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
973; SI-NEXT:    v_cvt_f32_f16_e32 v2, v5
974; SI-NEXT:    v_cvt_f32_f16_e32 v0, v4
975; SI-NEXT:    v_mov_b32_e32 v5, s1
976; SI-NEXT:    v_cvt_f32_f16_e32 v9, v1
977; SI-NEXT:    v_cvt_f32_f16_e32 v13, v3
978; SI-NEXT:    v_cvt_f32_f16_e32 v3, v16
979; SI-NEXT:    v_cvt_f32_f16_e32 v1, v17
980; SI-NEXT:    v_mov_b32_e32 v4, s0
981; SI-NEXT:    s_add_u32 s0, s0, 32
982; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
983; SI-NEXT:    s_addc_u32 s1, s1, 0
984; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
985; SI-NEXT:    v_mov_b32_e32 v15, s3
986; SI-NEXT:    v_mov_b32_e32 v17, s1
987; SI-NEXT:    v_mov_b32_e32 v14, s2
988; SI-NEXT:    v_mov_b32_e32 v16, s0
989; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
990; SI-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
991; SI-NEXT:    flat_store_dwordx4 v[16:17], v[6:9]
992; SI-NEXT:    s_endpgm
993;
994; VI-LABEL: global_extload_v16f16_to_v16f32:
995; VI:       ; %bb.0:
996; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
997; VI-NEXT:    s_waitcnt lgkmcnt(0)
998; VI-NEXT:    v_mov_b32_e32 v0, s2
999; VI-NEXT:    v_mov_b32_e32 v1, s3
1000; VI-NEXT:    s_add_u32 s2, s2, 16
1001; VI-NEXT:    s_addc_u32 s3, s3, 0
1002; VI-NEXT:    v_mov_b32_e32 v5, s3
1003; VI-NEXT:    v_mov_b32_e32 v4, s2
1004; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1005; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1006; VI-NEXT:    s_add_u32 s2, s0, 16
1007; VI-NEXT:    s_addc_u32 s3, s1, 0
1008; VI-NEXT:    v_mov_b32_e32 v19, s3
1009; VI-NEXT:    v_mov_b32_e32 v18, s2
1010; VI-NEXT:    s_add_u32 s2, s0, 48
1011; VI-NEXT:    v_mov_b32_e32 v17, s1
1012; VI-NEXT:    s_addc_u32 s3, s1, 0
1013; VI-NEXT:    v_mov_b32_e32 v16, s0
1014; VI-NEXT:    s_add_u32 s0, s0, 32
1015; VI-NEXT:    s_addc_u32 s1, s1, 0
1016; VI-NEXT:    v_mov_b32_e32 v21, s3
1017; VI-NEXT:    v_mov_b32_e32 v20, s2
1018; VI-NEXT:    s_waitcnt vmcnt(1)
1019; VI-NEXT:    v_cvt_f32_f16_e32 v14, v3
1020; VI-NEXT:    v_cvt_f32_f16_e32 v12, v2
1021; VI-NEXT:    v_cvt_f32_f16_sdwa v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1022; VI-NEXT:    v_cvt_f32_f16_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1023; VI-NEXT:    v_cvt_f32_f16_e32 v10, v1
1024; VI-NEXT:    v_cvt_f32_f16_e32 v8, v0
1025; VI-NEXT:    v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1026; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1027; VI-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
1028; VI-NEXT:    s_waitcnt vmcnt(1)
1029; VI-NEXT:    v_cvt_f32_f16_e32 v2, v5
1030; VI-NEXT:    v_cvt_f32_f16_e32 v14, v7
1031; VI-NEXT:    v_cvt_f32_f16_e32 v12, v6
1032; VI-NEXT:    v_cvt_f32_f16_sdwa v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1033; VI-NEXT:    v_cvt_f32_f16_sdwa v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1034; VI-NEXT:    v_cvt_f32_f16_e32 v0, v4
1035; VI-NEXT:    v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1036; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1037; VI-NEXT:    v_mov_b32_e32 v5, s1
1038; VI-NEXT:    v_mov_b32_e32 v4, s0
1039; VI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
1040; VI-NEXT:    flat_store_dwordx4 v[20:21], v[12:15]
1041; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1042; VI-NEXT:    s_endpgm
1043  %val = load <16 x half>, <16 x half> addrspace(1)* %in
1044  %cvt = fpext <16 x half> %val to <16 x float>
1045  store <16 x float> %cvt, <16 x float> addrspace(1)* %out
1046  ret void
1047}
1048
1049define amdgpu_kernel void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
1050; GCN-LABEL: global_extload_f16_to_f64:
1051; GCN:       ; %bb.0:
1052; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1053; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1054; GCN-NEXT:    v_mov_b32_e32 v0, s2
1055; GCN-NEXT:    v_mov_b32_e32 v1, s3
1056; GCN-NEXT:    flat_load_ushort v0, v[0:1]
1057; GCN-NEXT:    v_mov_b32_e32 v2, s0
1058; GCN-NEXT:    v_mov_b32_e32 v3, s1
1059; GCN-NEXT:    s_waitcnt vmcnt(0)
1060; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
1061; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
1062; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1063; GCN-NEXT:    s_endpgm
1064  %val = load half, half addrspace(1)* %in
1065  %cvt = fpext half %val to double
1066  store double %cvt, double addrspace(1)* %out
1067  ret void
1068}
1069
1070define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
1071; SI-LABEL: global_extload_v2f16_to_v2f64:
1072; SI:       ; %bb.0:
1073; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1074; SI-NEXT:    s_waitcnt lgkmcnt(0)
1075; SI-NEXT:    v_mov_b32_e32 v0, s2
1076; SI-NEXT:    v_mov_b32_e32 v1, s3
1077; SI-NEXT:    flat_load_dword v0, v[0:1]
1078; SI-NEXT:    v_mov_b32_e32 v4, s0
1079; SI-NEXT:    v_mov_b32_e32 v5, s1
1080; SI-NEXT:    s_waitcnt vmcnt(0)
1081; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1082; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1083; SI-NEXT:    v_cvt_f32_f16_e32 v2, v1
1084; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
1085; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1086; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1087; SI-NEXT:    s_endpgm
1088;
1089; VI-LABEL: global_extload_v2f16_to_v2f64:
1090; VI:       ; %bb.0:
1091; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1092; VI-NEXT:    s_waitcnt lgkmcnt(0)
1093; VI-NEXT:    v_mov_b32_e32 v0, s2
1094; VI-NEXT:    v_mov_b32_e32 v1, s3
1095; VI-NEXT:    flat_load_dword v0, v[0:1]
1096; VI-NEXT:    v_mov_b32_e32 v4, s0
1097; VI-NEXT:    v_mov_b32_e32 v5, s1
1098; VI-NEXT:    s_waitcnt vmcnt(0)
1099; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1100; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1101; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
1102; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1103; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1104; VI-NEXT:    s_endpgm
1105  %val = load <2 x half>, <2 x half> addrspace(1)* %in
1106  %cvt = fpext <2 x half> %val to <2 x double>
1107  store <2 x double> %cvt, <2 x double> addrspace(1)* %out
1108  ret void
1109}
1110
1111define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
1112; SI-LABEL: global_extload_v3f16_to_v3f64:
1113; SI:       ; %bb.0:
1114; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1115; SI-NEXT:    s_waitcnt lgkmcnt(0)
1116; SI-NEXT:    v_mov_b32_e32 v0, s2
1117; SI-NEXT:    v_mov_b32_e32 v1, s3
1118; SI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1119; SI-NEXT:    s_add_u32 s2, s0, 16
1120; SI-NEXT:    s_addc_u32 s3, s1, 0
1121; SI-NEXT:    v_mov_b32_e32 v7, s3
1122; SI-NEXT:    v_mov_b32_e32 v6, s2
1123; SI-NEXT:    s_waitcnt vmcnt(0)
1124; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1125; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
1126; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1127; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1128; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
1129; SI-NEXT:    flat_store_dwordx2 v[6:7], v[4:5]
1130; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
1131; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1132; SI-NEXT:    v_mov_b32_e32 v5, s1
1133; SI-NEXT:    v_mov_b32_e32 v4, s0
1134; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1135; SI-NEXT:    s_endpgm
1136;
1137; VI-LABEL: global_extload_v3f16_to_v3f64:
1138; VI:       ; %bb.0:
1139; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1140; VI-NEXT:    s_waitcnt lgkmcnt(0)
1141; VI-NEXT:    v_mov_b32_e32 v0, s2
1142; VI-NEXT:    v_mov_b32_e32 v1, s3
1143; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1144; VI-NEXT:    s_add_u32 s2, s0, 16
1145; VI-NEXT:    s_addc_u32 s3, s1, 0
1146; VI-NEXT:    v_mov_b32_e32 v5, s1
1147; VI-NEXT:    v_mov_b32_e32 v4, s0
1148; VI-NEXT:    s_waitcnt vmcnt(0)
1149; VI-NEXT:    v_cvt_f32_f16_e32 v3, v1
1150; VI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1151; VI-NEXT:    v_cvt_f32_f16_sdwa v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1152; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v3
1153; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
1154; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
1155; VI-NEXT:    v_mov_b32_e32 v9, s3
1156; VI-NEXT:    v_mov_b32_e32 v8, s2
1157; VI-NEXT:    flat_store_dwordx2 v[8:9], v[6:7]
1158; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1159; VI-NEXT:    s_endpgm
1160  %val = load <3 x half>, <3 x half> addrspace(1)* %in
1161  %cvt = fpext <3 x half> %val to <3 x double>
1162  store <3 x double> %cvt, <3 x double> addrspace(1)* %out
1163  ret void
1164}
1165
1166define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
1167; SI-LABEL: global_extload_v4f16_to_v4f64:
1168; SI:       ; %bb.0:
1169; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1170; SI-NEXT:    s_waitcnt lgkmcnt(0)
1171; SI-NEXT:    v_mov_b32_e32 v0, s2
1172; SI-NEXT:    v_mov_b32_e32 v1, s3
1173; SI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1174; SI-NEXT:    s_add_u32 s2, s0, 16
1175; SI-NEXT:    s_addc_u32 s3, s1, 0
1176; SI-NEXT:    v_mov_b32_e32 v9, s1
1177; SI-NEXT:    v_mov_b32_e32 v8, s0
1178; SI-NEXT:    s_waitcnt vmcnt(0)
1179; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
1180; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
1181; SI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1182; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1183; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1184; SI-NEXT:    v_cvt_f32_f16_e32 v10, v0
1185; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v3
1186; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
1187; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v2
1188; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v10
1189; SI-NEXT:    v_mov_b32_e32 v11, s3
1190; SI-NEXT:    v_mov_b32_e32 v10, s2
1191; SI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
1192; SI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1193; SI-NEXT:    s_endpgm
1194;
1195; VI-LABEL: global_extload_v4f16_to_v4f64:
1196; VI:       ; %bb.0:
1197; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1198; VI-NEXT:    s_waitcnt lgkmcnt(0)
1199; VI-NEXT:    v_mov_b32_e32 v0, s2
1200; VI-NEXT:    v_mov_b32_e32 v1, s3
1201; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1202; VI-NEXT:    s_add_u32 s2, s0, 16
1203; VI-NEXT:    s_addc_u32 s3, s1, 0
1204; VI-NEXT:    v_mov_b32_e32 v9, s1
1205; VI-NEXT:    v_mov_b32_e32 v8, s0
1206; VI-NEXT:    s_waitcnt vmcnt(0)
1207; VI-NEXT:    v_cvt_f32_f16_e32 v3, v1
1208; VI-NEXT:    v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1209; VI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1210; VI-NEXT:    v_cvt_f32_f16_sdwa v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1211; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v3
1212; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
1213; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
1214; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v10
1215; VI-NEXT:    v_mov_b32_e32 v11, s3
1216; VI-NEXT:    v_mov_b32_e32 v10, s2
1217; VI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
1218; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1219; VI-NEXT:    s_endpgm
1220  %val = load <4 x half>, <4 x half> addrspace(1)* %in
1221  %cvt = fpext <4 x half> %val to <4 x double>
1222  store <4 x double> %cvt, <4 x double> addrspace(1)* %out
1223  ret void
1224}
1225
1226define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
1227; SI-LABEL: global_extload_v8f16_to_v8f64:
1228; SI:       ; %bb.0:
1229; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1230; SI-NEXT:    s_waitcnt lgkmcnt(0)
1231; SI-NEXT:    v_mov_b32_e32 v0, s2
1232; SI-NEXT:    v_mov_b32_e32 v1, s3
1233; SI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1234; SI-NEXT:    s_add_u32 s2, s0, 48
1235; SI-NEXT:    s_addc_u32 s3, s1, 0
1236; SI-NEXT:    v_mov_b32_e32 v7, s3
1237; SI-NEXT:    v_mov_b32_e32 v6, s2
1238; SI-NEXT:    s_add_u32 s2, s0, 32
1239; SI-NEXT:    v_mov_b32_e32 v13, s1
1240; SI-NEXT:    s_addc_u32 s3, s1, 0
1241; SI-NEXT:    v_mov_b32_e32 v12, s0
1242; SI-NEXT:    s_add_u32 s0, s0, 16
1243; SI-NEXT:    v_mov_b32_e32 v15, s3
1244; SI-NEXT:    s_addc_u32 s1, s1, 0
1245; SI-NEXT:    v_mov_b32_e32 v14, s2
1246; SI-NEXT:    s_waitcnt vmcnt(0)
1247; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
1248; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1249; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
1250; SI-NEXT:    v_cvt_f32_f16_e32 v8, v2
1251; SI-NEXT:    v_cvt_f32_f16_e32 v2, v4
1252; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
1253; SI-NEXT:    v_cvt_f32_f16_e32 v10, v1
1254; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
1255; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
1256; SI-NEXT:    v_cvt_f32_f16_e32 v16, v5
1257; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
1258; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1259; SI-NEXT:    v_cvt_f32_f16_e32 v17, v9
1260; SI-NEXT:    v_cvt_f32_f16_e32 v18, v11
1261; SI-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
1262; SI-NEXT:    s_nop 0
1263; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v10
1264; SI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
1265; SI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v16
1266; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v17
1267; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
1268; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v18
1269; SI-NEXT:    v_mov_b32_e32 v17, s1
1270; SI-NEXT:    v_mov_b32_e32 v16, s0
1271; SI-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
1272; SI-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
1273; SI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
1274; SI-NEXT:    s_endpgm
1275;
1276; VI-LABEL: global_extload_v8f16_to_v8f64:
1277; VI:       ; %bb.0:
1278; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1279; VI-NEXT:    s_waitcnt lgkmcnt(0)
1280; VI-NEXT:    v_mov_b32_e32 v0, s2
1281; VI-NEXT:    v_mov_b32_e32 v1, s3
1282; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1283; VI-NEXT:    s_add_u32 s2, s0, 48
1284; VI-NEXT:    s_addc_u32 s3, s1, 0
1285; VI-NEXT:    v_mov_b32_e32 v8, s3
1286; VI-NEXT:    v_mov_b32_e32 v7, s2
1287; VI-NEXT:    s_add_u32 s2, s0, 32
1288; VI-NEXT:    v_mov_b32_e32 v13, s1
1289; VI-NEXT:    s_addc_u32 s3, s1, 0
1290; VI-NEXT:    v_mov_b32_e32 v12, s0
1291; VI-NEXT:    s_add_u32 s0, s0, 16
1292; VI-NEXT:    v_mov_b32_e32 v15, s3
1293; VI-NEXT:    s_addc_u32 s1, s1, 0
1294; VI-NEXT:    v_mov_b32_e32 v14, s2
1295; VI-NEXT:    s_waitcnt vmcnt(0)
1296; VI-NEXT:    v_cvt_f32_f16_e32 v9, v0
1297; VI-NEXT:    v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1298; VI-NEXT:    v_cvt_f32_f16_e32 v0, v3
1299; VI-NEXT:    v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1300; VI-NEXT:    v_cvt_f32_f16_e32 v10, v1
1301; VI-NEXT:    v_cvt_f32_f16_e32 v11, v2
1302; VI-NEXT:    v_cvt_f64_f32_e32 v[3:4], v0
1303; VI-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
1304; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1305; VI-NEXT:    v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1306; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
1307; VI-NEXT:    flat_store_dwordx4 v[7:8], v[3:6]
1308; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v11
1309; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
1310; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
1311; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v17
1312; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v16
1313; VI-NEXT:    v_mov_b32_e32 v17, s1
1314; VI-NEXT:    v_mov_b32_e32 v16, s0
1315; VI-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
1316; VI-NEXT:    flat_store_dwordx4 v[16:17], v[4:7]
1317; VI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
1318; VI-NEXT:    s_endpgm
1319  %val = load <8 x half>, <8 x half> addrspace(1)* %in
1320  %cvt = fpext <8 x half> %val to <8 x double>
1321  store <8 x double> %cvt, <8 x double> addrspace(1)* %out
1322  ret void
1323}
1324
1325define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
1326; SI-LABEL: global_extload_v16f16_to_v16f64:
1327; SI:       ; %bb.0:
1328; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1329; SI-NEXT:    s_waitcnt lgkmcnt(0)
1330; SI-NEXT:    v_mov_b32_e32 v0, s2
1331; SI-NEXT:    v_mov_b32_e32 v1, s3
1332; SI-NEXT:    s_add_u32 s2, s2, 16
1333; SI-NEXT:    s_addc_u32 s3, s3, 0
1334; SI-NEXT:    v_mov_b32_e32 v2, s2
1335; SI-NEXT:    v_mov_b32_e32 v3, s3
1336; SI-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
1337; SI-NEXT:    flat_load_dwordx4 v[0:3], v[2:3]
1338; SI-NEXT:    s_add_u32 s2, s0, 48
1339; SI-NEXT:    s_addc_u32 s3, s1, 0
1340; SI-NEXT:    v_mov_b32_e32 v14, s3
1341; SI-NEXT:    v_mov_b32_e32 v13, s2
1342; SI-NEXT:    s_add_u32 s2, s0, 32
1343; SI-NEXT:    s_addc_u32 s3, s1, 0
1344; SI-NEXT:    v_mov_b32_e32 v16, s3
1345; SI-NEXT:    v_mov_b32_e32 v15, s2
1346; SI-NEXT:    s_add_u32 s2, s0, 16
1347; SI-NEXT:    s_addc_u32 s3, s1, 0
1348; SI-NEXT:    v_mov_b32_e32 v18, s3
1349; SI-NEXT:    v_mov_b32_e32 v17, s2
1350; SI-NEXT:    s_add_u32 s2, s0, 0x70
1351; SI-NEXT:    s_addc_u32 s3, s1, 0
1352; SI-NEXT:    v_mov_b32_e32 v12, s1
1353; SI-NEXT:    v_mov_b32_e32 v11, s0
1354; SI-NEXT:    s_waitcnt vmcnt(1)
1355; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
1356; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
1357; SI-NEXT:    v_cvt_f32_f16_e32 v9, v8
1358; SI-NEXT:    s_waitcnt vmcnt(0)
1359; SI-NEXT:    v_cvt_f32_f16_e32 v19, v3
1360; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v0
1361; SI-NEXT:    v_cvt_f64_f32_e32 v[7:8], v7
1362; SI-NEXT:    v_cvt_f64_f32_e32 v[9:10], v9
1363; SI-NEXT:    flat_store_dwordx4 v[13:14], v[7:10]
1364; SI-NEXT:    s_nop 0
1365; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
1366; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
1367; SI-NEXT:    v_cvt_f32_f16_e32 v8, v7
1368; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
1369; SI-NEXT:    v_cvt_f32_f16_e32 v21, v0
1370; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
1371; SI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
1372; SI-NEXT:    flat_store_dwordx4 v[15:16], v[6:9]
1373; SI-NEXT:    s_nop 0
1374; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
1375; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
1376; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
1377; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
1378; SI-NEXT:    v_cvt_f32_f16_e32 v9, v4
1379; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
1380; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
1381; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
1382; SI-NEXT:    flat_store_dwordx4 v[17:18], v[4:7]
1383; SI-NEXT:    s_nop 0
1384; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
1385; SI-NEXT:    v_cvt_f32_f16_e32 v6, v2
1386; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
1387; SI-NEXT:    v_cvt_f32_f16_e32 v7, v1
1388; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
1389; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
1390; SI-NEXT:    v_cvt_f32_f16_e32 v8, v10
1391; SI-NEXT:    v_mov_b32_e32 v14, s3
1392; SI-NEXT:    v_mov_b32_e32 v13, s2
1393; SI-NEXT:    s_add_u32 s2, s0, 0x60
1394; SI-NEXT:    v_cvt_f32_f16_e32 v10, v4
1395; SI-NEXT:    s_addc_u32 s3, s1, 0
1396; SI-NEXT:    flat_store_dwordx4 v[11:12], v[0:3]
1397; SI-NEXT:    v_cvt_f32_f16_e32 v12, v5
1398; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v19
1399; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
1400; SI-NEXT:    v_mov_b32_e32 v16, s3
1401; SI-NEXT:    v_cvt_f32_f16_e32 v19, v20
1402; SI-NEXT:    v_mov_b32_e32 v15, s2
1403; SI-NEXT:    s_add_u32 s2, s0, 0x50
1404; SI-NEXT:    s_addc_u32 s3, s1, 0
1405; SI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v6
1406; SI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
1407; SI-NEXT:    s_add_u32 s0, s0, 64
1408; SI-NEXT:    flat_store_dwordx4 v[13:14], v[0:3]
1409; SI-NEXT:    s_addc_u32 s1, s1, 0
1410; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v7
1411; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v12
1412; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v21
1413; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v19
1414; SI-NEXT:    v_mov_b32_e32 v18, s3
1415; SI-NEXT:    v_mov_b32_e32 v13, s1
1416; SI-NEXT:    v_mov_b32_e32 v17, s2
1417; SI-NEXT:    v_mov_b32_e32 v12, s0
1418; SI-NEXT:    flat_store_dwordx4 v[15:16], v[8:11]
1419; SI-NEXT:    flat_store_dwordx4 v[17:18], v[0:3]
1420; SI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
1421; SI-NEXT:    s_endpgm
1422;
1423; VI-LABEL: global_extload_v16f16_to_v16f64:
1424; VI:       ; %bb.0:
1425; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1426; VI-NEXT:    s_waitcnt lgkmcnt(0)
1427; VI-NEXT:    v_mov_b32_e32 v0, s2
1428; VI-NEXT:    v_mov_b32_e32 v1, s3
1429; VI-NEXT:    s_add_u32 s2, s2, 16
1430; VI-NEXT:    s_addc_u32 s3, s3, 0
1431; VI-NEXT:    v_mov_b32_e32 v2, s2
1432; VI-NEXT:    v_mov_b32_e32 v3, s3
1433; VI-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
1434; VI-NEXT:    flat_load_dwordx4 v[0:3], v[2:3]
1435; VI-NEXT:    s_add_u32 s2, s0, 48
1436; VI-NEXT:    s_addc_u32 s3, s1, 0
1437; VI-NEXT:    v_mov_b32_e32 v14, s3
1438; VI-NEXT:    v_mov_b32_e32 v13, s2
1439; VI-NEXT:    s_add_u32 s2, s0, 32
1440; VI-NEXT:    s_addc_u32 s3, s1, 0
1441; VI-NEXT:    v_mov_b32_e32 v16, s3
1442; VI-NEXT:    v_mov_b32_e32 v15, s2
1443; VI-NEXT:    s_add_u32 s2, s0, 16
1444; VI-NEXT:    s_addc_u32 s3, s1, 0
1445; VI-NEXT:    v_mov_b32_e32 v18, s3
1446; VI-NEXT:    v_mov_b32_e32 v17, s2
1447; VI-NEXT:    s_add_u32 s2, s0, 0x70
1448; VI-NEXT:    v_mov_b32_e32 v12, s1
1449; VI-NEXT:    s_addc_u32 s3, s1, 0
1450; VI-NEXT:    v_mov_b32_e32 v11, s0
1451; VI-NEXT:    s_waitcnt vmcnt(1)
1452; VI-NEXT:    v_cvt_f32_f16_e32 v8, v7
1453; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1454; VI-NEXT:    v_cvt_f64_f32_e32 v[7:8], v8
1455; VI-NEXT:    v_cvt_f64_f32_e32 v[9:10], v9
1456; VI-NEXT:    flat_store_dwordx4 v[13:14], v[7:10]
1457; VI-NEXT:    s_nop 0
1458; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
1459; VI-NEXT:    v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1460; VI-NEXT:    s_waitcnt vmcnt(1)
1461; VI-NEXT:    v_cvt_f32_f16_e32 v10, v0
1462; VI-NEXT:    v_mov_b32_e32 v14, s3
1463; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
1464; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
1465; VI-NEXT:    flat_store_dwordx4 v[15:16], v[6:9]
1466; VI-NEXT:    s_nop 0
1467; VI-NEXT:    v_cvt_f32_f16_e32 v6, v5
1468; VI-NEXT:    v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1469; VI-NEXT:    v_cvt_f32_f16_e32 v8, v4
1470; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1471; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
1472; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
1473; VI-NEXT:    flat_store_dwordx4 v[17:18], v[4:7]
1474; VI-NEXT:    s_nop 0
1475; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
1476; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v9
1477; VI-NEXT:    v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1478; VI-NEXT:    v_cvt_f32_f16_sdwa v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1479; VI-NEXT:    v_cvt_f32_f16_e32 v0, v3
1480; VI-NEXT:    v_mov_b32_e32 v13, s2
1481; VI-NEXT:    s_add_u32 s2, s0, 0x60
1482; VI-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
1483; VI-NEXT:    s_addc_u32 s3, s1, 0
1484; VI-NEXT:    v_cvt_f64_f32_e32 v[5:6], v8
1485; VI-NEXT:    v_cvt_f32_f16_e32 v8, v2
1486; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1487; VI-NEXT:    v_cvt_f32_f16_e32 v7, v1
1488; VI-NEXT:    v_cvt_f32_f16_sdwa v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1489; VI-NEXT:    v_cvt_f64_f32_e32 v[3:4], v0
1490; VI-NEXT:    v_mov_b32_e32 v16, s3
1491; VI-NEXT:    v_mov_b32_e32 v15, s2
1492; VI-NEXT:    s_add_u32 s2, s0, 0x50
1493; VI-NEXT:    s_addc_u32 s3, s1, 0
1494; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v10
1495; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
1496; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
1497; VI-NEXT:    s_add_u32 s0, s0, 64
1498; VI-NEXT:    flat_store_dwordx4 v[13:14], v[3:6]
1499; VI-NEXT:    s_addc_u32 s1, s1, 0
1500; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
1501; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v12
1502; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v17
1503; VI-NEXT:    v_mov_b32_e32 v20, s3
1504; VI-NEXT:    v_mov_b32_e32 v13, s1
1505; VI-NEXT:    v_mov_b32_e32 v19, s2
1506; VI-NEXT:    v_mov_b32_e32 v12, s0
1507; VI-NEXT:    flat_store_dwordx4 v[15:16], v[8:11]
1508; VI-NEXT:    flat_store_dwordx4 v[19:20], v[4:7]
1509; VI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
1510; VI-NEXT:    s_endpgm
1511  %val = load <16 x half>, <16 x half> addrspace(1)* %in
1512  %cvt = fpext <16 x half> %val to <16 x double>
1513  store <16 x double> %cvt, <16 x double> addrspace(1)* %out
1514  ret void
1515}
1516
1517define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
1518; GCN-LABEL: global_truncstore_f32_to_f16:
1519; GCN:       ; %bb.0:
1520; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1521; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1522; GCN-NEXT:    v_mov_b32_e32 v0, s2
1523; GCN-NEXT:    v_mov_b32_e32 v1, s3
1524; GCN-NEXT:    flat_load_dword v0, v[0:1]
1525; GCN-NEXT:    v_mov_b32_e32 v1, s1
1526; GCN-NEXT:    s_waitcnt vmcnt(0)
1527; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v0
1528; GCN-NEXT:    v_mov_b32_e32 v0, s0
1529; GCN-NEXT:    flat_store_short v[0:1], v2
1530; GCN-NEXT:    s_endpgm
1531  %val = load float, float addrspace(1)* %in
1532  %cvt = fptrunc float %val to half
1533  store half %cvt, half addrspace(1)* %out
1534  ret void
1535}
1536
1537define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
1538; SI-LABEL: global_truncstore_v2f32_to_v2f16:
1539; SI:       ; %bb.0:
1540; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1541; SI-NEXT:    s_waitcnt lgkmcnt(0)
1542; SI-NEXT:    v_mov_b32_e32 v0, s2
1543; SI-NEXT:    v_mov_b32_e32 v1, s3
1544; SI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1545; SI-NEXT:    s_waitcnt vmcnt(0)
1546; SI-NEXT:    v_cvt_f16_f32_e32 v2, v1
1547; SI-NEXT:    v_cvt_f16_f32_e32 v3, v0
1548; SI-NEXT:    v_mov_b32_e32 v0, s0
1549; SI-NEXT:    v_mov_b32_e32 v1, s1
1550; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1551; SI-NEXT:    v_or_b32_e32 v2, v3, v2
1552; SI-NEXT:    flat_store_dword v[0:1], v2
1553; SI-NEXT:    s_endpgm
1554;
1555; VI-LABEL: global_truncstore_v2f32_to_v2f16:
1556; VI:       ; %bb.0:
1557; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1558; VI-NEXT:    s_waitcnt lgkmcnt(0)
1559; VI-NEXT:    v_mov_b32_e32 v0, s2
1560; VI-NEXT:    v_mov_b32_e32 v1, s3
1561; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1562; VI-NEXT:    s_waitcnt vmcnt(0)
1563; VI-NEXT:    v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1564; VI-NEXT:    v_cvt_f16_f32_e32 v3, v0
1565; VI-NEXT:    v_mov_b32_e32 v0, s0
1566; VI-NEXT:    v_mov_b32_e32 v1, s1
1567; VI-NEXT:    v_or_b32_e32 v2, v3, v2
1568; VI-NEXT:    flat_store_dword v[0:1], v2
1569; VI-NEXT:    s_endpgm
1570  %val = load <2 x float>, <2 x float> addrspace(1)* %in
1571  %cvt = fptrunc <2 x float> %val to <2 x half>
1572  store <2 x half> %cvt, <2 x half> addrspace(1)* %out
1573  ret void
1574}
1575
1576define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
1577; SI-LABEL: global_truncstore_v3f32_to_v3f16:
1578; SI:       ; %bb.0:
1579; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1580; SI-NEXT:    s_waitcnt lgkmcnt(0)
1581; SI-NEXT:    v_mov_b32_e32 v0, s2
1582; SI-NEXT:    v_mov_b32_e32 v1, s3
1583; SI-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
1584; SI-NEXT:    s_add_u32 s2, s0, 4
1585; SI-NEXT:    s_addc_u32 s3, s1, 0
1586; SI-NEXT:    s_waitcnt vmcnt(0)
1587; SI-NEXT:    v_cvt_f16_f32_e32 v3, v1
1588; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1589; SI-NEXT:    v_cvt_f16_f32_e32 v4, v0
1590; SI-NEXT:    v_mov_b32_e32 v0, s2
1591; SI-NEXT:    v_mov_b32_e32 v1, s3
1592; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1593; SI-NEXT:    flat_store_short v[0:1], v2
1594; SI-NEXT:    v_mov_b32_e32 v0, s0
1595; SI-NEXT:    v_or_b32_e32 v2, v4, v3
1596; SI-NEXT:    v_mov_b32_e32 v1, s1
1597; SI-NEXT:    flat_store_dword v[0:1], v2
1598; SI-NEXT:    s_endpgm
1599;
1600; VI-LABEL: global_truncstore_v3f32_to_v3f16:
1601; VI:       ; %bb.0:
1602; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1603; VI-NEXT:    s_waitcnt lgkmcnt(0)
1604; VI-NEXT:    v_mov_b32_e32 v0, s2
1605; VI-NEXT:    v_mov_b32_e32 v1, s3
1606; VI-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
1607; VI-NEXT:    s_add_u32 s2, s0, 4
1608; VI-NEXT:    s_addc_u32 s3, s1, 0
1609; VI-NEXT:    s_waitcnt vmcnt(0)
1610; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1611; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1612; VI-NEXT:    v_cvt_f16_f32_e32 v4, v0
1613; VI-NEXT:    v_mov_b32_e32 v0, s2
1614; VI-NEXT:    v_mov_b32_e32 v1, s3
1615; VI-NEXT:    flat_store_short v[0:1], v2
1616; VI-NEXT:    v_mov_b32_e32 v0, s0
1617; VI-NEXT:    v_or_b32_e32 v3, v4, v3
1618; VI-NEXT:    v_mov_b32_e32 v1, s1
1619; VI-NEXT:    flat_store_dword v[0:1], v3
1620; VI-NEXT:    s_endpgm
1621  %val = load <3 x float>, <3 x float> addrspace(1)* %in
1622  %cvt = fptrunc <3 x float> %val to <3 x half>
1623  store <3 x half> %cvt, <3 x half> addrspace(1)* %out
1624  ret void
1625}
1626
1627define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
1628; SI-LABEL: global_truncstore_v4f32_to_v4f16:
1629; SI:       ; %bb.0:
1630; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1631; SI-NEXT:    s_waitcnt lgkmcnt(0)
1632; SI-NEXT:    v_mov_b32_e32 v0, s2
1633; SI-NEXT:    v_mov_b32_e32 v1, s3
1634; SI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1635; SI-NEXT:    v_mov_b32_e32 v4, s0
1636; SI-NEXT:    v_mov_b32_e32 v5, s1
1637; SI-NEXT:    s_waitcnt vmcnt(0)
1638; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1639; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1640; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1641; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1642; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1643; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
1644; SI-NEXT:    v_or_b32_e32 v1, v2, v3
1645; SI-NEXT:    v_or_b32_e32 v0, v0, v6
1646; SI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
1647; SI-NEXT:    s_endpgm
1648;
1649; VI-LABEL: global_truncstore_v4f32_to_v4f16:
1650; VI:       ; %bb.0:
1651; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1652; VI-NEXT:    s_waitcnt lgkmcnt(0)
1653; VI-NEXT:    v_mov_b32_e32 v0, s2
1654; VI-NEXT:    v_mov_b32_e32 v1, s3
1655; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1656; VI-NEXT:    s_waitcnt vmcnt(0)
1657; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1658; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1659; VI-NEXT:    v_cvt_f16_f32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1660; VI-NEXT:    v_cvt_f16_f32_e32 v5, v0
1661; VI-NEXT:    v_mov_b32_e32 v0, s0
1662; VI-NEXT:    v_mov_b32_e32 v1, s1
1663; VI-NEXT:    v_or_b32_e32 v3, v2, v3
1664; VI-NEXT:    v_or_b32_e32 v2, v5, v4
1665; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1666; VI-NEXT:    s_endpgm
1667  %val = load <4 x float>, <4 x float> addrspace(1)* %in
1668  %cvt = fptrunc <4 x float> %val to <4 x half>
1669  store <4 x half> %cvt, <4 x half> addrspace(1)* %out
1670  ret void
1671}
1672
1673define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
1674; SI-LABEL: global_truncstore_v8f32_to_v8f16:
1675; SI:       ; %bb.0:
1676; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1677; SI-NEXT:    s_waitcnt lgkmcnt(0)
1678; SI-NEXT:    s_add_u32 s4, s2, 16
1679; SI-NEXT:    v_mov_b32_e32 v5, s3
1680; SI-NEXT:    s_addc_u32 s5, s3, 0
1681; SI-NEXT:    v_mov_b32_e32 v0, s4
1682; SI-NEXT:    v_mov_b32_e32 v4, s2
1683; SI-NEXT:    v_mov_b32_e32 v1, s5
1684; SI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1685; SI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1686; SI-NEXT:    v_mov_b32_e32 v8, s0
1687; SI-NEXT:    v_mov_b32_e32 v9, s1
1688; SI-NEXT:    s_waitcnt vmcnt(1)
1689; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1690; SI-NEXT:    s_waitcnt vmcnt(0)
1691; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1692; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
1693; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1694; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1695; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1696; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1697; SI-NEXT:    v_cvt_f16_f32_e32 v10, v0
1698; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
1699; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
1700; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1701; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
1702; SI-NEXT:    v_or_b32_e32 v1, v6, v0
1703; SI-NEXT:    v_or_b32_e32 v0, v4, v5
1704; SI-NEXT:    v_or_b32_e32 v3, v2, v3
1705; SI-NEXT:    v_or_b32_e32 v2, v10, v7
1706; SI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1707; SI-NEXT:    s_endpgm
1708;
1709; VI-LABEL: global_truncstore_v8f32_to_v8f16:
1710; VI:       ; %bb.0:
1711; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1712; VI-NEXT:    s_waitcnt lgkmcnt(0)
1713; VI-NEXT:    s_add_u32 s4, s2, 16
1714; VI-NEXT:    v_mov_b32_e32 v5, s3
1715; VI-NEXT:    s_addc_u32 s5, s3, 0
1716; VI-NEXT:    v_mov_b32_e32 v0, s4
1717; VI-NEXT:    v_mov_b32_e32 v4, s2
1718; VI-NEXT:    v_mov_b32_e32 v1, s5
1719; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1720; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1721; VI-NEXT:    v_mov_b32_e32 v8, s0
1722; VI-NEXT:    v_mov_b32_e32 v9, s1
1723; VI-NEXT:    s_waitcnt vmcnt(1)
1724; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1725; VI-NEXT:    s_waitcnt vmcnt(0)
1726; VI-NEXT:    v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1727; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1728; VI-NEXT:    v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1729; VI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1730; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1731; VI-NEXT:    v_cvt_f16_f32_sdwa v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1732; VI-NEXT:    v_cvt_f16_f32_e32 v11, v0
1733; VI-NEXT:    v_or_b32_e32 v1, v6, v7
1734; VI-NEXT:    v_or_b32_e32 v0, v4, v5
1735; VI-NEXT:    v_or_b32_e32 v3, v2, v3
1736; VI-NEXT:    v_or_b32_e32 v2, v11, v10
1737; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1738; VI-NEXT:    s_endpgm
1739  %val = load <8 x float>, <8 x float> addrspace(1)* %in
1740  %cvt = fptrunc <8 x float> %val to <8 x half>
1741  store <8 x half> %cvt, <8 x half> addrspace(1)* %out
1742  ret void
1743}
1744
1745define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
1746; SI-LABEL: global_truncstore_v16f32_to_v16f16:
1747; SI:       ; %bb.0:
1748; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1749; SI-NEXT:    s_waitcnt lgkmcnt(0)
1750; SI-NEXT:    s_add_u32 s4, s2, 32
1751; SI-NEXT:    s_addc_u32 s5, s3, 0
1752; SI-NEXT:    v_mov_b32_e32 v0, s4
1753; SI-NEXT:    v_mov_b32_e32 v1, s5
1754; SI-NEXT:    s_add_u32 s4, s2, 48
1755; SI-NEXT:    v_mov_b32_e32 v13, s3
1756; SI-NEXT:    s_addc_u32 s5, s3, 0
1757; SI-NEXT:    v_mov_b32_e32 v12, s2
1758; SI-NEXT:    v_mov_b32_e32 v4, s4
1759; SI-NEXT:    s_add_u32 s2, s2, 16
1760; SI-NEXT:    v_mov_b32_e32 v5, s5
1761; SI-NEXT:    s_addc_u32 s3, s3, 0
1762; SI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1763; SI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1764; SI-NEXT:    v_mov_b32_e32 v9, s3
1765; SI-NEXT:    v_mov_b32_e32 v8, s2
1766; SI-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
1767; SI-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
1768; SI-NEXT:    s_add_u32 s2, s0, 16
1769; SI-NEXT:    s_addc_u32 s3, s1, 0
1770; SI-NEXT:    s_waitcnt vmcnt(3)
1771; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1772; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1773; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1774; SI-NEXT:    s_waitcnt vmcnt(2)
1775; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1776; SI-NEXT:    v_cvt_f16_f32_e32 v16, v5
1777; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1778; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1779; SI-NEXT:    v_cvt_f16_f32_e32 v17, v4
1780; SI-NEXT:    s_waitcnt vmcnt(0)
1781; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
1782; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
1783; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
1784; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
1785; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
1786; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
1787; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
1788; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
1789; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1790; SI-NEXT:    v_mov_b32_e32 v5, s3
1791; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
1792; SI-NEXT:    v_or_b32_e32 v1, v2, v3
1793; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
1794; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v16
1795; SI-NEXT:    v_mov_b32_e32 v4, s2
1796; SI-NEXT:    v_or_b32_e32 v0, v0, v18
1797; SI-NEXT:    v_or_b32_e32 v3, v6, v2
1798; SI-NEXT:    v_or_b32_e32 v2, v17, v7
1799; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
1800; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v13
1801; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
1802; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
1803; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1804; SI-NEXT:    v_mov_b32_e32 v5, s1
1805; SI-NEXT:    v_or_b32_e32 v1, v14, v6
1806; SI-NEXT:    v_or_b32_e32 v0, v12, v7
1807; SI-NEXT:    v_or_b32_e32 v3, v10, v11
1808; SI-NEXT:    v_or_b32_e32 v2, v8, v9
1809; SI-NEXT:    v_mov_b32_e32 v4, s0
1810; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1811; SI-NEXT:    s_endpgm
1812;
1813; VI-LABEL: global_truncstore_v16f32_to_v16f16:
1814; VI:       ; %bb.0:
1815; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1816; VI-NEXT:    s_waitcnt lgkmcnt(0)
1817; VI-NEXT:    s_add_u32 s4, s2, 32
1818; VI-NEXT:    s_addc_u32 s5, s3, 0
1819; VI-NEXT:    v_mov_b32_e32 v0, s4
1820; VI-NEXT:    v_mov_b32_e32 v1, s5
1821; VI-NEXT:    s_add_u32 s4, s2, 48
1822; VI-NEXT:    v_mov_b32_e32 v13, s3
1823; VI-NEXT:    s_addc_u32 s5, s3, 0
1824; VI-NEXT:    v_mov_b32_e32 v12, s2
1825; VI-NEXT:    v_mov_b32_e32 v4, s4
1826; VI-NEXT:    s_add_u32 s2, s2, 16
1827; VI-NEXT:    v_mov_b32_e32 v5, s5
1828; VI-NEXT:    s_addc_u32 s3, s3, 0
1829; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1830; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1831; VI-NEXT:    v_mov_b32_e32 v9, s3
1832; VI-NEXT:    v_mov_b32_e32 v8, s2
1833; VI-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
1834; VI-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
1835; VI-NEXT:    s_add_u32 s2, s0, 16
1836; VI-NEXT:    s_addc_u32 s3, s1, 0
1837; VI-NEXT:    s_waitcnt vmcnt(3)
1838; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1839; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1840; VI-NEXT:    v_cvt_f16_f32_sdwa v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1841; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1842; VI-NEXT:    s_waitcnt vmcnt(2)
1843; VI-NEXT:    v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1844; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1845; VI-NEXT:    v_cvt_f16_f32_sdwa v17, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1846; VI-NEXT:    v_cvt_f16_f32_e32 v18, v4
1847; VI-NEXT:    s_waitcnt vmcnt(0)
1848; VI-NEXT:    v_cvt_f16_f32_sdwa v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1849; VI-NEXT:    v_cvt_f16_f32_e32 v14, v14
1850; VI-NEXT:    v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1851; VI-NEXT:    v_cvt_f16_f32_e32 v12, v12
1852; VI-NEXT:    v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1853; VI-NEXT:    v_cvt_f16_f32_e32 v10, v10
1854; VI-NEXT:    v_cvt_f16_f32_sdwa v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1855; VI-NEXT:    v_cvt_f16_f32_e32 v8, v8
1856; VI-NEXT:    v_mov_b32_e32 v5, s3
1857; VI-NEXT:    v_mov_b32_e32 v4, s2
1858; VI-NEXT:    v_or_b32_e32 v1, v2, v3
1859; VI-NEXT:    v_or_b32_e32 v0, v0, v16
1860; VI-NEXT:    v_or_b32_e32 v3, v6, v7
1861; VI-NEXT:    v_or_b32_e32 v2, v18, v17
1862; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1863; VI-NEXT:    v_mov_b32_e32 v5, s1
1864; VI-NEXT:    v_or_b32_e32 v1, v14, v15
1865; VI-NEXT:    v_or_b32_e32 v0, v12, v13
1866; VI-NEXT:    v_or_b32_e32 v3, v10, v11
1867; VI-NEXT:    v_or_b32_e32 v2, v8, v9
1868; VI-NEXT:    v_mov_b32_e32 v4, s0
1869; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1870; VI-NEXT:    s_endpgm
1871  %val = load <16 x float>, <16 x float> addrspace(1)* %in
1872  %cvt = fptrunc <16 x float> %val to <16 x half>
1873  store <16 x half> %cvt, <16 x half> addrspace(1)* %out
1874  ret void
1875}
1876
1877; FIXME: Unsafe math should fold conversions away
1878define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
1879; SI-LABEL: fadd_f16:
1880; SI:       ; %bb.0:
1881; SI-NEXT:    s_load_dword s0, s[4:5], 0x2
1882; SI-NEXT:    s_waitcnt lgkmcnt(0)
1883; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
1884; SI-NEXT:    s_lshr_b32 s0, s0, 16
1885; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
1886; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1887; SI-NEXT:    v_add_f32_e32 v0, v0, v1
1888; SI-NEXT:    v_cvt_f16_f32_e32 v2, v0
1889; SI-NEXT:    s_waitcnt lgkmcnt(0)
1890; SI-NEXT:    v_mov_b32_e32 v0, s0
1891; SI-NEXT:    v_mov_b32_e32 v1, s1
1892; SI-NEXT:    flat_store_short v[0:1], v2
1893; SI-NEXT:    s_endpgm
1894;
1895; VI-LABEL: fadd_f16:
1896; VI:       ; %bb.0:
1897; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1898; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
1899; VI-NEXT:    s_waitcnt lgkmcnt(0)
1900; VI-NEXT:    s_lshr_b32 s3, s2, 16
1901; VI-NEXT:    v_mov_b32_e32 v0, s3
1902; VI-NEXT:    v_add_f16_e32 v2, s2, v0
1903; VI-NEXT:    v_mov_b32_e32 v0, s0
1904; VI-NEXT:    v_mov_b32_e32 v1, s1
1905; VI-NEXT:    flat_store_short v[0:1], v2
1906; VI-NEXT:    s_endpgm
1907   %add = fadd half %a, %b
1908   store half %add, half addrspace(1)* %out, align 4
1909   ret void
1910}
1911
1912define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
1913; SI-LABEL: fadd_v2f16:
1914; SI:       ; %bb.0:
1915; SI-NEXT:    s_load_dword s0, s[4:5], 0x2
1916; SI-NEXT:    s_load_dword s1, s[4:5], 0x3
1917; SI-NEXT:    s_waitcnt lgkmcnt(0)
1918; SI-NEXT:    s_lshr_b32 s2, s0, 16
1919; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
1920; SI-NEXT:    s_lshr_b32 s0, s1, 16
1921; SI-NEXT:    v_cvt_f32_f16_e32 v1, s1
1922; SI-NEXT:    v_cvt_f32_f16_e32 v2, s2
1923; SI-NEXT:    v_cvt_f32_f16_e32 v3, s0
1924; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1925; SI-NEXT:    v_add_f32_e32 v0, v0, v1
1926; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1927; SI-NEXT:    v_add_f32_e32 v1, v2, v3
1928; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1929; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1930; SI-NEXT:    v_or_b32_e32 v2, v0, v1
1931; SI-NEXT:    s_waitcnt lgkmcnt(0)
1932; SI-NEXT:    v_mov_b32_e32 v0, s0
1933; SI-NEXT:    v_mov_b32_e32 v1, s1
1934; SI-NEXT:    flat_store_dword v[0:1], v2
1935; SI-NEXT:    s_endpgm
1936;
1937; VI-LABEL: fadd_v2f16:
1938; VI:       ; %bb.0:
1939; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1940; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
1941; VI-NEXT:    s_load_dword s3, s[4:5], 0xc
1942; VI-NEXT:    s_waitcnt lgkmcnt(0)
1943; VI-NEXT:    s_lshr_b32 s5, s2, 16
1944; VI-NEXT:    s_lshr_b32 s4, s3, 16
1945; VI-NEXT:    v_mov_b32_e32 v0, s4
1946; VI-NEXT:    v_mov_b32_e32 v1, s5
1947; VI-NEXT:    v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1948; VI-NEXT:    v_mov_b32_e32 v1, s3
1949; VI-NEXT:    v_add_f16_e32 v1, s2, v1
1950; VI-NEXT:    v_or_b32_e32 v2, v1, v0
1951; VI-NEXT:    v_mov_b32_e32 v0, s0
1952; VI-NEXT:    v_mov_b32_e32 v1, s1
1953; VI-NEXT:    flat_store_dword v[0:1], v2
1954; VI-NEXT:    s_endpgm
1955  %add = fadd <2 x half> %a, %b
1956  store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8
1957  ret void
1958}
1959
1960define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
1961; SI-LABEL: fadd_v4f16:
1962; SI:       ; %bb.0:
1963; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1964; SI-NEXT:    s_waitcnt lgkmcnt(0)
1965; SI-NEXT:    s_add_u32 s4, s2, 8
1966; SI-NEXT:    v_mov_b32_e32 v0, s2
1967; SI-NEXT:    s_addc_u32 s5, s3, 0
1968; SI-NEXT:    v_mov_b32_e32 v2, s4
1969; SI-NEXT:    v_mov_b32_e32 v1, s3
1970; SI-NEXT:    v_mov_b32_e32 v3, s5
1971; SI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1972; SI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1973; SI-NEXT:    v_mov_b32_e32 v4, s0
1974; SI-NEXT:    v_mov_b32_e32 v5, s1
1975; SI-NEXT:    s_waitcnt vmcnt(1)
1976; SI-NEXT:    v_cvt_f32_f16_e32 v6, v0
1977; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1978; SI-NEXT:    v_cvt_f32_f16_e32 v7, v1
1979; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1980; SI-NEXT:    s_waitcnt vmcnt(0)
1981; SI-NEXT:    v_cvt_f32_f16_e32 v8, v2
1982; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1983; SI-NEXT:    v_cvt_f32_f16_e32 v9, v3
1984; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1985; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1986; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1987; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1988; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1989; SI-NEXT:    v_add_f32_e32 v7, v7, v9
1990; SI-NEXT:    v_add_f32_e32 v6, v6, v8
1991; SI-NEXT:    v_add_f32_e32 v1, v1, v3
1992; SI-NEXT:    v_add_f32_e32 v0, v0, v2
1993; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1994; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1995; SI-NEXT:    v_cvt_f16_f32_e32 v2, v7
1996; SI-NEXT:    v_cvt_f16_f32_e32 v3, v6
1997; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1998; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1999; SI-NEXT:    v_or_b32_e32 v1, v2, v1
2000; SI-NEXT:    v_or_b32_e32 v0, v3, v0
2001; SI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
2002; SI-NEXT:    s_endpgm
2003;
2004; VI-LABEL: fadd_v4f16:
2005; VI:       ; %bb.0:
2006; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2007; VI-NEXT:    s_waitcnt lgkmcnt(0)
2008; VI-NEXT:    s_add_u32 s4, s2, 8
2009; VI-NEXT:    v_mov_b32_e32 v0, s2
2010; VI-NEXT:    s_addc_u32 s5, s3, 0
2011; VI-NEXT:    v_mov_b32_e32 v2, s4
2012; VI-NEXT:    v_mov_b32_e32 v1, s3
2013; VI-NEXT:    v_mov_b32_e32 v3, s5
2014; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2015; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
2016; VI-NEXT:    v_mov_b32_e32 v4, s0
2017; VI-NEXT:    v_mov_b32_e32 v5, s1
2018; VI-NEXT:    s_waitcnt vmcnt(0)
2019; VI-NEXT:    v_add_f16_sdwa v6, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2020; VI-NEXT:    v_add_f16_e32 v1, v1, v3
2021; VI-NEXT:    v_add_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2022; VI-NEXT:    v_add_f16_e32 v0, v0, v2
2023; VI-NEXT:    v_or_b32_e32 v1, v1, v6
2024; VI-NEXT:    v_or_b32_e32 v0, v0, v3
2025; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
2026; VI-NEXT:    s_endpgm
2027  %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1
2028  %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16
2029  %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16
2030  %result = fadd <4 x half> %a, %b
2031  store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16
2032  ret void
2033}
2034
2035define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
2036; SI-LABEL: fadd_v8f16:
2037; SI:       ; %bb.0:
2038; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2039; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
2040; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x8
2041; SI-NEXT:    s_waitcnt lgkmcnt(0)
2042; SI-NEXT:    s_lshr_b32 s10, s0, 16
2043; SI-NEXT:    v_cvt_f32_f16_e32 v4, s0
2044; SI-NEXT:    s_lshr_b32 s0, s4, 16
2045; SI-NEXT:    v_cvt_f32_f16_e32 v8, s0
2046; SI-NEXT:    s_lshr_b32 s0, s5, 16
2047; SI-NEXT:    s_lshr_b32 s11, s1, 16
2048; SI-NEXT:    v_cvt_f32_f16_e32 v0, s10
2049; SI-NEXT:    s_lshr_b32 s10, s2, 16
2050; SI-NEXT:    v_cvt_f32_f16_e32 v9, s0
2051; SI-NEXT:    s_lshr_b32 s0, s6, 16
2052; SI-NEXT:    v_cvt_f32_f16_e32 v1, s11
2053; SI-NEXT:    v_cvt_f32_f16_e32 v2, s10
2054; SI-NEXT:    s_lshr_b32 s10, s3, 16
2055; SI-NEXT:    v_cvt_f32_f16_e32 v10, s0
2056; SI-NEXT:    s_lshr_b32 s0, s7, 16
2057; SI-NEXT:    v_cvt_f32_f16_e32 v3, s10
2058; SI-NEXT:    v_cvt_f32_f16_e32 v5, s1
2059; SI-NEXT:    v_cvt_f32_f16_e32 v11, s0
2060; SI-NEXT:    v_cvt_f32_f16_e32 v12, s4
2061; SI-NEXT:    v_cvt_f32_f16_e32 v13, s5
2062; SI-NEXT:    v_cvt_f32_f16_e32 v6, s2
2063; SI-NEXT:    v_cvt_f32_f16_e32 v7, s3
2064; SI-NEXT:    v_cvt_f32_f16_e32 v14, s7
2065; SI-NEXT:    v_cvt_f32_f16_e32 v15, s6
2066; SI-NEXT:    v_add_f32_e32 v1, v1, v9
2067; SI-NEXT:    v_add_f32_e32 v0, v0, v8
2068; SI-NEXT:    v_add_f32_e32 v3, v3, v11
2069; SI-NEXT:    v_add_f32_e32 v2, v2, v10
2070; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2071; SI-NEXT:    v_add_f32_e32 v5, v5, v13
2072; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2073; SI-NEXT:    v_add_f32_e32 v4, v4, v12
2074; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
2075; SI-NEXT:    v_add_f32_e32 v7, v7, v14
2076; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2077; SI-NEXT:    v_add_f32_e32 v6, v6, v15
2078; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
2079; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
2080; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
2081; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
2082; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2083; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2084; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2085; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2086; SI-NEXT:    v_or_b32_e32 v1, v5, v1
2087; SI-NEXT:    v_or_b32_e32 v0, v4, v0
2088; SI-NEXT:    v_mov_b32_e32 v4, s8
2089; SI-NEXT:    v_or_b32_e32 v3, v7, v3
2090; SI-NEXT:    v_or_b32_e32 v2, v6, v2
2091; SI-NEXT:    v_mov_b32_e32 v5, s9
2092; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2093; SI-NEXT:    s_endpgm
2094;
2095; VI-LABEL: fadd_v8f16:
2096; VI:       ; %bb.0:
2097; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2098; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
2099; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x20
2100; VI-NEXT:    s_waitcnt lgkmcnt(0)
2101; VI-NEXT:    s_lshr_b32 s11, s3, 16
2102; VI-NEXT:    s_lshr_b32 s10, s7, 16
2103; VI-NEXT:    v_mov_b32_e32 v0, s10
2104; VI-NEXT:    v_mov_b32_e32 v1, s11
2105; VI-NEXT:    v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2106; VI-NEXT:    v_mov_b32_e32 v1, s7
2107; VI-NEXT:    v_add_f16_e32 v1, s3, v1
2108; VI-NEXT:    s_lshr_b32 s3, s6, 16
2109; VI-NEXT:    s_lshr_b32 s7, s2, 16
2110; VI-NEXT:    v_or_b32_e32 v3, v1, v0
2111; VI-NEXT:    v_mov_b32_e32 v0, s3
2112; VI-NEXT:    v_mov_b32_e32 v1, s7
2113; VI-NEXT:    v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2114; VI-NEXT:    v_mov_b32_e32 v1, s6
2115; VI-NEXT:    v_add_f16_e32 v1, s2, v1
2116; VI-NEXT:    s_lshr_b32 s2, s5, 16
2117; VI-NEXT:    s_lshr_b32 s3, s1, 16
2118; VI-NEXT:    v_or_b32_e32 v2, v1, v0
2119; VI-NEXT:    v_mov_b32_e32 v0, s2
2120; VI-NEXT:    v_mov_b32_e32 v1, s3
2121; VI-NEXT:    v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2122; VI-NEXT:    v_mov_b32_e32 v1, s5
2123; VI-NEXT:    v_add_f16_e32 v1, s1, v1
2124; VI-NEXT:    s_lshr_b32 s1, s4, 16
2125; VI-NEXT:    s_lshr_b32 s2, s0, 16
2126; VI-NEXT:    v_or_b32_e32 v1, v1, v0
2127; VI-NEXT:    v_mov_b32_e32 v0, s1
2128; VI-NEXT:    v_mov_b32_e32 v4, s2
2129; VI-NEXT:    v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2130; VI-NEXT:    v_mov_b32_e32 v4, s4
2131; VI-NEXT:    v_add_f16_e32 v4, s0, v4
2132; VI-NEXT:    v_or_b32_e32 v0, v4, v0
2133; VI-NEXT:    v_mov_b32_e32 v4, s8
2134; VI-NEXT:    v_mov_b32_e32 v5, s9
2135; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2136; VI-NEXT:    s_endpgm
2137  %add = fadd <8 x half> %a, %b
2138  store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32
2139  ret void
2140}
2141
2142define amdgpu_kernel void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
2143; GCN-LABEL: test_bitcast_from_half:
2144; GCN:       ; %bb.0:
2145; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2146; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2147; GCN-NEXT:    v_mov_b32_e32 v0, s0
2148; GCN-NEXT:    v_mov_b32_e32 v1, s1
2149; GCN-NEXT:    flat_load_ushort v0, v[0:1]
2150; GCN-NEXT:    v_mov_b32_e32 v2, s2
2151; GCN-NEXT:    v_mov_b32_e32 v3, s3
2152; GCN-NEXT:    s_waitcnt vmcnt(0)
2153; GCN-NEXT:    flat_store_short v[2:3], v0
2154; GCN-NEXT:    s_endpgm
2155  %val = load half, half addrspace(1)* %in
2156  %val_int = bitcast half %val to i16
2157  store i16 %val_int, i16 addrspace(1)* %out
2158  ret void
2159}
2160
2161define amdgpu_kernel void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
2162; GCN-LABEL: test_bitcast_to_half:
2163; GCN:       ; %bb.0:
2164; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2165; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2166; GCN-NEXT:    v_mov_b32_e32 v2, s2
2167; GCN-NEXT:    v_mov_b32_e32 v3, s3
2168; GCN-NEXT:    flat_load_ushort v2, v[2:3]
2169; GCN-NEXT:    v_mov_b32_e32 v0, s0
2170; GCN-NEXT:    v_mov_b32_e32 v1, s1
2171; GCN-NEXT:    s_waitcnt vmcnt(0)
2172; GCN-NEXT:    flat_store_short v[0:1], v2
2173; GCN-NEXT:    s_endpgm
2174  %val = load i16, i16 addrspace(1)* %in
2175  %val_fp = bitcast i16 %val to half
2176  store half %val_fp, half addrspace(1)* %out
2177  ret void
2178}
2179
2180attributes #0 = { nounwind }
2181