1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI %s
4
5; half args should be promoted to float for SI and lower.
6
7define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
8; SI-LABEL: load_f16_arg:
9; SI:       ; %bb.0:
10; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11; SI-NEXT:    s_load_dword s2, s[4:5], 0x2
12; SI-NEXT:    s_waitcnt lgkmcnt(0)
13; SI-NEXT:    v_mov_b32_e32 v0, s0
14; SI-NEXT:    v_mov_b32_e32 v1, s1
15; SI-NEXT:    v_mov_b32_e32 v2, s2
16; SI-NEXT:    flat_store_short v[0:1], v2
17; SI-NEXT:    s_endpgm
18;
19; VI-LABEL: load_f16_arg:
20; VI:       ; %bb.0:
21; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
22; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
23; VI-NEXT:    s_waitcnt lgkmcnt(0)
24; VI-NEXT:    v_mov_b32_e32 v0, s0
25; VI-NEXT:    v_mov_b32_e32 v1, s1
26; VI-NEXT:    v_mov_b32_e32 v2, s2
27; VI-NEXT:    flat_store_short v[0:1], v2
28; VI-NEXT:    s_endpgm
29  store half %arg, half addrspace(1)* %out
30  ret void
31}
32
33define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
34; SI-LABEL: load_v2f16_arg:
35; SI:       ; %bb.0:
36; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
37; SI-NEXT:    s_load_dword s2, s[4:5], 0x2
38; SI-NEXT:    s_waitcnt lgkmcnt(0)
39; SI-NEXT:    v_mov_b32_e32 v0, s0
40; SI-NEXT:    v_mov_b32_e32 v1, s1
41; SI-NEXT:    v_mov_b32_e32 v2, s2
42; SI-NEXT:    flat_store_dword v[0:1], v2
43; SI-NEXT:    s_endpgm
44;
45; VI-LABEL: load_v2f16_arg:
46; VI:       ; %bb.0:
47; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
48; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
49; VI-NEXT:    s_waitcnt lgkmcnt(0)
50; VI-NEXT:    v_mov_b32_e32 v0, s0
51; VI-NEXT:    v_mov_b32_e32 v1, s1
52; VI-NEXT:    v_mov_b32_e32 v2, s2
53; VI-NEXT:    flat_store_dword v[0:1], v2
54; VI-NEXT:    s_endpgm
55  store <2 x half> %arg, <2 x half> addrspace(1)* %out
56  ret void
57}
58
59define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
60; SI-LABEL: load_v3f16_arg:
61; SI:       ; %bb.0:
62; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
63; SI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
64; SI-NEXT:    s_waitcnt lgkmcnt(0)
65; SI-NEXT:    s_add_u32 s4, s0, 4
66; SI-NEXT:    s_addc_u32 s5, s1, 0
67; SI-NEXT:    v_mov_b32_e32 v2, s4
68; SI-NEXT:    v_mov_b32_e32 v4, s3
69; SI-NEXT:    v_mov_b32_e32 v0, s0
70; SI-NEXT:    v_mov_b32_e32 v3, s5
71; SI-NEXT:    v_mov_b32_e32 v1, s1
72; SI-NEXT:    v_mov_b32_e32 v5, s2
73; SI-NEXT:    flat_store_short v[2:3], v4
74; SI-NEXT:    flat_store_dword v[0:1], v5
75; SI-NEXT:    s_endpgm
76;
77; VI-LABEL: load_v3f16_arg:
78; VI:       ; %bb.0:
79; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
80; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
81; VI-NEXT:    s_waitcnt lgkmcnt(0)
82; VI-NEXT:    s_add_u32 s4, s0, 4
83; VI-NEXT:    s_addc_u32 s5, s1, 0
84; VI-NEXT:    v_mov_b32_e32 v2, s4
85; VI-NEXT:    v_mov_b32_e32 v4, s3
86; VI-NEXT:    v_mov_b32_e32 v0, s0
87; VI-NEXT:    v_mov_b32_e32 v3, s5
88; VI-NEXT:    v_mov_b32_e32 v1, s1
89; VI-NEXT:    v_mov_b32_e32 v5, s2
90; VI-NEXT:    flat_store_short v[2:3], v4
91; VI-NEXT:    flat_store_dword v[0:1], v5
92; VI-NEXT:    s_endpgm
93  store <3 x half> %arg, <3 x half> addrspace(1)* %out
94  ret void
95}
96
97
98; FIXME: Why not one load?
99define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
100; SI-LABEL: load_v4f16_arg:
101; SI:       ; %bb.0:
102; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
103; SI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
104; SI-NEXT:    s_waitcnt lgkmcnt(0)
105; SI-NEXT:    v_mov_b32_e32 v0, s0
106; SI-NEXT:    v_mov_b32_e32 v2, s2
107; SI-NEXT:    v_mov_b32_e32 v1, s1
108; SI-NEXT:    v_mov_b32_e32 v3, s3
109; SI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
110; SI-NEXT:    s_endpgm
111;
112; VI-LABEL: load_v4f16_arg:
113; VI:       ; %bb.0:
114; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
115; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
116; VI-NEXT:    s_waitcnt lgkmcnt(0)
117; VI-NEXT:    v_mov_b32_e32 v0, s0
118; VI-NEXT:    v_mov_b32_e32 v2, s2
119; VI-NEXT:    v_mov_b32_e32 v1, s1
120; VI-NEXT:    v_mov_b32_e32 v3, s3
121; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
122; VI-NEXT:    s_endpgm
123  store <4 x half> %arg, <4 x half> addrspace(1)* %out
124  ret void
125}
126
127define amdgpu_kernel void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
128; SI-LABEL: load_v8f16_arg:
129; SI:       ; %bb.0:
130; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
131; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
132; SI-NEXT:    s_waitcnt lgkmcnt(0)
133; SI-NEXT:    v_mov_b32_e32 v4, s6
134; SI-NEXT:    v_mov_b32_e32 v0, s0
135; SI-NEXT:    v_mov_b32_e32 v5, s7
136; SI-NEXT:    v_mov_b32_e32 v1, s1
137; SI-NEXT:    v_mov_b32_e32 v2, s2
138; SI-NEXT:    v_mov_b32_e32 v3, s3
139; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
140; SI-NEXT:    s_endpgm
141;
142; VI-LABEL: load_v8f16_arg:
143; VI:       ; %bb.0:
144; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
145; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
146; VI-NEXT:    s_waitcnt lgkmcnt(0)
147; VI-NEXT:    v_mov_b32_e32 v4, s6
148; VI-NEXT:    v_mov_b32_e32 v0, s0
149; VI-NEXT:    v_mov_b32_e32 v5, s7
150; VI-NEXT:    v_mov_b32_e32 v1, s1
151; VI-NEXT:    v_mov_b32_e32 v2, s2
152; VI-NEXT:    v_mov_b32_e32 v3, s3
153; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
154; VI-NEXT:    s_endpgm
155  store <8 x half> %arg, <8 x half> addrspace(1)* %out
156  ret void
157}
158
159define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
160; SI-LABEL: extload_v2f16_arg:
161; SI:       ; %bb.0:
162; SI-NEXT:    s_load_dword s2, s[4:5], 0x2
163; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
164; SI-NEXT:    s_waitcnt lgkmcnt(0)
165; SI-NEXT:    s_lshr_b32 s3, s2, 16
166; SI-NEXT:    v_cvt_f32_f16_e32 v1, s3
167; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
168; SI-NEXT:    v_mov_b32_e32 v3, s1
169; SI-NEXT:    v_mov_b32_e32 v2, s0
170; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
171; SI-NEXT:    s_endpgm
172;
173; VI-LABEL: extload_v2f16_arg:
174; VI:       ; %bb.0:
175; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
176; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
177; VI-NEXT:    s_waitcnt lgkmcnt(0)
178; VI-NEXT:    s_lshr_b32 s3, s2, 16
179; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
180; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
181; VI-NEXT:    v_mov_b32_e32 v3, s1
182; VI-NEXT:    v_mov_b32_e32 v2, s0
183; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
184; VI-NEXT:    s_endpgm
185  %fpext = fpext <2 x half> %in to <2 x float>
186  store <2 x float> %fpext, <2 x float> addrspace(1)* %out
187  ret void
188}
189
190define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
191; SI-LABEL: extload_f16_to_f32_arg:
192; SI:       ; %bb.0:
193; SI-NEXT:    s_load_dword s2, s[4:5], 0x2
194; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
195; SI-NEXT:    s_waitcnt lgkmcnt(0)
196; SI-NEXT:    v_cvt_f32_f16_e32 v2, s2
197; SI-NEXT:    v_mov_b32_e32 v0, s0
198; SI-NEXT:    v_mov_b32_e32 v1, s1
199; SI-NEXT:    flat_store_dword v[0:1], v2
200; SI-NEXT:    s_endpgm
201;
202; VI-LABEL: extload_f16_to_f32_arg:
203; VI:       ; %bb.0:
204; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
205; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
206; VI-NEXT:    s_waitcnt lgkmcnt(0)
207; VI-NEXT:    v_cvt_f32_f16_e32 v2, s2
208; VI-NEXT:    v_mov_b32_e32 v0, s0
209; VI-NEXT:    v_mov_b32_e32 v1, s1
210; VI-NEXT:    flat_store_dword v[0:1], v2
211; VI-NEXT:    s_endpgm
212  %ext = fpext half %arg to float
213  store float %ext, float addrspace(1)* %out
214  ret void
215}
216
217define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
218; SI-LABEL: extload_v2f16_to_v2f32_arg:
219; SI:       ; %bb.0:
220; SI-NEXT:    s_load_dword s2, s[4:5], 0x2
221; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
222; SI-NEXT:    s_waitcnt lgkmcnt(0)
223; SI-NEXT:    s_lshr_b32 s3, s2, 16
224; SI-NEXT:    v_cvt_f32_f16_e32 v1, s3
225; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
226; SI-NEXT:    v_mov_b32_e32 v3, s1
227; SI-NEXT:    v_mov_b32_e32 v2, s0
228; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
229; SI-NEXT:    s_endpgm
230;
231; VI-LABEL: extload_v2f16_to_v2f32_arg:
232; VI:       ; %bb.0:
233; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
234; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
235; VI-NEXT:    s_waitcnt lgkmcnt(0)
236; VI-NEXT:    s_lshr_b32 s3, s2, 16
237; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
238; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
239; VI-NEXT:    v_mov_b32_e32 v3, s1
240; VI-NEXT:    v_mov_b32_e32 v2, s0
241; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
242; VI-NEXT:    s_endpgm
243  %ext = fpext <2 x half> %arg to <2 x float>
244  store <2 x float> %ext, <2 x float> addrspace(1)* %out
245  ret void
246}
247
248define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
249; SI-LABEL: extload_v3f16_to_v3f32_arg:
250; SI:       ; %bb.0:
251; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
252; SI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
253; SI-NEXT:    s_waitcnt lgkmcnt(0)
254; SI-NEXT:    s_lshr_b32 s4, s0, 16
255; SI-NEXT:    v_cvt_f32_f16_e32 v2, s1
256; SI-NEXT:    v_cvt_f32_f16_e32 v1, s4
257; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
258; SI-NEXT:    v_mov_b32_e32 v4, s3
259; SI-NEXT:    v_mov_b32_e32 v3, s2
260; SI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
261; SI-NEXT:    s_endpgm
262;
263; VI-LABEL: extload_v3f16_to_v3f32_arg:
264; VI:       ; %bb.0:
265; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
266; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
267; VI-NEXT:    s_waitcnt lgkmcnt(0)
268; VI-NEXT:    s_lshr_b32 s4, s0, 16
269; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
270; VI-NEXT:    v_cvt_f32_f16_e32 v1, s4
271; VI-NEXT:    v_cvt_f32_f16_e32 v2, s1
272; VI-NEXT:    v_mov_b32_e32 v4, s3
273; VI-NEXT:    v_mov_b32_e32 v3, s2
274; VI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
275; VI-NEXT:    s_endpgm
276  %ext = fpext <3 x half> %arg to <3 x float>
277  store <3 x float> %ext, <3 x float> addrspace(1)* %out
278  ret void
279}
280
281define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
282; SI-LABEL: extload_v4f16_to_v4f32_arg:
283; SI:       ; %bb.0:
284; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
285; SI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
286; SI-NEXT:    s_waitcnt lgkmcnt(0)
287; SI-NEXT:    s_lshr_b32 s4, s1, 16
288; SI-NEXT:    s_lshr_b32 s5, s0, 16
289; SI-NEXT:    v_cvt_f32_f16_e32 v2, s1
290; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
291; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
292; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
293; SI-NEXT:    v_mov_b32_e32 v5, s3
294; SI-NEXT:    v_mov_b32_e32 v4, s2
295; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
296; SI-NEXT:    s_endpgm
297;
298; VI-LABEL: extload_v4f16_to_v4f32_arg:
299; VI:       ; %bb.0:
300; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
301; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
302; VI-NEXT:    s_waitcnt lgkmcnt(0)
303; VI-NEXT:    s_lshr_b32 s4, s1, 16
304; VI-NEXT:    s_lshr_b32 s5, s0, 16
305; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
306; VI-NEXT:    v_cvt_f32_f16_e32 v3, s4
307; VI-NEXT:    v_cvt_f32_f16_e32 v1, s5
308; VI-NEXT:    v_cvt_f32_f16_e32 v2, s1
309; VI-NEXT:    v_mov_b32_e32 v5, s3
310; VI-NEXT:    v_mov_b32_e32 v4, s2
311; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
312; VI-NEXT:    s_endpgm
313  %ext = fpext <4 x half> %arg to <4 x float>
314  store <4 x float> %ext, <4 x float> addrspace(1)* %out
315  ret void
316}
317
318define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
319; SI-LABEL: extload_v8f16_to_v8f32_arg:
320; SI:       ; %bb.0:
321; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
322; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
323; SI-NEXT:    s_waitcnt lgkmcnt(0)
324; SI-NEXT:    s_lshr_b32 s6, s1, 16
325; SI-NEXT:    s_lshr_b32 s7, s0, 16
326; SI-NEXT:    s_lshr_b32 s8, s3, 16
327; SI-NEXT:    v_cvt_f32_f16_e32 v3, s6
328; SI-NEXT:    s_lshr_b32 s6, s2, 16
329; SI-NEXT:    v_cvt_f32_f16_e32 v7, s8
330; SI-NEXT:    v_cvt_f32_f16_e32 v5, s6
331; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
332; SI-NEXT:    v_cvt_f32_f16_e32 v6, s3
333; SI-NEXT:    v_cvt_f32_f16_e32 v4, s2
334; SI-NEXT:    s_add_u32 s0, s4, 16
335; SI-NEXT:    v_cvt_f32_f16_e32 v2, s1
336; SI-NEXT:    s_addc_u32 s1, s5, 0
337; SI-NEXT:    v_cvt_f32_f16_e32 v1, s7
338; SI-NEXT:    v_mov_b32_e32 v9, s1
339; SI-NEXT:    v_mov_b32_e32 v8, s0
340; SI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
341; SI-NEXT:    s_nop 0
342; SI-NEXT:    v_mov_b32_e32 v4, s4
343; SI-NEXT:    v_mov_b32_e32 v5, s5
344; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
345; SI-NEXT:    s_endpgm
346;
347; VI-LABEL: extload_v8f16_to_v8f32_arg:
348; VI:       ; %bb.0:
349; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
350; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
351; VI-NEXT:    s_waitcnt lgkmcnt(0)
352; VI-NEXT:    s_lshr_b32 s6, s1, 16
353; VI-NEXT:    s_lshr_b32 s7, s0, 16
354; VI-NEXT:    s_lshr_b32 s8, s3, 16
355; VI-NEXT:    v_cvt_f32_f16_e32 v3, s6
356; VI-NEXT:    s_lshr_b32 s6, s2, 16
357; VI-NEXT:    v_cvt_f32_f16_e32 v7, s8
358; VI-NEXT:    v_cvt_f32_f16_e32 v5, s6
359; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
360; VI-NEXT:    v_cvt_f32_f16_e32 v6, s3
361; VI-NEXT:    v_cvt_f32_f16_e32 v4, s2
362; VI-NEXT:    s_add_u32 s0, s4, 16
363; VI-NEXT:    v_cvt_f32_f16_e32 v2, s1
364; VI-NEXT:    s_addc_u32 s1, s5, 0
365; VI-NEXT:    v_cvt_f32_f16_e32 v1, s7
366; VI-NEXT:    v_mov_b32_e32 v9, s1
367; VI-NEXT:    v_mov_b32_e32 v8, s0
368; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
369; VI-NEXT:    s_nop 0
370; VI-NEXT:    v_mov_b32_e32 v4, s4
371; VI-NEXT:    v_mov_b32_e32 v5, s5
372; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
373; VI-NEXT:    s_endpgm
374  %ext = fpext <8 x half> %arg to <8 x float>
375  store <8 x float> %ext, <8 x float> addrspace(1)* %out
376  ret void
377}
378
379define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
380; SI-LABEL: extload_f16_to_f64_arg:
381; SI:       ; %bb.0:
382; SI-NEXT:    s_load_dword s0, s[4:5], 0x2
383; SI-NEXT:    s_waitcnt lgkmcnt(0)
384; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
385; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
386; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
387; SI-NEXT:    s_waitcnt lgkmcnt(0)
388; SI-NEXT:    v_mov_b32_e32 v3, s1
389; SI-NEXT:    v_mov_b32_e32 v2, s0
390; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
391; SI-NEXT:    s_endpgm
392;
393; VI-LABEL: extload_f16_to_f64_arg:
394; VI:       ; %bb.0:
395; VI-NEXT:    s_load_dword s0, s[4:5], 0x8
396; VI-NEXT:    s_waitcnt lgkmcnt(0)
397; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
398; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
399; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
400; VI-NEXT:    s_waitcnt lgkmcnt(0)
401; VI-NEXT:    v_mov_b32_e32 v3, s1
402; VI-NEXT:    v_mov_b32_e32 v2, s0
403; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
404; VI-NEXT:    s_endpgm
405  %ext = fpext half %arg to double
406  store double %ext, double addrspace(1)* %out
407  ret void
408}
409
410define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
411; SI-LABEL: extload_v2f16_to_v2f64_arg:
412; SI:       ; %bb.0:
413; SI-NEXT:    s_load_dword s0, s[4:5], 0x2
414; SI-NEXT:    s_waitcnt lgkmcnt(0)
415; SI-NEXT:    s_lshr_b32 s1, s0, 16
416; SI-NEXT:    v_cvt_f32_f16_e32 v0, s1
417; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
418; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
419; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
420; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
421; SI-NEXT:    s_waitcnt lgkmcnt(0)
422; SI-NEXT:    v_mov_b32_e32 v5, s1
423; SI-NEXT:    v_mov_b32_e32 v4, s0
424; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
425; SI-NEXT:    s_endpgm
426;
427; VI-LABEL: extload_v2f16_to_v2f64_arg:
428; VI:       ; %bb.0:
429; VI-NEXT:    s_load_dword s0, s[4:5], 0x8
430; VI-NEXT:    s_waitcnt lgkmcnt(0)
431; VI-NEXT:    s_lshr_b32 s1, s0, 16
432; VI-NEXT:    v_cvt_f32_f16_e32 v0, s1
433; VI-NEXT:    v_cvt_f32_f16_e32 v1, s0
434; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
435; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
436; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
437; VI-NEXT:    s_waitcnt lgkmcnt(0)
438; VI-NEXT:    v_mov_b32_e32 v5, s1
439; VI-NEXT:    v_mov_b32_e32 v4, s0
440; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
441; VI-NEXT:    s_endpgm
442  %ext = fpext <2 x half> %arg to <2 x double>
443  store <2 x double> %ext, <2 x double> addrspace(1)* %out
444  ret void
445}
446
447define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
448; SI-LABEL: extload_v3f16_to_v3f64_arg:
449; SI:       ; %bb.0:
450; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
451; SI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
452; SI-NEXT:    s_waitcnt lgkmcnt(0)
453; SI-NEXT:    v_cvt_f32_f16_e32 v0, s1
454; SI-NEXT:    s_lshr_b32 s4, s0, 16
455; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
456; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
457; SI-NEXT:    s_add_u32 s0, s2, 16
458; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v0
459; SI-NEXT:    s_addc_u32 s1, s3, 0
460; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
461; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
462; SI-NEXT:    v_mov_b32_e32 v7, s1
463; SI-NEXT:    v_mov_b32_e32 v6, s0
464; SI-NEXT:    flat_store_dwordx2 v[6:7], v[4:5]
465; SI-NEXT:    v_mov_b32_e32 v5, s3
466; SI-NEXT:    v_mov_b32_e32 v4, s2
467; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
468; SI-NEXT:    s_endpgm
469;
470; VI-LABEL: extload_v3f16_to_v3f64_arg:
471; VI:       ; %bb.0:
472; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
473; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
474; VI-NEXT:    s_waitcnt lgkmcnt(0)
475; VI-NEXT:    v_cvt_f32_f16_e32 v1, s1
476; VI-NEXT:    s_lshr_b32 s4, s0, 16
477; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
478; VI-NEXT:    v_cvt_f32_f16_e32 v2, s4
479; VI-NEXT:    s_add_u32 s0, s2, 16
480; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
481; VI-NEXT:    s_addc_u32 s1, s3, 0
482; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
483; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
484; VI-NEXT:    v_mov_b32_e32 v7, s1
485; VI-NEXT:    v_mov_b32_e32 v6, s0
486; VI-NEXT:    flat_store_dwordx2 v[6:7], v[4:5]
487; VI-NEXT:    v_mov_b32_e32 v5, s3
488; VI-NEXT:    v_mov_b32_e32 v4, s2
489; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
490; VI-NEXT:    s_endpgm
491  %ext = fpext <3 x half> %arg to <3 x double>
492  store <3 x double> %ext, <3 x double> addrspace(1)* %out
493  ret void
494}
495
496define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
497; SI-LABEL: extload_v4f16_to_v4f64_arg:
498; SI:       ; %bb.0:
499; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
500; SI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
501; SI-NEXT:    s_waitcnt lgkmcnt(0)
502; SI-NEXT:    s_lshr_b32 s4, s1, 16
503; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
504; SI-NEXT:    v_cvt_f32_f16_e32 v5, s1
505; SI-NEXT:    s_lshr_b32 s5, s0, 16
506; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
507; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
508; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
509; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
510; SI-NEXT:    s_add_u32 s0, s2, 16
511; SI-NEXT:    s_addc_u32 s1, s3, 0
512; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
513; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
514; SI-NEXT:    v_mov_b32_e32 v9, s1
515; SI-NEXT:    v_mov_b32_e32 v8, s0
516; SI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
517; SI-NEXT:    s_nop 0
518; SI-NEXT:    v_mov_b32_e32 v5, s3
519; SI-NEXT:    v_mov_b32_e32 v4, s2
520; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
521; SI-NEXT:    s_endpgm
522;
523; VI-LABEL: extload_v4f16_to_v4f64_arg:
524; VI:       ; %bb.0:
525; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
526; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
527; VI-NEXT:    s_waitcnt lgkmcnt(0)
528; VI-NEXT:    s_lshr_b32 s5, s1, 16
529; VI-NEXT:    v_cvt_f32_f16_e32 v4, s5
530; VI-NEXT:    v_cvt_f32_f16_e32 v5, s1
531; VI-NEXT:    s_lshr_b32 s4, s0, 16
532; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
533; VI-NEXT:    v_cvt_f32_f16_e32 v2, s4
534; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
535; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
536; VI-NEXT:    s_add_u32 s0, s2, 16
537; VI-NEXT:    s_addc_u32 s1, s3, 0
538; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
539; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
540; VI-NEXT:    v_mov_b32_e32 v9, s1
541; VI-NEXT:    v_mov_b32_e32 v8, s0
542; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
543; VI-NEXT:    s_nop 0
544; VI-NEXT:    v_mov_b32_e32 v5, s3
545; VI-NEXT:    v_mov_b32_e32 v4, s2
546; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
547; VI-NEXT:    s_endpgm
548  %ext = fpext <4 x half> %arg to <4 x double>
549  store <4 x double> %ext, <4 x double> addrspace(1)* %out
550  ret void
551}
552
553define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
554; SI-LABEL: extload_v8f16_to_v8f64_arg:
555; SI:       ; %bb.0:
556; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
557; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
558; SI-NEXT:    s_waitcnt lgkmcnt(0)
559; SI-NEXT:    s_lshr_b32 s6, s3, 16
560; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
561; SI-NEXT:    v_cvt_f32_f16_e32 v12, s3
562; SI-NEXT:    s_lshr_b32 s7, s2, 16
563; SI-NEXT:    s_lshr_b32 s8, s1, 16
564; SI-NEXT:    s_lshr_b32 s6, s0, 16
565; SI-NEXT:    v_cvt_f32_f16_e32 v1, s7
566; SI-NEXT:    v_cvt_f32_f16_e32 v8, s2
567; SI-NEXT:    v_cvt_f32_f16_e32 v9, s0
568; SI-NEXT:    s_add_u32 s0, s4, 48
569; SI-NEXT:    v_cvt_f32_f16_e32 v5, s1
570; SI-NEXT:    v_cvt_f64_f32_e32 v[14:15], v0
571; SI-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
572; SI-NEXT:    s_addc_u32 s1, s5, 0
573; SI-NEXT:    v_cvt_f32_f16_e32 v4, s8
574; SI-NEXT:    v_mov_b32_e32 v17, s1
575; SI-NEXT:    v_mov_b32_e32 v16, s0
576; SI-NEXT:    s_add_u32 s0, s4, 32
577; SI-NEXT:    v_cvt_f32_f16_e32 v2, s6
578; SI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v1
579; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
580; SI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
581; SI-NEXT:    s_addc_u32 s1, s5, 0
582; SI-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
583; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
584; SI-NEXT:    v_mov_b32_e32 v13, s1
585; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
586; SI-NEXT:    v_mov_b32_e32 v12, s0
587; SI-NEXT:    s_add_u32 s0, s4, 16
588; SI-NEXT:    s_addc_u32 s1, s5, 0
589; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
590; SI-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
591; SI-NEXT:    s_nop 0
592; SI-NEXT:    v_mov_b32_e32 v9, s1
593; SI-NEXT:    v_mov_b32_e32 v8, s0
594; SI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
595; SI-NEXT:    s_nop 0
596; SI-NEXT:    v_mov_b32_e32 v4, s4
597; SI-NEXT:    v_mov_b32_e32 v5, s5
598; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
599; SI-NEXT:    s_endpgm
600;
601; VI-LABEL: extload_v8f16_to_v8f64_arg:
602; VI:       ; %bb.0:
603; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
604; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
605; VI-NEXT:    s_waitcnt lgkmcnt(0)
606; VI-NEXT:    s_lshr_b32 s6, s0, 16
607; VI-NEXT:    s_lshr_b32 s8, s2, 16
608; VI-NEXT:    s_lshr_b32 s9, s3, 16
609; VI-NEXT:    v_cvt_f32_f16_e32 v0, s6
610; VI-NEXT:    v_cvt_f32_f16_e32 v4, s8
611; VI-NEXT:    v_cvt_f32_f16_e32 v5, s9
612; VI-NEXT:    v_cvt_f32_f16_e32 v12, s3
613; VI-NEXT:    s_lshr_b32 s7, s1, 16
614; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
615; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
616; VI-NEXT:    v_cvt_f32_f16_e32 v8, s2
617; VI-NEXT:    s_add_u32 s0, s4, 48
618; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v4
619; VI-NEXT:    v_cvt_f64_f32_e32 v[14:15], v5
620; VI-NEXT:    v_cvt_f32_f16_e32 v4, s1
621; VI-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
622; VI-NEXT:    s_addc_u32 s1, s5, 0
623; VI-NEXT:    v_cvt_f32_f16_e32 v1, s7
624; VI-NEXT:    v_mov_b32_e32 v17, s1
625; VI-NEXT:    v_mov_b32_e32 v16, s0
626; VI-NEXT:    s_add_u32 s0, s4, 32
627; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
628; VI-NEXT:    s_addc_u32 s1, s5, 0
629; VI-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
630; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v1
631; VI-NEXT:    v_mov_b32_e32 v13, s1
632; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
633; VI-NEXT:    v_mov_b32_e32 v12, s0
634; VI-NEXT:    s_add_u32 s0, s4, 16
635; VI-NEXT:    s_addc_u32 s1, s5, 0
636; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
637; VI-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
638; VI-NEXT:    s_nop 0
639; VI-NEXT:    v_mov_b32_e32 v9, s1
640; VI-NEXT:    v_mov_b32_e32 v8, s0
641; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
642; VI-NEXT:    s_nop 0
643; VI-NEXT:    v_mov_b32_e32 v4, s4
644; VI-NEXT:    v_mov_b32_e32 v5, s5
645; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
646; VI-NEXT:    s_endpgm
647  %ext = fpext <8 x half> %arg to <8 x double>
648  store <8 x double> %ext, <8 x double> addrspace(1)* %out
649  ret void
650}
651
652define amdgpu_kernel void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
653; GCN-LABEL: global_load_store_f16:
654; GCN:       ; %bb.0:
655; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
656; GCN-NEXT:    s_waitcnt lgkmcnt(0)
657; GCN-NEXT:    v_mov_b32_e32 v0, s2
658; GCN-NEXT:    v_mov_b32_e32 v1, s3
659; GCN-NEXT:    flat_load_ushort v2, v[0:1]
660; GCN-NEXT:    v_mov_b32_e32 v0, s0
661; GCN-NEXT:    v_mov_b32_e32 v1, s1
662; GCN-NEXT:    s_waitcnt vmcnt(0)
663; GCN-NEXT:    flat_store_short v[0:1], v2
664; GCN-NEXT:    s_endpgm
665  %val = load half, half addrspace(1)* %in
666  store half %val, half addrspace(1)* %out
667  ret void
668}
669
670define amdgpu_kernel void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
671; GCN-LABEL: global_load_store_v2f16:
672; GCN:       ; %bb.0:
673; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
674; GCN-NEXT:    s_waitcnt lgkmcnt(0)
675; GCN-NEXT:    v_mov_b32_e32 v0, s2
676; GCN-NEXT:    v_mov_b32_e32 v1, s3
677; GCN-NEXT:    flat_load_dword v2, v[0:1]
678; GCN-NEXT:    v_mov_b32_e32 v0, s0
679; GCN-NEXT:    v_mov_b32_e32 v1, s1
680; GCN-NEXT:    s_waitcnt vmcnt(0)
681; GCN-NEXT:    flat_store_dword v[0:1], v2
682; GCN-NEXT:    s_endpgm
683  %val = load <2 x half>, <2 x half> addrspace(1)* %in
684  store <2 x half> %val, <2 x half> addrspace(1)* %out
685  ret void
686}
687
688define amdgpu_kernel void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
689; GCN-LABEL: global_load_store_v4f16:
690; GCN:       ; %bb.0:
691; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
692; GCN-NEXT:    s_waitcnt lgkmcnt(0)
693; GCN-NEXT:    v_mov_b32_e32 v0, s0
694; GCN-NEXT:    v_mov_b32_e32 v1, s1
695; GCN-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
696; GCN-NEXT:    v_mov_b32_e32 v2, s2
697; GCN-NEXT:    v_mov_b32_e32 v3, s3
698; GCN-NEXT:    s_waitcnt vmcnt(0)
699; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
700; GCN-NEXT:    s_endpgm
701  %val = load <4 x half>, <4 x half> addrspace(1)* %in
702  store <4 x half> %val, <4 x half> addrspace(1)* %out
703  ret void
704}
705
706define amdgpu_kernel void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
707; GCN-LABEL: global_load_store_v8f16:
708; GCN:       ; %bb.0:
709; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
710; GCN-NEXT:    s_waitcnt lgkmcnt(0)
711; GCN-NEXT:    v_mov_b32_e32 v0, s2
712; GCN-NEXT:    v_mov_b32_e32 v1, s3
713; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
714; GCN-NEXT:    v_mov_b32_e32 v4, s0
715; GCN-NEXT:    v_mov_b32_e32 v5, s1
716; GCN-NEXT:    s_waitcnt vmcnt(0)
717; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
718; GCN-NEXT:    s_endpgm
719  %val = load <8 x half>, <8 x half> addrspace(1)* %in
720  store <8 x half> %val, <8 x half> addrspace(1)* %out
721  ret void
722}
723
724define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
725; GCN-LABEL: global_extload_f16_to_f32:
726; GCN:       ; %bb.0:
727; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
728; GCN-NEXT:    s_waitcnt lgkmcnt(0)
729; GCN-NEXT:    v_mov_b32_e32 v0, s2
730; GCN-NEXT:    v_mov_b32_e32 v1, s3
731; GCN-NEXT:    flat_load_ushort v0, v[0:1]
732; GCN-NEXT:    v_mov_b32_e32 v1, s1
733; GCN-NEXT:    s_waitcnt vmcnt(0)
734; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v0
735; GCN-NEXT:    v_mov_b32_e32 v0, s0
736; GCN-NEXT:    flat_store_dword v[0:1], v2
737; GCN-NEXT:    s_endpgm
738  %val = load half, half addrspace(1)* %in
739  %cvt = fpext half %val to float
740  store float %cvt, float addrspace(1)* %out
741  ret void
742}
743
744define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
745; SI-LABEL: global_extload_v2f16_to_v2f32:
746; SI:       ; %bb.0:
747; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
748; SI-NEXT:    s_waitcnt lgkmcnt(0)
749; SI-NEXT:    v_mov_b32_e32 v0, s2
750; SI-NEXT:    v_mov_b32_e32 v1, s3
751; SI-NEXT:    flat_load_dword v1, v[0:1]
752; SI-NEXT:    v_mov_b32_e32 v2, s0
753; SI-NEXT:    v_mov_b32_e32 v3, s1
754; SI-NEXT:    s_waitcnt vmcnt(0)
755; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
756; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
757; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
758; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
759; SI-NEXT:    s_endpgm
760;
761; VI-LABEL: global_extload_v2f16_to_v2f32:
762; VI:       ; %bb.0:
763; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
764; VI-NEXT:    s_waitcnt lgkmcnt(0)
765; VI-NEXT:    v_mov_b32_e32 v0, s2
766; VI-NEXT:    v_mov_b32_e32 v1, s3
767; VI-NEXT:    flat_load_dword v1, v[0:1]
768; VI-NEXT:    v_mov_b32_e32 v2, s0
769; VI-NEXT:    v_mov_b32_e32 v3, s1
770; VI-NEXT:    s_waitcnt vmcnt(0)
771; VI-NEXT:    v_cvt_f32_f16_e32 v0, v1
772; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
773; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
774; VI-NEXT:    s_endpgm
775  %val = load <2 x half>, <2 x half> addrspace(1)* %in
776  %cvt = fpext <2 x half> %val to <2 x float>
777  store <2 x float> %cvt, <2 x float> addrspace(1)* %out
778  ret void
779}
780
781define amdgpu_kernel void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
782; SI-LABEL: global_extload_v3f16_to_v3f32:
783; SI:       ; %bb.0:
784; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
785; SI-NEXT:    s_waitcnt lgkmcnt(0)
786; SI-NEXT:    v_mov_b32_e32 v0, s2
787; SI-NEXT:    v_mov_b32_e32 v1, s3
788; SI-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
789; SI-NEXT:    v_mov_b32_e32 v3, s0
790; SI-NEXT:    v_mov_b32_e32 v4, s1
791; SI-NEXT:    s_waitcnt vmcnt(0)
792; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
793; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
794; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
795; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
796; SI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
797; SI-NEXT:    s_endpgm
798;
799; VI-LABEL: global_extload_v3f16_to_v3f32:
800; VI:       ; %bb.0:
801; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
802; VI-NEXT:    s_waitcnt lgkmcnt(0)
803; VI-NEXT:    v_mov_b32_e32 v0, s2
804; VI-NEXT:    v_mov_b32_e32 v1, s3
805; VI-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
806; VI-NEXT:    v_mov_b32_e32 v3, s0
807; VI-NEXT:    v_mov_b32_e32 v4, s1
808; VI-NEXT:    s_waitcnt vmcnt(0)
809; VI-NEXT:    v_cvt_f32_f16_e32 v0, v1
810; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
811; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
812; VI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
813; VI-NEXT:    s_endpgm
814  %val = load <3 x half>, <3 x half> addrspace(1)* %in
815  %cvt = fpext <3 x half> %val to <3 x float>
816  store <3 x float> %cvt, <3 x float> addrspace(1)* %out
817  ret void
818}
819
820define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
821; SI-LABEL: global_extload_v4f16_to_v4f32:
822; SI:       ; %bb.0:
823; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
824; SI-NEXT:    s_waitcnt lgkmcnt(0)
825; SI-NEXT:    v_mov_b32_e32 v0, s2
826; SI-NEXT:    v_mov_b32_e32 v1, s3
827; SI-NEXT:    flat_load_dwordx2 v[3:4], v[0:1]
828; SI-NEXT:    v_mov_b32_e32 v5, s1
829; SI-NEXT:    s_waitcnt vmcnt(0)
830; SI-NEXT:    v_cvt_f32_f16_e32 v2, v4
831; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
832; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
833; SI-NEXT:    v_cvt_f32_f16_e32 v0, v3
834; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
835; SI-NEXT:    v_cvt_f32_f16_e32 v1, v4
836; SI-NEXT:    v_mov_b32_e32 v4, s0
837; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
838; SI-NEXT:    s_endpgm
839;
840; VI-LABEL: global_extload_v4f16_to_v4f32:
841; VI:       ; %bb.0:
842; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
843; VI-NEXT:    s_waitcnt lgkmcnt(0)
844; VI-NEXT:    v_mov_b32_e32 v0, s2
845; VI-NEXT:    v_mov_b32_e32 v1, s3
846; VI-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
847; VI-NEXT:    s_waitcnt vmcnt(0)
848; VI-NEXT:    v_cvt_f32_f16_e32 v0, v4
849; VI-NEXT:    v_cvt_f32_f16_e32 v2, v5
850; VI-NEXT:    v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
851; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
852; VI-NEXT:    v_mov_b32_e32 v4, s0
853; VI-NEXT:    v_mov_b32_e32 v5, s1
854; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
855; VI-NEXT:    s_endpgm
856  %val = load <4 x half>, <4 x half> addrspace(1)* %in
857  %cvt = fpext <4 x half> %val to <4 x float>
858  store <4 x float> %cvt, <4 x float> addrspace(1)* %out
859  ret void
860}
861
862define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
863; SI-LABEL: global_extload_v8f16_to_v8f32:
864; SI:       ; %bb.0:
865; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
866; SI-NEXT:    s_waitcnt lgkmcnt(0)
867; SI-NEXT:    v_mov_b32_e32 v0, s2
868; SI-NEXT:    v_mov_b32_e32 v1, s3
869; SI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
870; SI-NEXT:    s_add_u32 s2, s0, 16
871; SI-NEXT:    s_addc_u32 s3, s1, 0
872; SI-NEXT:    v_mov_b32_e32 v13, s1
873; SI-NEXT:    v_mov_b32_e32 v12, s0
874; SI-NEXT:    s_waitcnt vmcnt(0)
875; SI-NEXT:    v_cvt_f32_f16_e32 v10, v3
876; SI-NEXT:    v_cvt_f32_f16_e32 v8, v2
877; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
878; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
879; SI-NEXT:    v_cvt_f32_f16_e32 v6, v1
880; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
881; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
882; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
883; SI-NEXT:    v_cvt_f32_f16_e32 v11, v3
884; SI-NEXT:    v_cvt_f32_f16_e32 v9, v2
885; SI-NEXT:    v_cvt_f32_f16_e32 v7, v1
886; SI-NEXT:    v_cvt_f32_f16_e32 v5, v0
887; SI-NEXT:    v_mov_b32_e32 v0, s2
888; SI-NEXT:    v_mov_b32_e32 v1, s3
889; SI-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
890; SI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
891; SI-NEXT:    s_endpgm
892;
893; VI-LABEL: global_extload_v8f16_to_v8f32:
894; VI:       ; %bb.0:
895; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
896; VI-NEXT:    s_waitcnt lgkmcnt(0)
897; VI-NEXT:    v_mov_b32_e32 v0, s2
898; VI-NEXT:    v_mov_b32_e32 v1, s3
899; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
900; VI-NEXT:    s_add_u32 s2, s0, 16
901; VI-NEXT:    s_addc_u32 s3, s1, 0
902; VI-NEXT:    v_mov_b32_e32 v13, s1
903; VI-NEXT:    v_mov_b32_e32 v12, s0
904; VI-NEXT:    s_waitcnt vmcnt(0)
905; VI-NEXT:    v_cvt_f32_f16_e32 v10, v3
906; VI-NEXT:    v_cvt_f32_f16_e32 v8, v2
907; VI-NEXT:    v_cvt_f32_f16_sdwa v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
908; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
909; VI-NEXT:    v_cvt_f32_f16_e32 v6, v1
910; VI-NEXT:    v_cvt_f32_f16_e32 v4, v0
911; VI-NEXT:    v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
912; VI-NEXT:    v_cvt_f32_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
913; VI-NEXT:    v_mov_b32_e32 v0, s2
914; VI-NEXT:    v_mov_b32_e32 v1, s3
915; VI-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
916; VI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
917; VI-NEXT:    s_endpgm
918  %val = load <8 x half>, <8 x half> addrspace(1)* %in
919  %cvt = fpext <8 x half> %val to <8 x float>
920  store <8 x float> %cvt, <8 x float> addrspace(1)* %out
921  ret void
922}
923
924define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
925; SI-LABEL: global_extload_v16f16_to_v16f32:
926; SI:       ; %bb.0:
927; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
928; SI-NEXT:    s_waitcnt lgkmcnt(0)
929; SI-NEXT:    s_add_u32 s4, s2, 16
930; SI-NEXT:    v_mov_b32_e32 v5, s3
931; SI-NEXT:    s_addc_u32 s5, s3, 0
932; SI-NEXT:    v_mov_b32_e32 v0, s4
933; SI-NEXT:    v_mov_b32_e32 v4, s2
934; SI-NEXT:    v_mov_b32_e32 v1, s5
935; SI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
936; SI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
937; SI-NEXT:    s_add_u32 s2, s0, 16
938; SI-NEXT:    s_addc_u32 s3, s1, 0
939; SI-NEXT:    v_mov_b32_e32 v14, s3
940; SI-NEXT:    v_mov_b32_e32 v13, s2
941; SI-NEXT:    s_add_u32 s2, s0, 48
942; SI-NEXT:    s_addc_u32 s3, s1, 0
943; SI-NEXT:    s_waitcnt vmcnt(1)
944; SI-NEXT:    v_cvt_f32_f16_e32 v8, v1
945; SI-NEXT:    s_waitcnt vmcnt(0)
946; SI-NEXT:    v_cvt_f32_f16_e32 v11, v7
947; SI-NEXT:    v_cvt_f32_f16_e32 v9, v6
948; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
949; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
950; SI-NEXT:    v_cvt_f32_f16_e32 v12, v7
951; SI-NEXT:    v_cvt_f32_f16_e32 v10, v6
952; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
953; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
954; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
955; SI-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
956; SI-NEXT:    v_cvt_f32_f16_e32 v6, v0
957; SI-NEXT:    v_cvt_f32_f16_e32 v12, v3
958; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
959; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
960; SI-NEXT:    v_cvt_f32_f16_e32 v10, v2
961; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
962; SI-NEXT:    v_cvt_f32_f16_e32 v2, v5
963; SI-NEXT:    v_cvt_f32_f16_e32 v0, v4
964; SI-NEXT:    v_mov_b32_e32 v5, s1
965; SI-NEXT:    v_cvt_f32_f16_e32 v9, v1
966; SI-NEXT:    v_cvt_f32_f16_e32 v13, v3
967; SI-NEXT:    v_cvt_f32_f16_e32 v3, v16
968; SI-NEXT:    v_cvt_f32_f16_e32 v1, v17
969; SI-NEXT:    v_mov_b32_e32 v4, s0
970; SI-NEXT:    s_add_u32 s0, s0, 32
971; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
972; SI-NEXT:    s_addc_u32 s1, s1, 0
973; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
974; SI-NEXT:    v_mov_b32_e32 v15, s3
975; SI-NEXT:    v_mov_b32_e32 v17, s1
976; SI-NEXT:    v_mov_b32_e32 v14, s2
977; SI-NEXT:    v_mov_b32_e32 v16, s0
978; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
979; SI-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
980; SI-NEXT:    flat_store_dwordx4 v[16:17], v[6:9]
981; SI-NEXT:    s_endpgm
982;
983; VI-LABEL: global_extload_v16f16_to_v16f32:
984; VI:       ; %bb.0:
985; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
986; VI-NEXT:    s_waitcnt lgkmcnt(0)
987; VI-NEXT:    v_mov_b32_e32 v0, s2
988; VI-NEXT:    v_mov_b32_e32 v1, s3
989; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
990; VI-NEXT:    s_add_u32 s2, s2, 16
991; VI-NEXT:    s_addc_u32 s3, s3, 0
992; VI-NEXT:    v_mov_b32_e32 v5, s3
993; VI-NEXT:    v_mov_b32_e32 v4, s2
994; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
995; VI-NEXT:    s_add_u32 s2, s0, 16
996; VI-NEXT:    s_addc_u32 s3, s1, 0
997; VI-NEXT:    v_mov_b32_e32 v19, s3
998; VI-NEXT:    v_mov_b32_e32 v18, s2
999; VI-NEXT:    s_add_u32 s2, s0, 48
1000; VI-NEXT:    v_mov_b32_e32 v17, s1
1001; VI-NEXT:    s_addc_u32 s3, s1, 0
1002; VI-NEXT:    v_mov_b32_e32 v16, s0
1003; VI-NEXT:    s_add_u32 s0, s0, 32
1004; VI-NEXT:    s_addc_u32 s1, s1, 0
1005; VI-NEXT:    v_mov_b32_e32 v21, s3
1006; VI-NEXT:    v_mov_b32_e32 v20, s2
1007; VI-NEXT:    s_waitcnt vmcnt(1)
1008; VI-NEXT:    v_cvt_f32_f16_e32 v14, v3
1009; VI-NEXT:    v_cvt_f32_f16_e32 v12, v2
1010; VI-NEXT:    v_cvt_f32_f16_sdwa v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1011; VI-NEXT:    v_cvt_f32_f16_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1012; VI-NEXT:    v_cvt_f32_f16_e32 v10, v1
1013; VI-NEXT:    v_cvt_f32_f16_e32 v8, v0
1014; VI-NEXT:    v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1015; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1016; VI-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
1017; VI-NEXT:    s_waitcnt vmcnt(1)
1018; VI-NEXT:    v_cvt_f32_f16_e32 v2, v5
1019; VI-NEXT:    v_cvt_f32_f16_e32 v14, v7
1020; VI-NEXT:    v_cvt_f32_f16_e32 v12, v6
1021; VI-NEXT:    v_cvt_f32_f16_sdwa v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1022; VI-NEXT:    v_cvt_f32_f16_sdwa v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1023; VI-NEXT:    v_cvt_f32_f16_e32 v0, v4
1024; VI-NEXT:    v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1025; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1026; VI-NEXT:    v_mov_b32_e32 v5, s1
1027; VI-NEXT:    v_mov_b32_e32 v4, s0
1028; VI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
1029; VI-NEXT:    flat_store_dwordx4 v[20:21], v[12:15]
1030; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1031; VI-NEXT:    s_endpgm
1032  %val = load <16 x half>, <16 x half> addrspace(1)* %in
1033  %cvt = fpext <16 x half> %val to <16 x float>
1034  store <16 x float> %cvt, <16 x float> addrspace(1)* %out
1035  ret void
1036}
1037
1038define amdgpu_kernel void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
1039; GCN-LABEL: global_extload_f16_to_f64:
1040; GCN:       ; %bb.0:
1041; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1042; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1043; GCN-NEXT:    v_mov_b32_e32 v0, s2
1044; GCN-NEXT:    v_mov_b32_e32 v1, s3
1045; GCN-NEXT:    flat_load_ushort v0, v[0:1]
1046; GCN-NEXT:    v_mov_b32_e32 v2, s0
1047; GCN-NEXT:    v_mov_b32_e32 v3, s1
1048; GCN-NEXT:    s_waitcnt vmcnt(0)
1049; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
1050; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
1051; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1052; GCN-NEXT:    s_endpgm
1053  %val = load half, half addrspace(1)* %in
1054  %cvt = fpext half %val to double
1055  store double %cvt, double addrspace(1)* %out
1056  ret void
1057}
1058
1059define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
1060; SI-LABEL: global_extload_v2f16_to_v2f64:
1061; SI:       ; %bb.0:
1062; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1063; SI-NEXT:    s_waitcnt lgkmcnt(0)
1064; SI-NEXT:    v_mov_b32_e32 v0, s2
1065; SI-NEXT:    v_mov_b32_e32 v1, s3
1066; SI-NEXT:    flat_load_dword v0, v[0:1]
1067; SI-NEXT:    v_mov_b32_e32 v4, s0
1068; SI-NEXT:    v_mov_b32_e32 v5, s1
1069; SI-NEXT:    s_waitcnt vmcnt(0)
1070; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1071; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1072; SI-NEXT:    v_cvt_f32_f16_e32 v2, v1
1073; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
1074; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1075; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1076; SI-NEXT:    s_endpgm
1077;
1078; VI-LABEL: global_extload_v2f16_to_v2f64:
1079; VI:       ; %bb.0:
1080; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1081; VI-NEXT:    s_waitcnt lgkmcnt(0)
1082; VI-NEXT:    v_mov_b32_e32 v0, s2
1083; VI-NEXT:    v_mov_b32_e32 v1, s3
1084; VI-NEXT:    flat_load_dword v0, v[0:1]
1085; VI-NEXT:    v_mov_b32_e32 v4, s0
1086; VI-NEXT:    v_mov_b32_e32 v5, s1
1087; VI-NEXT:    s_waitcnt vmcnt(0)
1088; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1089; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1090; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
1091; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1092; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1093; VI-NEXT:    s_endpgm
1094  %val = load <2 x half>, <2 x half> addrspace(1)* %in
1095  %cvt = fpext <2 x half> %val to <2 x double>
1096  store <2 x double> %cvt, <2 x double> addrspace(1)* %out
1097  ret void
1098}
1099
1100define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
1101; SI-LABEL: global_extload_v3f16_to_v3f64:
1102; SI:       ; %bb.0:
1103; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1104; SI-NEXT:    s_waitcnt lgkmcnt(0)
1105; SI-NEXT:    v_mov_b32_e32 v0, s2
1106; SI-NEXT:    v_mov_b32_e32 v1, s3
1107; SI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1108; SI-NEXT:    s_add_u32 s2, s0, 16
1109; SI-NEXT:    s_addc_u32 s3, s1, 0
1110; SI-NEXT:    v_mov_b32_e32 v7, s3
1111; SI-NEXT:    v_mov_b32_e32 v6, s2
1112; SI-NEXT:    s_waitcnt vmcnt(0)
1113; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1114; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
1115; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1116; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1117; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
1118; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
1119; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1120; SI-NEXT:    flat_store_dwordx2 v[6:7], v[4:5]
1121; SI-NEXT:    v_mov_b32_e32 v5, s1
1122; SI-NEXT:    v_mov_b32_e32 v4, s0
1123; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1124; SI-NEXT:    s_endpgm
1125;
1126; VI-LABEL: global_extload_v3f16_to_v3f64:
1127; VI:       ; %bb.0:
1128; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1129; VI-NEXT:    s_waitcnt lgkmcnt(0)
1130; VI-NEXT:    v_mov_b32_e32 v0, s2
1131; VI-NEXT:    v_mov_b32_e32 v1, s3
1132; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1133; VI-NEXT:    s_add_u32 s2, s0, 16
1134; VI-NEXT:    s_addc_u32 s3, s1, 0
1135; VI-NEXT:    v_mov_b32_e32 v5, s1
1136; VI-NEXT:    v_mov_b32_e32 v4, s0
1137; VI-NEXT:    s_waitcnt vmcnt(0)
1138; VI-NEXT:    v_cvt_f32_f16_e32 v3, v1
1139; VI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1140; VI-NEXT:    v_cvt_f32_f16_sdwa v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1141; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v3
1142; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
1143; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
1144; VI-NEXT:    v_mov_b32_e32 v9, s3
1145; VI-NEXT:    v_mov_b32_e32 v8, s2
1146; VI-NEXT:    flat_store_dwordx2 v[8:9], v[6:7]
1147; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1148; VI-NEXT:    s_endpgm
1149  %val = load <3 x half>, <3 x half> addrspace(1)* %in
1150  %cvt = fpext <3 x half> %val to <3 x double>
1151  store <3 x double> %cvt, <3 x double> addrspace(1)* %out
1152  ret void
1153}
1154
1155define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
1156; SI-LABEL: global_extload_v4f16_to_v4f64:
1157; SI:       ; %bb.0:
1158; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1159; SI-NEXT:    s_waitcnt lgkmcnt(0)
1160; SI-NEXT:    v_mov_b32_e32 v0, s2
1161; SI-NEXT:    v_mov_b32_e32 v1, s3
1162; SI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1163; SI-NEXT:    s_add_u32 s2, s0, 16
1164; SI-NEXT:    s_addc_u32 s3, s1, 0
1165; SI-NEXT:    v_mov_b32_e32 v9, s1
1166; SI-NEXT:    v_mov_b32_e32 v8, s0
1167; SI-NEXT:    s_waitcnt vmcnt(0)
1168; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
1169; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
1170; SI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1171; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1172; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1173; SI-NEXT:    v_cvt_f32_f16_e32 v10, v0
1174; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v3
1175; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
1176; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v2
1177; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v10
1178; SI-NEXT:    v_mov_b32_e32 v11, s3
1179; SI-NEXT:    v_mov_b32_e32 v10, s2
1180; SI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
1181; SI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1182; SI-NEXT:    s_endpgm
1183;
1184; VI-LABEL: global_extload_v4f16_to_v4f64:
1185; VI:       ; %bb.0:
1186; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1187; VI-NEXT:    s_waitcnt lgkmcnt(0)
1188; VI-NEXT:    v_mov_b32_e32 v0, s2
1189; VI-NEXT:    v_mov_b32_e32 v1, s3
1190; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1191; VI-NEXT:    s_add_u32 s2, s0, 16
1192; VI-NEXT:    s_addc_u32 s3, s1, 0
1193; VI-NEXT:    v_mov_b32_e32 v9, s1
1194; VI-NEXT:    v_mov_b32_e32 v8, s0
1195; VI-NEXT:    s_waitcnt vmcnt(0)
1196; VI-NEXT:    v_cvt_f32_f16_e32 v3, v1
1197; VI-NEXT:    v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1198; VI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1199; VI-NEXT:    v_cvt_f32_f16_sdwa v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1200; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v3
1201; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
1202; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
1203; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v10
1204; VI-NEXT:    v_mov_b32_e32 v11, s3
1205; VI-NEXT:    v_mov_b32_e32 v10, s2
1206; VI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
1207; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1208; VI-NEXT:    s_endpgm
1209  %val = load <4 x half>, <4 x half> addrspace(1)* %in
1210  %cvt = fpext <4 x half> %val to <4 x double>
1211  store <4 x double> %cvt, <4 x double> addrspace(1)* %out
1212  ret void
1213}
1214
1215define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
1216; SI-LABEL: global_extload_v8f16_to_v8f64:
1217; SI:       ; %bb.0:
1218; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1219; SI-NEXT:    s_waitcnt lgkmcnt(0)
1220; SI-NEXT:    v_mov_b32_e32 v0, s2
1221; SI-NEXT:    v_mov_b32_e32 v1, s3
1222; SI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1223; SI-NEXT:    s_add_u32 s2, s0, 48
1224; SI-NEXT:    s_addc_u32 s3, s1, 0
1225; SI-NEXT:    v_mov_b32_e32 v7, s3
1226; SI-NEXT:    v_mov_b32_e32 v6, s2
1227; SI-NEXT:    s_add_u32 s2, s0, 32
1228; SI-NEXT:    v_mov_b32_e32 v13, s1
1229; SI-NEXT:    s_addc_u32 s3, s1, 0
1230; SI-NEXT:    v_mov_b32_e32 v12, s0
1231; SI-NEXT:    s_add_u32 s0, s0, 16
1232; SI-NEXT:    v_mov_b32_e32 v15, s3
1233; SI-NEXT:    s_addc_u32 s1, s1, 0
1234; SI-NEXT:    v_mov_b32_e32 v14, s2
1235; SI-NEXT:    s_waitcnt vmcnt(0)
1236; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
1237; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1238; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
1239; SI-NEXT:    v_cvt_f32_f16_e32 v8, v2
1240; SI-NEXT:    v_cvt_f32_f16_e32 v2, v4
1241; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
1242; SI-NEXT:    v_cvt_f32_f16_e32 v10, v1
1243; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
1244; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
1245; SI-NEXT:    v_cvt_f32_f16_e32 v16, v5
1246; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
1247; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1248; SI-NEXT:    v_cvt_f32_f16_e32 v17, v9
1249; SI-NEXT:    v_cvt_f32_f16_e32 v18, v11
1250; SI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
1251; SI-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
1252; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
1253; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v10
1254; SI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v16
1255; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v17
1256; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v18
1257; SI-NEXT:    v_mov_b32_e32 v17, s1
1258; SI-NEXT:    v_mov_b32_e32 v16, s0
1259; SI-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
1260; SI-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
1261; SI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
1262; SI-NEXT:    s_endpgm
1263;
1264; VI-LABEL: global_extload_v8f16_to_v8f64:
1265; VI:       ; %bb.0:
1266; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1267; VI-NEXT:    s_waitcnt lgkmcnt(0)
1268; VI-NEXT:    v_mov_b32_e32 v0, s2
1269; VI-NEXT:    v_mov_b32_e32 v1, s3
1270; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1271; VI-NEXT:    s_add_u32 s2, s0, 48
1272; VI-NEXT:    s_addc_u32 s3, s1, 0
1273; VI-NEXT:    v_mov_b32_e32 v8, s3
1274; VI-NEXT:    v_mov_b32_e32 v7, s2
1275; VI-NEXT:    s_add_u32 s2, s0, 32
1276; VI-NEXT:    v_mov_b32_e32 v13, s1
1277; VI-NEXT:    s_addc_u32 s3, s1, 0
1278; VI-NEXT:    v_mov_b32_e32 v12, s0
1279; VI-NEXT:    s_add_u32 s0, s0, 16
1280; VI-NEXT:    v_mov_b32_e32 v15, s3
1281; VI-NEXT:    s_addc_u32 s1, s1, 0
1282; VI-NEXT:    v_mov_b32_e32 v14, s2
1283; VI-NEXT:    s_waitcnt vmcnt(0)
1284; VI-NEXT:    v_cvt_f32_f16_e32 v9, v0
1285; VI-NEXT:    v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1286; VI-NEXT:    v_cvt_f32_f16_e32 v0, v3
1287; VI-NEXT:    v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1288; VI-NEXT:    v_cvt_f32_f16_e32 v10, v1
1289; VI-NEXT:    v_cvt_f32_f16_e32 v11, v2
1290; VI-NEXT:    v_cvt_f64_f32_e32 v[3:4], v0
1291; VI-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
1292; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1293; VI-NEXT:    v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1294; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
1295; VI-NEXT:    flat_store_dwordx4 v[7:8], v[3:6]
1296; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v11
1297; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
1298; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
1299; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v17
1300; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v16
1301; VI-NEXT:    v_mov_b32_e32 v17, s1
1302; VI-NEXT:    v_mov_b32_e32 v16, s0
1303; VI-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
1304; VI-NEXT:    flat_store_dwordx4 v[16:17], v[4:7]
1305; VI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
1306; VI-NEXT:    s_endpgm
1307  %val = load <8 x half>, <8 x half> addrspace(1)* %in
1308  %cvt = fpext <8 x half> %val to <8 x double>
1309  store <8 x double> %cvt, <8 x double> addrspace(1)* %out
1310  ret void
1311}
1312
1313define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
1314; SI-LABEL: global_extload_v16f16_to_v16f64:
1315; SI:       ; %bb.0:
1316; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1317; SI-NEXT:    s_waitcnt lgkmcnt(0)
1318; SI-NEXT:    v_mov_b32_e32 v0, s2
1319; SI-NEXT:    v_mov_b32_e32 v1, s3
1320; SI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1321; SI-NEXT:    s_add_u32 s2, s2, 16
1322; SI-NEXT:    s_addc_u32 s3, s3, 0
1323; SI-NEXT:    v_mov_b32_e32 v5, s3
1324; SI-NEXT:    v_mov_b32_e32 v4, s2
1325; SI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1326; SI-NEXT:    s_add_u32 s2, s0, 48
1327; SI-NEXT:    s_addc_u32 s3, s1, 0
1328; SI-NEXT:    v_mov_b32_e32 v15, s3
1329; SI-NEXT:    v_mov_b32_e32 v14, s2
1330; SI-NEXT:    s_add_u32 s2, s0, 32
1331; SI-NEXT:    s_addc_u32 s3, s1, 0
1332; SI-NEXT:    v_mov_b32_e32 v17, s3
1333; SI-NEXT:    v_mov_b32_e32 v16, s2
1334; SI-NEXT:    s_add_u32 s2, s0, 16
1335; SI-NEXT:    s_addc_u32 s3, s1, 0
1336; SI-NEXT:    v_mov_b32_e32 v19, s3
1337; SI-NEXT:    v_mov_b32_e32 v18, s2
1338; SI-NEXT:    s_add_u32 s2, s0, 0x70
1339; SI-NEXT:    s_addc_u32 s3, s1, 0
1340; SI-NEXT:    v_mov_b32_e32 v13, s1
1341; SI-NEXT:    v_mov_b32_e32 v12, s0
1342; SI-NEXT:    s_waitcnt vmcnt(1)
1343; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
1344; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1345; SI-NEXT:    v_cvt_f32_f16_e32 v10, v8
1346; SI-NEXT:    s_waitcnt vmcnt(0)
1347; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
1348; SI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v3
1349; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
1350; SI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
1351; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1352; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1353; SI-NEXT:    v_cvt_f32_f16_e32 v21, v5
1354; SI-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
1355; SI-NEXT:    v_mov_b32_e32 v15, s3
1356; SI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
1357; SI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v3
1358; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
1359; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1360; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1361; SI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
1362; SI-NEXT:    v_mov_b32_e32 v14, s2
1363; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
1364; SI-NEXT:    v_cvt_f32_f16_e32 v9, v0
1365; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
1366; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1367; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
1368; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
1369; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
1370; SI-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
1371; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
1372; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
1373; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
1374; SI-NEXT:    v_cvt_f32_f16_e32 v8, v10
1375; SI-NEXT:    s_add_u32 s2, s0, 0x60
1376; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
1377; SI-NEXT:    v_cvt_f32_f16_e32 v10, v11
1378; SI-NEXT:    s_addc_u32 s3, s1, 0
1379; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
1380; SI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
1381; SI-NEXT:    v_mov_b32_e32 v17, s3
1382; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v7
1383; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
1384; SI-NEXT:    v_cvt_f32_f16_e32 v7, v20
1385; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
1386; SI-NEXT:    v_cvt_f32_f16_e32 v12, v5
1387; SI-NEXT:    v_mov_b32_e32 v16, s2
1388; SI-NEXT:    s_add_u32 s2, s0, 0x50
1389; SI-NEXT:    s_addc_u32 s3, s1, 0
1390; SI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v6
1391; SI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
1392; SI-NEXT:    s_add_u32 s0, s0, 64
1393; SI-NEXT:    flat_store_dwordx4 v[14:15], v[0:3]
1394; SI-NEXT:    s_addc_u32 s1, s1, 0
1395; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v21
1396; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v7
1397; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
1398; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v12
1399; SI-NEXT:    v_mov_b32_e32 v19, s3
1400; SI-NEXT:    v_mov_b32_e32 v13, s1
1401; SI-NEXT:    v_mov_b32_e32 v18, s2
1402; SI-NEXT:    v_mov_b32_e32 v12, s0
1403; SI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
1404; SI-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
1405; SI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
1406; SI-NEXT:    s_endpgm
1407;
1408; VI-LABEL: global_extload_v16f16_to_v16f64:
1409; VI:       ; %bb.0:
1410; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1411; VI-NEXT:    s_waitcnt lgkmcnt(0)
1412; VI-NEXT:    v_mov_b32_e32 v0, s2
1413; VI-NEXT:    v_mov_b32_e32 v1, s3
1414; VI-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
1415; VI-NEXT:    s_add_u32 s2, s2, 16
1416; VI-NEXT:    s_addc_u32 s3, s3, 0
1417; VI-NEXT:    v_mov_b32_e32 v0, s2
1418; VI-NEXT:    v_mov_b32_e32 v1, s3
1419; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1420; VI-NEXT:    s_add_u32 s2, s0, 48
1421; VI-NEXT:    s_addc_u32 s3, s1, 0
1422; VI-NEXT:    v_mov_b32_e32 v14, s3
1423; VI-NEXT:    v_mov_b32_e32 v13, s2
1424; VI-NEXT:    s_add_u32 s2, s0, 32
1425; VI-NEXT:    s_addc_u32 s3, s1, 0
1426; VI-NEXT:    v_mov_b32_e32 v16, s3
1427; VI-NEXT:    v_mov_b32_e32 v15, s2
1428; VI-NEXT:    s_add_u32 s2, s0, 16
1429; VI-NEXT:    s_addc_u32 s3, s1, 0
1430; VI-NEXT:    v_mov_b32_e32 v18, s3
1431; VI-NEXT:    v_mov_b32_e32 v17, s2
1432; VI-NEXT:    s_add_u32 s2, s0, 0x70
1433; VI-NEXT:    v_mov_b32_e32 v12, s1
1434; VI-NEXT:    s_addc_u32 s3, s1, 0
1435; VI-NEXT:    v_mov_b32_e32 v11, s0
1436; VI-NEXT:    s_waitcnt vmcnt(1)
1437; VI-NEXT:    v_cvt_f32_f16_e32 v8, v7
1438; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1439; VI-NEXT:    v_cvt_f64_f32_e32 v[7:8], v8
1440; VI-NEXT:    v_cvt_f64_f32_e32 v[9:10], v9
1441; VI-NEXT:    flat_store_dwordx4 v[13:14], v[7:10]
1442; VI-NEXT:    s_nop 0
1443; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
1444; VI-NEXT:    v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1445; VI-NEXT:    s_waitcnt vmcnt(1)
1446; VI-NEXT:    v_cvt_f32_f16_e32 v10, v0
1447; VI-NEXT:    v_mov_b32_e32 v14, s3
1448; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
1449; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
1450; VI-NEXT:    v_mov_b32_e32 v13, s2
1451; VI-NEXT:    s_add_u32 s2, s0, 0x60
1452; VI-NEXT:    s_addc_u32 s3, s1, 0
1453; VI-NEXT:    flat_store_dwordx4 v[15:16], v[6:9]
1454; VI-NEXT:    v_mov_b32_e32 v16, s3
1455; VI-NEXT:    v_cvt_f32_f16_e32 v6, v5
1456; VI-NEXT:    v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1457; VI-NEXT:    v_cvt_f32_f16_e32 v8, v4
1458; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1459; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
1460; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
1461; VI-NEXT:    v_mov_b32_e32 v15, s2
1462; VI-NEXT:    s_add_u32 s2, s0, 0x50
1463; VI-NEXT:    s_addc_u32 s3, s1, 0
1464; VI-NEXT:    flat_store_dwordx4 v[17:18], v[4:7]
1465; VI-NEXT:    v_cvt_f32_f16_sdwa v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1466; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
1467; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v9
1468; VI-NEXT:    v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1469; VI-NEXT:    v_cvt_f32_f16_e32 v0, v3
1470; VI-NEXT:    s_add_u32 s0, s0, 64
1471; VI-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
1472; VI-NEXT:    v_cvt_f32_f16_sdwa v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1473; VI-NEXT:    v_cvt_f64_f32_e32 v[5:6], v8
1474; VI-NEXT:    v_cvt_f32_f16_e32 v8, v2
1475; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1476; VI-NEXT:    v_cvt_f32_f16_e32 v7, v1
1477; VI-NEXT:    v_cvt_f64_f32_e32 v[3:4], v0
1478; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v10
1479; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
1480; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
1481; VI-NEXT:    flat_store_dwordx4 v[13:14], v[3:6]
1482; VI-NEXT:    s_addc_u32 s1, s1, 0
1483; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
1484; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v12
1485; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v17
1486; VI-NEXT:    v_mov_b32_e32 v20, s3
1487; VI-NEXT:    v_mov_b32_e32 v13, s1
1488; VI-NEXT:    v_mov_b32_e32 v19, s2
1489; VI-NEXT:    v_mov_b32_e32 v12, s0
1490; VI-NEXT:    flat_store_dwordx4 v[15:16], v[8:11]
1491; VI-NEXT:    flat_store_dwordx4 v[19:20], v[4:7]
1492; VI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
1493; VI-NEXT:    s_endpgm
1494  %val = load <16 x half>, <16 x half> addrspace(1)* %in
1495  %cvt = fpext <16 x half> %val to <16 x double>
1496  store <16 x double> %cvt, <16 x double> addrspace(1)* %out
1497  ret void
1498}
1499
1500define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
1501; GCN-LABEL: global_truncstore_f32_to_f16:
1502; GCN:       ; %bb.0:
1503; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1504; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1505; GCN-NEXT:    v_mov_b32_e32 v0, s2
1506; GCN-NEXT:    v_mov_b32_e32 v1, s3
1507; GCN-NEXT:    flat_load_dword v0, v[0:1]
1508; GCN-NEXT:    v_mov_b32_e32 v1, s1
1509; GCN-NEXT:    s_waitcnt vmcnt(0)
1510; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v0
1511; GCN-NEXT:    v_mov_b32_e32 v0, s0
1512; GCN-NEXT:    flat_store_short v[0:1], v2
1513; GCN-NEXT:    s_endpgm
1514  %val = load float, float addrspace(1)* %in
1515  %cvt = fptrunc float %val to half
1516  store half %cvt, half addrspace(1)* %out
1517  ret void
1518}
1519
1520define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
1521; SI-LABEL: global_truncstore_v2f32_to_v2f16:
1522; SI:       ; %bb.0:
1523; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1524; SI-NEXT:    s_waitcnt lgkmcnt(0)
1525; SI-NEXT:    v_mov_b32_e32 v0, s2
1526; SI-NEXT:    v_mov_b32_e32 v1, s3
1527; SI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1528; SI-NEXT:    s_waitcnt vmcnt(0)
1529; SI-NEXT:    v_cvt_f16_f32_e32 v2, v1
1530; SI-NEXT:    v_cvt_f16_f32_e32 v3, v0
1531; SI-NEXT:    v_mov_b32_e32 v0, s0
1532; SI-NEXT:    v_mov_b32_e32 v1, s1
1533; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1534; SI-NEXT:    v_or_b32_e32 v2, v3, v2
1535; SI-NEXT:    flat_store_dword v[0:1], v2
1536; SI-NEXT:    s_endpgm
1537;
1538; VI-LABEL: global_truncstore_v2f32_to_v2f16:
1539; VI:       ; %bb.0:
1540; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1541; VI-NEXT:    s_waitcnt lgkmcnt(0)
1542; VI-NEXT:    v_mov_b32_e32 v0, s2
1543; VI-NEXT:    v_mov_b32_e32 v1, s3
1544; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1545; VI-NEXT:    s_waitcnt vmcnt(0)
1546; VI-NEXT:    v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1547; VI-NEXT:    v_cvt_f16_f32_e32 v3, v0
1548; VI-NEXT:    v_mov_b32_e32 v0, s0
1549; VI-NEXT:    v_mov_b32_e32 v1, s1
1550; VI-NEXT:    v_or_b32_e32 v2, v3, v2
1551; VI-NEXT:    flat_store_dword v[0:1], v2
1552; VI-NEXT:    s_endpgm
1553  %val = load <2 x float>, <2 x float> addrspace(1)* %in
1554  %cvt = fptrunc <2 x float> %val to <2 x half>
1555  store <2 x half> %cvt, <2 x half> addrspace(1)* %out
1556  ret void
1557}
1558
1559define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
1560; SI-LABEL: global_truncstore_v3f32_to_v3f16:
1561; SI:       ; %bb.0:
1562; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1563; SI-NEXT:    s_waitcnt lgkmcnt(0)
1564; SI-NEXT:    v_mov_b32_e32 v0, s2
1565; SI-NEXT:    v_mov_b32_e32 v1, s3
1566; SI-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
1567; SI-NEXT:    s_add_u32 s2, s0, 4
1568; SI-NEXT:    s_addc_u32 s3, s1, 0
1569; SI-NEXT:    s_waitcnt vmcnt(0)
1570; SI-NEXT:    v_cvt_f16_f32_e32 v3, v1
1571; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1572; SI-NEXT:    v_cvt_f16_f32_e32 v4, v0
1573; SI-NEXT:    v_mov_b32_e32 v0, s2
1574; SI-NEXT:    v_mov_b32_e32 v1, s3
1575; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1576; SI-NEXT:    flat_store_short v[0:1], v2
1577; SI-NEXT:    v_mov_b32_e32 v0, s0
1578; SI-NEXT:    v_or_b32_e32 v2, v4, v3
1579; SI-NEXT:    v_mov_b32_e32 v1, s1
1580; SI-NEXT:    flat_store_dword v[0:1], v2
1581; SI-NEXT:    s_endpgm
1582;
1583; VI-LABEL: global_truncstore_v3f32_to_v3f16:
1584; VI:       ; %bb.0:
1585; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1586; VI-NEXT:    s_waitcnt lgkmcnt(0)
1587; VI-NEXT:    v_mov_b32_e32 v0, s2
1588; VI-NEXT:    v_mov_b32_e32 v1, s3
1589; VI-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
1590; VI-NEXT:    s_add_u32 s2, s0, 4
1591; VI-NEXT:    s_addc_u32 s3, s1, 0
1592; VI-NEXT:    s_waitcnt vmcnt(0)
1593; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1594; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1595; VI-NEXT:    v_cvt_f16_f32_e32 v4, v0
1596; VI-NEXT:    v_mov_b32_e32 v0, s2
1597; VI-NEXT:    v_mov_b32_e32 v1, s3
1598; VI-NEXT:    flat_store_short v[0:1], v2
1599; VI-NEXT:    v_mov_b32_e32 v0, s0
1600; VI-NEXT:    v_or_b32_e32 v3, v4, v3
1601; VI-NEXT:    v_mov_b32_e32 v1, s1
1602; VI-NEXT:    flat_store_dword v[0:1], v3
1603; VI-NEXT:    s_endpgm
1604  %val = load <3 x float>, <3 x float> addrspace(1)* %in
1605  %cvt = fptrunc <3 x float> %val to <3 x half>
1606  store <3 x half> %cvt, <3 x half> addrspace(1)* %out
1607  ret void
1608}
1609
1610define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
1611; SI-LABEL: global_truncstore_v4f32_to_v4f16:
1612; SI:       ; %bb.0:
1613; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1614; SI-NEXT:    s_waitcnt lgkmcnt(0)
1615; SI-NEXT:    v_mov_b32_e32 v0, s2
1616; SI-NEXT:    v_mov_b32_e32 v1, s3
1617; SI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1618; SI-NEXT:    v_mov_b32_e32 v4, s0
1619; SI-NEXT:    v_mov_b32_e32 v5, s1
1620; SI-NEXT:    s_waitcnt vmcnt(0)
1621; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1622; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1623; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1624; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1625; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1626; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
1627; SI-NEXT:    v_or_b32_e32 v1, v2, v3
1628; SI-NEXT:    v_or_b32_e32 v0, v0, v6
1629; SI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
1630; SI-NEXT:    s_endpgm
1631;
1632; VI-LABEL: global_truncstore_v4f32_to_v4f16:
1633; VI:       ; %bb.0:
1634; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1635; VI-NEXT:    s_waitcnt lgkmcnt(0)
1636; VI-NEXT:    v_mov_b32_e32 v0, s2
1637; VI-NEXT:    v_mov_b32_e32 v1, s3
1638; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1639; VI-NEXT:    s_waitcnt vmcnt(0)
1640; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1641; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1642; VI-NEXT:    v_cvt_f16_f32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1643; VI-NEXT:    v_cvt_f16_f32_e32 v5, v0
1644; VI-NEXT:    v_mov_b32_e32 v0, s0
1645; VI-NEXT:    v_mov_b32_e32 v1, s1
1646; VI-NEXT:    v_or_b32_e32 v3, v2, v3
1647; VI-NEXT:    v_or_b32_e32 v2, v5, v4
1648; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1649; VI-NEXT:    s_endpgm
1650  %val = load <4 x float>, <4 x float> addrspace(1)* %in
1651  %cvt = fptrunc <4 x float> %val to <4 x half>
1652  store <4 x half> %cvt, <4 x half> addrspace(1)* %out
1653  ret void
1654}
1655
1656define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
1657; SI-LABEL: global_truncstore_v8f32_to_v8f16:
1658; SI:       ; %bb.0:
1659; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1660; SI-NEXT:    s_waitcnt lgkmcnt(0)
1661; SI-NEXT:    v_mov_b32_e32 v0, s2
1662; SI-NEXT:    v_mov_b32_e32 v1, s3
1663; SI-NEXT:    s_add_u32 s2, s2, 16
1664; SI-NEXT:    s_addc_u32 s3, s3, 0
1665; SI-NEXT:    v_mov_b32_e32 v5, s3
1666; SI-NEXT:    v_mov_b32_e32 v4, s2
1667; SI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1668; SI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1669; SI-NEXT:    v_mov_b32_e32 v8, s0
1670; SI-NEXT:    v_mov_b32_e32 v9, s1
1671; SI-NEXT:    s_waitcnt vmcnt(1)
1672; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1673; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1674; SI-NEXT:    s_waitcnt vmcnt(0)
1675; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1676; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
1677; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1678; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1679; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1680; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1681; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1682; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
1683; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
1684; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
1685; SI-NEXT:    v_or_b32_e32 v1, v2, v3
1686; SI-NEXT:    v_or_b32_e32 v0, v0, v10
1687; SI-NEXT:    v_or_b32_e32 v3, v6, v7
1688; SI-NEXT:    v_or_b32_e32 v2, v4, v5
1689; SI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1690; SI-NEXT:    s_endpgm
1691;
1692; VI-LABEL: global_truncstore_v8f32_to_v8f16:
1693; VI:       ; %bb.0:
1694; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1695; VI-NEXT:    s_waitcnt lgkmcnt(0)
1696; VI-NEXT:    v_mov_b32_e32 v0, s2
1697; VI-NEXT:    v_mov_b32_e32 v1, s3
1698; VI-NEXT:    s_add_u32 s2, s2, 16
1699; VI-NEXT:    s_addc_u32 s3, s3, 0
1700; VI-NEXT:    v_mov_b32_e32 v5, s3
1701; VI-NEXT:    v_mov_b32_e32 v4, s2
1702; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1703; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1704; VI-NEXT:    v_mov_b32_e32 v8, s0
1705; VI-NEXT:    v_mov_b32_e32 v9, s1
1706; VI-NEXT:    s_waitcnt vmcnt(1)
1707; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1708; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1709; VI-NEXT:    v_cvt_f16_f32_sdwa v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1710; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1711; VI-NEXT:    s_waitcnt vmcnt(0)
1712; VI-NEXT:    v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1713; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1714; VI-NEXT:    v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1715; VI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1716; VI-NEXT:    v_or_b32_e32 v1, v2, v3
1717; VI-NEXT:    v_or_b32_e32 v0, v0, v10
1718; VI-NEXT:    v_or_b32_e32 v3, v6, v7
1719; VI-NEXT:    v_or_b32_e32 v2, v4, v5
1720; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1721; VI-NEXT:    s_endpgm
1722  %val = load <8 x float>, <8 x float> addrspace(1)* %in
1723  %cvt = fptrunc <8 x float> %val to <8 x half>
1724  store <8 x half> %cvt, <8 x half> addrspace(1)* %out
1725  ret void
1726}
1727
1728define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
1729; SI-LABEL: global_truncstore_v16f32_to_v16f16:
1730; SI:       ; %bb.0:
1731; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1732; SI-NEXT:    s_waitcnt lgkmcnt(0)
1733; SI-NEXT:    s_add_u32 s4, s2, 32
1734; SI-NEXT:    s_addc_u32 s5, s3, 0
1735; SI-NEXT:    v_mov_b32_e32 v0, s4
1736; SI-NEXT:    v_mov_b32_e32 v1, s5
1737; SI-NEXT:    s_add_u32 s4, s2, 48
1738; SI-NEXT:    s_addc_u32 s5, s3, 0
1739; SI-NEXT:    v_mov_b32_e32 v9, s3
1740; SI-NEXT:    v_mov_b32_e32 v4, s4
1741; SI-NEXT:    v_mov_b32_e32 v8, s2
1742; SI-NEXT:    s_add_u32 s2, s2, 16
1743; SI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1744; SI-NEXT:    v_mov_b32_e32 v5, s5
1745; SI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1746; SI-NEXT:    s_addc_u32 s3, s3, 0
1747; SI-NEXT:    v_mov_b32_e32 v13, s3
1748; SI-NEXT:    v_mov_b32_e32 v12, s2
1749; SI-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
1750; SI-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
1751; SI-NEXT:    s_add_u32 s2, s0, 16
1752; SI-NEXT:    s_addc_u32 s3, s1, 0
1753; SI-NEXT:    s_waitcnt vmcnt(3)
1754; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1755; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1756; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1757; SI-NEXT:    s_waitcnt vmcnt(2)
1758; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1759; SI-NEXT:    v_cvt_f16_f32_e32 v16, v5
1760; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1761; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1762; SI-NEXT:    v_cvt_f16_f32_e32 v17, v4
1763; SI-NEXT:    s_waitcnt vmcnt(1)
1764; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
1765; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
1766; SI-NEXT:    s_waitcnt vmcnt(0)
1767; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
1768; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
1769; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
1770; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
1771; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
1772; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
1773; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1774; SI-NEXT:    v_mov_b32_e32 v5, s3
1775; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
1776; SI-NEXT:    v_or_b32_e32 v1, v2, v3
1777; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
1778; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v16
1779; SI-NEXT:    v_mov_b32_e32 v4, s2
1780; SI-NEXT:    v_or_b32_e32 v0, v0, v18
1781; SI-NEXT:    v_or_b32_e32 v3, v6, v2
1782; SI-NEXT:    v_or_b32_e32 v2, v17, v7
1783; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
1784; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
1785; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v15
1786; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v13
1787; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1788; SI-NEXT:    v_mov_b32_e32 v5, s1
1789; SI-NEXT:    v_or_b32_e32 v1, v10, v6
1790; SI-NEXT:    v_or_b32_e32 v0, v8, v7
1791; SI-NEXT:    v_or_b32_e32 v3, v14, v9
1792; SI-NEXT:    v_or_b32_e32 v2, v12, v11
1793; SI-NEXT:    v_mov_b32_e32 v4, s0
1794; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1795; SI-NEXT:    s_endpgm
1796;
1797; VI-LABEL: global_truncstore_v16f32_to_v16f16:
1798; VI:       ; %bb.0:
1799; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1800; VI-NEXT:    s_waitcnt lgkmcnt(0)
1801; VI-NEXT:    s_add_u32 s4, s2, 32
1802; VI-NEXT:    s_addc_u32 s5, s3, 0
1803; VI-NEXT:    v_mov_b32_e32 v0, s4
1804; VI-NEXT:    v_mov_b32_e32 v1, s5
1805; VI-NEXT:    s_add_u32 s4, s2, 48
1806; VI-NEXT:    s_addc_u32 s5, s3, 0
1807; VI-NEXT:    v_mov_b32_e32 v9, s3
1808; VI-NEXT:    v_mov_b32_e32 v4, s4
1809; VI-NEXT:    v_mov_b32_e32 v8, s2
1810; VI-NEXT:    s_add_u32 s2, s2, 16
1811; VI-NEXT:    v_mov_b32_e32 v5, s5
1812; VI-NEXT:    s_addc_u32 s3, s3, 0
1813; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1814; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1815; VI-NEXT:    v_mov_b32_e32 v13, s3
1816; VI-NEXT:    v_mov_b32_e32 v12, s2
1817; VI-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
1818; VI-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
1819; VI-NEXT:    s_add_u32 s2, s0, 16
1820; VI-NEXT:    s_addc_u32 s3, s1, 0
1821; VI-NEXT:    s_waitcnt vmcnt(3)
1822; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1823; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1824; VI-NEXT:    v_cvt_f16_f32_sdwa v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1825; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1826; VI-NEXT:    s_waitcnt vmcnt(2)
1827; VI-NEXT:    v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1828; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1829; VI-NEXT:    v_cvt_f16_f32_sdwa v17, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1830; VI-NEXT:    v_cvt_f16_f32_e32 v18, v4
1831; VI-NEXT:    s_waitcnt vmcnt(1)
1832; VI-NEXT:    v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1833; VI-NEXT:    v_cvt_f16_f32_e32 v10, v10
1834; VI-NEXT:    v_cvt_f16_f32_sdwa v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1835; VI-NEXT:    v_cvt_f16_f32_e32 v8, v8
1836; VI-NEXT:    s_waitcnt vmcnt(0)
1837; VI-NEXT:    v_cvt_f16_f32_sdwa v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1838; VI-NEXT:    v_cvt_f16_f32_e32 v14, v14
1839; VI-NEXT:    v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1840; VI-NEXT:    v_cvt_f16_f32_e32 v12, v12
1841; VI-NEXT:    v_mov_b32_e32 v5, s3
1842; VI-NEXT:    v_mov_b32_e32 v4, s2
1843; VI-NEXT:    v_or_b32_e32 v1, v2, v3
1844; VI-NEXT:    v_or_b32_e32 v0, v0, v16
1845; VI-NEXT:    v_or_b32_e32 v3, v6, v7
1846; VI-NEXT:    v_or_b32_e32 v2, v18, v17
1847; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1848; VI-NEXT:    v_mov_b32_e32 v5, s1
1849; VI-NEXT:    v_or_b32_e32 v1, v10, v11
1850; VI-NEXT:    v_or_b32_e32 v0, v8, v9
1851; VI-NEXT:    v_or_b32_e32 v3, v14, v15
1852; VI-NEXT:    v_or_b32_e32 v2, v12, v13
1853; VI-NEXT:    v_mov_b32_e32 v4, s0
1854; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1855; VI-NEXT:    s_endpgm
1856  %val = load <16 x float>, <16 x float> addrspace(1)* %in
1857  %cvt = fptrunc <16 x float> %val to <16 x half>
1858  store <16 x half> %cvt, <16 x half> addrspace(1)* %out
1859  ret void
1860}
1861
1862; FIXME: Unsafe math should fold conversions away
1863define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
1864; SI-LABEL: fadd_f16:
1865; SI:       ; %bb.0:
1866; SI-NEXT:    s_load_dword s0, s[4:5], 0x2
1867; SI-NEXT:    s_waitcnt lgkmcnt(0)
1868; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
1869; SI-NEXT:    s_lshr_b32 s0, s0, 16
1870; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
1871; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1872; SI-NEXT:    v_add_f32_e32 v0, v0, v1
1873; SI-NEXT:    v_cvt_f16_f32_e32 v2, v0
1874; SI-NEXT:    s_waitcnt lgkmcnt(0)
1875; SI-NEXT:    v_mov_b32_e32 v0, s0
1876; SI-NEXT:    v_mov_b32_e32 v1, s1
1877; SI-NEXT:    flat_store_short v[0:1], v2
1878; SI-NEXT:    s_endpgm
1879;
1880; VI-LABEL: fadd_f16:
1881; VI:       ; %bb.0:
1882; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
1883; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1884; VI-NEXT:    s_waitcnt lgkmcnt(0)
1885; VI-NEXT:    s_lshr_b32 s3, s2, 16
1886; VI-NEXT:    v_mov_b32_e32 v0, s3
1887; VI-NEXT:    v_add_f16_e32 v2, s2, v0
1888; VI-NEXT:    v_mov_b32_e32 v0, s0
1889; VI-NEXT:    v_mov_b32_e32 v1, s1
1890; VI-NEXT:    flat_store_short v[0:1], v2
1891; VI-NEXT:    s_endpgm
1892   %add = fadd half %a, %b
1893   store half %add, half addrspace(1)* %out, align 4
1894   ret void
1895}
1896
1897define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
1898; SI-LABEL: fadd_v2f16:
1899; SI:       ; %bb.0:
1900; SI-NEXT:    s_load_dword s0, s[4:5], 0x2
1901; SI-NEXT:    s_load_dword s1, s[4:5], 0x3
1902; SI-NEXT:    s_waitcnt lgkmcnt(0)
1903; SI-NEXT:    s_lshr_b32 s2, s0, 16
1904; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
1905; SI-NEXT:    s_lshr_b32 s0, s1, 16
1906; SI-NEXT:    v_cvt_f32_f16_e32 v1, s1
1907; SI-NEXT:    v_cvt_f32_f16_e32 v2, s2
1908; SI-NEXT:    v_cvt_f32_f16_e32 v3, s0
1909; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1910; SI-NEXT:    v_add_f32_e32 v0, v0, v1
1911; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1912; SI-NEXT:    v_add_f32_e32 v1, v2, v3
1913; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1914; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1915; SI-NEXT:    v_or_b32_e32 v2, v0, v1
1916; SI-NEXT:    s_waitcnt lgkmcnt(0)
1917; SI-NEXT:    v_mov_b32_e32 v0, s0
1918; SI-NEXT:    v_mov_b32_e32 v1, s1
1919; SI-NEXT:    flat_store_dword v[0:1], v2
1920; SI-NEXT:    s_endpgm
1921;
1922; VI-LABEL: fadd_v2f16:
1923; VI:       ; %bb.0:
1924; VI-NEXT:    s_load_dword s2, s[4:5], 0xc
1925; VI-NEXT:    s_load_dword s3, s[4:5], 0x8
1926; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1927; VI-NEXT:    s_waitcnt lgkmcnt(0)
1928; VI-NEXT:    s_lshr_b32 s4, s2, 16
1929; VI-NEXT:    s_lshr_b32 s5, s3, 16
1930; VI-NEXT:    v_mov_b32_e32 v0, s4
1931; VI-NEXT:    v_mov_b32_e32 v1, s5
1932; VI-NEXT:    v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1933; VI-NEXT:    v_mov_b32_e32 v1, s2
1934; VI-NEXT:    v_add_f16_e32 v1, s3, v1
1935; VI-NEXT:    v_or_b32_e32 v2, v1, v0
1936; VI-NEXT:    v_mov_b32_e32 v0, s0
1937; VI-NEXT:    v_mov_b32_e32 v1, s1
1938; VI-NEXT:    flat_store_dword v[0:1], v2
1939; VI-NEXT:    s_endpgm
1940  %add = fadd <2 x half> %a, %b
1941  store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8
1942  ret void
1943}
1944
1945define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
1946; SI-LABEL: fadd_v4f16:
1947; SI:       ; %bb.0:
1948; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1949; SI-NEXT:    s_waitcnt lgkmcnt(0)
1950; SI-NEXT:    v_mov_b32_e32 v0, s2
1951; SI-NEXT:    v_mov_b32_e32 v1, s3
1952; SI-NEXT:    s_add_u32 s2, s2, 8
1953; SI-NEXT:    s_addc_u32 s3, s3, 0
1954; SI-NEXT:    v_mov_b32_e32 v2, s2
1955; SI-NEXT:    v_mov_b32_e32 v3, s3
1956; SI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1957; SI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1958; SI-NEXT:    v_mov_b32_e32 v4, s0
1959; SI-NEXT:    v_mov_b32_e32 v5, s1
1960; SI-NEXT:    s_waitcnt vmcnt(1)
1961; SI-NEXT:    v_cvt_f32_f16_e32 v6, v0
1962; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1963; SI-NEXT:    v_cvt_f32_f16_e32 v7, v1
1964; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1965; SI-NEXT:    s_waitcnt vmcnt(0)
1966; SI-NEXT:    v_cvt_f32_f16_e32 v8, v2
1967; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1968; SI-NEXT:    v_cvt_f32_f16_e32 v9, v3
1969; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1970; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1971; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1972; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1973; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1974; SI-NEXT:    v_add_f32_e32 v7, v7, v9
1975; SI-NEXT:    v_add_f32_e32 v6, v6, v8
1976; SI-NEXT:    v_add_f32_e32 v1, v1, v3
1977; SI-NEXT:    v_add_f32_e32 v0, v0, v2
1978; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1979; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1980; SI-NEXT:    v_cvt_f16_f32_e32 v2, v7
1981; SI-NEXT:    v_cvt_f16_f32_e32 v3, v6
1982; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1983; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1984; SI-NEXT:    v_or_b32_e32 v1, v2, v1
1985; SI-NEXT:    v_or_b32_e32 v0, v3, v0
1986; SI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
1987; SI-NEXT:    s_endpgm
1988;
1989; VI-LABEL: fadd_v4f16:
1990; VI:       ; %bb.0:
1991; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1992; VI-NEXT:    s_waitcnt lgkmcnt(0)
1993; VI-NEXT:    s_add_u32 s4, s2, 8
1994; VI-NEXT:    v_mov_b32_e32 v0, s2
1995; VI-NEXT:    s_addc_u32 s5, s3, 0
1996; VI-NEXT:    v_mov_b32_e32 v2, s4
1997; VI-NEXT:    v_mov_b32_e32 v1, s3
1998; VI-NEXT:    v_mov_b32_e32 v3, s5
1999; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2000; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
2001; VI-NEXT:    v_mov_b32_e32 v4, s0
2002; VI-NEXT:    v_mov_b32_e32 v5, s1
2003; VI-NEXT:    s_waitcnt vmcnt(0)
2004; VI-NEXT:    v_add_f16_sdwa v6, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2005; VI-NEXT:    v_add_f16_e32 v1, v1, v3
2006; VI-NEXT:    v_add_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2007; VI-NEXT:    v_add_f16_e32 v0, v0, v2
2008; VI-NEXT:    v_or_b32_e32 v1, v1, v6
2009; VI-NEXT:    v_or_b32_e32 v0, v0, v3
2010; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
2011; VI-NEXT:    s_endpgm
2012  %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1
2013  %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16
2014  %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16
2015  %result = fadd <4 x half> %a, %b
2016  store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16
2017  ret void
2018}
2019
2020define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
2021; SI-LABEL: fadd_v8f16:
2022; SI:       ; %bb.0:
2023; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
2024; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2025; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x8
2026; SI-NEXT:    s_waitcnt lgkmcnt(0)
2027; SI-NEXT:    s_lshr_b32 s10, s0, 16
2028; SI-NEXT:    v_cvt_f32_f16_e32 v4, s0
2029; SI-NEXT:    s_lshr_b32 s0, s4, 16
2030; SI-NEXT:    v_cvt_f32_f16_e32 v8, s0
2031; SI-NEXT:    s_lshr_b32 s0, s5, 16
2032; SI-NEXT:    s_lshr_b32 s11, s1, 16
2033; SI-NEXT:    v_cvt_f32_f16_e32 v0, s10
2034; SI-NEXT:    s_lshr_b32 s10, s2, 16
2035; SI-NEXT:    v_cvt_f32_f16_e32 v9, s0
2036; SI-NEXT:    s_lshr_b32 s0, s6, 16
2037; SI-NEXT:    v_cvt_f32_f16_e32 v1, s11
2038; SI-NEXT:    v_cvt_f32_f16_e32 v2, s10
2039; SI-NEXT:    s_lshr_b32 s10, s3, 16
2040; SI-NEXT:    v_cvt_f32_f16_e32 v10, s0
2041; SI-NEXT:    s_lshr_b32 s0, s7, 16
2042; SI-NEXT:    v_cvt_f32_f16_e32 v3, s10
2043; SI-NEXT:    v_cvt_f32_f16_e32 v5, s1
2044; SI-NEXT:    v_cvt_f32_f16_e32 v11, s0
2045; SI-NEXT:    v_cvt_f32_f16_e32 v12, s4
2046; SI-NEXT:    v_cvt_f32_f16_e32 v13, s5
2047; SI-NEXT:    v_cvt_f32_f16_e32 v6, s2
2048; SI-NEXT:    v_cvt_f32_f16_e32 v7, s3
2049; SI-NEXT:    v_cvt_f32_f16_e32 v14, s7
2050; SI-NEXT:    v_cvt_f32_f16_e32 v15, s6
2051; SI-NEXT:    v_add_f32_e32 v1, v1, v9
2052; SI-NEXT:    v_add_f32_e32 v0, v0, v8
2053; SI-NEXT:    v_add_f32_e32 v3, v3, v11
2054; SI-NEXT:    v_add_f32_e32 v2, v2, v10
2055; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2056; SI-NEXT:    v_add_f32_e32 v5, v5, v13
2057; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2058; SI-NEXT:    v_add_f32_e32 v4, v4, v12
2059; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
2060; SI-NEXT:    v_add_f32_e32 v7, v7, v14
2061; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2062; SI-NEXT:    v_add_f32_e32 v6, v6, v15
2063; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
2064; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
2065; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
2066; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
2067; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2068; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2069; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2070; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2071; SI-NEXT:    v_or_b32_e32 v1, v5, v1
2072; SI-NEXT:    v_or_b32_e32 v0, v4, v0
2073; SI-NEXT:    v_mov_b32_e32 v4, s8
2074; SI-NEXT:    v_or_b32_e32 v3, v7, v3
2075; SI-NEXT:    v_or_b32_e32 v2, v6, v2
2076; SI-NEXT:    v_mov_b32_e32 v5, s9
2077; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2078; SI-NEXT:    s_endpgm
2079;
2080; VI-LABEL: fadd_v8f16:
2081; VI:       ; %bb.0:
2082; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
2083; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
2084; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2085; VI-NEXT:    s_waitcnt lgkmcnt(0)
2086; VI-NEXT:    s_lshr_b32 s6, s3, 16
2087; VI-NEXT:    s_lshr_b32 s7, s11, 16
2088; VI-NEXT:    v_mov_b32_e32 v0, s6
2089; VI-NEXT:    v_mov_b32_e32 v1, s7
2090; VI-NEXT:    v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2091; VI-NEXT:    v_mov_b32_e32 v1, s3
2092; VI-NEXT:    v_add_f16_e32 v1, s11, v1
2093; VI-NEXT:    s_lshr_b32 s3, s2, 16
2094; VI-NEXT:    s_lshr_b32 s6, s10, 16
2095; VI-NEXT:    v_or_b32_e32 v3, v1, v0
2096; VI-NEXT:    v_mov_b32_e32 v0, s3
2097; VI-NEXT:    v_mov_b32_e32 v1, s6
2098; VI-NEXT:    v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2099; VI-NEXT:    v_mov_b32_e32 v1, s2
2100; VI-NEXT:    v_add_f16_e32 v1, s10, v1
2101; VI-NEXT:    s_lshr_b32 s2, s1, 16
2102; VI-NEXT:    s_lshr_b32 s3, s9, 16
2103; VI-NEXT:    v_or_b32_e32 v2, v1, v0
2104; VI-NEXT:    v_mov_b32_e32 v0, s2
2105; VI-NEXT:    v_mov_b32_e32 v1, s3
2106; VI-NEXT:    v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2107; VI-NEXT:    v_mov_b32_e32 v1, s1
2108; VI-NEXT:    v_add_f16_e32 v1, s9, v1
2109; VI-NEXT:    s_lshr_b32 s1, s0, 16
2110; VI-NEXT:    s_lshr_b32 s2, s8, 16
2111; VI-NEXT:    v_or_b32_e32 v1, v1, v0
2112; VI-NEXT:    v_mov_b32_e32 v0, s1
2113; VI-NEXT:    v_mov_b32_e32 v4, s2
2114; VI-NEXT:    v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2115; VI-NEXT:    v_mov_b32_e32 v4, s0
2116; VI-NEXT:    v_add_f16_e32 v4, s8, v4
2117; VI-NEXT:    v_or_b32_e32 v0, v4, v0
2118; VI-NEXT:    v_mov_b32_e32 v4, s4
2119; VI-NEXT:    v_mov_b32_e32 v5, s5
2120; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2121; VI-NEXT:    s_endpgm
2122  %add = fadd <8 x half> %a, %b
2123  store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32
2124  ret void
2125}
2126
2127define amdgpu_kernel void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
2128; GCN-LABEL: test_bitcast_from_half:
2129; GCN:       ; %bb.0:
2130; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2131; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2132; GCN-NEXT:    v_mov_b32_e32 v0, s0
2133; GCN-NEXT:    v_mov_b32_e32 v1, s1
2134; GCN-NEXT:    flat_load_ushort v2, v[0:1]
2135; GCN-NEXT:    v_mov_b32_e32 v0, s2
2136; GCN-NEXT:    v_mov_b32_e32 v1, s3
2137; GCN-NEXT:    s_waitcnt vmcnt(0)
2138; GCN-NEXT:    flat_store_short v[0:1], v2
2139; GCN-NEXT:    s_endpgm
2140  %val = load half, half addrspace(1)* %in
2141  %val_int = bitcast half %val to i16
2142  store i16 %val_int, i16 addrspace(1)* %out
2143  ret void
2144}
2145
2146define amdgpu_kernel void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
2147; GCN-LABEL: test_bitcast_to_half:
2148; GCN:       ; %bb.0:
2149; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2150; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2151; GCN-NEXT:    v_mov_b32_e32 v0, s2
2152; GCN-NEXT:    v_mov_b32_e32 v1, s3
2153; GCN-NEXT:    flat_load_ushort v2, v[0:1]
2154; GCN-NEXT:    v_mov_b32_e32 v0, s0
2155; GCN-NEXT:    v_mov_b32_e32 v1, s1
2156; GCN-NEXT:    s_waitcnt vmcnt(0)
2157; GCN-NEXT:    flat_store_short v[0:1], v2
2158; GCN-NEXT:    s_endpgm
2159  %val = load i16, i16 addrspace(1)* %in
2160  %val_fp = bitcast i16 %val to half
2161  store half %val_fp, half addrspace(1)* %out
2162  ret void
2163}
2164
2165attributes #0 = { nounwind }
2166