1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CI %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI %s
4
5; half args should be promoted to float for CI and lower.
6
7define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
8; CI-LABEL: load_f16_arg:
9; CI:       ; %bb.0:
10; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
12; CI-NEXT:    s_waitcnt lgkmcnt(0)
13; CI-NEXT:    v_mov_b32_e32 v0, s0
14; CI-NEXT:    v_mov_b32_e32 v1, s1
15; CI-NEXT:    v_mov_b32_e32 v2, s2
16; CI-NEXT:    flat_store_short v[0:1], v2
17; CI-NEXT:    s_endpgm
18;
19; VI-LABEL: load_f16_arg:
20; VI:       ; %bb.0:
21; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
22; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
23; VI-NEXT:    s_waitcnt lgkmcnt(0)
24; VI-NEXT:    v_mov_b32_e32 v0, s0
25; VI-NEXT:    v_mov_b32_e32 v1, s1
26; VI-NEXT:    v_mov_b32_e32 v2, s2
27; VI-NEXT:    flat_store_short v[0:1], v2
28; VI-NEXT:    s_endpgm
29  store half %arg, half addrspace(1)* %out
30  ret void
31}
32
33define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
34; CI-LABEL: load_v2f16_arg:
35; CI:       ; %bb.0:
36; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
37; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
38; CI-NEXT:    s_waitcnt lgkmcnt(0)
39; CI-NEXT:    v_mov_b32_e32 v0, s0
40; CI-NEXT:    v_mov_b32_e32 v1, s1
41; CI-NEXT:    v_mov_b32_e32 v2, s2
42; CI-NEXT:    flat_store_dword v[0:1], v2
43; CI-NEXT:    s_endpgm
44;
45; VI-LABEL: load_v2f16_arg:
46; VI:       ; %bb.0:
47; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
48; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
49; VI-NEXT:    s_waitcnt lgkmcnt(0)
50; VI-NEXT:    v_mov_b32_e32 v0, s0
51; VI-NEXT:    v_mov_b32_e32 v1, s1
52; VI-NEXT:    v_mov_b32_e32 v2, s2
53; VI-NEXT:    flat_store_dword v[0:1], v2
54; VI-NEXT:    s_endpgm
55  store <2 x half> %arg, <2 x half> addrspace(1)* %out
56  ret void
57}
58
59define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
60; CI-LABEL: load_v3f16_arg:
61; CI:       ; %bb.0:
62; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
63; CI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
64; CI-NEXT:    s_waitcnt lgkmcnt(0)
65; CI-NEXT:    s_add_u32 s4, s0, 4
66; CI-NEXT:    s_addc_u32 s5, s1, 0
67; CI-NEXT:    v_mov_b32_e32 v2, s4
68; CI-NEXT:    v_mov_b32_e32 v4, s3
69; CI-NEXT:    v_mov_b32_e32 v0, s0
70; CI-NEXT:    v_mov_b32_e32 v3, s5
71; CI-NEXT:    v_mov_b32_e32 v1, s1
72; CI-NEXT:    v_mov_b32_e32 v5, s2
73; CI-NEXT:    flat_store_short v[2:3], v4
74; CI-NEXT:    flat_store_dword v[0:1], v5
75; CI-NEXT:    s_endpgm
76;
77; VI-LABEL: load_v3f16_arg:
78; VI:       ; %bb.0:
79; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
80; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
81; VI-NEXT:    s_waitcnt lgkmcnt(0)
82; VI-NEXT:    s_add_u32 s4, s0, 4
83; VI-NEXT:    s_addc_u32 s5, s1, 0
84; VI-NEXT:    v_mov_b32_e32 v2, s4
85; VI-NEXT:    v_mov_b32_e32 v4, s3
86; VI-NEXT:    v_mov_b32_e32 v0, s0
87; VI-NEXT:    v_mov_b32_e32 v3, s5
88; VI-NEXT:    v_mov_b32_e32 v1, s1
89; VI-NEXT:    v_mov_b32_e32 v5, s2
90; VI-NEXT:    flat_store_short v[2:3], v4
91; VI-NEXT:    flat_store_dword v[0:1], v5
92; VI-NEXT:    s_endpgm
93  store <3 x half> %arg, <3 x half> addrspace(1)* %out
94  ret void
95}
96
97
98; FIXME: Why not one load?
99define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
100; CI-LABEL: load_v4f16_arg:
101; CI:       ; %bb.0:
102; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
103; CI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
104; CI-NEXT:    s_waitcnt lgkmcnt(0)
105; CI-NEXT:    v_mov_b32_e32 v0, s0
106; CI-NEXT:    v_mov_b32_e32 v2, s2
107; CI-NEXT:    v_mov_b32_e32 v1, s1
108; CI-NEXT:    v_mov_b32_e32 v3, s3
109; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
110; CI-NEXT:    s_endpgm
111;
112; VI-LABEL: load_v4f16_arg:
113; VI:       ; %bb.0:
114; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
115; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
116; VI-NEXT:    s_waitcnt lgkmcnt(0)
117; VI-NEXT:    v_mov_b32_e32 v0, s0
118; VI-NEXT:    v_mov_b32_e32 v2, s2
119; VI-NEXT:    v_mov_b32_e32 v1, s1
120; VI-NEXT:    v_mov_b32_e32 v3, s3
121; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
122; VI-NEXT:    s_endpgm
123  store <4 x half> %arg, <4 x half> addrspace(1)* %out
124  ret void
125}
126
127define amdgpu_kernel void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
128; CI-LABEL: load_v8f16_arg:
129; CI:       ; %bb.0:
130; CI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
131; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
132; CI-NEXT:    s_waitcnt lgkmcnt(0)
133; CI-NEXT:    v_mov_b32_e32 v4, s6
134; CI-NEXT:    v_mov_b32_e32 v0, s0
135; CI-NEXT:    v_mov_b32_e32 v5, s7
136; CI-NEXT:    v_mov_b32_e32 v1, s1
137; CI-NEXT:    v_mov_b32_e32 v2, s2
138; CI-NEXT:    v_mov_b32_e32 v3, s3
139; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
140; CI-NEXT:    s_endpgm
141;
142; VI-LABEL: load_v8f16_arg:
143; VI:       ; %bb.0:
144; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
145; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
146; VI-NEXT:    s_waitcnt lgkmcnt(0)
147; VI-NEXT:    v_mov_b32_e32 v4, s6
148; VI-NEXT:    v_mov_b32_e32 v0, s0
149; VI-NEXT:    v_mov_b32_e32 v5, s7
150; VI-NEXT:    v_mov_b32_e32 v1, s1
151; VI-NEXT:    v_mov_b32_e32 v2, s2
152; VI-NEXT:    v_mov_b32_e32 v3, s3
153; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
154; VI-NEXT:    s_endpgm
155  store <8 x half> %arg, <8 x half> addrspace(1)* %out
156  ret void
157}
158
159define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
160; CI-LABEL: extload_v2f16_arg:
161; CI:       ; %bb.0:
162; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
163; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
164; CI-NEXT:    s_waitcnt lgkmcnt(0)
165; CI-NEXT:    s_lshr_b32 s3, s2, 16
166; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
167; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
168; CI-NEXT:    v_mov_b32_e32 v3, s1
169; CI-NEXT:    v_mov_b32_e32 v2, s0
170; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
171; CI-NEXT:    s_endpgm
172;
173; VI-LABEL: extload_v2f16_arg:
174; VI:       ; %bb.0:
175; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
176; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
177; VI-NEXT:    s_waitcnt lgkmcnt(0)
178; VI-NEXT:    s_lshr_b32 s3, s2, 16
179; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
180; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
181; VI-NEXT:    v_mov_b32_e32 v3, s1
182; VI-NEXT:    v_mov_b32_e32 v2, s0
183; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
184; VI-NEXT:    s_endpgm
185  %fpext = fpext <2 x half> %in to <2 x float>
186  store <2 x float> %fpext, <2 x float> addrspace(1)* %out
187  ret void
188}
189
190define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
191; CI-LABEL: extload_f16_to_f32_arg:
192; CI:       ; %bb.0:
193; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
194; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
195; CI-NEXT:    s_waitcnt lgkmcnt(0)
196; CI-NEXT:    v_cvt_f32_f16_e32 v2, s2
197; CI-NEXT:    v_mov_b32_e32 v0, s0
198; CI-NEXT:    v_mov_b32_e32 v1, s1
199; CI-NEXT:    flat_store_dword v[0:1], v2
200; CI-NEXT:    s_endpgm
201;
202; VI-LABEL: extload_f16_to_f32_arg:
203; VI:       ; %bb.0:
204; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
205; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
206; VI-NEXT:    s_waitcnt lgkmcnt(0)
207; VI-NEXT:    v_cvt_f32_f16_e32 v2, s2
208; VI-NEXT:    v_mov_b32_e32 v0, s0
209; VI-NEXT:    v_mov_b32_e32 v1, s1
210; VI-NEXT:    flat_store_dword v[0:1], v2
211; VI-NEXT:    s_endpgm
212  %ext = fpext half %arg to float
213  store float %ext, float addrspace(1)* %out
214  ret void
215}
216
217define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
218; CI-LABEL: extload_v2f16_to_v2f32_arg:
219; CI:       ; %bb.0:
220; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
221; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
222; CI-NEXT:    s_waitcnt lgkmcnt(0)
223; CI-NEXT:    s_lshr_b32 s3, s2, 16
224; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
225; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
226; CI-NEXT:    v_mov_b32_e32 v3, s1
227; CI-NEXT:    v_mov_b32_e32 v2, s0
228; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
229; CI-NEXT:    s_endpgm
230;
231; VI-LABEL: extload_v2f16_to_v2f32_arg:
232; VI:       ; %bb.0:
233; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
234; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
235; VI-NEXT:    s_waitcnt lgkmcnt(0)
236; VI-NEXT:    s_lshr_b32 s3, s2, 16
237; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
238; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
239; VI-NEXT:    v_mov_b32_e32 v3, s1
240; VI-NEXT:    v_mov_b32_e32 v2, s0
241; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
242; VI-NEXT:    s_endpgm
243  %ext = fpext <2 x half> %arg to <2 x float>
244  store <2 x float> %ext, <2 x float> addrspace(1)* %out
245  ret void
246}
247
248define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
249; CI-LABEL: extload_v3f16_to_v3f32_arg:
250; CI:       ; %bb.0:
251; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
252; CI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
253; CI-NEXT:    s_waitcnt lgkmcnt(0)
254; CI-NEXT:    s_lshr_b32 s4, s0, 16
255; CI-NEXT:    v_cvt_f32_f16_e32 v2, s1
256; CI-NEXT:    v_cvt_f32_f16_e32 v1, s4
257; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
258; CI-NEXT:    v_mov_b32_e32 v4, s3
259; CI-NEXT:    v_mov_b32_e32 v3, s2
260; CI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
261; CI-NEXT:    s_endpgm
262;
263; VI-LABEL: extload_v3f16_to_v3f32_arg:
264; VI:       ; %bb.0:
265; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
266; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
267; VI-NEXT:    s_waitcnt lgkmcnt(0)
268; VI-NEXT:    s_lshr_b32 s4, s0, 16
269; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
270; VI-NEXT:    v_cvt_f32_f16_e32 v1, s4
271; VI-NEXT:    v_cvt_f32_f16_e32 v2, s1
272; VI-NEXT:    v_mov_b32_e32 v4, s3
273; VI-NEXT:    v_mov_b32_e32 v3, s2
274; VI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
275; VI-NEXT:    s_endpgm
276  %ext = fpext <3 x half> %arg to <3 x float>
277  store <3 x float> %ext, <3 x float> addrspace(1)* %out
278  ret void
279}
280
281define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
282; CI-LABEL: extload_v4f16_to_v4f32_arg:
283; CI:       ; %bb.0:
284; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
285; CI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
286; CI-NEXT:    s_waitcnt lgkmcnt(0)
287; CI-NEXT:    s_lshr_b32 s4, s1, 16
288; CI-NEXT:    s_lshr_b32 s5, s0, 16
289; CI-NEXT:    v_cvt_f32_f16_e32 v2, s1
290; CI-NEXT:    v_cvt_f32_f16_e32 v3, s4
291; CI-NEXT:    v_cvt_f32_f16_e32 v1, s5
292; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
293; CI-NEXT:    v_mov_b32_e32 v5, s3
294; CI-NEXT:    v_mov_b32_e32 v4, s2
295; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
296; CI-NEXT:    s_endpgm
297;
298; VI-LABEL: extload_v4f16_to_v4f32_arg:
299; VI:       ; %bb.0:
300; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
301; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
302; VI-NEXT:    s_waitcnt lgkmcnt(0)
303; VI-NEXT:    s_lshr_b32 s4, s1, 16
304; VI-NEXT:    s_lshr_b32 s5, s0, 16
305; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
306; VI-NEXT:    v_cvt_f32_f16_e32 v3, s4
307; VI-NEXT:    v_cvt_f32_f16_e32 v1, s5
308; VI-NEXT:    v_cvt_f32_f16_e32 v2, s1
309; VI-NEXT:    v_mov_b32_e32 v5, s3
310; VI-NEXT:    v_mov_b32_e32 v4, s2
311; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
312; VI-NEXT:    s_endpgm
313  %ext = fpext <4 x half> %arg to <4 x float>
314  store <4 x float> %ext, <4 x float> addrspace(1)* %out
315  ret void
316}
317
318define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
319; CI-LABEL: extload_v8f16_to_v8f32_arg:
320; CI:       ; %bb.0:
321; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
322; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
323; CI-NEXT:    s_waitcnt lgkmcnt(0)
324; CI-NEXT:    s_lshr_b32 s6, s1, 16
325; CI-NEXT:    s_lshr_b32 s7, s0, 16
326; CI-NEXT:    s_lshr_b32 s8, s3, 16
327; CI-NEXT:    v_cvt_f32_f16_e32 v3, s6
328; CI-NEXT:    s_lshr_b32 s6, s2, 16
329; CI-NEXT:    v_cvt_f32_f16_e32 v7, s8
330; CI-NEXT:    v_cvt_f32_f16_e32 v5, s6
331; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
332; CI-NEXT:    v_cvt_f32_f16_e32 v6, s3
333; CI-NEXT:    v_cvt_f32_f16_e32 v4, s2
334; CI-NEXT:    s_add_u32 s0, s4, 16
335; CI-NEXT:    v_cvt_f32_f16_e32 v2, s1
336; CI-NEXT:    s_addc_u32 s1, s5, 0
337; CI-NEXT:    v_cvt_f32_f16_e32 v1, s7
338; CI-NEXT:    v_mov_b32_e32 v9, s1
339; CI-NEXT:    v_mov_b32_e32 v8, s0
340; CI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
341; CI-NEXT:    s_nop 0
342; CI-NEXT:    v_mov_b32_e32 v4, s4
343; CI-NEXT:    v_mov_b32_e32 v5, s5
344; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
345; CI-NEXT:    s_endpgm
346;
347; VI-LABEL: extload_v8f16_to_v8f32_arg:
348; VI:       ; %bb.0:
349; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
350; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
351; VI-NEXT:    s_waitcnt lgkmcnt(0)
352; VI-NEXT:    s_lshr_b32 s6, s1, 16
353; VI-NEXT:    s_lshr_b32 s7, s0, 16
354; VI-NEXT:    s_lshr_b32 s8, s3, 16
355; VI-NEXT:    v_cvt_f32_f16_e32 v3, s6
356; VI-NEXT:    s_lshr_b32 s6, s2, 16
357; VI-NEXT:    v_cvt_f32_f16_e32 v7, s8
358; VI-NEXT:    v_cvt_f32_f16_e32 v5, s6
359; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
360; VI-NEXT:    v_cvt_f32_f16_e32 v6, s3
361; VI-NEXT:    v_cvt_f32_f16_e32 v4, s2
362; VI-NEXT:    s_add_u32 s0, s4, 16
363; VI-NEXT:    v_cvt_f32_f16_e32 v2, s1
364; VI-NEXT:    s_addc_u32 s1, s5, 0
365; VI-NEXT:    v_cvt_f32_f16_e32 v1, s7
366; VI-NEXT:    v_mov_b32_e32 v9, s1
367; VI-NEXT:    v_mov_b32_e32 v8, s0
368; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
369; VI-NEXT:    s_nop 0
370; VI-NEXT:    v_mov_b32_e32 v4, s4
371; VI-NEXT:    v_mov_b32_e32 v5, s5
372; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
373; VI-NEXT:    s_endpgm
374  %ext = fpext <8 x half> %arg to <8 x float>
375  store <8 x float> %ext, <8 x float> addrspace(1)* %out
376  ret void
377}
378
379define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
380; CI-LABEL: extload_f16_to_f64_arg:
381; CI:       ; %bb.0:
382; CI-NEXT:    s_load_dword s0, s[4:5], 0x2
383; CI-NEXT:    s_waitcnt lgkmcnt(0)
384; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
385; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
386; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
387; CI-NEXT:    s_waitcnt lgkmcnt(0)
388; CI-NEXT:    v_mov_b32_e32 v3, s1
389; CI-NEXT:    v_mov_b32_e32 v2, s0
390; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
391; CI-NEXT:    s_endpgm
392;
393; VI-LABEL: extload_f16_to_f64_arg:
394; VI:       ; %bb.0:
395; VI-NEXT:    s_load_dword s0, s[4:5], 0x8
396; VI-NEXT:    s_waitcnt lgkmcnt(0)
397; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
398; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
399; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
400; VI-NEXT:    s_waitcnt lgkmcnt(0)
401; VI-NEXT:    v_mov_b32_e32 v3, s1
402; VI-NEXT:    v_mov_b32_e32 v2, s0
403; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
404; VI-NEXT:    s_endpgm
405  %ext = fpext half %arg to double
406  store double %ext, double addrspace(1)* %out
407  ret void
408}
409
410define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
411; CI-LABEL: extload_v2f16_to_v2f64_arg:
412; CI:       ; %bb.0:
413; CI-NEXT:    s_load_dword s0, s[4:5], 0x2
414; CI-NEXT:    s_waitcnt lgkmcnt(0)
415; CI-NEXT:    s_lshr_b32 s1, s0, 16
416; CI-NEXT:    v_cvt_f32_f16_e32 v0, s1
417; CI-NEXT:    v_cvt_f32_f16_e32 v1, s0
418; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
419; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
420; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
421; CI-NEXT:    s_waitcnt lgkmcnt(0)
422; CI-NEXT:    v_mov_b32_e32 v5, s1
423; CI-NEXT:    v_mov_b32_e32 v4, s0
424; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
425; CI-NEXT:    s_endpgm
426;
427; VI-LABEL: extload_v2f16_to_v2f64_arg:
428; VI:       ; %bb.0:
429; VI-NEXT:    s_load_dword s0, s[4:5], 0x8
430; VI-NEXT:    s_waitcnt lgkmcnt(0)
431; VI-NEXT:    s_lshr_b32 s1, s0, 16
432; VI-NEXT:    v_cvt_f32_f16_e32 v0, s1
433; VI-NEXT:    v_cvt_f32_f16_e32 v1, s0
434; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
435; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
436; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
437; VI-NEXT:    s_waitcnt lgkmcnt(0)
438; VI-NEXT:    v_mov_b32_e32 v5, s1
439; VI-NEXT:    v_mov_b32_e32 v4, s0
440; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
441; VI-NEXT:    s_endpgm
442  %ext = fpext <2 x half> %arg to <2 x double>
443  store <2 x double> %ext, <2 x double> addrspace(1)* %out
444  ret void
445}
446
447define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
448; CI-LABEL: extload_v3f16_to_v3f64_arg:
449; CI:       ; %bb.0:
450; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
451; CI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
452; CI-NEXT:    s_waitcnt lgkmcnt(0)
453; CI-NEXT:    v_cvt_f32_f16_e32 v0, s1
454; CI-NEXT:    s_lshr_b32 s4, s0, 16
455; CI-NEXT:    v_cvt_f32_f16_e32 v1, s0
456; CI-NEXT:    v_cvt_f32_f16_e32 v2, s4
457; CI-NEXT:    s_add_u32 s0, s2, 16
458; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v0
459; CI-NEXT:    s_addc_u32 s1, s3, 0
460; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
461; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
462; CI-NEXT:    v_mov_b32_e32 v7, s1
463; CI-NEXT:    v_mov_b32_e32 v6, s0
464; CI-NEXT:    flat_store_dwordx2 v[6:7], v[4:5]
465; CI-NEXT:    v_mov_b32_e32 v5, s3
466; CI-NEXT:    v_mov_b32_e32 v4, s2
467; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
468; CI-NEXT:    s_endpgm
469;
470; VI-LABEL: extload_v3f16_to_v3f64_arg:
471; VI:       ; %bb.0:
472; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
473; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
474; VI-NEXT:    s_waitcnt lgkmcnt(0)
475; VI-NEXT:    v_cvt_f32_f16_e32 v1, s1
476; VI-NEXT:    s_lshr_b32 s4, s0, 16
477; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
478; VI-NEXT:    v_cvt_f32_f16_e32 v2, s4
479; VI-NEXT:    s_add_u32 s0, s2, 16
480; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
481; VI-NEXT:    s_addc_u32 s1, s3, 0
482; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
483; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
484; VI-NEXT:    v_mov_b32_e32 v7, s1
485; VI-NEXT:    v_mov_b32_e32 v6, s0
486; VI-NEXT:    flat_store_dwordx2 v[6:7], v[4:5]
487; VI-NEXT:    v_mov_b32_e32 v5, s3
488; VI-NEXT:    v_mov_b32_e32 v4, s2
489; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
490; VI-NEXT:    s_endpgm
491  %ext = fpext <3 x half> %arg to <3 x double>
492  store <3 x double> %ext, <3 x double> addrspace(1)* %out
493  ret void
494}
495
496define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
497; CI-LABEL: extload_v4f16_to_v4f64_arg:
498; CI:       ; %bb.0:
499; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
500; CI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
501; CI-NEXT:    s_waitcnt lgkmcnt(0)
502; CI-NEXT:    s_lshr_b32 s4, s1, 16
503; CI-NEXT:    v_cvt_f32_f16_e32 v4, s4
504; CI-NEXT:    v_cvt_f32_f16_e32 v5, s1
505; CI-NEXT:    s_lshr_b32 s5, s0, 16
506; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
507; CI-NEXT:    v_cvt_f32_f16_e32 v2, s5
508; CI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
509; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
510; CI-NEXT:    s_add_u32 s0, s2, 16
511; CI-NEXT:    s_addc_u32 s1, s3, 0
512; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
513; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
514; CI-NEXT:    v_mov_b32_e32 v9, s1
515; CI-NEXT:    v_mov_b32_e32 v8, s0
516; CI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
517; CI-NEXT:    s_nop 0
518; CI-NEXT:    v_mov_b32_e32 v5, s3
519; CI-NEXT:    v_mov_b32_e32 v4, s2
520; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
521; CI-NEXT:    s_endpgm
522;
523; VI-LABEL: extload_v4f16_to_v4f64_arg:
524; VI:       ; %bb.0:
525; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
526; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
527; VI-NEXT:    s_waitcnt lgkmcnt(0)
528; VI-NEXT:    s_lshr_b32 s5, s1, 16
529; VI-NEXT:    v_cvt_f32_f16_e32 v4, s5
530; VI-NEXT:    v_cvt_f32_f16_e32 v5, s1
531; VI-NEXT:    s_lshr_b32 s4, s0, 16
532; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
533; VI-NEXT:    v_cvt_f32_f16_e32 v2, s4
534; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
535; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
536; VI-NEXT:    s_add_u32 s0, s2, 16
537; VI-NEXT:    s_addc_u32 s1, s3, 0
538; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
539; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
540; VI-NEXT:    v_mov_b32_e32 v9, s1
541; VI-NEXT:    v_mov_b32_e32 v8, s0
542; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
543; VI-NEXT:    s_nop 0
544; VI-NEXT:    v_mov_b32_e32 v5, s3
545; VI-NEXT:    v_mov_b32_e32 v4, s2
546; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
547; VI-NEXT:    s_endpgm
548  %ext = fpext <4 x half> %arg to <4 x double>
549  store <4 x double> %ext, <4 x double> addrspace(1)* %out
550  ret void
551}
552
553define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
554; CI-LABEL: extload_v8f16_to_v8f64_arg:
555; CI:       ; %bb.0:
556; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
557; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
558; CI-NEXT:    s_waitcnt lgkmcnt(0)
559; CI-NEXT:    s_lshr_b32 s6, s3, 16
560; CI-NEXT:    v_cvt_f32_f16_e32 v0, s6
561; CI-NEXT:    v_cvt_f32_f16_e32 v12, s3
562; CI-NEXT:    s_lshr_b32 s7, s2, 16
563; CI-NEXT:    s_lshr_b32 s8, s1, 16
564; CI-NEXT:    s_lshr_b32 s6, s0, 16
565; CI-NEXT:    v_cvt_f32_f16_e32 v1, s7
566; CI-NEXT:    v_cvt_f32_f16_e32 v8, s2
567; CI-NEXT:    v_cvt_f32_f16_e32 v9, s0
568; CI-NEXT:    s_add_u32 s0, s4, 48
569; CI-NEXT:    v_cvt_f32_f16_e32 v5, s1
570; CI-NEXT:    v_cvt_f64_f32_e32 v[14:15], v0
571; CI-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
572; CI-NEXT:    s_addc_u32 s1, s5, 0
573; CI-NEXT:    v_cvt_f32_f16_e32 v4, s8
574; CI-NEXT:    v_mov_b32_e32 v17, s1
575; CI-NEXT:    v_mov_b32_e32 v16, s0
576; CI-NEXT:    s_add_u32 s0, s4, 32
577; CI-NEXT:    v_cvt_f32_f16_e32 v2, s6
578; CI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v1
579; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
580; CI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
581; CI-NEXT:    s_addc_u32 s1, s5, 0
582; CI-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
583; CI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
584; CI-NEXT:    v_mov_b32_e32 v13, s1
585; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
586; CI-NEXT:    v_mov_b32_e32 v12, s0
587; CI-NEXT:    s_add_u32 s0, s4, 16
588; CI-NEXT:    s_addc_u32 s1, s5, 0
589; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
590; CI-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
591; CI-NEXT:    s_nop 0
592; CI-NEXT:    v_mov_b32_e32 v9, s1
593; CI-NEXT:    v_mov_b32_e32 v8, s0
594; CI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
595; CI-NEXT:    s_nop 0
596; CI-NEXT:    v_mov_b32_e32 v4, s4
597; CI-NEXT:    v_mov_b32_e32 v5, s5
598; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
599; CI-NEXT:    s_endpgm
600;
601; VI-LABEL: extload_v8f16_to_v8f64_arg:
602; VI:       ; %bb.0:
603; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
604; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
605; VI-NEXT:    s_waitcnt lgkmcnt(0)
606; VI-NEXT:    s_lshr_b32 s6, s0, 16
607; VI-NEXT:    s_lshr_b32 s8, s2, 16
608; VI-NEXT:    s_lshr_b32 s9, s3, 16
609; VI-NEXT:    v_cvt_f32_f16_e32 v0, s6
610; VI-NEXT:    v_cvt_f32_f16_e32 v4, s8
611; VI-NEXT:    v_cvt_f32_f16_e32 v5, s9
612; VI-NEXT:    v_cvt_f32_f16_e32 v12, s3
613; VI-NEXT:    s_lshr_b32 s7, s1, 16
614; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
615; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
616; VI-NEXT:    v_cvt_f32_f16_e32 v8, s2
617; VI-NEXT:    s_add_u32 s0, s4, 48
618; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v4
619; VI-NEXT:    v_cvt_f64_f32_e32 v[14:15], v5
620; VI-NEXT:    v_cvt_f32_f16_e32 v4, s1
621; VI-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
622; VI-NEXT:    s_addc_u32 s1, s5, 0
623; VI-NEXT:    v_cvt_f32_f16_e32 v1, s7
624; VI-NEXT:    v_mov_b32_e32 v17, s1
625; VI-NEXT:    v_mov_b32_e32 v16, s0
626; VI-NEXT:    s_add_u32 s0, s4, 32
627; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
628; VI-NEXT:    s_addc_u32 s1, s5, 0
629; VI-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
630; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v1
631; VI-NEXT:    v_mov_b32_e32 v13, s1
632; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
633; VI-NEXT:    v_mov_b32_e32 v12, s0
634; VI-NEXT:    s_add_u32 s0, s4, 16
635; VI-NEXT:    s_addc_u32 s1, s5, 0
636; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
637; VI-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
638; VI-NEXT:    s_nop 0
639; VI-NEXT:    v_mov_b32_e32 v9, s1
640; VI-NEXT:    v_mov_b32_e32 v8, s0
641; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
642; VI-NEXT:    s_nop 0
643; VI-NEXT:    v_mov_b32_e32 v4, s4
644; VI-NEXT:    v_mov_b32_e32 v5, s5
645; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
646; VI-NEXT:    s_endpgm
647  %ext = fpext <8 x half> %arg to <8 x double>
648  store <8 x double> %ext, <8 x double> addrspace(1)* %out
649  ret void
650}
651
652define amdgpu_kernel void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
653; GCN-LABEL: global_load_store_f16:
654; GCN:       ; %bb.0:
655; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
656; GCN-NEXT:    s_waitcnt lgkmcnt(0)
657; GCN-NEXT:    v_mov_b32_e32 v0, s2
658; GCN-NEXT:    v_mov_b32_e32 v1, s3
659; GCN-NEXT:    flat_load_ushort v2, v[0:1]
660; GCN-NEXT:    v_mov_b32_e32 v0, s0
661; GCN-NEXT:    v_mov_b32_e32 v1, s1
662; GCN-NEXT:    s_waitcnt vmcnt(0)
663; GCN-NEXT:    flat_store_short v[0:1], v2
664; GCN-NEXT:    s_endpgm
665  %val = load half, half addrspace(1)* %in
666  store half %val, half addrspace(1)* %out
667  ret void
668}
669
670define amdgpu_kernel void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
671; GCN-LABEL: global_load_store_v2f16:
672; GCN:       ; %bb.0:
673; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
674; GCN-NEXT:    s_waitcnt lgkmcnt(0)
675; GCN-NEXT:    v_mov_b32_e32 v0, s2
676; GCN-NEXT:    v_mov_b32_e32 v1, s3
677; GCN-NEXT:    flat_load_dword v2, v[0:1]
678; GCN-NEXT:    v_mov_b32_e32 v0, s0
679; GCN-NEXT:    v_mov_b32_e32 v1, s1
680; GCN-NEXT:    s_waitcnt vmcnt(0)
681; GCN-NEXT:    flat_store_dword v[0:1], v2
682; GCN-NEXT:    s_endpgm
683  %val = load <2 x half>, <2 x half> addrspace(1)* %in
684  store <2 x half> %val, <2 x half> addrspace(1)* %out
685  ret void
686}
687
688define amdgpu_kernel void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
689; GCN-LABEL: global_load_store_v4f16:
690; GCN:       ; %bb.0:
691; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
692; GCN-NEXT:    s_waitcnt lgkmcnt(0)
693; GCN-NEXT:    v_mov_b32_e32 v0, s0
694; GCN-NEXT:    v_mov_b32_e32 v1, s1
695; GCN-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
696; GCN-NEXT:    v_mov_b32_e32 v2, s2
697; GCN-NEXT:    v_mov_b32_e32 v3, s3
698; GCN-NEXT:    s_waitcnt vmcnt(0)
699; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
700; GCN-NEXT:    s_endpgm
701  %val = load <4 x half>, <4 x half> addrspace(1)* %in
702  store <4 x half> %val, <4 x half> addrspace(1)* %out
703  ret void
704}
705
706define amdgpu_kernel void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
707; GCN-LABEL: global_load_store_v8f16:
708; GCN:       ; %bb.0:
709; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
710; GCN-NEXT:    s_waitcnt lgkmcnt(0)
711; GCN-NEXT:    v_mov_b32_e32 v0, s2
712; GCN-NEXT:    v_mov_b32_e32 v1, s3
713; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
714; GCN-NEXT:    v_mov_b32_e32 v4, s0
715; GCN-NEXT:    v_mov_b32_e32 v5, s1
716; GCN-NEXT:    s_waitcnt vmcnt(0)
717; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
718; GCN-NEXT:    s_endpgm
719  %val = load <8 x half>, <8 x half> addrspace(1)* %in
720  store <8 x half> %val, <8 x half> addrspace(1)* %out
721  ret void
722}
723
724define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
725; GCN-LABEL: global_extload_f16_to_f32:
726; GCN:       ; %bb.0:
727; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
728; GCN-NEXT:    s_waitcnt lgkmcnt(0)
729; GCN-NEXT:    v_mov_b32_e32 v0, s2
730; GCN-NEXT:    v_mov_b32_e32 v1, s3
731; GCN-NEXT:    flat_load_ushort v0, v[0:1]
732; GCN-NEXT:    v_mov_b32_e32 v1, s1
733; GCN-NEXT:    s_waitcnt vmcnt(0)
734; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v0
735; GCN-NEXT:    v_mov_b32_e32 v0, s0
736; GCN-NEXT:    flat_store_dword v[0:1], v2
737; GCN-NEXT:    s_endpgm
738  %val = load half, half addrspace(1)* %in
739  %cvt = fpext half %val to float
740  store float %cvt, float addrspace(1)* %out
741  ret void
742}
743
744define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
745; CI-LABEL: global_extload_v2f16_to_v2f32:
746; CI:       ; %bb.0:
747; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
748; CI-NEXT:    s_waitcnt lgkmcnt(0)
749; CI-NEXT:    v_mov_b32_e32 v0, s2
750; CI-NEXT:    v_mov_b32_e32 v1, s3
751; CI-NEXT:    flat_load_dword v1, v[0:1]
752; CI-NEXT:    v_mov_b32_e32 v2, s0
753; CI-NEXT:    v_mov_b32_e32 v3, s1
754; CI-NEXT:    s_waitcnt vmcnt(0)
755; CI-NEXT:    v_cvt_f32_f16_e32 v0, v1
756; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
757; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
758; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
759; CI-NEXT:    s_endpgm
760;
761; VI-LABEL: global_extload_v2f16_to_v2f32:
762; VI:       ; %bb.0:
763; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
764; VI-NEXT:    s_waitcnt lgkmcnt(0)
765; VI-NEXT:    v_mov_b32_e32 v0, s2
766; VI-NEXT:    v_mov_b32_e32 v1, s3
767; VI-NEXT:    flat_load_dword v1, v[0:1]
768; VI-NEXT:    v_mov_b32_e32 v2, s0
769; VI-NEXT:    v_mov_b32_e32 v3, s1
770; VI-NEXT:    s_waitcnt vmcnt(0)
771; VI-NEXT:    v_cvt_f32_f16_e32 v0, v1
772; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
773; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
774; VI-NEXT:    s_endpgm
775  %val = load <2 x half>, <2 x half> addrspace(1)* %in
776  %cvt = fpext <2 x half> %val to <2 x float>
777  store <2 x float> %cvt, <2 x float> addrspace(1)* %out
778  ret void
779}
780
781define amdgpu_kernel void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
782; CI-LABEL: global_extload_v3f16_to_v3f32:
783; CI:       ; %bb.0:
784; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
785; CI-NEXT:    s_waitcnt lgkmcnt(0)
786; CI-NEXT:    v_mov_b32_e32 v0, s2
787; CI-NEXT:    v_mov_b32_e32 v1, s3
788; CI-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
789; CI-NEXT:    v_mov_b32_e32 v3, s0
790; CI-NEXT:    v_mov_b32_e32 v4, s1
791; CI-NEXT:    s_waitcnt vmcnt(0)
792; CI-NEXT:    v_cvt_f32_f16_e32 v0, v1
793; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
794; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
795; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
796; CI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
797; CI-NEXT:    s_endpgm
798;
799; VI-LABEL: global_extload_v3f16_to_v3f32:
800; VI:       ; %bb.0:
801; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
802; VI-NEXT:    s_waitcnt lgkmcnt(0)
803; VI-NEXT:    v_mov_b32_e32 v0, s2
804; VI-NEXT:    v_mov_b32_e32 v1, s3
805; VI-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
806; VI-NEXT:    v_mov_b32_e32 v3, s0
807; VI-NEXT:    v_mov_b32_e32 v4, s1
808; VI-NEXT:    s_waitcnt vmcnt(0)
809; VI-NEXT:    v_cvt_f32_f16_e32 v0, v1
810; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
811; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
812; VI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
813; VI-NEXT:    s_endpgm
814  %val = load <3 x half>, <3 x half> addrspace(1)* %in
815  %cvt = fpext <3 x half> %val to <3 x float>
816  store <3 x float> %cvt, <3 x float> addrspace(1)* %out
817  ret void
818}
819
820define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
821; CI-LABEL: global_extload_v4f16_to_v4f32:
822; CI:       ; %bb.0:
823; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
824; CI-NEXT:    s_waitcnt lgkmcnt(0)
825; CI-NEXT:    v_mov_b32_e32 v0, s2
826; CI-NEXT:    v_mov_b32_e32 v1, s3
827; CI-NEXT:    flat_load_dwordx2 v[3:4], v[0:1]
828; CI-NEXT:    v_mov_b32_e32 v5, s1
829; CI-NEXT:    s_waitcnt vmcnt(0)
830; CI-NEXT:    v_cvt_f32_f16_e32 v2, v4
831; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
832; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
833; CI-NEXT:    v_cvt_f32_f16_e32 v0, v3
834; CI-NEXT:    v_cvt_f32_f16_e32 v3, v1
835; CI-NEXT:    v_cvt_f32_f16_e32 v1, v4
836; CI-NEXT:    v_mov_b32_e32 v4, s0
837; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
838; CI-NEXT:    s_endpgm
839;
840; VI-LABEL: global_extload_v4f16_to_v4f32:
841; VI:       ; %bb.0:
842; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
843; VI-NEXT:    s_waitcnt lgkmcnt(0)
844; VI-NEXT:    v_mov_b32_e32 v0, s2
845; VI-NEXT:    v_mov_b32_e32 v1, s3
846; VI-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
847; VI-NEXT:    s_waitcnt vmcnt(0)
848; VI-NEXT:    v_cvt_f32_f16_e32 v0, v4
849; VI-NEXT:    v_cvt_f32_f16_e32 v2, v5
850; VI-NEXT:    v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
851; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
852; VI-NEXT:    v_mov_b32_e32 v4, s0
853; VI-NEXT:    v_mov_b32_e32 v5, s1
854; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
855; VI-NEXT:    s_endpgm
856  %val = load <4 x half>, <4 x half> addrspace(1)* %in
857  %cvt = fpext <4 x half> %val to <4 x float>
858  store <4 x float> %cvt, <4 x float> addrspace(1)* %out
859  ret void
860}
861
862define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
863; CI-LABEL: global_extload_v8f16_to_v8f32:
864; CI:       ; %bb.0:
865; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
866; CI-NEXT:    s_waitcnt lgkmcnt(0)
867; CI-NEXT:    v_mov_b32_e32 v0, s2
868; CI-NEXT:    v_mov_b32_e32 v1, s3
869; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
870; CI-NEXT:    s_add_u32 s2, s0, 16
871; CI-NEXT:    s_addc_u32 s3, s1, 0
872; CI-NEXT:    v_mov_b32_e32 v13, s1
873; CI-NEXT:    v_mov_b32_e32 v12, s0
874; CI-NEXT:    s_waitcnt vmcnt(0)
875; CI-NEXT:    v_cvt_f32_f16_e32 v10, v3
876; CI-NEXT:    v_cvt_f32_f16_e32 v8, v2
877; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
878; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
879; CI-NEXT:    v_cvt_f32_f16_e32 v6, v1
880; CI-NEXT:    v_cvt_f32_f16_e32 v4, v0
881; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
882; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
883; CI-NEXT:    v_cvt_f32_f16_e32 v11, v3
884; CI-NEXT:    v_cvt_f32_f16_e32 v9, v2
885; CI-NEXT:    v_cvt_f32_f16_e32 v7, v1
886; CI-NEXT:    v_cvt_f32_f16_e32 v5, v0
887; CI-NEXT:    v_mov_b32_e32 v0, s2
888; CI-NEXT:    v_mov_b32_e32 v1, s3
889; CI-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
890; CI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
891; CI-NEXT:    s_endpgm
892;
893; VI-LABEL: global_extload_v8f16_to_v8f32:
894; VI:       ; %bb.0:
895; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
896; VI-NEXT:    s_waitcnt lgkmcnt(0)
897; VI-NEXT:    v_mov_b32_e32 v0, s2
898; VI-NEXT:    v_mov_b32_e32 v1, s3
899; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
900; VI-NEXT:    s_add_u32 s2, s0, 16
901; VI-NEXT:    s_addc_u32 s3, s1, 0
902; VI-NEXT:    v_mov_b32_e32 v13, s1
903; VI-NEXT:    v_mov_b32_e32 v12, s0
904; VI-NEXT:    s_waitcnt vmcnt(0)
905; VI-NEXT:    v_cvt_f32_f16_e32 v10, v3
906; VI-NEXT:    v_cvt_f32_f16_e32 v8, v2
907; VI-NEXT:    v_cvt_f32_f16_sdwa v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
908; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
909; VI-NEXT:    v_cvt_f32_f16_e32 v6, v1
910; VI-NEXT:    v_cvt_f32_f16_e32 v4, v0
911; VI-NEXT:    v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
912; VI-NEXT:    v_cvt_f32_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
913; VI-NEXT:    v_mov_b32_e32 v0, s2
914; VI-NEXT:    v_mov_b32_e32 v1, s3
915; VI-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
916; VI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
917; VI-NEXT:    s_endpgm
918  %val = load <8 x half>, <8 x half> addrspace(1)* %in
919  %cvt = fpext <8 x half> %val to <8 x float>
920  store <8 x float> %cvt, <8 x float> addrspace(1)* %out
921  ret void
922}
923
924define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
925; CI-LABEL: global_extload_v16f16_to_v16f32:
926; CI:       ; %bb.0:
927; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
928; CI-NEXT:    s_waitcnt lgkmcnt(0)
929; CI-NEXT:    s_add_u32 s4, s2, 16
930; CI-NEXT:    v_mov_b32_e32 v5, s3
931; CI-NEXT:    s_addc_u32 s5, s3, 0
932; CI-NEXT:    v_mov_b32_e32 v0, s4
933; CI-NEXT:    v_mov_b32_e32 v4, s2
934; CI-NEXT:    v_mov_b32_e32 v1, s5
935; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
936; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
937; CI-NEXT:    s_add_u32 s2, s0, 16
938; CI-NEXT:    s_addc_u32 s3, s1, 0
939; CI-NEXT:    v_mov_b32_e32 v14, s3
940; CI-NEXT:    v_mov_b32_e32 v13, s2
941; CI-NEXT:    s_add_u32 s2, s0, 48
942; CI-NEXT:    s_addc_u32 s3, s1, 0
943; CI-NEXT:    s_waitcnt vmcnt(1)
944; CI-NEXT:    v_cvt_f32_f16_e32 v8, v1
945; CI-NEXT:    s_waitcnt vmcnt(0)
946; CI-NEXT:    v_cvt_f32_f16_e32 v11, v7
947; CI-NEXT:    v_cvt_f32_f16_e32 v9, v6
948; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
949; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
950; CI-NEXT:    v_cvt_f32_f16_e32 v12, v7
951; CI-NEXT:    v_cvt_f32_f16_e32 v10, v6
952; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
953; CI-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
954; CI-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
955; CI-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
956; CI-NEXT:    v_cvt_f32_f16_e32 v6, v0
957; CI-NEXT:    v_cvt_f32_f16_e32 v12, v3
958; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
959; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
960; CI-NEXT:    v_cvt_f32_f16_e32 v10, v2
961; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
962; CI-NEXT:    v_cvt_f32_f16_e32 v2, v5
963; CI-NEXT:    v_cvt_f32_f16_e32 v0, v4
964; CI-NEXT:    v_mov_b32_e32 v5, s1
965; CI-NEXT:    v_cvt_f32_f16_e32 v9, v1
966; CI-NEXT:    v_cvt_f32_f16_e32 v13, v3
967; CI-NEXT:    v_cvt_f32_f16_e32 v3, v16
968; CI-NEXT:    v_cvt_f32_f16_e32 v1, v17
969; CI-NEXT:    v_mov_b32_e32 v4, s0
970; CI-NEXT:    s_add_u32 s0, s0, 32
971; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
972; CI-NEXT:    s_addc_u32 s1, s1, 0
973; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
974; CI-NEXT:    v_mov_b32_e32 v15, s3
975; CI-NEXT:    v_mov_b32_e32 v17, s1
976; CI-NEXT:    v_mov_b32_e32 v14, s2
977; CI-NEXT:    v_mov_b32_e32 v16, s0
978; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
979; CI-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
980; CI-NEXT:    flat_store_dwordx4 v[16:17], v[6:9]
981; CI-NEXT:    s_endpgm
982;
983; VI-LABEL: global_extload_v16f16_to_v16f32:
984; VI:       ; %bb.0:
985; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
986; VI-NEXT:    s_waitcnt lgkmcnt(0)
987; VI-NEXT:    v_mov_b32_e32 v0, s2
988; VI-NEXT:    v_mov_b32_e32 v1, s3
989; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
990; VI-NEXT:    s_add_u32 s2, s2, 16
991; VI-NEXT:    s_addc_u32 s3, s3, 0
992; VI-NEXT:    v_mov_b32_e32 v5, s3
993; VI-NEXT:    v_mov_b32_e32 v4, s2
994; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
995; VI-NEXT:    s_add_u32 s2, s0, 16
996; VI-NEXT:    s_addc_u32 s3, s1, 0
997; VI-NEXT:    v_mov_b32_e32 v19, s3
998; VI-NEXT:    v_mov_b32_e32 v18, s2
999; VI-NEXT:    s_add_u32 s2, s0, 48
1000; VI-NEXT:    v_mov_b32_e32 v17, s1
1001; VI-NEXT:    s_addc_u32 s3, s1, 0
1002; VI-NEXT:    v_mov_b32_e32 v16, s0
1003; VI-NEXT:    s_add_u32 s0, s0, 32
1004; VI-NEXT:    s_addc_u32 s1, s1, 0
1005; VI-NEXT:    v_mov_b32_e32 v21, s3
1006; VI-NEXT:    v_mov_b32_e32 v20, s2
1007; VI-NEXT:    s_waitcnt vmcnt(1)
1008; VI-NEXT:    v_cvt_f32_f16_e32 v14, v3
1009; VI-NEXT:    v_cvt_f32_f16_e32 v12, v2
1010; VI-NEXT:    v_cvt_f32_f16_sdwa v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1011; VI-NEXT:    v_cvt_f32_f16_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1012; VI-NEXT:    v_cvt_f32_f16_e32 v10, v1
1013; VI-NEXT:    v_cvt_f32_f16_e32 v8, v0
1014; VI-NEXT:    v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1015; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1016; VI-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
1017; VI-NEXT:    s_waitcnt vmcnt(1)
1018; VI-NEXT:    v_cvt_f32_f16_e32 v2, v5
1019; VI-NEXT:    v_cvt_f32_f16_e32 v14, v7
1020; VI-NEXT:    v_cvt_f32_f16_e32 v12, v6
1021; VI-NEXT:    v_cvt_f32_f16_sdwa v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1022; VI-NEXT:    v_cvt_f32_f16_sdwa v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1023; VI-NEXT:    v_cvt_f32_f16_e32 v0, v4
1024; VI-NEXT:    v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1025; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1026; VI-NEXT:    v_mov_b32_e32 v5, s1
1027; VI-NEXT:    v_mov_b32_e32 v4, s0
1028; VI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
1029; VI-NEXT:    flat_store_dwordx4 v[20:21], v[12:15]
1030; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1031; VI-NEXT:    s_endpgm
1032  %val = load <16 x half>, <16 x half> addrspace(1)* %in
1033  %cvt = fpext <16 x half> %val to <16 x float>
1034  store <16 x float> %cvt, <16 x float> addrspace(1)* %out
1035  ret void
1036}
1037
1038define amdgpu_kernel void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
1039; GCN-LABEL: global_extload_f16_to_f64:
1040; GCN:       ; %bb.0:
1041; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1042; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1043; GCN-NEXT:    v_mov_b32_e32 v0, s2
1044; GCN-NEXT:    v_mov_b32_e32 v1, s3
1045; GCN-NEXT:    flat_load_ushort v0, v[0:1]
1046; GCN-NEXT:    v_mov_b32_e32 v2, s0
1047; GCN-NEXT:    v_mov_b32_e32 v3, s1
1048; GCN-NEXT:    s_waitcnt vmcnt(0)
1049; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
1050; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
1051; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1052; GCN-NEXT:    s_endpgm
1053  %val = load half, half addrspace(1)* %in
1054  %cvt = fpext half %val to double
1055  store double %cvt, double addrspace(1)* %out
1056  ret void
1057}
1058
1059define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
1060; CI-LABEL: global_extload_v2f16_to_v2f64:
1061; CI:       ; %bb.0:
1062; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1063; CI-NEXT:    s_waitcnt lgkmcnt(0)
1064; CI-NEXT:    v_mov_b32_e32 v0, s2
1065; CI-NEXT:    v_mov_b32_e32 v1, s3
1066; CI-NEXT:    flat_load_dword v0, v[0:1]
1067; CI-NEXT:    v_mov_b32_e32 v4, s0
1068; CI-NEXT:    v_mov_b32_e32 v5, s1
1069; CI-NEXT:    s_waitcnt vmcnt(0)
1070; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1071; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1072; CI-NEXT:    v_cvt_f32_f16_e32 v2, v1
1073; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
1074; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1075; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1076; CI-NEXT:    s_endpgm
1077;
1078; VI-LABEL: global_extload_v2f16_to_v2f64:
1079; VI:       ; %bb.0:
1080; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1081; VI-NEXT:    s_waitcnt lgkmcnt(0)
1082; VI-NEXT:    v_mov_b32_e32 v0, s2
1083; VI-NEXT:    v_mov_b32_e32 v1, s3
1084; VI-NEXT:    flat_load_dword v0, v[0:1]
1085; VI-NEXT:    v_mov_b32_e32 v4, s0
1086; VI-NEXT:    v_mov_b32_e32 v5, s1
1087; VI-NEXT:    s_waitcnt vmcnt(0)
1088; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1089; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1090; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
1091; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1092; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1093; VI-NEXT:    s_endpgm
1094  %val = load <2 x half>, <2 x half> addrspace(1)* %in
1095  %cvt = fpext <2 x half> %val to <2 x double>
1096  store <2 x double> %cvt, <2 x double> addrspace(1)* %out
1097  ret void
1098}
1099
1100define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
1101; CI-LABEL: global_extload_v3f16_to_v3f64:
1102; CI:       ; %bb.0:
1103; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1104; CI-NEXT:    s_waitcnt lgkmcnt(0)
1105; CI-NEXT:    v_mov_b32_e32 v0, s2
1106; CI-NEXT:    v_mov_b32_e32 v1, s3
1107; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1108; CI-NEXT:    s_add_u32 s2, s0, 16
1109; CI-NEXT:    s_addc_u32 s3, s1, 0
1110; CI-NEXT:    v_mov_b32_e32 v7, s3
1111; CI-NEXT:    v_mov_b32_e32 v6, s2
1112; CI-NEXT:    s_waitcnt vmcnt(0)
1113; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1114; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
1115; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1116; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1117; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
1118; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
1119; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1120; CI-NEXT:    flat_store_dwordx2 v[6:7], v[4:5]
1121; CI-NEXT:    v_mov_b32_e32 v5, s1
1122; CI-NEXT:    v_mov_b32_e32 v4, s0
1123; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1124; CI-NEXT:    s_endpgm
1125;
1126; VI-LABEL: global_extload_v3f16_to_v3f64:
1127; VI:       ; %bb.0:
1128; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1129; VI-NEXT:    s_waitcnt lgkmcnt(0)
1130; VI-NEXT:    v_mov_b32_e32 v0, s2
1131; VI-NEXT:    v_mov_b32_e32 v1, s3
1132; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1133; VI-NEXT:    s_add_u32 s2, s0, 16
1134; VI-NEXT:    s_addc_u32 s3, s1, 0
1135; VI-NEXT:    v_mov_b32_e32 v5, s1
1136; VI-NEXT:    v_mov_b32_e32 v4, s0
1137; VI-NEXT:    s_waitcnt vmcnt(0)
1138; VI-NEXT:    v_cvt_f32_f16_e32 v3, v1
1139; VI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1140; VI-NEXT:    v_cvt_f32_f16_sdwa v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1141; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v3
1142; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
1143; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
1144; VI-NEXT:    v_mov_b32_e32 v9, s3
1145; VI-NEXT:    v_mov_b32_e32 v8, s2
1146; VI-NEXT:    flat_store_dwordx2 v[8:9], v[6:7]
1147; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1148; VI-NEXT:    s_endpgm
1149  %val = load <3 x half>, <3 x half> addrspace(1)* %in
1150  %cvt = fpext <3 x half> %val to <3 x double>
1151  store <3 x double> %cvt, <3 x double> addrspace(1)* %out
1152  ret void
1153}
1154
1155define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
1156; CI-LABEL: global_extload_v4f16_to_v4f64:
1157; CI:       ; %bb.0:
1158; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1159; CI-NEXT:    s_waitcnt lgkmcnt(0)
1160; CI-NEXT:    v_mov_b32_e32 v0, s2
1161; CI-NEXT:    v_mov_b32_e32 v1, s3
1162; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1163; CI-NEXT:    s_add_u32 s2, s0, 16
1164; CI-NEXT:    s_addc_u32 s3, s1, 0
1165; CI-NEXT:    v_mov_b32_e32 v9, s1
1166; CI-NEXT:    v_mov_b32_e32 v8, s0
1167; CI-NEXT:    s_waitcnt vmcnt(0)
1168; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
1169; CI-NEXT:    v_cvt_f32_f16_e32 v3, v1
1170; CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1171; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1172; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1173; CI-NEXT:    v_cvt_f32_f16_e32 v10, v0
1174; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v3
1175; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
1176; CI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v2
1177; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v10
1178; CI-NEXT:    v_mov_b32_e32 v11, s3
1179; CI-NEXT:    v_mov_b32_e32 v10, s2
1180; CI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
1181; CI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1182; CI-NEXT:    s_endpgm
1183;
1184; VI-LABEL: global_extload_v4f16_to_v4f64:
1185; VI:       ; %bb.0:
1186; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1187; VI-NEXT:    s_waitcnt lgkmcnt(0)
1188; VI-NEXT:    v_mov_b32_e32 v0, s2
1189; VI-NEXT:    v_mov_b32_e32 v1, s3
1190; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1191; VI-NEXT:    s_add_u32 s2, s0, 16
1192; VI-NEXT:    s_addc_u32 s3, s1, 0
1193; VI-NEXT:    v_mov_b32_e32 v9, s1
1194; VI-NEXT:    v_mov_b32_e32 v8, s0
1195; VI-NEXT:    s_waitcnt vmcnt(0)
1196; VI-NEXT:    v_cvt_f32_f16_e32 v3, v1
1197; VI-NEXT:    v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1198; VI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1199; VI-NEXT:    v_cvt_f32_f16_sdwa v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1200; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v3
1201; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
1202; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
1203; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v10
1204; VI-NEXT:    v_mov_b32_e32 v11, s3
1205; VI-NEXT:    v_mov_b32_e32 v10, s2
1206; VI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
1207; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1208; VI-NEXT:    s_endpgm
1209  %val = load <4 x half>, <4 x half> addrspace(1)* %in
1210  %cvt = fpext <4 x half> %val to <4 x double>
1211  store <4 x double> %cvt, <4 x double> addrspace(1)* %out
1212  ret void
1213}
1214
1215define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
1216; CI-LABEL: global_extload_v8f16_to_v8f64:
1217; CI:       ; %bb.0:
1218; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1219; CI-NEXT:    s_waitcnt lgkmcnt(0)
1220; CI-NEXT:    v_mov_b32_e32 v0, s2
1221; CI-NEXT:    v_mov_b32_e32 v1, s3
1222; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1223; CI-NEXT:    s_add_u32 s2, s0, 48
1224; CI-NEXT:    s_addc_u32 s3, s1, 0
1225; CI-NEXT:    v_mov_b32_e32 v7, s3
1226; CI-NEXT:    v_mov_b32_e32 v6, s2
1227; CI-NEXT:    s_add_u32 s2, s0, 32
1228; CI-NEXT:    v_mov_b32_e32 v13, s1
1229; CI-NEXT:    s_addc_u32 s3, s1, 0
1230; CI-NEXT:    v_mov_b32_e32 v12, s0
1231; CI-NEXT:    s_add_u32 s0, s0, 16
1232; CI-NEXT:    v_mov_b32_e32 v15, s3
1233; CI-NEXT:    s_addc_u32 s1, s1, 0
1234; CI-NEXT:    v_mov_b32_e32 v14, s2
1235; CI-NEXT:    s_waitcnt vmcnt(0)
1236; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
1237; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1238; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
1239; CI-NEXT:    v_cvt_f32_f16_e32 v8, v2
1240; CI-NEXT:    v_cvt_f32_f16_e32 v2, v4
1241; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
1242; CI-NEXT:    v_cvt_f32_f16_e32 v10, v1
1243; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
1244; CI-NEXT:    v_cvt_f32_f16_e32 v4, v0
1245; CI-NEXT:    v_cvt_f32_f16_e32 v16, v5
1246; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
1247; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1248; CI-NEXT:    v_cvt_f32_f16_e32 v17, v9
1249; CI-NEXT:    v_cvt_f32_f16_e32 v18, v11
1250; CI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
1251; CI-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
1252; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
1253; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v10
1254; CI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v16
1255; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v17
1256; CI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v18
1257; CI-NEXT:    v_mov_b32_e32 v17, s1
1258; CI-NEXT:    v_mov_b32_e32 v16, s0
1259; CI-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
1260; CI-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
1261; CI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
1262; CI-NEXT:    s_endpgm
1263;
1264; VI-LABEL: global_extload_v8f16_to_v8f64:
1265; VI:       ; %bb.0:
1266; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1267; VI-NEXT:    s_waitcnt lgkmcnt(0)
1268; VI-NEXT:    v_mov_b32_e32 v0, s2
1269; VI-NEXT:    v_mov_b32_e32 v1, s3
1270; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1271; VI-NEXT:    s_add_u32 s2, s0, 48
1272; VI-NEXT:    s_addc_u32 s3, s1, 0
1273; VI-NEXT:    v_mov_b32_e32 v8, s3
1274; VI-NEXT:    v_mov_b32_e32 v7, s2
1275; VI-NEXT:    s_add_u32 s2, s0, 32
1276; VI-NEXT:    v_mov_b32_e32 v13, s1
1277; VI-NEXT:    s_addc_u32 s3, s1, 0
1278; VI-NEXT:    v_mov_b32_e32 v12, s0
1279; VI-NEXT:    s_add_u32 s0, s0, 16
1280; VI-NEXT:    v_mov_b32_e32 v15, s3
1281; VI-NEXT:    s_addc_u32 s1, s1, 0
1282; VI-NEXT:    v_mov_b32_e32 v14, s2
1283; VI-NEXT:    s_waitcnt vmcnt(0)
1284; VI-NEXT:    v_cvt_f32_f16_e32 v9, v0
1285; VI-NEXT:    v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1286; VI-NEXT:    v_cvt_f32_f16_e32 v0, v3
1287; VI-NEXT:    v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1288; VI-NEXT:    v_cvt_f32_f16_e32 v10, v1
1289; VI-NEXT:    v_cvt_f32_f16_e32 v11, v2
1290; VI-NEXT:    v_cvt_f64_f32_e32 v[3:4], v0
1291; VI-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
1292; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1293; VI-NEXT:    v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1294; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
1295; VI-NEXT:    flat_store_dwordx4 v[7:8], v[3:6]
1296; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v11
1297; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
1298; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
1299; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v17
1300; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v16
1301; VI-NEXT:    v_mov_b32_e32 v17, s1
1302; VI-NEXT:    v_mov_b32_e32 v16, s0
1303; VI-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
1304; VI-NEXT:    flat_store_dwordx4 v[16:17], v[4:7]
1305; VI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
1306; VI-NEXT:    s_endpgm
1307  %val = load <8 x half>, <8 x half> addrspace(1)* %in
1308  %cvt = fpext <8 x half> %val to <8 x double>
1309  store <8 x double> %cvt, <8 x double> addrspace(1)* %out
1310  ret void
1311}
1312
1313define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
1314; CI-LABEL: global_extload_v16f16_to_v16f64:
1315; CI:       ; %bb.0:
1316; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1317; CI-NEXT:    s_waitcnt lgkmcnt(0)
1318; CI-NEXT:    v_mov_b32_e32 v0, s2
1319; CI-NEXT:    v_mov_b32_e32 v1, s3
1320; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1321; CI-NEXT:    s_add_u32 s2, s2, 16
1322; CI-NEXT:    s_addc_u32 s3, s3, 0
1323; CI-NEXT:    v_mov_b32_e32 v5, s3
1324; CI-NEXT:    v_mov_b32_e32 v4, s2
1325; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1326; CI-NEXT:    s_add_u32 s2, s0, 48
1327; CI-NEXT:    s_addc_u32 s3, s1, 0
1328; CI-NEXT:    v_mov_b32_e32 v15, s3
1329; CI-NEXT:    v_mov_b32_e32 v14, s2
1330; CI-NEXT:    s_add_u32 s2, s0, 32
1331; CI-NEXT:    s_addc_u32 s3, s1, 0
1332; CI-NEXT:    v_mov_b32_e32 v17, s3
1333; CI-NEXT:    v_mov_b32_e32 v16, s2
1334; CI-NEXT:    s_add_u32 s2, s0, 16
1335; CI-NEXT:    s_addc_u32 s3, s1, 0
1336; CI-NEXT:    v_mov_b32_e32 v19, s3
1337; CI-NEXT:    v_mov_b32_e32 v18, s2
1338; CI-NEXT:    s_add_u32 s2, s0, 0x70
1339; CI-NEXT:    s_addc_u32 s3, s1, 0
1340; CI-NEXT:    v_mov_b32_e32 v13, s1
1341; CI-NEXT:    v_mov_b32_e32 v12, s0
1342; CI-NEXT:    s_waitcnt vmcnt(1)
1343; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
1344; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1345; CI-NEXT:    v_cvt_f32_f16_e32 v10, v8
1346; CI-NEXT:    s_waitcnt vmcnt(0)
1347; CI-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
1348; CI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v3
1349; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
1350; CI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
1351; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1352; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1353; CI-NEXT:    v_cvt_f32_f16_e32 v21, v5
1354; CI-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
1355; CI-NEXT:    v_mov_b32_e32 v15, s3
1356; CI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
1357; CI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v3
1358; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
1359; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1360; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1361; CI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
1362; CI-NEXT:    v_mov_b32_e32 v14, s2
1363; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
1364; CI-NEXT:    v_cvt_f32_f16_e32 v9, v0
1365; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
1366; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1367; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
1368; CI-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
1369; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
1370; CI-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
1371; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
1372; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
1373; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
1374; CI-NEXT:    v_cvt_f32_f16_e32 v8, v10
1375; CI-NEXT:    s_add_u32 s2, s0, 0x60
1376; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
1377; CI-NEXT:    v_cvt_f32_f16_e32 v10, v11
1378; CI-NEXT:    s_addc_u32 s3, s1, 0
1379; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
1380; CI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
1381; CI-NEXT:    v_mov_b32_e32 v17, s3
1382; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v7
1383; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
1384; CI-NEXT:    v_cvt_f32_f16_e32 v7, v20
1385; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
1386; CI-NEXT:    v_cvt_f32_f16_e32 v12, v5
1387; CI-NEXT:    v_mov_b32_e32 v16, s2
1388; CI-NEXT:    s_add_u32 s2, s0, 0x50
1389; CI-NEXT:    s_addc_u32 s3, s1, 0
1390; CI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v6
1391; CI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
1392; CI-NEXT:    s_add_u32 s0, s0, 64
1393; CI-NEXT:    flat_store_dwordx4 v[14:15], v[0:3]
1394; CI-NEXT:    s_addc_u32 s1, s1, 0
1395; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v21
1396; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v7
1397; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
1398; CI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v12
1399; CI-NEXT:    v_mov_b32_e32 v19, s3
1400; CI-NEXT:    v_mov_b32_e32 v13, s1
1401; CI-NEXT:    v_mov_b32_e32 v18, s2
1402; CI-NEXT:    v_mov_b32_e32 v12, s0
1403; CI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
1404; CI-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
1405; CI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
1406; CI-NEXT:    s_endpgm
1407;
1408; VI-LABEL: global_extload_v16f16_to_v16f64:
1409; VI:       ; %bb.0:
1410; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1411; VI-NEXT:    s_waitcnt lgkmcnt(0)
1412; VI-NEXT:    v_mov_b32_e32 v0, s2
1413; VI-NEXT:    v_mov_b32_e32 v1, s3
1414; VI-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
1415; VI-NEXT:    s_add_u32 s2, s2, 16
1416; VI-NEXT:    s_addc_u32 s3, s3, 0
1417; VI-NEXT:    v_mov_b32_e32 v0, s2
1418; VI-NEXT:    v_mov_b32_e32 v1, s3
1419; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1420; VI-NEXT:    s_add_u32 s2, s0, 48
1421; VI-NEXT:    s_addc_u32 s3, s1, 0
1422; VI-NEXT:    v_mov_b32_e32 v14, s3
1423; VI-NEXT:    v_mov_b32_e32 v13, s2
1424; VI-NEXT:    s_add_u32 s2, s0, 32
1425; VI-NEXT:    s_addc_u32 s3, s1, 0
1426; VI-NEXT:    v_mov_b32_e32 v16, s3
1427; VI-NEXT:    v_mov_b32_e32 v15, s2
1428; VI-NEXT:    s_add_u32 s2, s0, 16
1429; VI-NEXT:    s_addc_u32 s3, s1, 0
1430; VI-NEXT:    v_mov_b32_e32 v18, s3
1431; VI-NEXT:    v_mov_b32_e32 v17, s2
1432; VI-NEXT:    s_add_u32 s2, s0, 0x50
1433; VI-NEXT:    v_mov_b32_e32 v12, s1
1434; VI-NEXT:    s_addc_u32 s3, s1, 0
1435; VI-NEXT:    v_mov_b32_e32 v11, s0
1436; VI-NEXT:    s_waitcnt vmcnt(1)
1437; VI-NEXT:    v_cvt_f32_f16_e32 v8, v7
1438; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1439; VI-NEXT:    v_cvt_f64_f32_e32 v[7:8], v8
1440; VI-NEXT:    v_cvt_f64_f32_e32 v[9:10], v9
1441; VI-NEXT:    flat_store_dwordx4 v[13:14], v[7:10]
1442; VI-NEXT:    s_nop 0
1443; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
1444; VI-NEXT:    v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1445; VI-NEXT:    s_waitcnt vmcnt(1)
1446; VI-NEXT:    v_cvt_f32_f16_e32 v10, v2
1447; VI-NEXT:    v_mov_b32_e32 v14, s3
1448; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
1449; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
1450; VI-NEXT:    v_mov_b32_e32 v13, s2
1451; VI-NEXT:    s_add_u32 s2, s0, 64
1452; VI-NEXT:    s_addc_u32 s3, s1, 0
1453; VI-NEXT:    flat_store_dwordx4 v[15:16], v[6:9]
1454; VI-NEXT:    v_mov_b32_e32 v16, s3
1455; VI-NEXT:    v_cvt_f32_f16_e32 v6, v5
1456; VI-NEXT:    v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1457; VI-NEXT:    v_cvt_f32_f16_e32 v8, v4
1458; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1459; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
1460; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
1461; VI-NEXT:    v_mov_b32_e32 v15, s2
1462; VI-NEXT:    s_add_u32 s2, s0, 0x70
1463; VI-NEXT:    s_addc_u32 s3, s1, 0
1464; VI-NEXT:    flat_store_dwordx4 v[17:18], v[4:7]
1465; VI-NEXT:    v_cvt_f32_f16_sdwa v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1466; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
1467; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v9
1468; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1469; VI-NEXT:    v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1470; VI-NEXT:    v_cvt_f32_f16_e32 v2, v1
1471; VI-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
1472; VI-NEXT:    v_cvt_f32_f16_sdwa v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1473; VI-NEXT:    v_cvt_f32_f16_e32 v7, v3
1474; VI-NEXT:    v_cvt_f64_f32_e32 v[3:4], v9
1475; VI-NEXT:    v_cvt_f32_f16_e32 v9, v0
1476; VI-NEXT:    v_cvt_f64_f32_e32 v[1:2], v2
1477; VI-NEXT:    v_cvt_f64_f32_e32 v[5:6], v10
1478; VI-NEXT:    v_cvt_f64_f32_e32 v[11:12], v11
1479; VI-NEXT:    v_cvt_f64_f32_e32 v[9:10], v9
1480; VI-NEXT:    s_add_u32 s0, s0, 0x60
1481; VI-NEXT:    flat_store_dwordx4 v[13:14], v[1:4]
1482; VI-NEXT:    s_addc_u32 s1, s1, 0
1483; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v7
1484; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v17
1485; VI-NEXT:    v_cvt_f64_f32_e32 v[7:8], v8
1486; VI-NEXT:    v_mov_b32_e32 v20, s3
1487; VI-NEXT:    v_mov_b32_e32 v14, s1
1488; VI-NEXT:    v_mov_b32_e32 v19, s2
1489; VI-NEXT:    v_mov_b32_e32 v13, s0
1490; VI-NEXT:    flat_store_dwordx4 v[15:16], v[9:12]
1491; VI-NEXT:    flat_store_dwordx4 v[19:20], v[0:3]
1492; VI-NEXT:    flat_store_dwordx4 v[13:14], v[5:8]
1493; VI-NEXT:    s_endpgm
1494  %val = load <16 x half>, <16 x half> addrspace(1)* %in
1495  %cvt = fpext <16 x half> %val to <16 x double>
1496  store <16 x double> %cvt, <16 x double> addrspace(1)* %out
1497  ret void
1498}
1499
1500define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
1501; GCN-LABEL: global_truncstore_f32_to_f16:
1502; GCN:       ; %bb.0:
1503; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1504; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1505; GCN-NEXT:    v_mov_b32_e32 v0, s2
1506; GCN-NEXT:    v_mov_b32_e32 v1, s3
1507; GCN-NEXT:    flat_load_dword v0, v[0:1]
1508; GCN-NEXT:    v_mov_b32_e32 v1, s1
1509; GCN-NEXT:    s_waitcnt vmcnt(0)
1510; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v0
1511; GCN-NEXT:    v_mov_b32_e32 v0, s0
1512; GCN-NEXT:    flat_store_short v[0:1], v2
1513; GCN-NEXT:    s_endpgm
1514  %val = load float, float addrspace(1)* %in
1515  %cvt = fptrunc float %val to half
1516  store half %cvt, half addrspace(1)* %out
1517  ret void
1518}
1519
1520define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
1521; CI-LABEL: global_truncstore_v2f32_to_v2f16:
1522; CI:       ; %bb.0:
1523; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1524; CI-NEXT:    s_waitcnt lgkmcnt(0)
1525; CI-NEXT:    v_mov_b32_e32 v0, s2
1526; CI-NEXT:    v_mov_b32_e32 v1, s3
1527; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1528; CI-NEXT:    s_waitcnt vmcnt(0)
1529; CI-NEXT:    v_cvt_f16_f32_e32 v2, v1
1530; CI-NEXT:    v_cvt_f16_f32_e32 v3, v0
1531; CI-NEXT:    v_mov_b32_e32 v0, s0
1532; CI-NEXT:    v_mov_b32_e32 v1, s1
1533; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1534; CI-NEXT:    v_or_b32_e32 v2, v3, v2
1535; CI-NEXT:    flat_store_dword v[0:1], v2
1536; CI-NEXT:    s_endpgm
1537;
1538; VI-LABEL: global_truncstore_v2f32_to_v2f16:
1539; VI:       ; %bb.0:
1540; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1541; VI-NEXT:    s_waitcnt lgkmcnt(0)
1542; VI-NEXT:    v_mov_b32_e32 v0, s2
1543; VI-NEXT:    v_mov_b32_e32 v1, s3
1544; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1545; VI-NEXT:    s_waitcnt vmcnt(0)
1546; VI-NEXT:    v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1547; VI-NEXT:    v_cvt_f16_f32_e32 v3, v0
1548; VI-NEXT:    v_mov_b32_e32 v0, s0
1549; VI-NEXT:    v_mov_b32_e32 v1, s1
1550; VI-NEXT:    v_or_b32_e32 v2, v3, v2
1551; VI-NEXT:    flat_store_dword v[0:1], v2
1552; VI-NEXT:    s_endpgm
1553  %val = load <2 x float>, <2 x float> addrspace(1)* %in
1554  %cvt = fptrunc <2 x float> %val to <2 x half>
1555  store <2 x half> %cvt, <2 x half> addrspace(1)* %out
1556  ret void
1557}
1558
1559define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
1560; CI-LABEL: global_truncstore_v3f32_to_v3f16:
1561; CI:       ; %bb.0:
1562; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1563; CI-NEXT:    s_waitcnt lgkmcnt(0)
1564; CI-NEXT:    v_mov_b32_e32 v0, s2
1565; CI-NEXT:    v_mov_b32_e32 v1, s3
1566; CI-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
1567; CI-NEXT:    s_add_u32 s2, s0, 4
1568; CI-NEXT:    s_addc_u32 s3, s1, 0
1569; CI-NEXT:    s_waitcnt vmcnt(0)
1570; CI-NEXT:    v_cvt_f16_f32_e32 v3, v1
1571; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1572; CI-NEXT:    v_cvt_f16_f32_e32 v4, v0
1573; CI-NEXT:    v_mov_b32_e32 v0, s2
1574; CI-NEXT:    v_mov_b32_e32 v1, s3
1575; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1576; CI-NEXT:    flat_store_short v[0:1], v2
1577; CI-NEXT:    v_mov_b32_e32 v0, s0
1578; CI-NEXT:    v_or_b32_e32 v2, v4, v3
1579; CI-NEXT:    v_mov_b32_e32 v1, s1
1580; CI-NEXT:    flat_store_dword v[0:1], v2
1581; CI-NEXT:    s_endpgm
1582;
1583; VI-LABEL: global_truncstore_v3f32_to_v3f16:
1584; VI:       ; %bb.0:
1585; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1586; VI-NEXT:    s_waitcnt lgkmcnt(0)
1587; VI-NEXT:    v_mov_b32_e32 v0, s2
1588; VI-NEXT:    v_mov_b32_e32 v1, s3
1589; VI-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
1590; VI-NEXT:    s_add_u32 s2, s0, 4
1591; VI-NEXT:    s_addc_u32 s3, s1, 0
1592; VI-NEXT:    s_waitcnt vmcnt(0)
1593; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1594; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1595; VI-NEXT:    v_cvt_f16_f32_e32 v4, v0
1596; VI-NEXT:    v_mov_b32_e32 v0, s2
1597; VI-NEXT:    v_mov_b32_e32 v1, s3
1598; VI-NEXT:    flat_store_short v[0:1], v2
1599; VI-NEXT:    v_mov_b32_e32 v0, s0
1600; VI-NEXT:    v_or_b32_e32 v3, v4, v3
1601; VI-NEXT:    v_mov_b32_e32 v1, s1
1602; VI-NEXT:    flat_store_dword v[0:1], v3
1603; VI-NEXT:    s_endpgm
1604  %val = load <3 x float>, <3 x float> addrspace(1)* %in
1605  %cvt = fptrunc <3 x float> %val to <3 x half>
1606  store <3 x half> %cvt, <3 x half> addrspace(1)* %out
1607  ret void
1608}
1609
1610define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
1611; CI-LABEL: global_truncstore_v4f32_to_v4f16:
1612; CI:       ; %bb.0:
1613; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1614; CI-NEXT:    s_waitcnt lgkmcnt(0)
1615; CI-NEXT:    v_mov_b32_e32 v0, s2
1616; CI-NEXT:    v_mov_b32_e32 v1, s3
1617; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1618; CI-NEXT:    v_mov_b32_e32 v4, s0
1619; CI-NEXT:    v_mov_b32_e32 v5, s1
1620; CI-NEXT:    s_waitcnt vmcnt(0)
1621; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1622; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1623; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1624; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1625; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1626; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
1627; CI-NEXT:    v_or_b32_e32 v1, v2, v3
1628; CI-NEXT:    v_or_b32_e32 v0, v0, v6
1629; CI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
1630; CI-NEXT:    s_endpgm
1631;
1632; VI-LABEL: global_truncstore_v4f32_to_v4f16:
1633; VI:       ; %bb.0:
1634; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1635; VI-NEXT:    s_waitcnt lgkmcnt(0)
1636; VI-NEXT:    v_mov_b32_e32 v0, s2
1637; VI-NEXT:    v_mov_b32_e32 v1, s3
1638; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1639; VI-NEXT:    s_waitcnt vmcnt(0)
1640; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1641; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1642; VI-NEXT:    v_cvt_f16_f32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1643; VI-NEXT:    v_cvt_f16_f32_e32 v5, v0
1644; VI-NEXT:    v_mov_b32_e32 v0, s0
1645; VI-NEXT:    v_mov_b32_e32 v1, s1
1646; VI-NEXT:    v_or_b32_e32 v3, v2, v3
1647; VI-NEXT:    v_or_b32_e32 v2, v5, v4
1648; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1649; VI-NEXT:    s_endpgm
1650  %val = load <4 x float>, <4 x float> addrspace(1)* %in
1651  %cvt = fptrunc <4 x float> %val to <4 x half>
1652  store <4 x half> %cvt, <4 x half> addrspace(1)* %out
1653  ret void
1654}
1655
1656define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
1657; CI-LABEL: global_truncstore_v8f32_to_v8f16:
1658; CI:       ; %bb.0:
1659; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1660; CI-NEXT:    s_waitcnt lgkmcnt(0)
1661; CI-NEXT:    v_mov_b32_e32 v0, s2
1662; CI-NEXT:    v_mov_b32_e32 v1, s3
1663; CI-NEXT:    s_add_u32 s2, s2, 16
1664; CI-NEXT:    s_addc_u32 s3, s3, 0
1665; CI-NEXT:    v_mov_b32_e32 v5, s3
1666; CI-NEXT:    v_mov_b32_e32 v4, s2
1667; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1668; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1669; CI-NEXT:    v_mov_b32_e32 v8, s0
1670; CI-NEXT:    v_mov_b32_e32 v9, s1
1671; CI-NEXT:    s_waitcnt vmcnt(1)
1672; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1673; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1674; CI-NEXT:    s_waitcnt vmcnt(0)
1675; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1676; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
1677; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1678; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1679; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1680; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1681; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1682; CI-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
1683; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
1684; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
1685; CI-NEXT:    v_or_b32_e32 v1, v2, v3
1686; CI-NEXT:    v_or_b32_e32 v0, v0, v10
1687; CI-NEXT:    v_or_b32_e32 v3, v6, v7
1688; CI-NEXT:    v_or_b32_e32 v2, v4, v5
1689; CI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1690; CI-NEXT:    s_endpgm
1691;
1692; VI-LABEL: global_truncstore_v8f32_to_v8f16:
1693; VI:       ; %bb.0:
1694; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1695; VI-NEXT:    s_waitcnt lgkmcnt(0)
1696; VI-NEXT:    v_mov_b32_e32 v0, s2
1697; VI-NEXT:    v_mov_b32_e32 v1, s3
1698; VI-NEXT:    s_add_u32 s2, s2, 16
1699; VI-NEXT:    s_addc_u32 s3, s3, 0
1700; VI-NEXT:    v_mov_b32_e32 v5, s3
1701; VI-NEXT:    v_mov_b32_e32 v4, s2
1702; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1703; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1704; VI-NEXT:    v_mov_b32_e32 v8, s0
1705; VI-NEXT:    v_mov_b32_e32 v9, s1
1706; VI-NEXT:    s_waitcnt vmcnt(1)
1707; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1708; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1709; VI-NEXT:    v_cvt_f16_f32_sdwa v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1710; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1711; VI-NEXT:    s_waitcnt vmcnt(0)
1712; VI-NEXT:    v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1713; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1714; VI-NEXT:    v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1715; VI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1716; VI-NEXT:    v_or_b32_e32 v1, v2, v3
1717; VI-NEXT:    v_or_b32_e32 v0, v0, v10
1718; VI-NEXT:    v_or_b32_e32 v3, v6, v7
1719; VI-NEXT:    v_or_b32_e32 v2, v4, v5
1720; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1721; VI-NEXT:    s_endpgm
1722  %val = load <8 x float>, <8 x float> addrspace(1)* %in
1723  %cvt = fptrunc <8 x float> %val to <8 x half>
1724  store <8 x half> %cvt, <8 x half> addrspace(1)* %out
1725  ret void
1726}
1727
1728define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
1729; CI-LABEL: global_truncstore_v16f32_to_v16f16:
1730; CI:       ; %bb.0:
1731; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1732; CI-NEXT:    s_waitcnt lgkmcnt(0)
1733; CI-NEXT:    s_add_u32 s4, s2, 32
1734; CI-NEXT:    s_addc_u32 s5, s3, 0
1735; CI-NEXT:    v_mov_b32_e32 v0, s4
1736; CI-NEXT:    v_mov_b32_e32 v1, s5
1737; CI-NEXT:    s_add_u32 s4, s2, 48
1738; CI-NEXT:    s_addc_u32 s5, s3, 0
1739; CI-NEXT:    v_mov_b32_e32 v9, s3
1740; CI-NEXT:    v_mov_b32_e32 v4, s4
1741; CI-NEXT:    v_mov_b32_e32 v8, s2
1742; CI-NEXT:    s_add_u32 s2, s2, 16
1743; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1744; CI-NEXT:    v_mov_b32_e32 v5, s5
1745; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1746; CI-NEXT:    s_addc_u32 s3, s3, 0
1747; CI-NEXT:    v_mov_b32_e32 v13, s3
1748; CI-NEXT:    v_mov_b32_e32 v12, s2
1749; CI-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
1750; CI-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
1751; CI-NEXT:    s_add_u32 s2, s0, 16
1752; CI-NEXT:    s_addc_u32 s3, s1, 0
1753; CI-NEXT:    s_waitcnt vmcnt(3)
1754; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1755; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1756; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1757; CI-NEXT:    s_waitcnt vmcnt(2)
1758; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1759; CI-NEXT:    v_cvt_f16_f32_e32 v16, v5
1760; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1761; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1762; CI-NEXT:    v_cvt_f16_f32_e32 v17, v4
1763; CI-NEXT:    s_waitcnt vmcnt(1)
1764; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
1765; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
1766; CI-NEXT:    s_waitcnt vmcnt(0)
1767; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
1768; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
1769; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
1770; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
1771; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
1772; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
1773; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1774; CI-NEXT:    v_mov_b32_e32 v5, s3
1775; CI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
1776; CI-NEXT:    v_or_b32_e32 v1, v2, v3
1777; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
1778; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v16
1779; CI-NEXT:    v_mov_b32_e32 v4, s2
1780; CI-NEXT:    v_or_b32_e32 v0, v0, v18
1781; CI-NEXT:    v_or_b32_e32 v3, v6, v2
1782; CI-NEXT:    v_or_b32_e32 v2, v17, v7
1783; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
1784; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
1785; CI-NEXT:    v_lshlrev_b32_e32 v9, 16, v15
1786; CI-NEXT:    v_lshlrev_b32_e32 v11, 16, v13
1787; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1788; CI-NEXT:    v_mov_b32_e32 v5, s1
1789; CI-NEXT:    v_or_b32_e32 v1, v10, v6
1790; CI-NEXT:    v_or_b32_e32 v0, v8, v7
1791; CI-NEXT:    v_or_b32_e32 v3, v14, v9
1792; CI-NEXT:    v_or_b32_e32 v2, v12, v11
1793; CI-NEXT:    v_mov_b32_e32 v4, s0
1794; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1795; CI-NEXT:    s_endpgm
1796;
1797; VI-LABEL: global_truncstore_v16f32_to_v16f16:
1798; VI:       ; %bb.0:
1799; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1800; VI-NEXT:    s_waitcnt lgkmcnt(0)
1801; VI-NEXT:    s_add_u32 s4, s2, 32
1802; VI-NEXT:    s_addc_u32 s5, s3, 0
1803; VI-NEXT:    v_mov_b32_e32 v0, s4
1804; VI-NEXT:    v_mov_b32_e32 v1, s5
1805; VI-NEXT:    s_add_u32 s4, s2, 48
1806; VI-NEXT:    s_addc_u32 s5, s3, 0
1807; VI-NEXT:    v_mov_b32_e32 v9, s3
1808; VI-NEXT:    v_mov_b32_e32 v4, s4
1809; VI-NEXT:    v_mov_b32_e32 v8, s2
1810; VI-NEXT:    s_add_u32 s2, s2, 16
1811; VI-NEXT:    v_mov_b32_e32 v5, s5
1812; VI-NEXT:    s_addc_u32 s3, s3, 0
1813; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1814; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1815; VI-NEXT:    v_mov_b32_e32 v13, s3
1816; VI-NEXT:    v_mov_b32_e32 v12, s2
1817; VI-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
1818; VI-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
1819; VI-NEXT:    s_add_u32 s2, s0, 16
1820; VI-NEXT:    s_addc_u32 s3, s1, 0
1821; VI-NEXT:    s_waitcnt vmcnt(3)
1822; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1823; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1824; VI-NEXT:    v_cvt_f16_f32_sdwa v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1825; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1826; VI-NEXT:    s_waitcnt vmcnt(2)
1827; VI-NEXT:    v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1828; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1829; VI-NEXT:    v_cvt_f16_f32_sdwa v17, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1830; VI-NEXT:    v_cvt_f16_f32_e32 v18, v4
1831; VI-NEXT:    s_waitcnt vmcnt(1)
1832; VI-NEXT:    v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1833; VI-NEXT:    v_cvt_f16_f32_e32 v10, v10
1834; VI-NEXT:    v_cvt_f16_f32_sdwa v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1835; VI-NEXT:    v_cvt_f16_f32_e32 v8, v8
1836; VI-NEXT:    s_waitcnt vmcnt(0)
1837; VI-NEXT:    v_cvt_f16_f32_sdwa v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1838; VI-NEXT:    v_cvt_f16_f32_e32 v14, v14
1839; VI-NEXT:    v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1840; VI-NEXT:    v_cvt_f16_f32_e32 v12, v12
1841; VI-NEXT:    v_mov_b32_e32 v5, s3
1842; VI-NEXT:    v_mov_b32_e32 v4, s2
1843; VI-NEXT:    v_or_b32_e32 v1, v2, v3
1844; VI-NEXT:    v_or_b32_e32 v0, v0, v16
1845; VI-NEXT:    v_or_b32_e32 v3, v6, v7
1846; VI-NEXT:    v_or_b32_e32 v2, v18, v17
1847; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1848; VI-NEXT:    v_mov_b32_e32 v5, s1
1849; VI-NEXT:    v_or_b32_e32 v1, v10, v11
1850; VI-NEXT:    v_or_b32_e32 v0, v8, v9
1851; VI-NEXT:    v_or_b32_e32 v3, v14, v15
1852; VI-NEXT:    v_or_b32_e32 v2, v12, v13
1853; VI-NEXT:    v_mov_b32_e32 v4, s0
1854; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1855; VI-NEXT:    s_endpgm
1856  %val = load <16 x float>, <16 x float> addrspace(1)* %in
1857  %cvt = fptrunc <16 x float> %val to <16 x half>
1858  store <16 x half> %cvt, <16 x half> addrspace(1)* %out
1859  ret void
1860}
1861
1862; FIXME: Unsafe math should fold conversions away
1863define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
1864; CI-LABEL: fadd_f16:
1865; CI:       ; %bb.0:
1866; CI-NEXT:    s_load_dword s0, s[4:5], 0x2
1867; CI-NEXT:    s_waitcnt lgkmcnt(0)
1868; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
1869; CI-NEXT:    s_lshr_b32 s0, s0, 16
1870; CI-NEXT:    v_cvt_f32_f16_e32 v1, s0
1871; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1872; CI-NEXT:    v_add_f32_e32 v0, v0, v1
1873; CI-NEXT:    v_cvt_f16_f32_e32 v2, v0
1874; CI-NEXT:    s_waitcnt lgkmcnt(0)
1875; CI-NEXT:    v_mov_b32_e32 v0, s0
1876; CI-NEXT:    v_mov_b32_e32 v1, s1
1877; CI-NEXT:    flat_store_short v[0:1], v2
1878; CI-NEXT:    s_endpgm
1879;
1880; VI-LABEL: fadd_f16:
1881; VI:       ; %bb.0:
1882; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
1883; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1884; VI-NEXT:    s_waitcnt lgkmcnt(0)
1885; VI-NEXT:    s_lshr_b32 s3, s2, 16
1886; VI-NEXT:    v_mov_b32_e32 v0, s3
1887; VI-NEXT:    v_add_f16_e32 v2, s2, v0
1888; VI-NEXT:    v_mov_b32_e32 v0, s0
1889; VI-NEXT:    v_mov_b32_e32 v1, s1
1890; VI-NEXT:    flat_store_short v[0:1], v2
1891; VI-NEXT:    s_endpgm
1892   %add = fadd half %a, %b
1893   store half %add, half addrspace(1)* %out, align 4
1894   ret void
1895}
1896
1897define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
1898; CI-LABEL: fadd_v2f16:
1899; CI:       ; %bb.0:
1900; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
1901; CI-NEXT:    s_waitcnt lgkmcnt(0)
1902; CI-NEXT:    s_lshr_b32 s2, s0, 16
1903; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
1904; CI-NEXT:    s_lshr_b32 s0, s1, 16
1905; CI-NEXT:    v_cvt_f32_f16_e32 v1, s1
1906; CI-NEXT:    v_cvt_f32_f16_e32 v2, s2
1907; CI-NEXT:    v_cvt_f32_f16_e32 v3, s0
1908; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1909; CI-NEXT:    v_add_f32_e32 v0, v0, v1
1910; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1911; CI-NEXT:    v_add_f32_e32 v1, v2, v3
1912; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1913; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1914; CI-NEXT:    v_or_b32_e32 v2, v0, v1
1915; CI-NEXT:    s_waitcnt lgkmcnt(0)
1916; CI-NEXT:    v_mov_b32_e32 v0, s0
1917; CI-NEXT:    v_mov_b32_e32 v1, s1
1918; CI-NEXT:    flat_store_dword v[0:1], v2
1919; CI-NEXT:    s_endpgm
1920;
1921; VI-LABEL: fadd_v2f16:
1922; VI:       ; %bb.0:
1923; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1924; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1925; VI-NEXT:    s_waitcnt lgkmcnt(0)
1926; VI-NEXT:    s_lshr_b32 s4, s1, 16
1927; VI-NEXT:    s_lshr_b32 s5, s0, 16
1928; VI-NEXT:    v_mov_b32_e32 v0, s1
1929; VI-NEXT:    v_mov_b32_e32 v1, s4
1930; VI-NEXT:    v_mov_b32_e32 v2, s5
1931; VI-NEXT:    v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1932; VI-NEXT:    v_add_f16_e32 v0, s0, v0
1933; VI-NEXT:    v_or_b32_e32 v2, v0, v1
1934; VI-NEXT:    v_mov_b32_e32 v0, s2
1935; VI-NEXT:    v_mov_b32_e32 v1, s3
1936; VI-NEXT:    flat_store_dword v[0:1], v2
1937; VI-NEXT:    s_endpgm
1938  %add = fadd <2 x half> %a, %b
1939  store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8
1940  ret void
1941}
1942
1943define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
1944; CI-LABEL: fadd_v4f16:
1945; CI:       ; %bb.0:
1946; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1947; CI-NEXT:    s_waitcnt lgkmcnt(0)
1948; CI-NEXT:    v_mov_b32_e32 v0, s2
1949; CI-NEXT:    v_mov_b32_e32 v1, s3
1950; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1951; CI-NEXT:    v_mov_b32_e32 v4, s0
1952; CI-NEXT:    v_mov_b32_e32 v5, s1
1953; CI-NEXT:    s_waitcnt vmcnt(0)
1954; CI-NEXT:    v_cvt_f32_f16_e32 v6, v0
1955; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1956; CI-NEXT:    v_cvt_f32_f16_e32 v7, v1
1957; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1958; CI-NEXT:    v_cvt_f32_f16_e32 v8, v2
1959; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1960; CI-NEXT:    v_cvt_f32_f16_e32 v9, v3
1961; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1962; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1963; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1964; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1965; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1966; CI-NEXT:    v_add_f32_e32 v7, v7, v9
1967; CI-NEXT:    v_add_f32_e32 v6, v6, v8
1968; CI-NEXT:    v_add_f32_e32 v1, v1, v3
1969; CI-NEXT:    v_add_f32_e32 v0, v0, v2
1970; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1971; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1972; CI-NEXT:    v_cvt_f16_f32_e32 v2, v7
1973; CI-NEXT:    v_cvt_f16_f32_e32 v3, v6
1974; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1975; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1976; CI-NEXT:    v_or_b32_e32 v1, v2, v1
1977; CI-NEXT:    v_or_b32_e32 v0, v3, v0
1978; CI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
1979; CI-NEXT:    s_endpgm
1980;
1981; VI-LABEL: fadd_v4f16:
1982; VI:       ; %bb.0:
1983; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1984; VI-NEXT:    s_waitcnt lgkmcnt(0)
1985; VI-NEXT:    v_mov_b32_e32 v0, s2
1986; VI-NEXT:    v_mov_b32_e32 v1, s3
1987; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1988; VI-NEXT:    v_mov_b32_e32 v4, s0
1989; VI-NEXT:    v_mov_b32_e32 v5, s1
1990; VI-NEXT:    s_waitcnt vmcnt(0)
1991; VI-NEXT:    v_add_f16_sdwa v6, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1992; VI-NEXT:    v_add_f16_e32 v1, v1, v3
1993; VI-NEXT:    v_add_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1994; VI-NEXT:    v_add_f16_e32 v0, v0, v2
1995; VI-NEXT:    v_or_b32_e32 v1, v1, v6
1996; VI-NEXT:    v_or_b32_e32 v0, v0, v3
1997; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
1998; VI-NEXT:    s_endpgm
1999  %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1
2000  %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16
2001  %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16
2002  %result = fadd <4 x half> %a, %b
2003  store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16
2004  ret void
2005}
2006
2007define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
2008; CI-LABEL: fadd_v8f16:
2009; CI:       ; %bb.0:
2010; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
2011; CI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2012; CI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x8
2013; CI-NEXT:    s_waitcnt lgkmcnt(0)
2014; CI-NEXT:    s_lshr_b32 s10, s0, 16
2015; CI-NEXT:    v_cvt_f32_f16_e32 v4, s0
2016; CI-NEXT:    s_lshr_b32 s0, s4, 16
2017; CI-NEXT:    v_cvt_f32_f16_e32 v8, s0
2018; CI-NEXT:    s_lshr_b32 s0, s5, 16
2019; CI-NEXT:    s_lshr_b32 s11, s1, 16
2020; CI-NEXT:    v_cvt_f32_f16_e32 v0, s10
2021; CI-NEXT:    s_lshr_b32 s10, s2, 16
2022; CI-NEXT:    v_cvt_f32_f16_e32 v9, s0
2023; CI-NEXT:    s_lshr_b32 s0, s6, 16
2024; CI-NEXT:    v_cvt_f32_f16_e32 v1, s11
2025; CI-NEXT:    v_cvt_f32_f16_e32 v2, s10
2026; CI-NEXT:    s_lshr_b32 s10, s3, 16
2027; CI-NEXT:    v_cvt_f32_f16_e32 v10, s0
2028; CI-NEXT:    s_lshr_b32 s0, s7, 16
2029; CI-NEXT:    v_cvt_f32_f16_e32 v3, s10
2030; CI-NEXT:    v_cvt_f32_f16_e32 v5, s1
2031; CI-NEXT:    v_cvt_f32_f16_e32 v11, s0
2032; CI-NEXT:    v_cvt_f32_f16_e32 v12, s4
2033; CI-NEXT:    v_cvt_f32_f16_e32 v13, s5
2034; CI-NEXT:    v_cvt_f32_f16_e32 v6, s2
2035; CI-NEXT:    v_cvt_f32_f16_e32 v7, s3
2036; CI-NEXT:    v_cvt_f32_f16_e32 v14, s7
2037; CI-NEXT:    v_cvt_f32_f16_e32 v15, s6
2038; CI-NEXT:    v_add_f32_e32 v1, v1, v9
2039; CI-NEXT:    v_add_f32_e32 v0, v0, v8
2040; CI-NEXT:    v_add_f32_e32 v3, v3, v11
2041; CI-NEXT:    v_add_f32_e32 v2, v2, v10
2042; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2043; CI-NEXT:    v_add_f32_e32 v5, v5, v13
2044; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2045; CI-NEXT:    v_add_f32_e32 v4, v4, v12
2046; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
2047; CI-NEXT:    v_add_f32_e32 v7, v7, v14
2048; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2049; CI-NEXT:    v_add_f32_e32 v6, v6, v15
2050; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
2051; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
2052; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
2053; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
2054; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2055; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2056; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2057; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2058; CI-NEXT:    v_or_b32_e32 v1, v5, v1
2059; CI-NEXT:    v_or_b32_e32 v0, v4, v0
2060; CI-NEXT:    v_mov_b32_e32 v4, s8
2061; CI-NEXT:    v_or_b32_e32 v3, v7, v3
2062; CI-NEXT:    v_or_b32_e32 v2, v6, v2
2063; CI-NEXT:    v_mov_b32_e32 v5, s9
2064; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2065; CI-NEXT:    s_endpgm
2066;
2067; VI-LABEL: fadd_v8f16:
2068; VI:       ; %bb.0:
2069; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
2070; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
2071; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2072; VI-NEXT:    s_waitcnt lgkmcnt(0)
2073; VI-NEXT:    s_lshr_b32 s6, s3, 16
2074; VI-NEXT:    s_lshr_b32 s7, s11, 16
2075; VI-NEXT:    v_mov_b32_e32 v0, s6
2076; VI-NEXT:    v_mov_b32_e32 v1, s7
2077; VI-NEXT:    v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2078; VI-NEXT:    v_mov_b32_e32 v1, s3
2079; VI-NEXT:    v_add_f16_e32 v1, s11, v1
2080; VI-NEXT:    s_lshr_b32 s3, s2, 16
2081; VI-NEXT:    s_lshr_b32 s6, s10, 16
2082; VI-NEXT:    v_or_b32_e32 v3, v1, v0
2083; VI-NEXT:    v_mov_b32_e32 v0, s3
2084; VI-NEXT:    v_mov_b32_e32 v1, s6
2085; VI-NEXT:    v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2086; VI-NEXT:    v_mov_b32_e32 v1, s2
2087; VI-NEXT:    v_add_f16_e32 v1, s10, v1
2088; VI-NEXT:    s_lshr_b32 s2, s1, 16
2089; VI-NEXT:    s_lshr_b32 s3, s9, 16
2090; VI-NEXT:    v_or_b32_e32 v2, v1, v0
2091; VI-NEXT:    v_mov_b32_e32 v0, s2
2092; VI-NEXT:    v_mov_b32_e32 v1, s3
2093; VI-NEXT:    v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2094; VI-NEXT:    v_mov_b32_e32 v1, s1
2095; VI-NEXT:    v_add_f16_e32 v1, s9, v1
2096; VI-NEXT:    s_lshr_b32 s1, s0, 16
2097; VI-NEXT:    s_lshr_b32 s2, s8, 16
2098; VI-NEXT:    v_or_b32_e32 v1, v1, v0
2099; VI-NEXT:    v_mov_b32_e32 v0, s1
2100; VI-NEXT:    v_mov_b32_e32 v4, s2
2101; VI-NEXT:    v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2102; VI-NEXT:    v_mov_b32_e32 v4, s0
2103; VI-NEXT:    v_add_f16_e32 v4, s8, v4
2104; VI-NEXT:    v_or_b32_e32 v0, v4, v0
2105; VI-NEXT:    v_mov_b32_e32 v4, s4
2106; VI-NEXT:    v_mov_b32_e32 v5, s5
2107; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2108; VI-NEXT:    s_endpgm
2109  %add = fadd <8 x half> %a, %b
2110  store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32
2111  ret void
2112}
2113
2114define amdgpu_kernel void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
2115; GCN-LABEL: test_bitcast_from_half:
2116; GCN:       ; %bb.0:
2117; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2118; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2119; GCN-NEXT:    v_mov_b32_e32 v0, s0
2120; GCN-NEXT:    v_mov_b32_e32 v1, s1
2121; GCN-NEXT:    flat_load_ushort v2, v[0:1]
2122; GCN-NEXT:    v_mov_b32_e32 v0, s2
2123; GCN-NEXT:    v_mov_b32_e32 v1, s3
2124; GCN-NEXT:    s_waitcnt vmcnt(0)
2125; GCN-NEXT:    flat_store_short v[0:1], v2
2126; GCN-NEXT:    s_endpgm
2127  %val = load half, half addrspace(1)* %in
2128  %val_int = bitcast half %val to i16
2129  store i16 %val_int, i16 addrspace(1)* %out
2130  ret void
2131}
2132
2133define amdgpu_kernel void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
2134; GCN-LABEL: test_bitcast_to_half:
2135; GCN:       ; %bb.0:
2136; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2137; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2138; GCN-NEXT:    v_mov_b32_e32 v0, s2
2139; GCN-NEXT:    v_mov_b32_e32 v1, s3
2140; GCN-NEXT:    flat_load_ushort v2, v[0:1]
2141; GCN-NEXT:    v_mov_b32_e32 v0, s0
2142; GCN-NEXT:    v_mov_b32_e32 v1, s1
2143; GCN-NEXT:    s_waitcnt vmcnt(0)
2144; GCN-NEXT:    flat_store_short v[0:1], v2
2145; GCN-NEXT:    s_endpgm
2146  %val = load i16, i16 addrspace(1)* %in
2147  %val_fp = bitcast i16 %val to half
2148  store half %val_fp, half addrspace(1)* %out
2149  ret void
2150}
2151
2152attributes #0 = { nounwind }
2153