1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI
4; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10
5; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11
6
7; FIXME: Need to handle non-uniform case for function below (load without gep).
8define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
9; GFX9-LABEL: v_test_sub_v2i16:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
12; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
13; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
14; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
16; GFX9-NEXT:    s_waitcnt vmcnt(0)
17; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
18; GFX9-NEXT:    s_waitcnt vmcnt(0)
19; GFX9-NEXT:    s_mov_b32 s7, 0xf000
20; GFX9-NEXT:    s_mov_b32 s6, -1
21; GFX9-NEXT:    v_pk_sub_i16 v0, v1, v2
22; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
23; GFX9-NEXT:    s_endpgm
24;
25; VI-LABEL: v_test_sub_v2i16:
26; VI:       ; %bb.0:
27; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
28; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
29; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
30; VI-NEXT:    s_waitcnt lgkmcnt(0)
31; VI-NEXT:    v_mov_b32_e32 v1, s7
32; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
33; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
34; VI-NEXT:    v_mov_b32_e32 v3, s1
35; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
36; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
37; VI-NEXT:    flat_load_dword v0, v[0:1] glc
38; VI-NEXT:    s_waitcnt vmcnt(0)
39; VI-NEXT:    flat_load_dword v1, v[2:3] glc
40; VI-NEXT:    s_waitcnt vmcnt(0)
41; VI-NEXT:    s_mov_b32 s7, 0xf000
42; VI-NEXT:    s_mov_b32 s6, -1
43; VI-NEXT:    v_sub_u16_e32 v2, v0, v1
44; VI-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
45; VI-NEXT:    v_or_b32_e32 v0, v2, v0
46; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
47; VI-NEXT:    s_endpgm
48;
49; GFX10-LABEL: v_test_sub_v2i16:
50; GFX10:       ; %bb.0:
51; GFX10-NEXT:    s_clause 0x1
52; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
53; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
54; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
55; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
56; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
57; GFX10-NEXT:    s_waitcnt vmcnt(0)
58; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] glc dlc
59; GFX10-NEXT:    s_waitcnt vmcnt(0)
60; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
61; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
62; GFX10-NEXT:    s_mov_b32 s6, -1
63; GFX10-NEXT:    v_pk_sub_i16 v0, v1, v2
64; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
65; GFX10-NEXT:    s_endpgm
66;
67; GFX11-LABEL: v_test_sub_v2i16:
68; GFX11:       ; %bb.0:
69; GFX11-NEXT:    s_clause 0x1
70; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
71; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
72; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
73; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
74; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7] glc dlc
75; GFX11-NEXT:    s_waitcnt vmcnt(0)
76; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1] glc dlc
77; GFX11-NEXT:    s_waitcnt vmcnt(0)
78; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
79; GFX11-NEXT:    s_mov_b32 s6, -1
80; GFX11-NEXT:    v_pk_sub_i16 v0, v1, v0
81; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
82; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
83; GFX11-NEXT:    s_endpgm
84  %tid = call i32 @llvm.amdgcn.workitem.id.x()
85  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
86  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
87  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
88  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
89  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
90  %add = sub <2 x i16> %a, %b
91  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
92  ret void
93}
94
95define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 {
96; GFX9-LABEL: s_test_sub_v2i16:
97; GFX9:       ; %bb.0:
98; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
99; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
100; GFX9-NEXT:    s_mov_b32 s3, 0xf000
101; GFX9-NEXT:    s_mov_b32 s2, -1
102; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
103; GFX9-NEXT:    s_load_dword s10, s[8:9], 0x0
104; GFX9-NEXT:    s_load_dword s11, s[6:7], 0x0
105; GFX9-NEXT:    s_mov_b32 s0, s4
106; GFX9-NEXT:    s_mov_b32 s1, s5
107; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
108; GFX9-NEXT:    v_mov_b32_e32 v0, s10
109; GFX9-NEXT:    v_pk_sub_i16 v0, s11, v0
110; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
111; GFX9-NEXT:    s_endpgm
112;
113; VI-LABEL: s_test_sub_v2i16:
114; VI:       ; %bb.0:
115; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
116; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
117; VI-NEXT:    s_mov_b32 s3, 0xf000
118; VI-NEXT:    s_mov_b32 s2, -1
119; VI-NEXT:    s_waitcnt lgkmcnt(0)
120; VI-NEXT:    s_load_dword s6, s[6:7], 0x0
121; VI-NEXT:    s_load_dword s7, s[0:1], 0x0
122; VI-NEXT:    s_mov_b32 s0, s4
123; VI-NEXT:    s_mov_b32 s1, s5
124; VI-NEXT:    s_waitcnt lgkmcnt(0)
125; VI-NEXT:    s_lshr_b32 s4, s6, 16
126; VI-NEXT:    s_lshr_b32 s5, s7, 16
127; VI-NEXT:    s_sub_i32 s6, s6, s7
128; VI-NEXT:    s_sub_i32 s4, s4, s5
129; VI-NEXT:    s_and_b32 s5, s6, 0xffff
130; VI-NEXT:    s_lshl_b32 s4, s4, 16
131; VI-NEXT:    s_or_b32 s4, s5, s4
132; VI-NEXT:    v_mov_b32_e32 v0, s4
133; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
134; VI-NEXT:    s_endpgm
135;
136; GFX10-LABEL: s_test_sub_v2i16:
137; GFX10:       ; %bb.0:
138; GFX10-NEXT:    s_clause 0x1
139; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
140; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
141; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
142; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
143; GFX10-NEXT:    s_load_dword s1, s[2:3], 0x0
144; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
145; GFX10-NEXT:    s_mov_b32 s6, -1
146; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
147; GFX10-NEXT:    v_pk_sub_i16 v0, s0, s1
148; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
149; GFX10-NEXT:    s_endpgm
150;
151; GFX11-LABEL: s_test_sub_v2i16:
152; GFX11:       ; %bb.0:
153; GFX11-NEXT:    s_clause 0x1
154; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
155; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
156; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX11-NEXT:    s_load_b32 s2, s[6:7], 0x0
158; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
159; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
160; GFX11-NEXT:    s_mov_b32 s6, -1
161; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
162; GFX11-NEXT:    v_pk_sub_i16 v0, s2, s0
163; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
164; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
165; GFX11-NEXT:    s_endpgm
166  %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
167  %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1
168  %add = sub <2 x i16> %a, %b
169  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
170  ret void
171}
172
173define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 {
174; GCN-LABEL: s_test_sub_self_v2i16:
175; GCN:       ; %bb.0:
176; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
177; GCN-NEXT:    s_mov_b32 s3, 0xf000
178; GCN-NEXT:    s_mov_b32 s2, -1
179; GCN-NEXT:    v_mov_b32_e32 v0, 0
180; GCN-NEXT:    s_waitcnt lgkmcnt(0)
181; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
182; GCN-NEXT:    s_endpgm
183;
184; GFX10-LABEL: s_test_sub_self_v2i16:
185; GFX10:       ; %bb.0:
186; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
187; GFX10-NEXT:    v_mov_b32_e32 v0, 0
188; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
189; GFX10-NEXT:    s_mov_b32 s2, -1
190; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
191; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
192; GFX10-NEXT:    s_endpgm
193;
194; GFX11-LABEL: s_test_sub_self_v2i16:
195; GFX11:       ; %bb.0:
196; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
197; GFX11-NEXT:    v_mov_b32_e32 v0, 0
198; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
199; GFX11-NEXT:    s_mov_b32 s2, -1
200; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
201; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
202; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
203; GFX11-NEXT:    s_endpgm
204  %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
205  %add = sub <2 x i16> %a, %a
206  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
207  ret void
208}
209
210; FIXME: VI should not scalarize arg access.
211define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
212; GFX9-LABEL: s_test_sub_v2i16_kernarg:
213; GFX9:       ; %bb.0:
214; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
215; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
216; GFX9-NEXT:    s_mov_b32 s7, 0xf000
217; GFX9-NEXT:    s_mov_b32 s6, -1
218; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
219; GFX9-NEXT:    v_mov_b32_e32 v0, s3
220; GFX9-NEXT:    v_pk_sub_i16 v0, s2, v0
221; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
222; GFX9-NEXT:    s_endpgm
223;
224; VI-LABEL: s_test_sub_v2i16_kernarg:
225; VI:       ; %bb.0:
226; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
227; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
228; VI-NEXT:    s_mov_b32 s3, 0xf000
229; VI-NEXT:    s_mov_b32 s2, -1
230; VI-NEXT:    s_waitcnt lgkmcnt(0)
231; VI-NEXT:    s_lshr_b32 s6, s4, 16
232; VI-NEXT:    s_lshr_b32 s7, s5, 16
233; VI-NEXT:    s_sub_i32 s6, s6, s7
234; VI-NEXT:    s_sub_i32 s4, s4, s5
235; VI-NEXT:    s_lshl_b32 s5, s6, 16
236; VI-NEXT:    s_and_b32 s4, s4, 0xffff
237; VI-NEXT:    s_or_b32 s4, s4, s5
238; VI-NEXT:    v_mov_b32_e32 v0, s4
239; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
240; VI-NEXT:    s_endpgm
241;
242; GFX10-LABEL: s_test_sub_v2i16_kernarg:
243; GFX10:       ; %bb.0:
244; GFX10-NEXT:    s_clause 0x1
245; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
246; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
247; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
248; GFX10-NEXT:    s_mov_b32 s6, -1
249; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
250; GFX10-NEXT:    v_pk_sub_i16 v0, s2, s3
251; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
252; GFX10-NEXT:    s_endpgm
253;
254; GFX11-LABEL: s_test_sub_v2i16_kernarg:
255; GFX11:       ; %bb.0:
256; GFX11-NEXT:    s_clause 0x1
257; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
258; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
259; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
260; GFX11-NEXT:    v_pk_sub_i16 v0, s2, s3
261; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
262; GFX11-NEXT:    s_mov_b32 s2, -1
263; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
264; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
265; GFX11-NEXT:    s_endpgm
266  %add = sub <2 x i16> %a, %b
267  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
268  ret void
269}
270
271define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
272; GFX9-LABEL: v_test_sub_v2i16_constant:
273; GFX9:       ; %bb.0:
274; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
275; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
276; GFX9-NEXT:    s_mov_b32 s4, 0x1c8007b
277; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
278; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
279; GFX9-NEXT:    s_waitcnt vmcnt(0)
280; GFX9-NEXT:    s_mov_b32 s3, 0xf000
281; GFX9-NEXT:    s_mov_b32 s2, -1
282; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
283; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
284; GFX9-NEXT:    s_endpgm
285;
286; VI-LABEL: v_test_sub_v2i16_constant:
287; VI:       ; %bb.0:
288; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
289; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
290; VI-NEXT:    s_waitcnt lgkmcnt(0)
291; VI-NEXT:    v_mov_b32_e32 v1, s3
292; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
293; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
294; VI-NEXT:    flat_load_dword v0, v[0:1] glc
295; VI-NEXT:    s_waitcnt vmcnt(0)
296; VI-NEXT:    v_mov_b32_e32 v1, 0xfffffe38
297; VI-NEXT:    s_mov_b32 s3, 0xf000
298; VI-NEXT:    s_mov_b32 s2, -1
299; VI-NEXT:    v_add_u16_e32 v2, 0xff85, v0
300; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
301; VI-NEXT:    v_or_b32_e32 v0, v2, v0
302; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
303; VI-NEXT:    s_endpgm
304;
305; GFX10-LABEL: v_test_sub_v2i16_constant:
306; GFX10:       ; %bb.0:
307; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
308; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
309; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
310; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
311; GFX10-NEXT:    s_waitcnt vmcnt(0)
312; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
313; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
314; GFX10-NEXT:    s_mov_b32 s2, -1
315; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0x1c8007b
316; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
317; GFX10-NEXT:    s_endpgm
318;
319; GFX11-LABEL: v_test_sub_v2i16_constant:
320; GFX11:       ; %bb.0:
321; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
322; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
323; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
324; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc dlc
325; GFX11-NEXT:    s_waitcnt vmcnt(0)
326; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
327; GFX11-NEXT:    s_mov_b32 s2, -1
328; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0x1c8007b
329; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
330; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
331; GFX11-NEXT:    s_endpgm
332  %tid = call i32 @llvm.amdgcn.workitem.id.x()
333  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
334  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
335  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
336  %add = sub <2 x i16> %a, <i16 123, i16 456>
337  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
338  ret void
339}
340
341; FIXME: Need to handle non-uniform case for function below (load without gep).
342define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
343; GFX9-LABEL: v_test_sub_v2i16_neg_constant:
344; GFX9:       ; %bb.0:
345; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
346; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
347; GFX9-NEXT:    s_mov_b32 s4, 0xfc21fcb3
348; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
349; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
350; GFX9-NEXT:    s_waitcnt vmcnt(0)
351; GFX9-NEXT:    s_mov_b32 s3, 0xf000
352; GFX9-NEXT:    s_mov_b32 s2, -1
353; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
354; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
355; GFX9-NEXT:    s_endpgm
356;
357; VI-LABEL: v_test_sub_v2i16_neg_constant:
358; VI:       ; %bb.0:
359; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
360; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
361; VI-NEXT:    s_waitcnt lgkmcnt(0)
362; VI-NEXT:    v_mov_b32_e32 v1, s3
363; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
364; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
365; VI-NEXT:    flat_load_dword v0, v[0:1] glc
366; VI-NEXT:    s_waitcnt vmcnt(0)
367; VI-NEXT:    v_mov_b32_e32 v1, 0x3df
368; VI-NEXT:    s_mov_b32 s3, 0xf000
369; VI-NEXT:    s_mov_b32 s2, -1
370; VI-NEXT:    v_add_u16_e32 v2, 0x34d, v0
371; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
372; VI-NEXT:    v_or_b32_e32 v0, v2, v0
373; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
374; VI-NEXT:    s_endpgm
375;
376; GFX10-LABEL: v_test_sub_v2i16_neg_constant:
377; GFX10:       ; %bb.0:
378; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
379; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
380; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
381; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
382; GFX10-NEXT:    s_waitcnt vmcnt(0)
383; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
384; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
385; GFX10-NEXT:    s_mov_b32 s2, -1
386; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0xfc21fcb3
387; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
388; GFX10-NEXT:    s_endpgm
389;
390; GFX11-LABEL: v_test_sub_v2i16_neg_constant:
391; GFX11:       ; %bb.0:
392; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
393; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
394; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
395; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc dlc
396; GFX11-NEXT:    s_waitcnt vmcnt(0)
397; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
398; GFX11-NEXT:    s_mov_b32 s2, -1
399; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0xfc21fcb3
400; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
401; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
402; GFX11-NEXT:    s_endpgm
403  %tid = call i32 @llvm.amdgcn.workitem.id.x()
404  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
405  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
406  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
407  %add = sub <2 x i16> %a, <i16 -845, i16 -991>
408  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
409  ret void
410}
411
412define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
413; GFX9-LABEL: v_test_sub_v2i16_inline_neg1:
414; GFX9:       ; %bb.0:
415; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
416; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
417; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
418; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
419; GFX9-NEXT:    s_waitcnt vmcnt(0)
420; GFX9-NEXT:    s_mov_b32 s3, 0xf000
421; GFX9-NEXT:    s_mov_b32 s2, -1
422; GFX9-NEXT:    v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
423; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
424; GFX9-NEXT:    s_endpgm
425;
426; VI-LABEL: v_test_sub_v2i16_inline_neg1:
427; VI:       ; %bb.0:
428; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
429; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
430; VI-NEXT:    s_waitcnt lgkmcnt(0)
431; VI-NEXT:    v_mov_b32_e32 v1, s3
432; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
433; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
434; VI-NEXT:    flat_load_dword v0, v[0:1] glc
435; VI-NEXT:    s_waitcnt vmcnt(0)
436; VI-NEXT:    v_mov_b32_e32 v1, 1
437; VI-NEXT:    s_mov_b32 s3, 0xf000
438; VI-NEXT:    s_mov_b32 s2, -1
439; VI-NEXT:    v_add_u16_e32 v2, 1, v0
440; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
441; VI-NEXT:    v_or_b32_e32 v0, v2, v0
442; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
443; VI-NEXT:    s_endpgm
444;
445; GFX10-LABEL: v_test_sub_v2i16_inline_neg1:
446; GFX10:       ; %bb.0:
447; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
448; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
449; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
450; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
451; GFX10-NEXT:    s_waitcnt vmcnt(0)
452; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
453; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
454; GFX10-NEXT:    s_mov_b32 s2, -1
455; GFX10-NEXT:    v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
456; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
457; GFX10-NEXT:    s_endpgm
458;
459; GFX11-LABEL: v_test_sub_v2i16_inline_neg1:
460; GFX11:       ; %bb.0:
461; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
462; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
463; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
464; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc dlc
465; GFX11-NEXT:    s_waitcnt vmcnt(0)
466; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
467; GFX11-NEXT:    s_mov_b32 s2, -1
468; GFX11-NEXT:    v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
469; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
470; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
471; GFX11-NEXT:    s_endpgm
472  %tid = call i32 @llvm.amdgcn.workitem.id.x()
473  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
474  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
475  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
476  %add = sub <2 x i16> %a, <i16 -1, i16 -1>
477  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
478  ret void
479}
480
481define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
482; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
483; GFX9:       ; %bb.0:
484; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
485; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
486; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
487; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
488; GFX9-NEXT:    s_waitcnt vmcnt(0)
489; GFX9-NEXT:    s_mov_b32 s3, 0xf000
490; GFX9-NEXT:    s_mov_b32 s2, -1
491; GFX9-NEXT:    v_pk_sub_i16 v0, v0, 32
492; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
493; GFX9-NEXT:    s_endpgm
494;
495; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
496; VI:       ; %bb.0:
497; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
498; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
499; VI-NEXT:    s_waitcnt lgkmcnt(0)
500; VI-NEXT:    v_mov_b32_e32 v1, s3
501; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
502; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
503; VI-NEXT:    flat_load_dword v0, v[0:1] glc
504; VI-NEXT:    s_waitcnt vmcnt(0)
505; VI-NEXT:    s_mov_b32 s3, 0xf000
506; VI-NEXT:    s_mov_b32 s2, -1
507; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
508; VI-NEXT:    v_subrev_u16_e32 v0, 32, v0
509; VI-NEXT:    v_or_b32_e32 v0, v0, v1
510; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
511; VI-NEXT:    s_endpgm
512;
513; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
514; GFX10:       ; %bb.0:
515; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
516; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
517; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
518; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
519; GFX10-NEXT:    s_waitcnt vmcnt(0)
520; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
521; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
522; GFX10-NEXT:    s_mov_b32 s2, -1
523; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 32
524; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
525; GFX10-NEXT:    s_endpgm
526;
527; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
528; GFX11:       ; %bb.0:
529; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
530; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
531; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
532; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc dlc
533; GFX11-NEXT:    s_waitcnt vmcnt(0)
534; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
535; GFX11-NEXT:    s_mov_b32 s2, -1
536; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 32
537; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
538; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
539; GFX11-NEXT:    s_endpgm
540  %tid = call i32 @llvm.amdgcn.workitem.id.x()
541  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
542  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
543  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
544  %add = sub <2 x i16> %a, <i16 32, i16 0>
545  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
546  ret void
547}
548
549; The high element gives fp
550define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
551; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split:
552; GFX9:       ; %bb.0:
553; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
554; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
555; GFX9-NEXT:    s_mov_b32 s4, 1.0
556; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
557; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
558; GFX9-NEXT:    s_waitcnt vmcnt(0)
559; GFX9-NEXT:    s_mov_b32 s3, 0xf000
560; GFX9-NEXT:    s_mov_b32 s2, -1
561; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
562; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
563; GFX9-NEXT:    s_endpgm
564;
565; VI-LABEL: v_test_sub_v2i16_inline_fp_split:
566; VI:       ; %bb.0:
567; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
568; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
569; VI-NEXT:    s_waitcnt lgkmcnt(0)
570; VI-NEXT:    v_mov_b32_e32 v1, s3
571; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
572; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
573; VI-NEXT:    flat_load_dword v0, v[0:1] glc
574; VI-NEXT:    s_waitcnt vmcnt(0)
575; VI-NEXT:    v_mov_b32_e32 v1, 0xffffc080
576; VI-NEXT:    s_mov_b32 s3, 0xf000
577; VI-NEXT:    s_mov_b32 s2, -1
578; VI-NEXT:    v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
579; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
580; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
581; VI-NEXT:    s_endpgm
582;
583; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split:
584; GFX10:       ; %bb.0:
585; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
586; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
587; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
588; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
589; GFX10-NEXT:    s_waitcnt vmcnt(0)
590; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
591; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
592; GFX10-NEXT:    s_mov_b32 s2, -1
593; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0x3f80 op_sel:[0,1] op_sel_hi:[1,0]
594; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
595; GFX10-NEXT:    s_endpgm
596;
597; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split:
598; GFX11:       ; %bb.0:
599; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
600; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
601; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
602; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc dlc
603; GFX11-NEXT:    s_waitcnt vmcnt(0)
604; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
605; GFX11-NEXT:    s_mov_b32 s2, -1
606; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0x3f80 op_sel:[0,1] op_sel_hi:[1,0]
607; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
608; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
609; GFX11-NEXT:    s_endpgm
610  %tid = call i32 @llvm.amdgcn.workitem.id.x()
611  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
612  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
613  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
614  %add = sub <2 x i16> %a, <i16 0, i16 16256>
615  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
616  ret void
617}
618
619; FIXME: Need to handle non-uniform case for function below (load without gep).
620define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
621; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32:
622; GFX9:       ; %bb.0:
623; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
624; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
625; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
626; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
627; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
628; GFX9-NEXT:    s_waitcnt vmcnt(0)
629; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
630; GFX9-NEXT:    s_waitcnt vmcnt(0)
631; GFX9-NEXT:    s_mov_b32 s7, 0xf000
632; GFX9-NEXT:    s_mov_b32 s6, -1
633; GFX9-NEXT:    v_pk_sub_i16 v0, v1, v2
634; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
635; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
636; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
637; GFX9-NEXT:    s_endpgm
638;
639; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32:
640; VI:       ; %bb.0:
641; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
642; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
643; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
644; VI-NEXT:    s_waitcnt lgkmcnt(0)
645; VI-NEXT:    v_mov_b32_e32 v1, s7
646; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
647; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
648; VI-NEXT:    v_mov_b32_e32 v3, s1
649; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
650; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
651; VI-NEXT:    flat_load_dword v1, v[0:1] glc
652; VI-NEXT:    s_waitcnt vmcnt(0)
653; VI-NEXT:    flat_load_dword v2, v[2:3] glc
654; VI-NEXT:    s_waitcnt vmcnt(0)
655; VI-NEXT:    s_mov_b32 s7, 0xf000
656; VI-NEXT:    s_mov_b32 s6, -1
657; VI-NEXT:    v_sub_u16_e32 v0, v1, v2
658; VI-NEXT:    v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
659; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
660; VI-NEXT:    s_endpgm
661;
662; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i32:
663; GFX10:       ; %bb.0:
664; GFX10-NEXT:    s_clause 0x1
665; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
666; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
667; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
668; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
669; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
670; GFX10-NEXT:    s_waitcnt vmcnt(0)
671; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] glc dlc
672; GFX10-NEXT:    s_waitcnt vmcnt(0)
673; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
674; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
675; GFX10-NEXT:    s_mov_b32 s6, -1
676; GFX10-NEXT:    v_pk_sub_i16 v0, v1, v2
677; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
678; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
679; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
680; GFX10-NEXT:    s_endpgm
681;
682; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i32:
683; GFX11:       ; %bb.0:
684; GFX11-NEXT:    s_clause 0x1
685; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
686; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
687; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
688; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
689; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7] glc dlc
690; GFX11-NEXT:    s_waitcnt vmcnt(0)
691; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1] glc dlc
692; GFX11-NEXT:    s_waitcnt vmcnt(0)
693; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
694; GFX11-NEXT:    s_mov_b32 s6, -1
695; GFX11-NEXT:    v_pk_sub_i16 v0, v1, v0
696; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
697; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
698; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
699; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
700; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
701; GFX11-NEXT:    s_endpgm
702  %tid = call i32 @llvm.amdgcn.workitem.id.x()
703  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
704  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
705  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
706  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
707  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
708  %add = sub <2 x i16> %a, %b
709  %ext = zext <2 x i16> %add to <2 x i32>
710  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
711  ret void
712}
713
714; FIXME: Need to handle non-uniform case for function below (load without gep).
715define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
716; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64:
717; GFX9:       ; %bb.0:
718; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
719; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
720; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
721; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
722; GFX9-NEXT:    v_mov_b32_e32 v1, 0
723; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
724; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
725; GFX9-NEXT:    s_waitcnt vmcnt(0)
726; GFX9-NEXT:    global_load_dword v3, v0, s[2:3] glc
727; GFX9-NEXT:    s_waitcnt vmcnt(0)
728; GFX9-NEXT:    s_mov_b32 s7, 0xf000
729; GFX9-NEXT:    s_mov_b32 s6, -1
730; GFX9-NEXT:    v_pk_sub_i16 v2, v2, v3
731; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v2
732; GFX9-NEXT:    v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
733; GFX9-NEXT:    v_mov_b32_e32 v3, v1
734; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
735; GFX9-NEXT:    s_endpgm
736;
737; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:
738; VI:       ; %bb.0:
739; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
740; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
741; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
742; VI-NEXT:    s_waitcnt lgkmcnt(0)
743; VI-NEXT:    v_mov_b32_e32 v1, s7
744; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
745; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
746; VI-NEXT:    v_mov_b32_e32 v3, s1
747; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
748; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
749; VI-NEXT:    flat_load_dword v4, v[0:1] glc
750; VI-NEXT:    s_waitcnt vmcnt(0)
751; VI-NEXT:    flat_load_dword v2, v[2:3] glc
752; VI-NEXT:    s_waitcnt vmcnt(0)
753; VI-NEXT:    v_mov_b32_e32 v1, 0
754; VI-NEXT:    s_mov_b32 s7, 0xf000
755; VI-NEXT:    s_mov_b32 s6, -1
756; VI-NEXT:    v_mov_b32_e32 v3, v1
757; VI-NEXT:    v_sub_u16_e32 v0, v4, v2
758; VI-NEXT:    v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
759; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
760; VI-NEXT:    s_endpgm
761;
762; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i64:
763; GFX10:       ; %bb.0:
764; GFX10-NEXT:    s_clause 0x1
765; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
766; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
767; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
768; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff
769; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
770; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
771; GFX10-NEXT:    s_waitcnt vmcnt(0)
772; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] glc dlc
773; GFX10-NEXT:    s_waitcnt vmcnt(0)
774; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
775; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
776; GFX10-NEXT:    s_mov_b32 s6, -1
777; GFX10-NEXT:    v_pk_sub_i16 v2, v1, v2
778; GFX10-NEXT:    v_mov_b32_e32 v1, 0
779; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v2
780; GFX10-NEXT:    v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
781; GFX10-NEXT:    v_mov_b32_e32 v3, v1
782; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
783; GFX10-NEXT:    s_endpgm
784;
785; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i64:
786; GFX11:       ; %bb.0:
787; GFX11-NEXT:    s_clause 0x1
788; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
789; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
790; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
791; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
792; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7] glc dlc
793; GFX11-NEXT:    s_waitcnt vmcnt(0)
794; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1] glc dlc
795; GFX11-NEXT:    s_waitcnt vmcnt(0)
796; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
797; GFX11-NEXT:    s_mov_b32 s6, -1
798; GFX11-NEXT:    v_pk_sub_i16 v0, v1, v0
799; GFX11-NEXT:    v_mov_b32_e32 v1, 0
800; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
801; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
802; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
803; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
804; GFX11-NEXT:    v_lshl_or_b32 v2, 0, 16, v2
805; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
806; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
807; GFX11-NEXT:    s_endpgm
808  %tid = call i32 @llvm.amdgcn.workitem.id.x()
809  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
810  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
811  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
812  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
813  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
814  %add = sub <2 x i16> %a, %b
815  %ext = zext <2 x i16> %add to <2 x i64>
816  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
817  ret void
818}
819
820; FIXME: Need to handle non-uniform case for function below (load without gep).
821define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
822; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32:
823; GFX9:       ; %bb.0:
824; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
825; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
826; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
827; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
828; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
829; GFX9-NEXT:    s_waitcnt vmcnt(0)
830; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
831; GFX9-NEXT:    s_waitcnt vmcnt(0)
832; GFX9-NEXT:    s_mov_b32 s7, 0xf000
833; GFX9-NEXT:    s_mov_b32 s6, -1
834; GFX9-NEXT:    v_pk_sub_i16 v0, v1, v2
835; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
836; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
837; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
838; GFX9-NEXT:    s_endpgm
839;
840; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32:
841; VI:       ; %bb.0:
842; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
843; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
844; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
845; VI-NEXT:    s_waitcnt lgkmcnt(0)
846; VI-NEXT:    v_mov_b32_e32 v1, s7
847; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
848; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
849; VI-NEXT:    v_mov_b32_e32 v3, s1
850; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
851; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
852; VI-NEXT:    flat_load_dword v0, v[0:1] glc
853; VI-NEXT:    s_waitcnt vmcnt(0)
854; VI-NEXT:    flat_load_dword v1, v[2:3] glc
855; VI-NEXT:    s_waitcnt vmcnt(0)
856; VI-NEXT:    s_mov_b32 s7, 0xf000
857; VI-NEXT:    s_mov_b32 s6, -1
858; VI-NEXT:    v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
859; VI-NEXT:    v_sub_u16_e32 v0, v0, v1
860; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
861; VI-NEXT:    v_bfe_i32 v1, v2, 0, 16
862; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
863; VI-NEXT:    s_endpgm
864;
865; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i32:
866; GFX10:       ; %bb.0:
867; GFX10-NEXT:    s_clause 0x1
868; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
869; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
870; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
871; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
872; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
873; GFX10-NEXT:    s_waitcnt vmcnt(0)
874; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] glc dlc
875; GFX10-NEXT:    s_waitcnt vmcnt(0)
876; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
877; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
878; GFX10-NEXT:    s_mov_b32 s6, -1
879; GFX10-NEXT:    v_pk_sub_i16 v0, v1, v2
880; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
881; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 16
882; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
883; GFX10-NEXT:    s_endpgm
884;
885; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i32:
886; GFX11:       ; %bb.0:
887; GFX11-NEXT:    s_clause 0x1
888; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
889; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
890; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
891; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
892; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7] glc dlc
893; GFX11-NEXT:    s_waitcnt vmcnt(0)
894; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1] glc dlc
895; GFX11-NEXT:    s_waitcnt vmcnt(0)
896; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
897; GFX11-NEXT:    s_mov_b32 s6, -1
898; GFX11-NEXT:    v_pk_sub_i16 v0, v1, v0
899; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
900; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
901; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
902; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
903; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
904; GFX11-NEXT:    s_endpgm
905  %tid = call i32 @llvm.amdgcn.workitem.id.x()
906  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
907  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
908  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
909  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
910  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
911  %add = sub <2 x i16> %a, %b
912  %ext = sext <2 x i16> %add to <2 x i32>
913  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
914  ret void
915}
916
917; FIXME: Need to handle non-uniform case for function below (load without gep).
918define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
919; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64:
920; GFX9:       ; %bb.0:
921; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
922; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
923; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
924; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
925; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
926; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
927; GFX9-NEXT:    s_mov_b32 s7, 0xf000
928; GFX9-NEXT:    s_mov_b32 s6, -1
929; GFX9-NEXT:    s_waitcnt vmcnt(0)
930; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v2
931; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
932; GFX9-NEXT:    v_bfe_i32 v0, v1, 0, 16
933; GFX9-NEXT:    v_bfe_i32 v2, v2, 0, 16
934; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
935; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
936; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
937; GFX9-NEXT:    s_endpgm
938;
939; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64:
940; VI:       ; %bb.0:
941; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
942; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
943; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
944; VI-NEXT:    s_waitcnt lgkmcnt(0)
945; VI-NEXT:    v_mov_b32_e32 v1, s7
946; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
947; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
948; VI-NEXT:    v_mov_b32_e32 v3, s1
949; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
950; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
951; VI-NEXT:    flat_load_dword v0, v[0:1]
952; VI-NEXT:    flat_load_dword v1, v[2:3]
953; VI-NEXT:    s_mov_b32 s7, 0xf000
954; VI-NEXT:    s_mov_b32 s6, -1
955; VI-NEXT:    s_waitcnt vmcnt(0)
956; VI-NEXT:    v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
957; VI-NEXT:    v_sub_u16_e32 v0, v0, v1
958; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
959; VI-NEXT:    v_bfe_i32 v2, v2, 0, 16
960; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
961; VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
962; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
963; VI-NEXT:    s_endpgm
964;
965; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i64:
966; GFX10:       ; %bb.0:
967; GFX10-NEXT:    s_clause 0x1
968; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
969; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
970; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
971; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
972; GFX10-NEXT:    s_clause 0x1
973; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
974; GFX10-NEXT:    global_load_dword v2, v0, s[2:3]
975; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
976; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
977; GFX10-NEXT:    s_mov_b32 s6, -1
978; GFX10-NEXT:    s_waitcnt vmcnt(0)
979; GFX10-NEXT:    v_pk_sub_i16 v0, v1, v2
980; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
981; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 16
982; GFX10-NEXT:    v_bfe_i32 v2, v1, 0, 16
983; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
984; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
985; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
986; GFX10-NEXT:    s_endpgm
987;
988; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i64:
989; GFX11:       ; %bb.0:
990; GFX11-NEXT:    s_clause 0x1
991; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
992; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
993; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
994; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
995; GFX11-NEXT:    s_clause 0x1
996; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7]
997; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1]
998; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
999; GFX11-NEXT:    s_mov_b32 s6, -1
1000; GFX11-NEXT:    s_waitcnt vmcnt(0)
1001; GFX11-NEXT:    v_pk_sub_i16 v0, v1, v0
1002; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1003; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1004; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
1005; GFX11-NEXT:    v_bfe_i32 v2, v1, 0, 16
1006; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1007; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1008; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
1009; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
1010; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1011; GFX11-NEXT:    s_endpgm
1012  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1013  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
1014  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
1015  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
1016  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
1017  %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
1018  %add = sub <2 x i16> %a, %b
1019  %ext = sext <2 x i16> %add to <2 x i64>
1020  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
1021  ret void
1022}
1023
1024declare i32 @llvm.amdgcn.workitem.id.x() #0
1025
1026attributes #0 = { nounwind readnone }
1027attributes #1 = { nounwind }
1028