1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI
4; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10
5
6; FIXME: Need to handle non-uniform case for function below (load without gep).
7define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
8; GFX9-LABEL: v_test_sub_v2i16:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
11; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
12; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
13; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
14; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
15; GFX9-NEXT:    s_waitcnt vmcnt(0)
16; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
17; GFX9-NEXT:    s_waitcnt vmcnt(0)
18; GFX9-NEXT:    s_mov_b32 s7, 0xf000
19; GFX9-NEXT:    s_mov_b32 s6, -1
20; GFX9-NEXT:    v_pk_sub_i16 v0, v1, v2
21; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
22; GFX9-NEXT:    s_endpgm
23;
24; VI-LABEL: v_test_sub_v2i16:
25; VI:       ; %bb.0:
26; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
27; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
28; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
29; VI-NEXT:    s_waitcnt lgkmcnt(0)
30; VI-NEXT:    v_mov_b32_e32 v1, s7
31; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
32; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
33; VI-NEXT:    v_mov_b32_e32 v3, s1
34; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
35; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
36; VI-NEXT:    flat_load_dword v0, v[0:1] glc
37; VI-NEXT:    s_waitcnt vmcnt(0)
38; VI-NEXT:    flat_load_dword v1, v[2:3] glc
39; VI-NEXT:    s_waitcnt vmcnt(0)
40; VI-NEXT:    s_mov_b32 s7, 0xf000
41; VI-NEXT:    s_mov_b32 s6, -1
42; VI-NEXT:    v_sub_u16_e32 v2, v0, v1
43; VI-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
44; VI-NEXT:    v_or_b32_e32 v0, v2, v0
45; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
46; VI-NEXT:    s_endpgm
47;
48; GFX10-LABEL: v_test_sub_v2i16:
49; GFX10:       ; %bb.0:
50; GFX10-NEXT:    s_clause 0x1
51; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
52; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
53; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
54; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
55; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
56; GFX10-NEXT:    s_waitcnt vmcnt(0)
57; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] glc dlc
58; GFX10-NEXT:    s_waitcnt vmcnt(0)
59; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
60; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
61; GFX10-NEXT:    s_mov_b32 s6, -1
62; GFX10-NEXT:    v_pk_sub_i16 v0, v1, v2
63; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
64; GFX10-NEXT:    s_endpgm
65  %tid = call i32 @llvm.amdgcn.workitem.id.x()
66  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
67  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
68  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
69  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
70  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
71  %add = sub <2 x i16> %a, %b
72  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
73  ret void
74}
75
76define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 {
77; GFX9-LABEL: s_test_sub_v2i16:
78; GFX9:       ; %bb.0:
79; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
80; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
81; GFX9-NEXT:    s_mov_b32 s3, 0xf000
82; GFX9-NEXT:    s_mov_b32 s2, -1
83; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
84; GFX9-NEXT:    s_load_dword s10, s[8:9], 0x0
85; GFX9-NEXT:    s_load_dword s11, s[6:7], 0x0
86; GFX9-NEXT:    s_mov_b32 s0, s4
87; GFX9-NEXT:    s_mov_b32 s1, s5
88; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
89; GFX9-NEXT:    v_mov_b32_e32 v0, s10
90; GFX9-NEXT:    v_pk_sub_i16 v0, s11, v0
91; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
92; GFX9-NEXT:    s_endpgm
93;
94; VI-LABEL: s_test_sub_v2i16:
95; VI:       ; %bb.0:
96; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
97; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
98; VI-NEXT:    s_mov_b32 s3, 0xf000
99; VI-NEXT:    s_mov_b32 s2, -1
100; VI-NEXT:    s_waitcnt lgkmcnt(0)
101; VI-NEXT:    s_load_dword s6, s[6:7], 0x0
102; VI-NEXT:    s_load_dword s7, s[0:1], 0x0
103; VI-NEXT:    s_mov_b32 s0, s4
104; VI-NEXT:    s_mov_b32 s1, s5
105; VI-NEXT:    s_waitcnt lgkmcnt(0)
106; VI-NEXT:    s_lshr_b32 s4, s6, 16
107; VI-NEXT:    s_lshr_b32 s5, s7, 16
108; VI-NEXT:    s_sub_i32 s6, s6, s7
109; VI-NEXT:    s_sub_i32 s4, s4, s5
110; VI-NEXT:    s_and_b32 s5, s6, 0xffff
111; VI-NEXT:    s_lshl_b32 s4, s4, 16
112; VI-NEXT:    s_or_b32 s4, s5, s4
113; VI-NEXT:    v_mov_b32_e32 v0, s4
114; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
115; VI-NEXT:    s_endpgm
116;
117; GFX10-LABEL: s_test_sub_v2i16:
118; GFX10:       ; %bb.0:
119; GFX10-NEXT:    s_clause 0x1
120; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
121; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
122; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
123; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
124; GFX10-NEXT:    s_load_dword s1, s[2:3], 0x0
125; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
126; GFX10-NEXT:    s_mov_b32 s6, -1
127; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX10-NEXT:    v_pk_sub_i16 v0, s0, s1
129; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
130; GFX10-NEXT:    s_endpgm
131  %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
132  %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1
133  %add = sub <2 x i16> %a, %b
134  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
135  ret void
136}
137
138define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 {
139; GCN-LABEL: s_test_sub_self_v2i16:
140; GCN:       ; %bb.0:
141; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
142; GCN-NEXT:    s_mov_b32 s3, 0xf000
143; GCN-NEXT:    s_mov_b32 s2, -1
144; GCN-NEXT:    v_mov_b32_e32 v0, 0
145; GCN-NEXT:    s_waitcnt lgkmcnt(0)
146; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
147; GCN-NEXT:    s_endpgm
148;
149; GFX10-LABEL: s_test_sub_self_v2i16:
150; GFX10:       ; %bb.0:
151; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
152; GFX10-NEXT:    v_mov_b32_e32 v0, 0
153; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
154; GFX10-NEXT:    s_mov_b32 s2, -1
155; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
156; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
157; GFX10-NEXT:    s_endpgm
158  %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
159  %add = sub <2 x i16> %a, %a
160  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
161  ret void
162}
163
164; FIXME: VI should not scalarize arg access.
165define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
166; GFX9-LABEL: s_test_sub_v2i16_kernarg:
167; GFX9:       ; %bb.0:
168; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x30
169; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x2c
170; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
171; GFX9-NEXT:    s_mov_b32 s7, 0xf000
172; GFX9-NEXT:    s_mov_b32 s6, -1
173; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
174; GFX9-NEXT:    v_mov_b32_e32 v0, s2
175; GFX9-NEXT:    v_pk_sub_i16 v0, s3, v0
176; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
177; GFX9-NEXT:    s_endpgm
178;
179; VI-LABEL: s_test_sub_v2i16_kernarg:
180; VI:       ; %bb.0:
181; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
182; VI-NEXT:    s_load_dword s5, s[0:1], 0x30
183; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
184; VI-NEXT:    s_mov_b32 s3, 0xf000
185; VI-NEXT:    s_mov_b32 s2, -1
186; VI-NEXT:    s_waitcnt lgkmcnt(0)
187; VI-NEXT:    s_lshr_b32 s6, s4, 16
188; VI-NEXT:    s_lshr_b32 s7, s5, 16
189; VI-NEXT:    s_sub_i32 s6, s6, s7
190; VI-NEXT:    s_sub_i32 s4, s4, s5
191; VI-NEXT:    s_lshl_b32 s5, s6, 16
192; VI-NEXT:    s_and_b32 s4, s4, 0xffff
193; VI-NEXT:    s_or_b32 s4, s4, s5
194; VI-NEXT:    v_mov_b32_e32 v0, s4
195; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
196; VI-NEXT:    s_endpgm
197;
198; GFX10-LABEL: s_test_sub_v2i16_kernarg:
199; GFX10:       ; %bb.0:
200; GFX10-NEXT:    s_clause 0x2
201; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c
202; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x30
203; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
204; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
205; GFX10-NEXT:    s_mov_b32 s6, -1
206; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
207; GFX10-NEXT:    v_pk_sub_i16 v0, s2, s3
208; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
209; GFX10-NEXT:    s_endpgm
210  %add = sub <2 x i16> %a, %b
211  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
212  ret void
213}
214
215define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
216; GFX9-LABEL: v_test_sub_v2i16_constant:
217; GFX9:       ; %bb.0:
218; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
219; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
220; GFX9-NEXT:    s_mov_b32 s4, 0x1c8007b
221; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
222; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
223; GFX9-NEXT:    s_waitcnt vmcnt(0)
224; GFX9-NEXT:    s_mov_b32 s3, 0xf000
225; GFX9-NEXT:    s_mov_b32 s2, -1
226; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
227; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
228; GFX9-NEXT:    s_endpgm
229;
230; VI-LABEL: v_test_sub_v2i16_constant:
231; VI:       ; %bb.0:
232; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
233; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
234; VI-NEXT:    s_waitcnt lgkmcnt(0)
235; VI-NEXT:    v_mov_b32_e32 v1, s3
236; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
237; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
238; VI-NEXT:    flat_load_dword v0, v[0:1] glc
239; VI-NEXT:    s_waitcnt vmcnt(0)
240; VI-NEXT:    v_mov_b32_e32 v1, 0xfffffe38
241; VI-NEXT:    s_mov_b32 s3, 0xf000
242; VI-NEXT:    s_mov_b32 s2, -1
243; VI-NEXT:    v_add_u16_e32 v2, 0xff85, v0
244; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
245; VI-NEXT:    v_or_b32_e32 v0, v2, v0
246; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
247; VI-NEXT:    s_endpgm
248;
249; GFX10-LABEL: v_test_sub_v2i16_constant:
250; GFX10:       ; %bb.0:
251; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
252; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
253; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
254; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
255; GFX10-NEXT:    s_waitcnt vmcnt(0)
256; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
257; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
258; GFX10-NEXT:    s_mov_b32 s2, -1
259; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0x1c8007b
260; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
261; GFX10-NEXT:    s_endpgm
262  %tid = call i32 @llvm.amdgcn.workitem.id.x()
263  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
264  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
265  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
266  %add = sub <2 x i16> %a, <i16 123, i16 456>
267  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
268  ret void
269}
270
271; FIXME: Need to handle non-uniform case for function below (load without gep).
272define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
273; GFX9-LABEL: v_test_sub_v2i16_neg_constant:
274; GFX9:       ; %bb.0:
275; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
276; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
277; GFX9-NEXT:    s_mov_b32 s4, 0xfc21fcb3
278; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
279; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
280; GFX9-NEXT:    s_waitcnt vmcnt(0)
281; GFX9-NEXT:    s_mov_b32 s3, 0xf000
282; GFX9-NEXT:    s_mov_b32 s2, -1
283; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
284; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
285; GFX9-NEXT:    s_endpgm
286;
287; VI-LABEL: v_test_sub_v2i16_neg_constant:
288; VI:       ; %bb.0:
289; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
290; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
291; VI-NEXT:    s_waitcnt lgkmcnt(0)
292; VI-NEXT:    v_mov_b32_e32 v1, s3
293; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
294; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
295; VI-NEXT:    flat_load_dword v0, v[0:1] glc
296; VI-NEXT:    s_waitcnt vmcnt(0)
297; VI-NEXT:    v_mov_b32_e32 v1, 0x3df
298; VI-NEXT:    s_mov_b32 s3, 0xf000
299; VI-NEXT:    s_mov_b32 s2, -1
300; VI-NEXT:    v_add_u16_e32 v2, 0x34d, v0
301; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
302; VI-NEXT:    v_or_b32_e32 v0, v2, v0
303; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
304; VI-NEXT:    s_endpgm
305;
306; GFX10-LABEL: v_test_sub_v2i16_neg_constant:
307; GFX10:       ; %bb.0:
308; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
309; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
310; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
311; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
312; GFX10-NEXT:    s_waitcnt vmcnt(0)
313; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
314; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
315; GFX10-NEXT:    s_mov_b32 s2, -1
316; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0xfc21fcb3
317; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
318; GFX10-NEXT:    s_endpgm
319  %tid = call i32 @llvm.amdgcn.workitem.id.x()
320  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
321  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
322  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
323  %add = sub <2 x i16> %a, <i16 -845, i16 -991>
324  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
325  ret void
326}
327
328define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
329; GFX9-LABEL: v_test_sub_v2i16_inline_neg1:
330; GFX9:       ; %bb.0:
331; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
332; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
333; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
334; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
335; GFX9-NEXT:    s_waitcnt vmcnt(0)
336; GFX9-NEXT:    s_mov_b32 s3, 0xf000
337; GFX9-NEXT:    s_mov_b32 s2, -1
338; GFX9-NEXT:    v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
339; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
340; GFX9-NEXT:    s_endpgm
341;
342; VI-LABEL: v_test_sub_v2i16_inline_neg1:
343; VI:       ; %bb.0:
344; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
345; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
346; VI-NEXT:    s_waitcnt lgkmcnt(0)
347; VI-NEXT:    v_mov_b32_e32 v1, s3
348; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
349; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
350; VI-NEXT:    flat_load_dword v0, v[0:1] glc
351; VI-NEXT:    s_waitcnt vmcnt(0)
352; VI-NEXT:    v_mov_b32_e32 v1, 1
353; VI-NEXT:    s_mov_b32 s3, 0xf000
354; VI-NEXT:    s_mov_b32 s2, -1
355; VI-NEXT:    v_add_u16_e32 v2, 1, v0
356; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
357; VI-NEXT:    v_or_b32_e32 v0, v2, v0
358; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
359; VI-NEXT:    s_endpgm
360;
361; GFX10-LABEL: v_test_sub_v2i16_inline_neg1:
362; GFX10:       ; %bb.0:
363; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
364; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
365; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
366; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
367; GFX10-NEXT:    s_waitcnt vmcnt(0)
368; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
369; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
370; GFX10-NEXT:    s_mov_b32 s2, -1
371; GFX10-NEXT:    v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
372; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
373; GFX10-NEXT:    s_endpgm
374  %tid = call i32 @llvm.amdgcn.workitem.id.x()
375  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
376  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
377  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
378  %add = sub <2 x i16> %a, <i16 -1, i16 -1>
379  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
380  ret void
381}
382
383define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
384; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
385; GFX9:       ; %bb.0:
386; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
387; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
388; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
389; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
390; GFX9-NEXT:    s_waitcnt vmcnt(0)
391; GFX9-NEXT:    s_mov_b32 s3, 0xf000
392; GFX9-NEXT:    s_mov_b32 s2, -1
393; GFX9-NEXT:    v_pk_sub_i16 v0, v0, 32
394; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
395; GFX9-NEXT:    s_endpgm
396;
397; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
398; VI:       ; %bb.0:
399; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
400; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
401; VI-NEXT:    s_waitcnt lgkmcnt(0)
402; VI-NEXT:    v_mov_b32_e32 v1, s3
403; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
404; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
405; VI-NEXT:    flat_load_dword v0, v[0:1] glc
406; VI-NEXT:    s_waitcnt vmcnt(0)
407; VI-NEXT:    s_mov_b32 s3, 0xf000
408; VI-NEXT:    s_mov_b32 s2, -1
409; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
410; VI-NEXT:    v_subrev_u16_e32 v0, 32, v0
411; VI-NEXT:    v_or_b32_e32 v0, v0, v1
412; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
413; VI-NEXT:    s_endpgm
414;
415; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
416; GFX10:       ; %bb.0:
417; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
418; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
419; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
420; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
421; GFX10-NEXT:    s_waitcnt vmcnt(0)
422; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
423; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
424; GFX10-NEXT:    s_mov_b32 s2, -1
425; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 32
426; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
427; GFX10-NEXT:    s_endpgm
428  %tid = call i32 @llvm.amdgcn.workitem.id.x()
429  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
430  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
431  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
432  %add = sub <2 x i16> %a, <i16 32, i16 0>
433  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
434  ret void
435}
436
437; The high element gives fp
438define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
439; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split:
440; GFX9:       ; %bb.0:
441; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
442; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
443; GFX9-NEXT:    s_mov_b32 s4, 1.0
444; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
445; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
446; GFX9-NEXT:    s_waitcnt vmcnt(0)
447; GFX9-NEXT:    s_mov_b32 s3, 0xf000
448; GFX9-NEXT:    s_mov_b32 s2, -1
449; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
450; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
451; GFX9-NEXT:    s_endpgm
452;
453; VI-LABEL: v_test_sub_v2i16_inline_fp_split:
454; VI:       ; %bb.0:
455; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
456; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
457; VI-NEXT:    s_waitcnt lgkmcnt(0)
458; VI-NEXT:    v_mov_b32_e32 v1, s3
459; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
460; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
461; VI-NEXT:    flat_load_dword v0, v[0:1] glc
462; VI-NEXT:    s_waitcnt vmcnt(0)
463; VI-NEXT:    v_mov_b32_e32 v1, 0xffffc080
464; VI-NEXT:    s_mov_b32 s3, 0xf000
465; VI-NEXT:    s_mov_b32 s2, -1
466; VI-NEXT:    v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
467; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
468; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
469; VI-NEXT:    s_endpgm
470;
471; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split:
472; GFX10:       ; %bb.0:
473; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
474; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
475; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
476; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
477; GFX10-NEXT:    s_waitcnt vmcnt(0)
478; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
479; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
480; GFX10-NEXT:    s_mov_b32 s2, -1
481; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0x3f80 op_sel:[0,1] op_sel_hi:[1,0]
482; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
483; GFX10-NEXT:    s_endpgm
484  %tid = call i32 @llvm.amdgcn.workitem.id.x()
485  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
486  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
487  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
488  %add = sub <2 x i16> %a, <i16 0, i16 16256>
489  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
490  ret void
491}
492
493; FIXME: Need to handle non-uniform case for function below (load without gep).
494define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
495; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32:
496; GFX9:       ; %bb.0:
497; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
498; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
499; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
500; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
501; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
502; GFX9-NEXT:    s_waitcnt vmcnt(0)
503; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
504; GFX9-NEXT:    s_waitcnt vmcnt(0)
505; GFX9-NEXT:    s_mov_b32 s7, 0xf000
506; GFX9-NEXT:    s_mov_b32 s6, -1
507; GFX9-NEXT:    v_pk_sub_i16 v0, v1, v2
508; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
509; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
510; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
511; GFX9-NEXT:    s_endpgm
512;
513; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32:
514; VI:       ; %bb.0:
515; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
516; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
517; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
518; VI-NEXT:    s_waitcnt lgkmcnt(0)
519; VI-NEXT:    v_mov_b32_e32 v1, s7
520; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
521; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
522; VI-NEXT:    v_mov_b32_e32 v3, s1
523; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
524; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
525; VI-NEXT:    flat_load_dword v1, v[0:1] glc
526; VI-NEXT:    s_waitcnt vmcnt(0)
527; VI-NEXT:    flat_load_dword v2, v[2:3] glc
528; VI-NEXT:    s_waitcnt vmcnt(0)
529; VI-NEXT:    s_mov_b32 s7, 0xf000
530; VI-NEXT:    s_mov_b32 s6, -1
531; VI-NEXT:    v_sub_u16_e32 v0, v1, v2
532; VI-NEXT:    v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
533; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
534; VI-NEXT:    s_endpgm
535;
536; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i32:
537; GFX10:       ; %bb.0:
538; GFX10-NEXT:    s_clause 0x1
539; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
540; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
541; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
542; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
543; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
544; GFX10-NEXT:    s_waitcnt vmcnt(0)
545; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] glc dlc
546; GFX10-NEXT:    s_waitcnt vmcnt(0)
547; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
548; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
549; GFX10-NEXT:    s_mov_b32 s6, -1
550; GFX10-NEXT:    v_pk_sub_i16 v0, v1, v2
551; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
552; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
553; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
554; GFX10-NEXT:    s_endpgm
555  %tid = call i32 @llvm.amdgcn.workitem.id.x()
556  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
557  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
558  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
559  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
560  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
561  %add = sub <2 x i16> %a, %b
562  %ext = zext <2 x i16> %add to <2 x i32>
563  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
564  ret void
565}
566
567; FIXME: Need to handle non-uniform case for function below (load without gep).
568define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
569; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64:
570; GFX9:       ; %bb.0:
571; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
572; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
573; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
574; GFX9-NEXT:    v_mov_b32_e32 v1, 0
575; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
576; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
577; GFX9-NEXT:    s_waitcnt vmcnt(0)
578; GFX9-NEXT:    global_load_dword v3, v0, s[2:3] glc
579; GFX9-NEXT:    s_waitcnt vmcnt(0)
580; GFX9-NEXT:    s_mov_b32 s7, 0xf000
581; GFX9-NEXT:    s_mov_b32 s6, -1
582; GFX9-NEXT:    v_pk_sub_i16 v2, v2, v3
583; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v2
584; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
585; GFX9-NEXT:    v_mov_b32_e32 v3, v1
586; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
587; GFX9-NEXT:    s_endpgm
588;
589; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:
590; VI:       ; %bb.0:
591; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
592; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
593; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
594; VI-NEXT:    s_waitcnt lgkmcnt(0)
595; VI-NEXT:    v_mov_b32_e32 v1, s7
596; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
597; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
598; VI-NEXT:    v_mov_b32_e32 v3, s1
599; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
600; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
601; VI-NEXT:    flat_load_dword v4, v[0:1] glc
602; VI-NEXT:    s_waitcnt vmcnt(0)
603; VI-NEXT:    flat_load_dword v2, v[2:3] glc
604; VI-NEXT:    s_waitcnt vmcnt(0)
605; VI-NEXT:    v_mov_b32_e32 v1, 0
606; VI-NEXT:    s_mov_b32 s7, 0xf000
607; VI-NEXT:    s_mov_b32 s6, -1
608; VI-NEXT:    v_mov_b32_e32 v3, v1
609; VI-NEXT:    v_sub_u16_e32 v0, v4, v2
610; VI-NEXT:    v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
611; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
612; VI-NEXT:    s_endpgm
613;
614; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i64:
615; GFX10:       ; %bb.0:
616; GFX10-NEXT:    s_clause 0x1
617; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
618; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
619; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
620; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
621; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
622; GFX10-NEXT:    s_waitcnt vmcnt(0)
623; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] glc dlc
624; GFX10-NEXT:    s_waitcnt vmcnt(0)
625; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
626; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
627; GFX10-NEXT:    s_mov_b32 s6, -1
628; GFX10-NEXT:    v_pk_sub_i16 v2, v1, v2
629; GFX10-NEXT:    v_mov_b32_e32 v1, 0
630; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v2
631; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
632; GFX10-NEXT:    v_mov_b32_e32 v3, v1
633; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
634; GFX10-NEXT:    s_endpgm
635  %tid = call i32 @llvm.amdgcn.workitem.id.x()
636  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
637  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
638  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
639  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
640  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
641  %add = sub <2 x i16> %a, %b
642  %ext = zext <2 x i16> %add to <2 x i64>
643  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
644  ret void
645}
646
647; FIXME: Need to handle non-uniform case for function below (load without gep).
648define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
649; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32:
650; GFX9:       ; %bb.0:
651; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
652; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
653; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
654; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
655; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
656; GFX9-NEXT:    s_waitcnt vmcnt(0)
657; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
658; GFX9-NEXT:    s_waitcnt vmcnt(0)
659; GFX9-NEXT:    s_mov_b32 s7, 0xf000
660; GFX9-NEXT:    s_mov_b32 s6, -1
661; GFX9-NEXT:    v_pk_sub_i16 v0, v1, v2
662; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
663; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
664; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
665; GFX9-NEXT:    s_endpgm
666;
667; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32:
668; VI:       ; %bb.0:
669; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
670; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
671; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
672; VI-NEXT:    s_waitcnt lgkmcnt(0)
673; VI-NEXT:    v_mov_b32_e32 v1, s7
674; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
675; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
676; VI-NEXT:    v_mov_b32_e32 v3, s1
677; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
678; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
679; VI-NEXT:    flat_load_dword v0, v[0:1] glc
680; VI-NEXT:    s_waitcnt vmcnt(0)
681; VI-NEXT:    flat_load_dword v1, v[2:3] glc
682; VI-NEXT:    s_waitcnt vmcnt(0)
683; VI-NEXT:    s_mov_b32 s7, 0xf000
684; VI-NEXT:    s_mov_b32 s6, -1
685; VI-NEXT:    v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
686; VI-NEXT:    v_sub_u16_e32 v0, v0, v1
687; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
688; VI-NEXT:    v_bfe_i32 v1, v2, 0, 16
689; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
690; VI-NEXT:    s_endpgm
691;
692; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i32:
693; GFX10:       ; %bb.0:
694; GFX10-NEXT:    s_clause 0x1
695; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
696; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
697; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
698; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
699; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
700; GFX10-NEXT:    s_waitcnt vmcnt(0)
701; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] glc dlc
702; GFX10-NEXT:    s_waitcnt vmcnt(0)
703; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
704; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
705; GFX10-NEXT:    s_mov_b32 s6, -1
706; GFX10-NEXT:    v_pk_sub_i16 v0, v1, v2
707; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
708; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 16
709; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
710; GFX10-NEXT:    s_endpgm
711  %tid = call i32 @llvm.amdgcn.workitem.id.x()
712  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
713  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
714  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
715  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
716  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
717  %add = sub <2 x i16> %a, %b
718  %ext = sext <2 x i16> %add to <2 x i32>
719  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
720  ret void
721}
722
723; FIXME: Need to handle non-uniform case for function below (load without gep).
724define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
725; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64:
726; GFX9:       ; %bb.0:
727; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
728; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
729; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
730; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
731; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
732; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
733; GFX9-NEXT:    s_mov_b32 s7, 0xf000
734; GFX9-NEXT:    s_mov_b32 s6, -1
735; GFX9-NEXT:    s_waitcnt vmcnt(0)
736; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v2
737; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
738; GFX9-NEXT:    v_bfe_i32 v0, v1, 0, 16
739; GFX9-NEXT:    v_bfe_i32 v2, v2, 0, 16
740; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
741; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
742; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
743; GFX9-NEXT:    s_endpgm
744;
745; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64:
746; VI:       ; %bb.0:
747; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
748; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
749; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
750; VI-NEXT:    s_waitcnt lgkmcnt(0)
751; VI-NEXT:    v_mov_b32_e32 v1, s7
752; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
753; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
754; VI-NEXT:    v_mov_b32_e32 v3, s1
755; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
756; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
757; VI-NEXT:    flat_load_dword v0, v[0:1]
758; VI-NEXT:    flat_load_dword v1, v[2:3]
759; VI-NEXT:    s_mov_b32 s7, 0xf000
760; VI-NEXT:    s_mov_b32 s6, -1
761; VI-NEXT:    s_waitcnt vmcnt(0)
762; VI-NEXT:    v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
763; VI-NEXT:    v_sub_u16_e32 v0, v0, v1
764; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
765; VI-NEXT:    v_bfe_i32 v2, v2, 0, 16
766; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
767; VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
768; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
769; VI-NEXT:    s_endpgm
770;
771; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i64:
772; GFX10:       ; %bb.0:
773; GFX10-NEXT:    s_clause 0x1
774; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
775; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
776; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
777; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
778; GFX10-NEXT:    s_clause 0x1
779; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
780; GFX10-NEXT:    global_load_dword v2, v0, s[2:3]
781; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
782; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
783; GFX10-NEXT:    s_mov_b32 s6, -1
784; GFX10-NEXT:    s_waitcnt vmcnt(0)
785; GFX10-NEXT:    v_pk_sub_i16 v0, v1, v2
786; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
787; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 16
788; GFX10-NEXT:    v_bfe_i32 v2, v1, 0, 16
789; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
790; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
791; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
792; GFX10-NEXT:    s_endpgm
793  %tid = call i32 @llvm.amdgcn.workitem.id.x()
794  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
795  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
796  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
797  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
798  %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
799  %add = sub <2 x i16> %a, %b
800  %ext = sext <2 x i16> %add to <2 x i64>
801  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
802  ret void
803}
804
805declare i32 @llvm.amdgcn.workitem.id.x() #0
806
807attributes #0 = { nounwind readnone }
808attributes #1 = { nounwind }
809