1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
6
7define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
8; GFX6-LABEL: v_usubsat_i8:
9; GFX6:       ; %bb.0:
10; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX6-NEXT:    s_movk_i32 s4, 0xff
12; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
13; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
14; GFX6-NEXT:    v_max_u32_e32 v0, v0, v1
15; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
16; GFX6-NEXT:    s_setpc_b64 s[30:31]
17;
18; GFX8-LABEL: v_usubsat_i8:
19; GFX8:       ; %bb.0:
20; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
22; GFX8-NEXT:    s_setpc_b64 s[30:31]
23;
24; GFX9-LABEL: v_usubsat_i8:
25; GFX9:       ; %bb.0:
26; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27; GFX9-NEXT:    v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
28; GFX9-NEXT:    s_setpc_b64 s[30:31]
29;
30; GFX10-LABEL: v_usubsat_i8:
31; GFX10:       ; %bb.0:
32; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
34; GFX10-NEXT:    s_movk_i32 s4, 0xff
35; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
36; GFX10-NEXT:    v_and_b32_e32 v0, s4, v0
37; GFX10-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
38; GFX10-NEXT:    s_setpc_b64 s[30:31]
39  %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
40  ret i8 %result
41}
42
43define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) {
44; GFX6-LABEL: v_usubsat_i16:
45; GFX6:       ; %bb.0:
46; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
47; GFX6-NEXT:    s_mov_b32 s4, 0xffff
48; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
49; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
50; GFX6-NEXT:    v_max_u32_e32 v0, v0, v1
51; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
52; GFX6-NEXT:    s_setpc_b64 s[30:31]
53;
54; GFX8-LABEL: v_usubsat_i16:
55; GFX8:       ; %bb.0:
56; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
57; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
58; GFX8-NEXT:    s_setpc_b64 s[30:31]
59;
60; GFX9-LABEL: v_usubsat_i16:
61; GFX9:       ; %bb.0:
62; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
63; GFX9-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
64; GFX9-NEXT:    s_setpc_b64 s[30:31]
65;
66; GFX10-LABEL: v_usubsat_i16:
67; GFX10:       ; %bb.0:
68; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
69; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
70; GFX10-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
71; GFX10-NEXT:    s_setpc_b64 s[30:31]
72  %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
73  ret i16 %result
74}
75
76define i16 @usubsat_as_bithack_i16(i16 %x) {
77; GFX6-LABEL: usubsat_as_bithack_i16:
78; GFX6:       ; %bb.0:
79; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80; GFX6-NEXT:    v_bfe_i32 v1, v0, 0, 16
81; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v1
82; GFX6-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
83; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
84; GFX6-NEXT:    s_setpc_b64 s[30:31]
85;
86; GFX8-LABEL: usubsat_as_bithack_i16:
87; GFX8:       ; %bb.0:
88; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
89; GFX8-NEXT:    s_movk_i32 s4, 0x8000
90; GFX8-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
91; GFX8-NEXT:    s_setpc_b64 s[30:31]
92;
93; GFX9-LABEL: usubsat_as_bithack_i16:
94; GFX9:       ; %bb.0:
95; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
96; GFX9-NEXT:    s_movk_i32 s4, 0x8000
97; GFX9-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
98; GFX9-NEXT:    s_setpc_b64 s[30:31]
99;
100; GFX10-LABEL: usubsat_as_bithack_i16:
101; GFX10:       ; %bb.0:
102; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
103; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
104; GFX10-NEXT:    v_sub_nc_u16 v0, v0, 0x8000 clamp
105; GFX10-NEXT:    s_setpc_b64 s[30:31]
106  %signsplat = ashr i16 %x, 15
107  %flipsign = xor i16 %x, 32768
108  %result = and i16 %signsplat, %flipsign
109  ret i16 %result
110}
111
112define i16 @usubsat_as_bithack2_i16(i16 %x) {
113; GFX6-LABEL: usubsat_as_bithack2_i16:
114; GFX6:       ; %bb.0:
115; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116; GFX6-NEXT:    v_bfe_i32 v1, v0, 0, 16
117; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v1
118; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xffff8000, v0
119; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
120; GFX6-NEXT:    s_setpc_b64 s[30:31]
121;
122; GFX8-LABEL: usubsat_as_bithack2_i16:
123; GFX8:       ; %bb.0:
124; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
125; GFX8-NEXT:    s_movk_i32 s4, 0x8000
126; GFX8-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
127; GFX8-NEXT:    s_setpc_b64 s[30:31]
128;
129; GFX9-LABEL: usubsat_as_bithack2_i16:
130; GFX9:       ; %bb.0:
131; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
132; GFX9-NEXT:    s_movk_i32 s4, 0x8000
133; GFX9-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
134; GFX9-NEXT:    s_setpc_b64 s[30:31]
135;
136; GFX10-LABEL: usubsat_as_bithack2_i16:
137; GFX10:       ; %bb.0:
138; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
139; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
140; GFX10-NEXT:    v_sub_nc_u16 v0, v0, 0x8000 clamp
141; GFX10-NEXT:    s_setpc_b64 s[30:31]
142  %signsplat = ashr i16 %x, 15
143  %flipsign = add i16 %x, 32768
144  %result = and i16 %signsplat, %flipsign
145  ret i16 %result
146}
147
148define i16 @usubsat_as_bithack_commute_i16(i16 %x) {
149; GFX6-LABEL: usubsat_as_bithack_commute_i16:
150; GFX6:       ; %bb.0:
151; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
152; GFX6-NEXT:    v_bfe_i32 v1, v0, 0, 16
153; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v1
154; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xffff8000, v0
155; GFX6-NEXT:    v_and_b32_e32 v0, v0, v1
156; GFX6-NEXT:    s_setpc_b64 s[30:31]
157;
158; GFX8-LABEL: usubsat_as_bithack_commute_i16:
159; GFX8:       ; %bb.0:
160; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161; GFX8-NEXT:    s_movk_i32 s4, 0x8000
162; GFX8-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
163; GFX8-NEXT:    s_setpc_b64 s[30:31]
164;
165; GFX9-LABEL: usubsat_as_bithack_commute_i16:
166; GFX9:       ; %bb.0:
167; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168; GFX9-NEXT:    s_movk_i32 s4, 0x8000
169; GFX9-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
170; GFX9-NEXT:    s_setpc_b64 s[30:31]
171;
172; GFX10-LABEL: usubsat_as_bithack_commute_i16:
173; GFX10:       ; %bb.0:
174; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
175; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
176; GFX10-NEXT:    v_sub_nc_u16 v0, v0, 0x8000 clamp
177; GFX10-NEXT:    s_setpc_b64 s[30:31]
178  %signsplat = ashr i16 %x, 15
179  %flipsign = add i16 %x, 32768
180  %result = and i16 %flipsign, %signsplat
181  ret i16 %result
182}
183
184define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) {
185; GFX6-LABEL: v_usubsat_i32:
186; GFX6:       ; %bb.0:
187; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
188; GFX6-NEXT:    v_max_u32_e32 v0, v0, v1
189; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
190; GFX6-NEXT:    s_setpc_b64 s[30:31]
191;
192; GFX8-LABEL: v_usubsat_i32:
193; GFX8:       ; %bb.0:
194; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
195; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v1 clamp
196; GFX8-NEXT:    s_setpc_b64 s[30:31]
197;
198; GFX9-LABEL: v_usubsat_i32:
199; GFX9:       ; %bb.0:
200; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
201; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v1 clamp
202; GFX9-NEXT:    s_setpc_b64 s[30:31]
203;
204; GFX10-LABEL: v_usubsat_i32:
205; GFX10:       ; %bb.0:
206; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
208; GFX10-NEXT:    v_sub_nc_u32_e64 v0, v0, v1 clamp
209; GFX10-NEXT:    s_setpc_b64 s[30:31]
210  %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
211  ret i32 %result
212}
213
214define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
215; GFX6-LABEL: v_usubsat_v2i16:
216; GFX6:       ; %bb.0:
217; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218; GFX6-NEXT:    s_mov_b32 s4, 0xffff
219; GFX6-NEXT:    v_and_b32_e32 v4, s4, v3
220; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
221; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
222; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
223; GFX6-NEXT:    v_max_u32_e32 v1, v1, v4
224; GFX6-NEXT:    v_max_u32_e32 v0, v0, v2
225; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
226; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
227; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
228; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
229; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
230; GFX6-NEXT:    s_setpc_b64 s[30:31]
231;
232; GFX8-LABEL: v_usubsat_v2i16:
233; GFX8:       ; %bb.0:
234; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
235; GFX8-NEXT:    v_sub_u16_sdwa v2, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
236; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
237; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
238; GFX8-NEXT:    s_setpc_b64 s[30:31]
239;
240; GFX9-LABEL: v_usubsat_v2i16:
241; GFX9:       ; %bb.0:
242; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
243; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
244; GFX9-NEXT:    s_setpc_b64 s[30:31]
245;
246; GFX10-LABEL: v_usubsat_v2i16:
247; GFX10:       ; %bb.0:
248; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
250; GFX10-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
251; GFX10-NEXT:    s_setpc_b64 s[30:31]
252  %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
253  ret <2 x i16> %result
254}
255
256define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
257; GFX6-LABEL: v_usubsat_v3i16:
258; GFX6:       ; %bb.0:
259; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
260; GFX6-NEXT:    s_mov_b32 s4, 0xffff
261; GFX6-NEXT:    v_and_b32_e32 v6, s4, v4
262; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
263; GFX6-NEXT:    v_and_b32_e32 v3, s4, v3
264; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
265; GFX6-NEXT:    v_max_u32_e32 v1, v1, v6
266; GFX6-NEXT:    v_max_u32_e32 v0, v0, v3
267; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
268; GFX6-NEXT:    v_and_b32_e32 v5, s4, v5
269; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
270; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
271; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
272; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
273; GFX6-NEXT:    v_max_u32_e32 v1, v2, v5
274; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v1, v5
275; GFX6-NEXT:    v_alignbit_b32 v1, v2, v0, 16
276; GFX6-NEXT:    s_setpc_b64 s[30:31]
277;
278; GFX8-LABEL: v_usubsat_v3i16:
279; GFX8:       ; %bb.0:
280; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
281; GFX8-NEXT:    v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
282; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v2 clamp
283; GFX8-NEXT:    v_sub_u16_e64 v1, v1, v3 clamp
284; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
285; GFX8-NEXT:    s_setpc_b64 s[30:31]
286;
287; GFX9-LABEL: v_usubsat_v3i16:
288; GFX9:       ; %bb.0:
289; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
290; GFX9-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
291; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
292; GFX9-NEXT:    s_setpc_b64 s[30:31]
293;
294; GFX10-LABEL: v_usubsat_v3i16:
295; GFX10:       ; %bb.0:
296; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
297; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
298; GFX10-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
299; GFX10-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
300; GFX10-NEXT:    s_setpc_b64 s[30:31]
301  %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
302  ret <3 x i16> %result
303}
304
305define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
306; GFX6-LABEL: v_usubsat_v4i16:
307; GFX6:       ; %bb.0:
308; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
309; GFX6-NEXT:    s_mov_b32 s4, 0xffff
310; GFX6-NEXT:    v_and_b32_e32 v9, s4, v5
311; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
312; GFX6-NEXT:    v_and_b32_e32 v4, s4, v4
313; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
314; GFX6-NEXT:    v_max_u32_e32 v1, v1, v9
315; GFX6-NEXT:    v_max_u32_e32 v0, v0, v4
316; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
317; GFX6-NEXT:    v_and_b32_e32 v8, s4, v7
318; GFX6-NEXT:    v_and_b32_e32 v3, s4, v3
319; GFX6-NEXT:    v_and_b32_e32 v6, s4, v6
320; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
321; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
322; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
323; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
324; GFX6-NEXT:    v_max_u32_e32 v1, v2, v6
325; GFX6-NEXT:    v_max_u32_e32 v2, v3, v8
326; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
327; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
328; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
329; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
330; GFX6-NEXT:    s_setpc_b64 s[30:31]
331;
332; GFX8-LABEL: v_usubsat_v4i16:
333; GFX8:       ; %bb.0:
334; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
335; GFX8-NEXT:    v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
336; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v2 clamp
337; GFX8-NEXT:    v_sub_u16_sdwa v2, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
338; GFX8-NEXT:    v_sub_u16_e64 v1, v1, v3 clamp
339; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
340; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
341; GFX8-NEXT:    s_setpc_b64 s[30:31]
342;
343; GFX9-LABEL: v_usubsat_v4i16:
344; GFX9:       ; %bb.0:
345; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
346; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
347; GFX9-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
348; GFX9-NEXT:    s_setpc_b64 s[30:31]
349;
350; GFX10-LABEL: v_usubsat_v4i16:
351; GFX10:       ; %bb.0:
352; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
353; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
354; GFX10-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
355; GFX10-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
356; GFX10-NEXT:    s_setpc_b64 s[30:31]
357  %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
358  %cast = bitcast <4 x i16> %result to <2 x float>
359  ret <2 x float> %cast
360}
361
362define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
363; GFX6-LABEL: v_usubsat_v2i32:
364; GFX6:       ; %bb.0:
365; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
366; GFX6-NEXT:    v_max_u32_e32 v0, v0, v2
367; GFX6-NEXT:    v_max_u32_e32 v1, v1, v3
368; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
369; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
370; GFX6-NEXT:    s_setpc_b64 s[30:31]
371;
372; GFX8-LABEL: v_usubsat_v2i32:
373; GFX8:       ; %bb.0:
374; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
375; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v2 clamp
376; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v3 clamp
377; GFX8-NEXT:    s_setpc_b64 s[30:31]
378;
379; GFX9-LABEL: v_usubsat_v2i32:
380; GFX9:       ; %bb.0:
381; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
382; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v2 clamp
383; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v3 clamp
384; GFX9-NEXT:    s_setpc_b64 s[30:31]
385;
386; GFX10-LABEL: v_usubsat_v2i32:
387; GFX10:       ; %bb.0:
388; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
389; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
390; GFX10-NEXT:    v_sub_nc_u32_e64 v0, v0, v2 clamp
391; GFX10-NEXT:    v_sub_nc_u32_e64 v1, v1, v3 clamp
392; GFX10-NEXT:    s_setpc_b64 s[30:31]
393  %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
394  ret <2 x i32> %result
395}
396
397define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
398; GFX6-LABEL: v_usubsat_v3i32:
399; GFX6:       ; %bb.0:
400; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
401; GFX6-NEXT:    v_max_u32_e32 v0, v0, v3
402; GFX6-NEXT:    v_max_u32_e32 v1, v1, v4
403; GFX6-NEXT:    v_max_u32_e32 v2, v2, v5
404; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
405; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
406; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
407; GFX6-NEXT:    s_setpc_b64 s[30:31]
408;
409; GFX8-LABEL: v_usubsat_v3i32:
410; GFX8:       ; %bb.0:
411; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
412; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v3 clamp
413; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v4 clamp
414; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v5 clamp
415; GFX8-NEXT:    s_setpc_b64 s[30:31]
416;
417; GFX9-LABEL: v_usubsat_v3i32:
418; GFX9:       ; %bb.0:
419; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
420; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v3 clamp
421; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v4 clamp
422; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v5 clamp
423; GFX9-NEXT:    s_setpc_b64 s[30:31]
424;
425; GFX10-LABEL: v_usubsat_v3i32:
426; GFX10:       ; %bb.0:
427; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
428; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
429; GFX10-NEXT:    v_sub_nc_u32_e64 v0, v0, v3 clamp
430; GFX10-NEXT:    v_sub_nc_u32_e64 v1, v1, v4 clamp
431; GFX10-NEXT:    v_sub_nc_u32_e64 v2, v2, v5 clamp
432; GFX10-NEXT:    s_setpc_b64 s[30:31]
433  %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
434  ret <3 x i32> %result
435}
436
437define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
438; GFX6-LABEL: v_usubsat_v4i32:
439; GFX6:       ; %bb.0:
440; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
441; GFX6-NEXT:    v_max_u32_e32 v0, v0, v4
442; GFX6-NEXT:    v_max_u32_e32 v1, v1, v5
443; GFX6-NEXT:    v_max_u32_e32 v2, v2, v6
444; GFX6-NEXT:    v_max_u32_e32 v3, v3, v7
445; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
446; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
447; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
448; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v7
449; GFX6-NEXT:    s_setpc_b64 s[30:31]
450;
451; GFX8-LABEL: v_usubsat_v4i32:
452; GFX8:       ; %bb.0:
453; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
454; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v4 clamp
455; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v5 clamp
456; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v6 clamp
457; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v3, v7 clamp
458; GFX8-NEXT:    s_setpc_b64 s[30:31]
459;
460; GFX9-LABEL: v_usubsat_v4i32:
461; GFX9:       ; %bb.0:
462; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
463; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v4 clamp
464; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v5 clamp
465; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v6 clamp
466; GFX9-NEXT:    v_sub_u32_e64 v3, v3, v7 clamp
467; GFX9-NEXT:    s_setpc_b64 s[30:31]
468;
469; GFX10-LABEL: v_usubsat_v4i32:
470; GFX10:       ; %bb.0:
471; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
472; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
473; GFX10-NEXT:    v_sub_nc_u32_e64 v0, v0, v4 clamp
474; GFX10-NEXT:    v_sub_nc_u32_e64 v1, v1, v5 clamp
475; GFX10-NEXT:    v_sub_nc_u32_e64 v2, v2, v6 clamp
476; GFX10-NEXT:    v_sub_nc_u32_e64 v3, v3, v7 clamp
477; GFX10-NEXT:    s_setpc_b64 s[30:31]
478  %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
479  ret <4 x i32> %result
480}
481
482define <8 x i32> @v_usubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
483; GFX6-LABEL: v_usubsat_v8i32:
484; GFX6:       ; %bb.0:
485; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
486; GFX6-NEXT:    v_max_u32_e32 v0, v0, v8
487; GFX6-NEXT:    v_max_u32_e32 v1, v1, v9
488; GFX6-NEXT:    v_max_u32_e32 v2, v2, v10
489; GFX6-NEXT:    v_max_u32_e32 v3, v3, v11
490; GFX6-NEXT:    v_max_u32_e32 v4, v4, v12
491; GFX6-NEXT:    v_max_u32_e32 v5, v5, v13
492; GFX6-NEXT:    v_max_u32_e32 v6, v6, v14
493; GFX6-NEXT:    v_max_u32_e32 v7, v7, v15
494; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
495; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
496; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
497; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v11
498; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v12
499; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v13
500; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v14
501; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v15
502; GFX6-NEXT:    s_setpc_b64 s[30:31]
503;
504; GFX8-LABEL: v_usubsat_v8i32:
505; GFX8:       ; %bb.0:
506; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
507; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v8 clamp
508; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v9 clamp
509; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v10 clamp
510; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v3, v11 clamp
511; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v4, v12 clamp
512; GFX8-NEXT:    v_sub_u32_e64 v5, s[4:5], v5, v13 clamp
513; GFX8-NEXT:    v_sub_u32_e64 v6, s[4:5], v6, v14 clamp
514; GFX8-NEXT:    v_sub_u32_e64 v7, s[4:5], v7, v15 clamp
515; GFX8-NEXT:    s_setpc_b64 s[30:31]
516;
517; GFX9-LABEL: v_usubsat_v8i32:
518; GFX9:       ; %bb.0:
519; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
520; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v8 clamp
521; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v9 clamp
522; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v10 clamp
523; GFX9-NEXT:    v_sub_u32_e64 v3, v3, v11 clamp
524; GFX9-NEXT:    v_sub_u32_e64 v4, v4, v12 clamp
525; GFX9-NEXT:    v_sub_u32_e64 v5, v5, v13 clamp
526; GFX9-NEXT:    v_sub_u32_e64 v6, v6, v14 clamp
527; GFX9-NEXT:    v_sub_u32_e64 v7, v7, v15 clamp
528; GFX9-NEXT:    s_setpc_b64 s[30:31]
529;
530; GFX10-LABEL: v_usubsat_v8i32:
531; GFX10:       ; %bb.0:
532; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
533; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
534; GFX10-NEXT:    v_sub_nc_u32_e64 v0, v0, v8 clamp
535; GFX10-NEXT:    v_sub_nc_u32_e64 v1, v1, v9 clamp
536; GFX10-NEXT:    v_sub_nc_u32_e64 v2, v2, v10 clamp
537; GFX10-NEXT:    v_sub_nc_u32_e64 v3, v3, v11 clamp
538; GFX10-NEXT:    v_sub_nc_u32_e64 v4, v4, v12 clamp
539; GFX10-NEXT:    v_sub_nc_u32_e64 v5, v5, v13 clamp
540; GFX10-NEXT:    v_sub_nc_u32_e64 v6, v6, v14 clamp
541; GFX10-NEXT:    v_sub_nc_u32_e64 v7, v7, v15 clamp
542; GFX10-NEXT:    s_setpc_b64 s[30:31]
543  %result = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs)
544  ret <8 x i32> %result
545}
546
547define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
548; GFX6-LABEL: v_usubsat_v16i32:
549; GFX6:       ; %bb.0:
550; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
551; GFX6-NEXT:    v_max_u32_e32 v0, v0, v16
552; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v16
553; GFX6-NEXT:    buffer_load_dword v16, off, s[0:3], s32
554; GFX6-NEXT:    v_max_u32_e32 v1, v1, v17
555; GFX6-NEXT:    v_max_u32_e32 v2, v2, v18
556; GFX6-NEXT:    v_max_u32_e32 v3, v3, v19
557; GFX6-NEXT:    v_max_u32_e32 v4, v4, v20
558; GFX6-NEXT:    v_max_u32_e32 v5, v5, v21
559; GFX6-NEXT:    v_max_u32_e32 v6, v6, v22
560; GFX6-NEXT:    v_max_u32_e32 v7, v7, v23
561; GFX6-NEXT:    v_max_u32_e32 v8, v8, v24
562; GFX6-NEXT:    v_max_u32_e32 v9, v9, v25
563; GFX6-NEXT:    v_max_u32_e32 v10, v10, v26
564; GFX6-NEXT:    v_max_u32_e32 v11, v11, v27
565; GFX6-NEXT:    v_max_u32_e32 v12, v12, v28
566; GFX6-NEXT:    v_max_u32_e32 v13, v13, v29
567; GFX6-NEXT:    v_max_u32_e32 v14, v14, v30
568; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v17
569; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v18
570; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v19
571; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v20
572; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v21
573; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v22
574; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v23
575; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v24
576; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v25
577; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v26
578; GFX6-NEXT:    v_sub_i32_e32 v11, vcc, v11, v27
579; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v12, v28
580; GFX6-NEXT:    v_sub_i32_e32 v13, vcc, v13, v29
581; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, v14, v30
582; GFX6-NEXT:    s_waitcnt vmcnt(0)
583; GFX6-NEXT:    v_max_u32_e32 v15, v15, v16
584; GFX6-NEXT:    v_sub_i32_e32 v15, vcc, v15, v16
585; GFX6-NEXT:    s_setpc_b64 s[30:31]
586;
587; GFX8-LABEL: v_usubsat_v16i32:
588; GFX8:       ; %bb.0:
589; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
590; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v16 clamp
591; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
592; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v17 clamp
593; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v18 clamp
594; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v3, v19 clamp
595; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v4, v20 clamp
596; GFX8-NEXT:    v_sub_u32_e64 v5, s[4:5], v5, v21 clamp
597; GFX8-NEXT:    v_sub_u32_e64 v6, s[4:5], v6, v22 clamp
598; GFX8-NEXT:    v_sub_u32_e64 v7, s[4:5], v7, v23 clamp
599; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v8, v24 clamp
600; GFX8-NEXT:    v_sub_u32_e64 v9, s[4:5], v9, v25 clamp
601; GFX8-NEXT:    v_sub_u32_e64 v10, s[4:5], v10, v26 clamp
602; GFX8-NEXT:    v_sub_u32_e64 v11, s[4:5], v11, v27 clamp
603; GFX8-NEXT:    v_sub_u32_e64 v12, s[4:5], v12, v28 clamp
604; GFX8-NEXT:    v_sub_u32_e64 v13, s[4:5], v13, v29 clamp
605; GFX8-NEXT:    v_sub_u32_e64 v14, s[4:5], v14, v30 clamp
606; GFX8-NEXT:    s_waitcnt vmcnt(0)
607; GFX8-NEXT:    v_sub_u32_e64 v15, s[4:5], v15, v16 clamp
608; GFX8-NEXT:    s_setpc_b64 s[30:31]
609;
610; GFX9-LABEL: v_usubsat_v16i32:
611; GFX9:       ; %bb.0:
612; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
613; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v16 clamp
614; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32
615; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v17 clamp
616; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v18 clamp
617; GFX9-NEXT:    v_sub_u32_e64 v3, v3, v19 clamp
618; GFX9-NEXT:    v_sub_u32_e64 v4, v4, v20 clamp
619; GFX9-NEXT:    v_sub_u32_e64 v5, v5, v21 clamp
620; GFX9-NEXT:    v_sub_u32_e64 v6, v6, v22 clamp
621; GFX9-NEXT:    v_sub_u32_e64 v7, v7, v23 clamp
622; GFX9-NEXT:    v_sub_u32_e64 v8, v8, v24 clamp
623; GFX9-NEXT:    v_sub_u32_e64 v9, v9, v25 clamp
624; GFX9-NEXT:    v_sub_u32_e64 v10, v10, v26 clamp
625; GFX9-NEXT:    v_sub_u32_e64 v11, v11, v27 clamp
626; GFX9-NEXT:    v_sub_u32_e64 v12, v12, v28 clamp
627; GFX9-NEXT:    v_sub_u32_e64 v13, v13, v29 clamp
628; GFX9-NEXT:    v_sub_u32_e64 v14, v14, v30 clamp
629; GFX9-NEXT:    s_waitcnt vmcnt(0)
630; GFX9-NEXT:    v_sub_u32_e64 v15, v15, v16 clamp
631; GFX9-NEXT:    s_setpc_b64 s[30:31]
632;
633; GFX10-LABEL: v_usubsat_v16i32:
634; GFX10:       ; %bb.0:
635; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
636; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
637; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
638; GFX10-NEXT:    v_sub_nc_u32_e64 v0, v0, v16 clamp
639; GFX10-NEXT:    v_sub_nc_u32_e64 v1, v1, v17 clamp
640; GFX10-NEXT:    v_sub_nc_u32_e64 v2, v2, v18 clamp
641; GFX10-NEXT:    v_sub_nc_u32_e64 v3, v3, v19 clamp
642; GFX10-NEXT:    v_sub_nc_u32_e64 v4, v4, v20 clamp
643; GFX10-NEXT:    v_sub_nc_u32_e64 v5, v5, v21 clamp
644; GFX10-NEXT:    v_sub_nc_u32_e64 v6, v6, v22 clamp
645; GFX10-NEXT:    v_sub_nc_u32_e64 v7, v7, v23 clamp
646; GFX10-NEXT:    v_sub_nc_u32_e64 v8, v8, v24 clamp
647; GFX10-NEXT:    v_sub_nc_u32_e64 v9, v9, v25 clamp
648; GFX10-NEXT:    v_sub_nc_u32_e64 v10, v10, v26 clamp
649; GFX10-NEXT:    v_sub_nc_u32_e64 v11, v11, v27 clamp
650; GFX10-NEXT:    v_sub_nc_u32_e64 v12, v12, v28 clamp
651; GFX10-NEXT:    v_sub_nc_u32_e64 v13, v13, v29 clamp
652; GFX10-NEXT:    v_sub_nc_u32_e64 v14, v14, v30 clamp
653; GFX10-NEXT:    s_waitcnt vmcnt(0)
654; GFX10-NEXT:    v_sub_nc_u32_e64 v15, v15, v31 clamp
655; GFX10-NEXT:    s_setpc_b64 s[30:31]
656  %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
657  ret <16 x i32> %result
658}
659
660
661define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
662; GFX6-LABEL: v_usubsat_i64:
663; GFX6:       ; %bb.0:
664; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
665; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v0, v2
666; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
667; GFX6-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
668; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
669; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc
670; GFX6-NEXT:    s_setpc_b64 s[30:31]
671;
672; GFX8-LABEL: v_usubsat_i64:
673; GFX8:       ; %bb.0:
674; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
675; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v0, v2
676; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
677; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
678; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
679; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc
680; GFX8-NEXT:    s_setpc_b64 s[30:31]
681;
682; GFX9-LABEL: v_usubsat_i64:
683; GFX9:       ; %bb.0:
684; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
685; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v2
686; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
687; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
688; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
689; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc
690; GFX9-NEXT:    s_setpc_b64 s[30:31]
691;
692; GFX10-LABEL: v_usubsat_i64:
693; GFX10:       ; %bb.0:
694; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
695; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
696; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
697; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
698; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
699; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
700; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
701; GFX10-NEXT:    s_setpc_b64 s[30:31]
702  %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
703  ret i64 %result
704}
705
706declare i8 @llvm.usub.sat.i8(i8, i8) #0
707declare i16 @llvm.usub.sat.i16(i16, i16) #0
708declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>) #0
709declare <3 x i16> @llvm.usub.sat.v3i16(<3 x i16>, <3 x i16>) #0
710declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) #0
711declare i32 @llvm.usub.sat.i32(i32, i32) #0
712declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>) #0
713declare <3 x i32> @llvm.usub.sat.v3i32(<3 x i32>, <3 x i32>) #0
714declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) #0
715declare <8 x i32> @llvm.usub.sat.v8i32(<8 x i32>, <8 x i32>) #0
716declare <16 x i32> @llvm.usub.sat.v16i32(<16 x i32>, <16 x i32>) #0
717declare i64 @llvm.usub.sat.i64(i64, i64) #0
718
719attributes #0 = { nounwind readnone speculatable willreturn }
720