1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s
7
8define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
9; GFX6-LABEL: v_usubsat_i8:
10; GFX6:       ; %bb.0:
11; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
13; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
14; GFX6-NEXT:    v_max_u32_e32 v0, v0, v1
15; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
16; GFX6-NEXT:    s_setpc_b64 s[30:31]
17;
18; GFX8-LABEL: v_usubsat_i8:
19; GFX8:       ; %bb.0:
20; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
22; GFX8-NEXT:    s_setpc_b64 s[30:31]
23;
24; GFX9-LABEL: v_usubsat_i8:
25; GFX9:       ; %bb.0:
26; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27; GFX9-NEXT:    v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
28; GFX9-NEXT:    s_setpc_b64 s[30:31]
29;
30; GFX10PLUS-LABEL: v_usubsat_i8:
31; GFX10PLUS:       ; %bb.0:
32; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
34; GFX10PLUS-NEXT:    v_and_b32_e32 v1, 0xff, v1
35; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xff, v0
36; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
37; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
38  %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
39  ret i8 %result
40}
41
42define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) {
43; GFX6-LABEL: v_usubsat_i16:
44; GFX6:       ; %bb.0:
45; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
46; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
47; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
48; GFX6-NEXT:    v_max_u32_e32 v0, v0, v1
49; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
50; GFX6-NEXT:    s_setpc_b64 s[30:31]
51;
52; GFX8-LABEL: v_usubsat_i16:
53; GFX8:       ; %bb.0:
54; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
55; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
56; GFX8-NEXT:    s_setpc_b64 s[30:31]
57;
58; GFX9-LABEL: v_usubsat_i16:
59; GFX9:       ; %bb.0:
60; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
61; GFX9-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
62; GFX9-NEXT:    s_setpc_b64 s[30:31]
63;
64; GFX10PLUS-LABEL: v_usubsat_i16:
65; GFX10PLUS:       ; %bb.0:
66; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
67; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
68; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
69; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
70  %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
71  ret i16 %result
72}
73
74define i16 @usubsat_as_bithack_i16(i16 %x) {
75; GFX6-LABEL: usubsat_as_bithack_i16:
76; GFX6:       ; %bb.0:
77; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
78; GFX6-NEXT:    v_bfe_i32 v1, v0, 0, 16
79; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v1
80; GFX6-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
81; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
82; GFX6-NEXT:    s_setpc_b64 s[30:31]
83;
84; GFX8-LABEL: usubsat_as_bithack_i16:
85; GFX8:       ; %bb.0:
86; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87; GFX8-NEXT:    s_movk_i32 s4, 0x8000
88; GFX8-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
89; GFX8-NEXT:    s_setpc_b64 s[30:31]
90;
91; GFX9-LABEL: usubsat_as_bithack_i16:
92; GFX9:       ; %bb.0:
93; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
94; GFX9-NEXT:    s_movk_i32 s4, 0x8000
95; GFX9-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
96; GFX9-NEXT:    s_setpc_b64 s[30:31]
97;
98; GFX10PLUS-LABEL: usubsat_as_bithack_i16:
99; GFX10PLUS:       ; %bb.0:
100; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
101; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
102; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, 0x8000 clamp
103; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
104  %signsplat = ashr i16 %x, 15
105  %flipsign = xor i16 %x, 32768
106  %result = and i16 %signsplat, %flipsign
107  ret i16 %result
108}
109
110define i16 @usubsat_as_bithack2_i16(i16 %x) {
111; GFX6-LABEL: usubsat_as_bithack2_i16:
112; GFX6:       ; %bb.0:
113; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114; GFX6-NEXT:    v_bfe_i32 v1, v0, 0, 16
115; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v1
116; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xffff8000, v0
117; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
118; GFX6-NEXT:    s_setpc_b64 s[30:31]
119;
120; GFX8-LABEL: usubsat_as_bithack2_i16:
121; GFX8:       ; %bb.0:
122; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123; GFX8-NEXT:    s_movk_i32 s4, 0x8000
124; GFX8-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
125; GFX8-NEXT:    s_setpc_b64 s[30:31]
126;
127; GFX9-LABEL: usubsat_as_bithack2_i16:
128; GFX9:       ; %bb.0:
129; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
130; GFX9-NEXT:    s_movk_i32 s4, 0x8000
131; GFX9-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
132; GFX9-NEXT:    s_setpc_b64 s[30:31]
133;
134; GFX10PLUS-LABEL: usubsat_as_bithack2_i16:
135; GFX10PLUS:       ; %bb.0:
136; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
138; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, 0x8000 clamp
139; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
140  %signsplat = ashr i16 %x, 15
141  %flipsign = add i16 %x, 32768
142  %result = and i16 %signsplat, %flipsign
143  ret i16 %result
144}
145
146define i16 @usubsat_as_bithack_commute_i16(i16 %x) {
147; GFX6-LABEL: usubsat_as_bithack_commute_i16:
148; GFX6:       ; %bb.0:
149; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150; GFX6-NEXT:    v_bfe_i32 v1, v0, 0, 16
151; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v1
152; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xffff8000, v0
153; GFX6-NEXT:    v_and_b32_e32 v0, v0, v1
154; GFX6-NEXT:    s_setpc_b64 s[30:31]
155;
156; GFX8-LABEL: usubsat_as_bithack_commute_i16:
157; GFX8:       ; %bb.0:
158; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159; GFX8-NEXT:    s_movk_i32 s4, 0x8000
160; GFX8-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
161; GFX8-NEXT:    s_setpc_b64 s[30:31]
162;
163; GFX9-LABEL: usubsat_as_bithack_commute_i16:
164; GFX9:       ; %bb.0:
165; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
166; GFX9-NEXT:    s_movk_i32 s4, 0x8000
167; GFX9-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
168; GFX9-NEXT:    s_setpc_b64 s[30:31]
169;
170; GFX10PLUS-LABEL: usubsat_as_bithack_commute_i16:
171; GFX10PLUS:       ; %bb.0:
172; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
173; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
174; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, 0x8000 clamp
175; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
176  %signsplat = ashr i16 %x, 15
177  %flipsign = add i16 %x, 32768
178  %result = and i16 %flipsign, %signsplat
179  ret i16 %result
180}
181
182define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) {
183; GFX6-LABEL: v_usubsat_i32:
184; GFX6:       ; %bb.0:
185; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
186; GFX6-NEXT:    v_max_u32_e32 v0, v0, v1
187; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
188; GFX6-NEXT:    s_setpc_b64 s[30:31]
189;
190; GFX8-LABEL: v_usubsat_i32:
191; GFX8:       ; %bb.0:
192; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
193; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v1 clamp
194; GFX8-NEXT:    s_setpc_b64 s[30:31]
195;
196; GFX9-LABEL: v_usubsat_i32:
197; GFX9:       ; %bb.0:
198; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
199; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v1 clamp
200; GFX9-NEXT:    s_setpc_b64 s[30:31]
201;
202; GFX10PLUS-LABEL: v_usubsat_i32:
203; GFX10PLUS:       ; %bb.0:
204; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
205; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
206; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v1 clamp
207; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
208  %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
209  ret i32 %result
210}
211
212define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
213; GFX6-LABEL: v_usubsat_v2i16:
214; GFX6:       ; %bb.0:
215; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
216; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v3
217; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
218; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
219; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
220; GFX6-NEXT:    v_max_u32_e32 v1, v1, v4
221; GFX6-NEXT:    v_max_u32_e32 v0, v0, v2
222; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
223; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
224; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
225; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
226; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
227; GFX6-NEXT:    s_setpc_b64 s[30:31]
228;
229; GFX8-LABEL: v_usubsat_v2i16:
230; GFX8:       ; %bb.0:
231; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
232; GFX8-NEXT:    v_sub_u16_sdwa v2, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
233; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
234; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
235; GFX8-NEXT:    s_setpc_b64 s[30:31]
236;
237; GFX9-LABEL: v_usubsat_v2i16:
238; GFX9:       ; %bb.0:
239; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
240; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
241; GFX9-NEXT:    s_setpc_b64 s[30:31]
242;
243; GFX10PLUS-LABEL: v_usubsat_v2i16:
244; GFX10PLUS:       ; %bb.0:
245; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
246; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
247; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
248; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
249  %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
250  ret <2 x i16> %result
251}
252
253define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
254; GFX6-LABEL: v_usubsat_v3i16:
255; GFX6:       ; %bb.0:
256; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
257; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff, v4
258; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
259; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
260; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
261; GFX6-NEXT:    v_max_u32_e32 v1, v1, v6
262; GFX6-NEXT:    v_max_u32_e32 v0, v0, v3
263; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
264; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v5
265; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
266; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
267; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
268; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
269; GFX6-NEXT:    v_max_u32_e32 v1, v2, v5
270; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v1, v5
271; GFX6-NEXT:    v_alignbit_b32 v1, v2, v0, 16
272; GFX6-NEXT:    s_setpc_b64 s[30:31]
273;
274; GFX8-LABEL: v_usubsat_v3i16:
275; GFX8:       ; %bb.0:
276; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
277; GFX8-NEXT:    v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
278; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v2 clamp
279; GFX8-NEXT:    v_sub_u16_e64 v1, v1, v3 clamp
280; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
281; GFX8-NEXT:    s_setpc_b64 s[30:31]
282;
283; GFX9-LABEL: v_usubsat_v3i16:
284; GFX9:       ; %bb.0:
285; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
286; GFX9-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
287; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
288; GFX9-NEXT:    s_setpc_b64 s[30:31]
289;
290; GFX10PLUS-LABEL: v_usubsat_v3i16:
291; GFX10PLUS:       ; %bb.0:
292; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
293; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
294; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
295; GFX10PLUS-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
296; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
297  %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
298  ret <3 x i16> %result
299}
300
301define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
302; GFX6-LABEL: v_usubsat_v4i16:
303; GFX6:       ; %bb.0:
304; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
305; GFX6-NEXT:    v_and_b32_e32 v9, 0xffff, v5
306; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
307; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
308; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
309; GFX6-NEXT:    v_max_u32_e32 v1, v1, v9
310; GFX6-NEXT:    v_max_u32_e32 v0, v0, v4
311; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
312; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v7
313; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
314; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff, v6
315; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
316; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
317; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
318; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
319; GFX6-NEXT:    v_max_u32_e32 v1, v2, v6
320; GFX6-NEXT:    v_max_u32_e32 v2, v3, v8
321; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
322; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
323; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
324; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
325; GFX6-NEXT:    s_setpc_b64 s[30:31]
326;
327; GFX8-LABEL: v_usubsat_v4i16:
328; GFX8:       ; %bb.0:
329; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
330; GFX8-NEXT:    v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
331; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v2 clamp
332; GFX8-NEXT:    v_sub_u16_sdwa v2, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
333; GFX8-NEXT:    v_sub_u16_e64 v1, v1, v3 clamp
334; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
335; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
336; GFX8-NEXT:    s_setpc_b64 s[30:31]
337;
338; GFX9-LABEL: v_usubsat_v4i16:
339; GFX9:       ; %bb.0:
340; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
341; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
342; GFX9-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
343; GFX9-NEXT:    s_setpc_b64 s[30:31]
344;
345; GFX10PLUS-LABEL: v_usubsat_v4i16:
346; GFX10PLUS:       ; %bb.0:
347; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
348; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
349; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
350; GFX10PLUS-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
351; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
352  %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
353  %cast = bitcast <4 x i16> %result to <2 x float>
354  ret <2 x float> %cast
355}
356
357define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
358; GFX6-LABEL: v_usubsat_v2i32:
359; GFX6:       ; %bb.0:
360; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
361; GFX6-NEXT:    v_max_u32_e32 v0, v0, v2
362; GFX6-NEXT:    v_max_u32_e32 v1, v1, v3
363; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
364; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
365; GFX6-NEXT:    s_setpc_b64 s[30:31]
366;
367; GFX8-LABEL: v_usubsat_v2i32:
368; GFX8:       ; %bb.0:
369; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
370; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v2 clamp
371; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v3 clamp
372; GFX8-NEXT:    s_setpc_b64 s[30:31]
373;
374; GFX9-LABEL: v_usubsat_v2i32:
375; GFX9:       ; %bb.0:
376; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v2 clamp
378; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v3 clamp
379; GFX9-NEXT:    s_setpc_b64 s[30:31]
380;
381; GFX10PLUS-LABEL: v_usubsat_v2i32:
382; GFX10PLUS:       ; %bb.0:
383; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
384; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
385; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v2 clamp
386; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v3 clamp
387; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
388  %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
389  ret <2 x i32> %result
390}
391
392define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
393; GFX6-LABEL: v_usubsat_v3i32:
394; GFX6:       ; %bb.0:
395; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
396; GFX6-NEXT:    v_max_u32_e32 v0, v0, v3
397; GFX6-NEXT:    v_max_u32_e32 v1, v1, v4
398; GFX6-NEXT:    v_max_u32_e32 v2, v2, v5
399; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
400; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
401; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
402; GFX6-NEXT:    s_setpc_b64 s[30:31]
403;
404; GFX8-LABEL: v_usubsat_v3i32:
405; GFX8:       ; %bb.0:
406; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
407; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v3 clamp
408; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v4 clamp
409; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v5 clamp
410; GFX8-NEXT:    s_setpc_b64 s[30:31]
411;
412; GFX9-LABEL: v_usubsat_v3i32:
413; GFX9:       ; %bb.0:
414; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
415; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v3 clamp
416; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v4 clamp
417; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v5 clamp
418; GFX9-NEXT:    s_setpc_b64 s[30:31]
419;
420; GFX10PLUS-LABEL: v_usubsat_v3i32:
421; GFX10PLUS:       ; %bb.0:
422; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
423; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
424; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v3 clamp
425; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v4 clamp
426; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, v2, v5 clamp
427; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
428  %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
429  ret <3 x i32> %result
430}
431
432define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
433; GFX6-LABEL: v_usubsat_v4i32:
434; GFX6:       ; %bb.0:
435; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
436; GFX6-NEXT:    v_max_u32_e32 v0, v0, v4
437; GFX6-NEXT:    v_max_u32_e32 v1, v1, v5
438; GFX6-NEXT:    v_max_u32_e32 v2, v2, v6
439; GFX6-NEXT:    v_max_u32_e32 v3, v3, v7
440; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
441; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
442; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
443; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v7
444; GFX6-NEXT:    s_setpc_b64 s[30:31]
445;
446; GFX8-LABEL: v_usubsat_v4i32:
447; GFX8:       ; %bb.0:
448; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
449; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v4 clamp
450; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v5 clamp
451; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v6 clamp
452; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v3, v7 clamp
453; GFX8-NEXT:    s_setpc_b64 s[30:31]
454;
455; GFX9-LABEL: v_usubsat_v4i32:
456; GFX9:       ; %bb.0:
457; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v4 clamp
459; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v5 clamp
460; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v6 clamp
461; GFX9-NEXT:    v_sub_u32_e64 v3, v3, v7 clamp
462; GFX9-NEXT:    s_setpc_b64 s[30:31]
463;
464; GFX10PLUS-LABEL: v_usubsat_v4i32:
465; GFX10PLUS:       ; %bb.0:
466; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
467; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
468; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v4 clamp
469; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v5 clamp
470; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, v2, v6 clamp
471; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v3, v3, v7 clamp
472; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
473  %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
474  ret <4 x i32> %result
475}
476
477define <8 x i32> @v_usubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
478; GFX6-LABEL: v_usubsat_v8i32:
479; GFX6:       ; %bb.0:
480; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
481; GFX6-NEXT:    v_max_u32_e32 v0, v0, v8
482; GFX6-NEXT:    v_max_u32_e32 v1, v1, v9
483; GFX6-NEXT:    v_max_u32_e32 v2, v2, v10
484; GFX6-NEXT:    v_max_u32_e32 v3, v3, v11
485; GFX6-NEXT:    v_max_u32_e32 v4, v4, v12
486; GFX6-NEXT:    v_max_u32_e32 v5, v5, v13
487; GFX6-NEXT:    v_max_u32_e32 v6, v6, v14
488; GFX6-NEXT:    v_max_u32_e32 v7, v7, v15
489; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
490; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
491; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
492; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v11
493; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v12
494; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v13
495; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v14
496; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v15
497; GFX6-NEXT:    s_setpc_b64 s[30:31]
498;
499; GFX8-LABEL: v_usubsat_v8i32:
500; GFX8:       ; %bb.0:
501; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
502; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v8 clamp
503; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v9 clamp
504; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v10 clamp
505; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v3, v11 clamp
506; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v4, v12 clamp
507; GFX8-NEXT:    v_sub_u32_e64 v5, s[4:5], v5, v13 clamp
508; GFX8-NEXT:    v_sub_u32_e64 v6, s[4:5], v6, v14 clamp
509; GFX8-NEXT:    v_sub_u32_e64 v7, s[4:5], v7, v15 clamp
510; GFX8-NEXT:    s_setpc_b64 s[30:31]
511;
512; GFX9-LABEL: v_usubsat_v8i32:
513; GFX9:       ; %bb.0:
514; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
515; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v8 clamp
516; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v9 clamp
517; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v10 clamp
518; GFX9-NEXT:    v_sub_u32_e64 v3, v3, v11 clamp
519; GFX9-NEXT:    v_sub_u32_e64 v4, v4, v12 clamp
520; GFX9-NEXT:    v_sub_u32_e64 v5, v5, v13 clamp
521; GFX9-NEXT:    v_sub_u32_e64 v6, v6, v14 clamp
522; GFX9-NEXT:    v_sub_u32_e64 v7, v7, v15 clamp
523; GFX9-NEXT:    s_setpc_b64 s[30:31]
524;
525; GFX10PLUS-LABEL: v_usubsat_v8i32:
526; GFX10PLUS:       ; %bb.0:
527; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
528; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
529; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v8 clamp
530; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v9 clamp
531; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, v2, v10 clamp
532; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v3, v3, v11 clamp
533; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v4, v4, v12 clamp
534; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v5, v5, v13 clamp
535; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v6, v6, v14 clamp
536; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v7, v7, v15 clamp
537; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
538  %result = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs)
539  ret <8 x i32> %result
540}
541
542define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
543; GFX6-LABEL: v_usubsat_v16i32:
544; GFX6:       ; %bb.0:
545; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
546; GFX6-NEXT:    v_max_u32_e32 v0, v0, v16
547; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v16
548; GFX6-NEXT:    buffer_load_dword v16, off, s[0:3], s32
549; GFX6-NEXT:    v_max_u32_e32 v1, v1, v17
550; GFX6-NEXT:    v_max_u32_e32 v2, v2, v18
551; GFX6-NEXT:    v_max_u32_e32 v3, v3, v19
552; GFX6-NEXT:    v_max_u32_e32 v4, v4, v20
553; GFX6-NEXT:    v_max_u32_e32 v5, v5, v21
554; GFX6-NEXT:    v_max_u32_e32 v6, v6, v22
555; GFX6-NEXT:    v_max_u32_e32 v7, v7, v23
556; GFX6-NEXT:    v_max_u32_e32 v8, v8, v24
557; GFX6-NEXT:    v_max_u32_e32 v9, v9, v25
558; GFX6-NEXT:    v_max_u32_e32 v10, v10, v26
559; GFX6-NEXT:    v_max_u32_e32 v11, v11, v27
560; GFX6-NEXT:    v_max_u32_e32 v12, v12, v28
561; GFX6-NEXT:    v_max_u32_e32 v13, v13, v29
562; GFX6-NEXT:    v_max_u32_e32 v14, v14, v30
563; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v17
564; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v18
565; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v19
566; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v20
567; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v21
568; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v22
569; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v23
570; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v24
571; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v25
572; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v26
573; GFX6-NEXT:    v_sub_i32_e32 v11, vcc, v11, v27
574; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v12, v28
575; GFX6-NEXT:    v_sub_i32_e32 v13, vcc, v13, v29
576; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, v14, v30
577; GFX6-NEXT:    s_waitcnt vmcnt(0)
578; GFX6-NEXT:    v_max_u32_e32 v15, v15, v16
579; GFX6-NEXT:    v_sub_i32_e32 v15, vcc, v15, v16
580; GFX6-NEXT:    s_setpc_b64 s[30:31]
581;
582; GFX8-LABEL: v_usubsat_v16i32:
583; GFX8:       ; %bb.0:
584; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
585; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v16 clamp
586; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
587; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v17 clamp
588; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v18 clamp
589; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v3, v19 clamp
590; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v4, v20 clamp
591; GFX8-NEXT:    v_sub_u32_e64 v5, s[4:5], v5, v21 clamp
592; GFX8-NEXT:    v_sub_u32_e64 v6, s[4:5], v6, v22 clamp
593; GFX8-NEXT:    v_sub_u32_e64 v7, s[4:5], v7, v23 clamp
594; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v8, v24 clamp
595; GFX8-NEXT:    v_sub_u32_e64 v9, s[4:5], v9, v25 clamp
596; GFX8-NEXT:    v_sub_u32_e64 v10, s[4:5], v10, v26 clamp
597; GFX8-NEXT:    v_sub_u32_e64 v11, s[4:5], v11, v27 clamp
598; GFX8-NEXT:    v_sub_u32_e64 v12, s[4:5], v12, v28 clamp
599; GFX8-NEXT:    v_sub_u32_e64 v13, s[4:5], v13, v29 clamp
600; GFX8-NEXT:    v_sub_u32_e64 v14, s[4:5], v14, v30 clamp
601; GFX8-NEXT:    s_waitcnt vmcnt(0)
602; GFX8-NEXT:    v_sub_u32_e64 v15, s[4:5], v15, v16 clamp
603; GFX8-NEXT:    s_setpc_b64 s[30:31]
604;
605; GFX9-LABEL: v_usubsat_v16i32:
606; GFX9:       ; %bb.0:
607; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
608; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v16 clamp
609; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32
610; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v17 clamp
611; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v18 clamp
612; GFX9-NEXT:    v_sub_u32_e64 v3, v3, v19 clamp
613; GFX9-NEXT:    v_sub_u32_e64 v4, v4, v20 clamp
614; GFX9-NEXT:    v_sub_u32_e64 v5, v5, v21 clamp
615; GFX9-NEXT:    v_sub_u32_e64 v6, v6, v22 clamp
616; GFX9-NEXT:    v_sub_u32_e64 v7, v7, v23 clamp
617; GFX9-NEXT:    v_sub_u32_e64 v8, v8, v24 clamp
618; GFX9-NEXT:    v_sub_u32_e64 v9, v9, v25 clamp
619; GFX9-NEXT:    v_sub_u32_e64 v10, v10, v26 clamp
620; GFX9-NEXT:    v_sub_u32_e64 v11, v11, v27 clamp
621; GFX9-NEXT:    v_sub_u32_e64 v12, v12, v28 clamp
622; GFX9-NEXT:    v_sub_u32_e64 v13, v13, v29 clamp
623; GFX9-NEXT:    v_sub_u32_e64 v14, v14, v30 clamp
624; GFX9-NEXT:    s_waitcnt vmcnt(0)
625; GFX9-NEXT:    v_sub_u32_e64 v15, v15, v16 clamp
626; GFX9-NEXT:    s_setpc_b64 s[30:31]
627;
628; GFX10-LABEL: v_usubsat_v16i32:
629; GFX10:       ; %bb.0:
630; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
631; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
632; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
633; GFX10-NEXT:    v_sub_nc_u32_e64 v0, v0, v16 clamp
634; GFX10-NEXT:    v_sub_nc_u32_e64 v1, v1, v17 clamp
635; GFX10-NEXT:    v_sub_nc_u32_e64 v2, v2, v18 clamp
636; GFX10-NEXT:    v_sub_nc_u32_e64 v3, v3, v19 clamp
637; GFX10-NEXT:    v_sub_nc_u32_e64 v4, v4, v20 clamp
638; GFX10-NEXT:    v_sub_nc_u32_e64 v5, v5, v21 clamp
639; GFX10-NEXT:    v_sub_nc_u32_e64 v6, v6, v22 clamp
640; GFX10-NEXT:    v_sub_nc_u32_e64 v7, v7, v23 clamp
641; GFX10-NEXT:    v_sub_nc_u32_e64 v8, v8, v24 clamp
642; GFX10-NEXT:    v_sub_nc_u32_e64 v9, v9, v25 clamp
643; GFX10-NEXT:    v_sub_nc_u32_e64 v10, v10, v26 clamp
644; GFX10-NEXT:    v_sub_nc_u32_e64 v11, v11, v27 clamp
645; GFX10-NEXT:    v_sub_nc_u32_e64 v12, v12, v28 clamp
646; GFX10-NEXT:    v_sub_nc_u32_e64 v13, v13, v29 clamp
647; GFX10-NEXT:    v_sub_nc_u32_e64 v14, v14, v30 clamp
648; GFX10-NEXT:    s_waitcnt vmcnt(0)
649; GFX10-NEXT:    v_sub_nc_u32_e64 v15, v15, v31 clamp
650; GFX10-NEXT:    s_setpc_b64 s[30:31]
651;
652; GFX11-LABEL: v_usubsat_v16i32:
653; GFX11:       ; %bb.0:
654; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
655; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
656; GFX11-NEXT:    scratch_load_b32 v31, off, s32
657; GFX11-NEXT:    v_sub_nc_u32_e64 v0, v0, v16 clamp
658; GFX11-NEXT:    v_sub_nc_u32_e64 v1, v1, v17 clamp
659; GFX11-NEXT:    v_sub_nc_u32_e64 v2, v2, v18 clamp
660; GFX11-NEXT:    v_sub_nc_u32_e64 v3, v3, v19 clamp
661; GFX11-NEXT:    v_sub_nc_u32_e64 v4, v4, v20 clamp
662; GFX11-NEXT:    v_sub_nc_u32_e64 v5, v5, v21 clamp
663; GFX11-NEXT:    v_sub_nc_u32_e64 v6, v6, v22 clamp
664; GFX11-NEXT:    v_sub_nc_u32_e64 v7, v7, v23 clamp
665; GFX11-NEXT:    v_sub_nc_u32_e64 v8, v8, v24 clamp
666; GFX11-NEXT:    v_sub_nc_u32_e64 v9, v9, v25 clamp
667; GFX11-NEXT:    v_sub_nc_u32_e64 v10, v10, v26 clamp
668; GFX11-NEXT:    v_sub_nc_u32_e64 v11, v11, v27 clamp
669; GFX11-NEXT:    v_sub_nc_u32_e64 v12, v12, v28 clamp
670; GFX11-NEXT:    v_sub_nc_u32_e64 v13, v13, v29 clamp
671; GFX11-NEXT:    v_sub_nc_u32_e64 v14, v14, v30 clamp
672; GFX11-NEXT:    s_waitcnt vmcnt(0)
673; GFX11-NEXT:    v_sub_nc_u32_e64 v15, v15, v31 clamp
674; GFX11-NEXT:    s_setpc_b64 s[30:31]
675  %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
676  ret <16 x i32> %result
677}
678
679
680define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
681; GFX6-LABEL: v_usubsat_i64:
682; GFX6:       ; %bb.0:
683; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
684; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v0, v2
685; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
686; GFX6-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
687; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
688; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc
689; GFX6-NEXT:    s_setpc_b64 s[30:31]
690;
691; GFX8-LABEL: v_usubsat_i64:
692; GFX8:       ; %bb.0:
693; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
694; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v0, v2
695; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
696; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
697; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
698; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc
699; GFX8-NEXT:    s_setpc_b64 s[30:31]
700;
701; GFX9-LABEL: v_usubsat_i64:
702; GFX9:       ; %bb.0:
703; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
704; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v2
705; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
706; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
707; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
708; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc
709; GFX9-NEXT:    s_setpc_b64 s[30:31]
710;
711; GFX10PLUS-LABEL: v_usubsat_i64:
712; GFX10PLUS:       ; %bb.0:
713; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
714; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
715; GFX10PLUS-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
716; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
717; GFX10PLUS-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
718; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
719; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
720; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
721  %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
722  ret i64 %result
723}
724
725declare i8 @llvm.usub.sat.i8(i8, i8) #0
726declare i16 @llvm.usub.sat.i16(i16, i16) #0
727declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>) #0
728declare <3 x i16> @llvm.usub.sat.v3i16(<3 x i16>, <3 x i16>) #0
729declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) #0
730declare i32 @llvm.usub.sat.i32(i32, i32) #0
731declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>) #0
732declare <3 x i32> @llvm.usub.sat.v3i32(<3 x i32>, <3 x i32>) #0
733declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) #0
734declare <8 x i32> @llvm.usub.sat.v8i32(<8 x i32>, <8 x i32>) #0
735declare <16 x i32> @llvm.usub.sat.v16i32(<16 x i32>, <16 x i32>) #0
736declare i64 @llvm.usub.sat.i64(i64, i64) #0
737
738attributes #0 = { nounwind readnone speculatable willreturn }
739