1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s
7
8define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
9; GFX6-LABEL: v_saddsat_i8:
10; GFX6:       ; %bb.0:
11; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 8
13; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
14; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
15; GFX6-NEXT:    v_min_i32_e32 v0, 0x7f, v0
16; GFX6-NEXT:    v_max_i32_e32 v0, 0xffffff80, v0
17; GFX6-NEXT:    s_setpc_b64 s[30:31]
18;
19; GFX8-LABEL: v_saddsat_i8:
20; GFX8:       ; %bb.0:
21; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22; GFX8-NEXT:    v_add_u16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
23; GFX8-NEXT:    v_min_i16_e32 v0, 0x7f, v0
24; GFX8-NEXT:    v_max_i16_e32 v0, 0xff80, v0
25; GFX8-NEXT:    s_setpc_b64 s[30:31]
26;
27; GFX9-LABEL: v_saddsat_i8:
28; GFX9:       ; %bb.0:
29; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
31; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
32; GFX9-NEXT:    v_add_i16 v0, v0, v1 clamp
33; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
34; GFX9-NEXT:    s_setpc_b64 s[30:31]
35;
36; GFX10PLUS-LABEL: v_saddsat_i8:
37; GFX10PLUS:       ; %bb.0:
38; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
40; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 8, v1
41; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 8, v0
42; GFX10PLUS-NEXT:    v_add_nc_i16 v0, v0, v1 clamp
43; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, 8, v0
44; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
45  %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
46  ret i8 %result
47}
48
49define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
50; GFX6-LABEL: v_saddsat_i16:
51; GFX6:       ; %bb.0:
52; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
53; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
54; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
55; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
56; GFX6-NEXT:    v_min_i32_e32 v0, 0x7fff, v0
57; GFX6-NEXT:    v_max_i32_e32 v0, 0xffff8000, v0
58; GFX6-NEXT:    s_setpc_b64 s[30:31]
59;
60; GFX8-LABEL: v_saddsat_i16:
61; GFX8:       ; %bb.0:
62; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
63; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v1
64; GFX8-NEXT:    v_add_u16_e32 v1, v0, v1
65; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
66; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v1
67; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
68; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
69; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
70; GFX8-NEXT:    s_setpc_b64 s[30:31]
71;
72; GFX9-LABEL: v_saddsat_i16:
73; GFX9:       ; %bb.0:
74; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
75; GFX9-NEXT:    v_add_i16 v0, v0, v1 clamp
76; GFX9-NEXT:    s_setpc_b64 s[30:31]
77;
78; GFX10PLUS-LABEL: v_saddsat_i16:
79; GFX10PLUS:       ; %bb.0:
80; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
81; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
82; GFX10PLUS-NEXT:    v_add_nc_i16 v0, v0, v1 clamp
83; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
84  %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
85  ret i16 %result
86}
87
88define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
89; GFX6-LABEL: v_saddsat_i32:
90; GFX6:       ; %bb.0:
91; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
93; GFX6-NEXT:    v_add_i32_e64 v1, s[4:5], v0, v1
94; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
95; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
96; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
97; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
98; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
99; GFX6-NEXT:    s_setpc_b64 s[30:31]
100;
101; GFX8-LABEL: v_saddsat_i32:
102; GFX8:       ; %bb.0:
103; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
104; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
105; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v0, v1
106; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
107; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
108; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
109; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
110; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
111; GFX8-NEXT:    s_setpc_b64 s[30:31]
112;
113; GFX9-LABEL: v_saddsat_i32:
114; GFX9:       ; %bb.0:
115; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116; GFX9-NEXT:    v_add_i32 v0, v0, v1 clamp
117; GFX9-NEXT:    s_setpc_b64 s[30:31]
118;
119; GFX10PLUS-LABEL: v_saddsat_i32:
120; GFX10PLUS:       ; %bb.0:
121; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
122; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
123; GFX10PLUS-NEXT:    v_add_nc_i32 v0, v0, v1 clamp
124; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
125  %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
126  ret i32 %result
127}
128
129define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
130; GFX6-LABEL: v_saddsat_v2i16:
131; GFX6:       ; %bb.0:
132; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
133; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
134; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
135; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
136; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
137; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
138; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
139; GFX6-NEXT:    v_min_i32_e32 v1, 0x7fff, v1
140; GFX6-NEXT:    v_min_i32_e32 v0, 0x7fff, v0
141; GFX6-NEXT:    v_max_i32_e32 v1, 0xffff8000, v1
142; GFX6-NEXT:    v_max_i32_e32 v0, 0xffff8000, v0
143; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
144; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
145; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
146; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
147; GFX6-NEXT:    s_setpc_b64 s[30:31]
148;
149; GFX8-LABEL: v_saddsat_v2i16:
150; GFX8:       ; %bb.0:
151; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
152; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
153; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
154; GFX8-NEXT:    v_add_u16_e32 v4, v3, v2
155; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v4, v3
156; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v2
157; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v4
158; GFX8-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
159; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
160; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
161; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v1
162; GFX8-NEXT:    v_add_u16_e32 v1, v0, v1
163; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
164; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v1
165; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
166; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
167; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
168; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
169; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
170; GFX8-NEXT:    s_setpc_b64 s[30:31]
171;
172; GFX9-LABEL: v_saddsat_v2i16:
173; GFX9:       ; %bb.0:
174; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
175; GFX9-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
176; GFX9-NEXT:    s_setpc_b64 s[30:31]
177;
178; GFX10PLUS-LABEL: v_saddsat_v2i16:
179; GFX10PLUS:       ; %bb.0:
180; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
181; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
182; GFX10PLUS-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
183; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
184  %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
185  ret <2 x i16> %result
186}
187
188define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
189; GFX6-LABEL: v_saddsat_v3i16:
190; GFX6:       ; %bb.0:
191; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
192; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
193; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
194; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
195; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
196; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
197; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
198; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
199; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
200; GFX6-NEXT:    v_min_i32_e32 v1, 0x7fff, v1
201; GFX6-NEXT:    v_min_i32_e32 v0, 0x7fff, v0
202; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
203; GFX6-NEXT:    v_max_i32_e32 v1, 0xffff8000, v1
204; GFX6-NEXT:    v_max_i32_e32 v0, 0xffff8000, v0
205; GFX6-NEXT:    v_min_i32_e32 v2, 0x7fff, v2
206; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
207; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
208; GFX6-NEXT:    v_max_i32_e32 v3, 0xffff8000, v2
209; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
210; GFX6-NEXT:    v_or_b32_e32 v2, 0xffff0000, v3
211; GFX6-NEXT:    v_alignbit_b32 v1, v3, v1, 16
212; GFX6-NEXT:    s_setpc_b64 s[30:31]
213;
214; GFX8-LABEL: v_saddsat_v3i16:
215; GFX8:       ; %bb.0:
216; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
218; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
219; GFX8-NEXT:    v_add_u16_e32 v6, v5, v4
220; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
221; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v4
222; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
223; GFX8-NEXT:    v_xor_b32_e32 v4, 0xffff8000, v4
224; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
225; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
226; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v3
227; GFX8-NEXT:    v_add_u16_e32 v3, v1, v3
228; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
229; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v3
230; GFX8-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
231; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
232; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
233; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v2
234; GFX8-NEXT:    v_add_u16_e32 v2, v0, v2
235; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
236; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
237; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
238; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
239; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
240; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
241; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
242; GFX8-NEXT:    s_setpc_b64 s[30:31]
243;
244; GFX9-LABEL: v_saddsat_v3i16:
245; GFX9:       ; %bb.0:
246; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
247; GFX9-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
248; GFX9-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
249; GFX9-NEXT:    s_setpc_b64 s[30:31]
250;
251; GFX10PLUS-LABEL: v_saddsat_v3i16:
252; GFX10PLUS:       ; %bb.0:
253; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
254; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
255; GFX10PLUS-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
256; GFX10PLUS-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
257; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
258  %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
259  ret <3 x i16> %result
260}
261
262define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
263; GFX6-LABEL: v_saddsat_v4i16:
264; GFX6:       ; %bb.0:
265; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
266; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
267; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
268; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
269; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
270; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
271; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
272; GFX6-NEXT:    v_min_i32_e32 v1, 0x7fff, v1
273; GFX6-NEXT:    v_min_i32_e32 v0, 0x7fff, v0
274; GFX6-NEXT:    v_max_i32_e32 v1, 0xffff8000, v1
275; GFX6-NEXT:    v_max_i32_e32 v0, 0xffff8000, v0
276; GFX6-NEXT:    v_bfe_i32 v6, v6, 0, 16
277; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
278; GFX6-NEXT:    v_bfe_i32 v7, v7, 0, 16
279; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
280; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
281; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
282; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
283; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v7
284; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
285; GFX6-NEXT:    v_min_i32_e32 v1, 0x7fff, v1
286; GFX6-NEXT:    v_min_i32_e32 v2, 0x7fff, v2
287; GFX6-NEXT:    v_max_i32_e32 v1, 0xffff8000, v1
288; GFX6-NEXT:    v_max_i32_e32 v2, 0xffff8000, v2
289; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
290; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
291; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
292; GFX6-NEXT:    s_setpc_b64 s[30:31]
293;
294; GFX8-LABEL: v_saddsat_v4i16:
295; GFX8:       ; %bb.0:
296; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
297; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
298; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
299; GFX8-NEXT:    v_add_u16_e32 v6, v5, v4
300; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
301; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v4
302; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
303; GFX8-NEXT:    v_xor_b32_e32 v4, 0xffff8000, v4
304; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
305; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
306; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v2
307; GFX8-NEXT:    v_add_u16_e32 v2, v0, v2
308; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
309; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
310; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
311; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
312; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
313; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
314; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
315; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
316; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
317; GFX8-NEXT:    v_add_u16_e32 v5, v4, v2
318; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v5, v4
319; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v2
320; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v5
321; GFX8-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
322; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
323; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
324; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v3
325; GFX8-NEXT:    v_add_u16_e32 v3, v1, v3
326; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
327; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v3
328; GFX8-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
329; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
330; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
331; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
332; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
333; GFX8-NEXT:    s_setpc_b64 s[30:31]
334;
335; GFX9-LABEL: v_saddsat_v4i16:
336; GFX9:       ; %bb.0:
337; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
338; GFX9-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
339; GFX9-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
340; GFX9-NEXT:    s_setpc_b64 s[30:31]
341;
342; GFX10PLUS-LABEL: v_saddsat_v4i16:
343; GFX10PLUS:       ; %bb.0:
344; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
345; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
346; GFX10PLUS-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
347; GFX10PLUS-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
348; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
349  %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
350  %cast = bitcast <4 x i16> %result to <2 x float>
351  ret <2 x float> %cast
352}
353
354define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
355; GFX6-LABEL: v_saddsat_v2i32:
356; GFX6:       ; %bb.0:
357; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
358; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
359; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v0, v2
360; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
361; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
362; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
363; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
364; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
365; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v1, v3
366; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
367; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
368; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
369; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
370; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
371; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
372; GFX6-NEXT:    s_setpc_b64 s[30:31]
373;
374; GFX8-LABEL: v_saddsat_v2i32:
375; GFX8:       ; %bb.0:
376; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
378; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v0, v2
379; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
380; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
381; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
382; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
383; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
384; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v1, v3
385; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
386; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
387; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
388; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
389; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
390; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
391; GFX8-NEXT:    s_setpc_b64 s[30:31]
392;
393; GFX9-LABEL: v_saddsat_v2i32:
394; GFX9:       ; %bb.0:
395; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
396; GFX9-NEXT:    v_add_i32 v0, v0, v2 clamp
397; GFX9-NEXT:    v_add_i32 v1, v1, v3 clamp
398; GFX9-NEXT:    s_setpc_b64 s[30:31]
399;
400; GFX10PLUS-LABEL: v_saddsat_v2i32:
401; GFX10PLUS:       ; %bb.0:
402; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
403; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
404; GFX10PLUS-NEXT:    v_add_nc_i32 v0, v0, v2 clamp
405; GFX10PLUS-NEXT:    v_add_nc_i32 v1, v1, v3 clamp
406; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
407  %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
408  ret <2 x i32> %result
409}
410
411define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
412; GFX6-LABEL: v_saddsat_i64:
413; GFX6:       ; %bb.0:
414; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
415; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
416; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
417; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
418; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
419; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
420; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
421; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
422; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
423; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
424; GFX6-NEXT:    s_setpc_b64 s[30:31]
425;
426; GFX8-LABEL: v_saddsat_i64:
427; GFX8:       ; %bb.0:
428; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
429; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
430; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
431; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
432; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
433; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
434; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
435; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
436; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
437; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
438; GFX8-NEXT:    s_setpc_b64 s[30:31]
439;
440; GFX9-LABEL: v_saddsat_i64:
441; GFX9:       ; %bb.0:
442; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
443; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
444; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
445; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
446; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
447; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
448; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
449; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
450; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
451; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
452; GFX9-NEXT:    s_setpc_b64 s[30:31]
453;
454; GFX10-LABEL: v_saddsat_i64:
455; GFX10:       ; %bb.0:
456; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
457; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
458; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
459; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
460; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[2:3]
461; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
462; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
463; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
464; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
465; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
466; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
467; GFX10-NEXT:    s_setpc_b64 s[30:31]
468;
469; GFX11-LABEL: v_saddsat_i64:
470; GFX11:       ; %bb.0:
471; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
472; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
473; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
474; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
475; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[2:3]
476; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
477; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
478; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
479; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
480; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
481; GFX11-NEXT:    s_setpc_b64 s[30:31]
482  %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
483  ret i64 %result
484}
485
486declare i8 @llvm.sadd.sat.i8(i8, i8) #0
487declare i16 @llvm.sadd.sat.i16(i16, i16) #0
488declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) #0
489declare <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16>, <3 x i16>) #0
490declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) #0
491declare i32 @llvm.sadd.sat.i32(i32, i32) #0
492declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) #0
493declare i64 @llvm.sadd.sat.i64(i64, i64) #0
494