1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
6
7define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
8; GFX6-LABEL: v_saddsat_i8:
9; GFX6:       ; %bb.0:
10; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 8
12; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
13; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
14; GFX6-NEXT:    v_min_i32_e32 v0, 0x7f, v0
15; GFX6-NEXT:    v_max_i32_e32 v0, 0xffffff80, v0
16; GFX6-NEXT:    s_setpc_b64 s[30:31]
17;
18; GFX8-LABEL: v_saddsat_i8:
19; GFX8:       ; %bb.0:
20; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21; GFX8-NEXT:    v_add_u16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
22; GFX8-NEXT:    v_min_i16_e32 v0, 0x7f, v0
23; GFX8-NEXT:    v_max_i16_e32 v0, 0xff80, v0
24; GFX8-NEXT:    s_setpc_b64 s[30:31]
25;
26; GFX9-LABEL: v_saddsat_i8:
27; GFX9:       ; %bb.0:
28; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
30; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
31; GFX9-NEXT:    v_add_i16 v0, v0, v1 clamp
32; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
33; GFX9-NEXT:    s_setpc_b64 s[30:31]
34;
35; GFX10-LABEL: v_saddsat_i8:
36; GFX10:       ; %bb.0:
37; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
39; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
40; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
41; GFX10-NEXT:    v_add_nc_i16 v0, v0, v1 clamp
42; GFX10-NEXT:    v_ashrrev_i16 v0, 8, v0
43; GFX10-NEXT:    s_setpc_b64 s[30:31]
44  %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
45  ret i8 %result
46}
47
48define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
49; GFX6-LABEL: v_saddsat_i16:
50; GFX6:       ; %bb.0:
51; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
53; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
54; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
55; GFX6-NEXT:    v_min_i32_e32 v0, 0x7fff, v0
56; GFX6-NEXT:    v_max_i32_e32 v0, 0xffff8000, v0
57; GFX6-NEXT:    s_setpc_b64 s[30:31]
58;
59; GFX8-LABEL: v_saddsat_i16:
60; GFX8:       ; %bb.0:
61; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
62; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v1
63; GFX8-NEXT:    v_add_u16_e32 v1, v0, v1
64; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
65; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v1
66; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
67; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
68; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
69; GFX8-NEXT:    s_setpc_b64 s[30:31]
70;
71; GFX9-LABEL: v_saddsat_i16:
72; GFX9:       ; %bb.0:
73; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
74; GFX9-NEXT:    v_add_i16 v0, v0, v1 clamp
75; GFX9-NEXT:    s_setpc_b64 s[30:31]
76;
77; GFX10-LABEL: v_saddsat_i16:
78; GFX10:       ; %bb.0:
79; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
81; GFX10-NEXT:    v_add_nc_i16 v0, v0, v1 clamp
82; GFX10-NEXT:    s_setpc_b64 s[30:31]
83  %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
84  ret i16 %result
85}
86
87define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
88; GFX6-LABEL: v_saddsat_i32:
89; GFX6:       ; %bb.0:
90; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
91; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
92; GFX6-NEXT:    v_add_i32_e64 v1, s[4:5], v0, v1
93; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
94; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
95; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
96; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
97; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
98; GFX6-NEXT:    s_setpc_b64 s[30:31]
99;
100; GFX8-LABEL: v_saddsat_i32:
101; GFX8:       ; %bb.0:
102; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
103; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
104; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v0, v1
105; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
106; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
107; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
108; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
109; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
110; GFX8-NEXT:    s_setpc_b64 s[30:31]
111;
112; GFX9-LABEL: v_saddsat_i32:
113; GFX9:       ; %bb.0:
114; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
115; GFX9-NEXT:    v_add_i32 v0, v0, v1 clamp
116; GFX9-NEXT:    s_setpc_b64 s[30:31]
117;
118; GFX10-LABEL: v_saddsat_i32:
119; GFX10:       ; %bb.0:
120; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
122; GFX10-NEXT:    v_add_nc_i32 v0, v0, v1 clamp
123; GFX10-NEXT:    s_setpc_b64 s[30:31]
124  %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
125  ret i32 %result
126}
127
128define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
129; GFX6-LABEL: v_saddsat_v2i16:
130; GFX6:       ; %bb.0:
131; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
132; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
133; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
134; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
135; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
136; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
137; GFX6-NEXT:    s_movk_i32 s4, 0x7fff
138; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
139; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
140; GFX6-NEXT:    s_movk_i32 s5, 0x8000
141; GFX6-NEXT:    v_min_i32_e32 v0, s4, v0
142; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
143; GFX6-NEXT:    v_max_i32_e32 v0, s5, v0
144; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
145; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
146; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
147; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
148; GFX6-NEXT:    s_setpc_b64 s[30:31]
149;
150; GFX8-LABEL: v_saddsat_v2i16:
151; GFX8:       ; %bb.0:
152; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
154; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
155; GFX8-NEXT:    v_add_u16_e32 v4, v3, v2
156; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v4, v3
157; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v2
158; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v4
159; GFX8-NEXT:    s_movk_i32 s6, 0x8000
160; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
161; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
162; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
163; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v1
164; GFX8-NEXT:    v_add_u16_e32 v1, v0, v1
165; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
166; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v1
167; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
168; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
169; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
170; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
171; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
172; GFX8-NEXT:    s_setpc_b64 s[30:31]
173;
174; GFX9-LABEL: v_saddsat_v2i16:
175; GFX9:       ; %bb.0:
176; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
177; GFX9-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
178; GFX9-NEXT:    s_setpc_b64 s[30:31]
179;
180; GFX10-LABEL: v_saddsat_v2i16:
181; GFX10:       ; %bb.0:
182; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
183; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
184; GFX10-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
185; GFX10-NEXT:    s_setpc_b64 s[30:31]
186  %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
187  ret <2 x i16> %result
188}
189
190define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
191; GFX6-LABEL: v_saddsat_v3i16:
192; GFX6:       ; %bb.0:
193; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
194; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
195; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
196; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
197; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
198; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
199; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
200; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
201; GFX6-NEXT:    s_movk_i32 s4, 0x7fff
202; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
203; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
204; GFX6-NEXT:    s_movk_i32 s5, 0x8000
205; GFX6-NEXT:    v_min_i32_e32 v0, s4, v0
206; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
207; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
208; GFX6-NEXT:    v_max_i32_e32 v0, s5, v0
209; GFX6-NEXT:    v_min_i32_e32 v2, s4, v2
210; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
211; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
212; GFX6-NEXT:    v_max_i32_e32 v3, s5, v2
213; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
214; GFX6-NEXT:    v_or_b32_e32 v2, 0xffff0000, v3
215; GFX6-NEXT:    v_alignbit_b32 v1, v3, v1, 16
216; GFX6-NEXT:    s_setpc_b64 s[30:31]
217;
218; GFX8-LABEL: v_saddsat_v3i16:
219; GFX8:       ; %bb.0:
220; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
221; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
222; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
223; GFX8-NEXT:    v_add_u16_e32 v6, v5, v4
224; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
225; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v4
226; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
227; GFX8-NEXT:    s_movk_i32 s6, 0x8000
228; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
229; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
230; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
231; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v3
232; GFX8-NEXT:    v_add_u16_e32 v3, v1, v3
233; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
234; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v3
235; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
236; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
237; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
238; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v2
239; GFX8-NEXT:    v_add_u16_e32 v2, v0, v2
240; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
241; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
242; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
243; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
244; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
245; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
246; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
247; GFX8-NEXT:    s_setpc_b64 s[30:31]
248;
249; GFX9-LABEL: v_saddsat_v3i16:
250; GFX9:       ; %bb.0:
251; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
252; GFX9-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
253; GFX9-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
254; GFX9-NEXT:    s_setpc_b64 s[30:31]
255;
256; GFX10-LABEL: v_saddsat_v3i16:
257; GFX10:       ; %bb.0:
258; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
259; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
260; GFX10-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
261; GFX10-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
262; GFX10-NEXT:    s_setpc_b64 s[30:31]
263  %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
264  ret <3 x i16> %result
265}
266
267define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
268; GFX6-LABEL: v_saddsat_v4i16:
269; GFX6:       ; %bb.0:
270; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
271; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
272; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
273; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
274; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
275; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
276; GFX6-NEXT:    s_movk_i32 s4, 0x7fff
277; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
278; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
279; GFX6-NEXT:    s_movk_i32 s5, 0x8000
280; GFX6-NEXT:    v_min_i32_e32 v0, s4, v0
281; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
282; GFX6-NEXT:    v_max_i32_e32 v0, s5, v0
283; GFX6-NEXT:    s_mov_b32 s6, 0xffff
284; GFX6-NEXT:    v_bfe_i32 v6, v6, 0, 16
285; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
286; GFX6-NEXT:    v_bfe_i32 v7, v7, 0, 16
287; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
288; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
289; GFX6-NEXT:    v_and_b32_e32 v0, s6, v0
290; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
291; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v7
292; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
293; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
294; GFX6-NEXT:    v_min_i32_e32 v2, s4, v2
295; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
296; GFX6-NEXT:    v_max_i32_e32 v2, s5, v2
297; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
298; GFX6-NEXT:    v_and_b32_e32 v2, s6, v2
299; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
300; GFX6-NEXT:    s_setpc_b64 s[30:31]
301;
302; GFX8-LABEL: v_saddsat_v4i16:
303; GFX8:       ; %bb.0:
304; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
305; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
306; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
307; GFX8-NEXT:    v_add_u16_e32 v6, v5, v4
308; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
309; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v4
310; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
311; GFX8-NEXT:    s_movk_i32 s6, 0x8000
312; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
313; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
314; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
315; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v2
316; GFX8-NEXT:    v_add_u16_e32 v2, v0, v2
317; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
318; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
319; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
320; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
321; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
322; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
323; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
324; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
325; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
326; GFX8-NEXT:    v_add_u16_e32 v5, v4, v2
327; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v5, v4
328; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v2
329; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v5
330; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
331; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
332; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
333; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v3
334; GFX8-NEXT:    v_add_u16_e32 v3, v1, v3
335; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
336; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v3
337; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
338; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
339; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
340; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
341; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
342; GFX8-NEXT:    s_setpc_b64 s[30:31]
343;
344; GFX9-LABEL: v_saddsat_v4i16:
345; GFX9:       ; %bb.0:
346; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
347; GFX9-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
348; GFX9-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
349; GFX9-NEXT:    s_setpc_b64 s[30:31]
350;
351; GFX10-LABEL: v_saddsat_v4i16:
352; GFX10:       ; %bb.0:
353; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
354; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
355; GFX10-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
356; GFX10-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
357; GFX10-NEXT:    s_setpc_b64 s[30:31]
358  %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
359  %cast = bitcast <4 x i16> %result to <2 x float>
360  ret <2 x float> %cast
361}
362
363define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
364; GFX6-LABEL: v_saddsat_v2i32:
365; GFX6:       ; %bb.0:
366; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
367; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
368; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v0, v2
369; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
370; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
371; GFX6-NEXT:    s_brev_b32 s6, 1
372; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
373; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
374; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
375; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v1, v3
376; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
377; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
378; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
379; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
380; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
381; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
382; GFX6-NEXT:    s_setpc_b64 s[30:31]
383;
384; GFX8-LABEL: v_saddsat_v2i32:
385; GFX8:       ; %bb.0:
386; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
387; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
388; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v0, v2
389; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
390; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
391; GFX8-NEXT:    s_brev_b32 s6, 1
392; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
393; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
394; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
395; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v1, v3
396; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
397; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
398; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
399; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
400; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
401; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
402; GFX8-NEXT:    s_setpc_b64 s[30:31]
403;
404; GFX9-LABEL: v_saddsat_v2i32:
405; GFX9:       ; %bb.0:
406; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
407; GFX9-NEXT:    v_add_i32 v0, v0, v2 clamp
408; GFX9-NEXT:    v_add_i32 v1, v1, v3 clamp
409; GFX9-NEXT:    s_setpc_b64 s[30:31]
410;
411; GFX10-LABEL: v_saddsat_v2i32:
412; GFX10:       ; %bb.0:
413; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
414; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
415; GFX10-NEXT:    v_add_nc_i32 v0, v0, v2 clamp
416; GFX10-NEXT:    v_add_nc_i32 v1, v1, v3 clamp
417; GFX10-NEXT:    s_setpc_b64 s[30:31]
418  %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
419  ret <2 x i32> %result
420}
421
422define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
423; GFX6-LABEL: v_saddsat_i64:
424; GFX6:       ; %bb.0:
425; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
426; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
427; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
428; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
429; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
430; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
431; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
432; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
433; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
434; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
435; GFX6-NEXT:    s_setpc_b64 s[30:31]
436;
437; GFX8-LABEL: v_saddsat_i64:
438; GFX8:       ; %bb.0:
439; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
440; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
441; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
442; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
443; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
444; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
445; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
446; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
447; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
448; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
449; GFX8-NEXT:    s_setpc_b64 s[30:31]
450;
451; GFX9-LABEL: v_saddsat_i64:
452; GFX9:       ; %bb.0:
453; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
454; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
455; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
456; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
457; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
458; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
459; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
460; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
461; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
462; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
463; GFX9-NEXT:    s_setpc_b64 s[30:31]
464;
465; GFX10-LABEL: v_saddsat_i64:
466; GFX10:       ; %bb.0:
467; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
468; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
469; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
470; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
471; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[2:3]
472; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
473; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
474; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
475; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
476; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
477; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
478; GFX10-NEXT:    s_setpc_b64 s[30:31]
479  %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
480  ret i64 %result
481}
482
483declare i8 @llvm.sadd.sat.i8(i8, i8) #0
484declare i16 @llvm.sadd.sat.i16(i16, i16) #0
485declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) #0
486declare <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16>, <3 x i16>) #0
487declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) #0
488declare i32 @llvm.sadd.sat.i32(i32, i32) #0
489declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) #0
490declare i64 @llvm.sadd.sat.i64(i64, i64) #0
491