1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
6
7define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
8; GFX6-LABEL: v_uaddsat_i8:
9; GFX6:       ; %bb.0:
10; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX6-NEXT:    s_movk_i32 s4, 0xff
12; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
13; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
14; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
15; GFX6-NEXT:    v_min_u32_e32 v0, s4, v0
16; GFX6-NEXT:    s_setpc_b64 s[30:31]
17;
18; GFX8-LABEL: v_uaddsat_i8:
19; GFX8:       ; %bb.0:
20; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
22; GFX8-NEXT:    v_min_u16_e32 v0, 0xff, v0
23; GFX8-NEXT:    s_setpc_b64 s[30:31]
24;
25; GFX9-LABEL: v_uaddsat_i8:
26; GFX9:       ; %bb.0:
27; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28; GFX9-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
29; GFX9-NEXT:    v_min_u16_e32 v0, 0xff, v0
30; GFX9-NEXT:    s_setpc_b64 s[30:31]
31;
32; GFX10-LABEL: v_uaddsat_i8:
33; GFX10:       ; %bb.0:
34; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
36; GFX10-NEXT:    s_movk_i32 s4, 0xff
37; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
38; GFX10-NEXT:    v_and_b32_e32 v0, s4, v0
39; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1
40; GFX10-NEXT:    v_min_u16 v0, v0, s4
41; GFX10-NEXT:    s_setpc_b64 s[30:31]
42  %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
43  ret i8 %result
44}
45
46define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) {
47; GFX6-LABEL: v_uaddsat_i16:
48; GFX6:       ; %bb.0:
49; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50; GFX6-NEXT:    s_mov_b32 s4, 0xffff
51; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
52; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
53; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
54; GFX6-NEXT:    v_min_u32_e32 v0, s4, v0
55; GFX6-NEXT:    s_setpc_b64 s[30:31]
56;
57; GFX8-LABEL: v_uaddsat_i16:
58; GFX8:       ; %bb.0:
59; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
60; GFX8-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
61; GFX8-NEXT:    s_setpc_b64 s[30:31]
62;
63; GFX9-LABEL: v_uaddsat_i16:
64; GFX9:       ; %bb.0:
65; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
66; GFX9-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
67; GFX9-NEXT:    s_setpc_b64 s[30:31]
68;
69; GFX10-LABEL: v_uaddsat_i16:
70; GFX10:       ; %bb.0:
71; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
72; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
73; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1 clamp
74; GFX10-NEXT:    s_setpc_b64 s[30:31]
75  %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
76  ret i16 %result
77}
78
79define i32 @v_uaddsat_i32(i32 %lhs, i32 %rhs) {
80; GFX6-LABEL: v_uaddsat_i32:
81; GFX6:       ; %bb.0:
82; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83; GFX6-NEXT:    v_not_b32_e32 v2, v1
84; GFX6-NEXT:    v_min_u32_e32 v0, v0, v2
85; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
86; GFX6-NEXT:    s_setpc_b64 s[30:31]
87;
88; GFX8-LABEL: v_uaddsat_i32:
89; GFX8:       ; %bb.0:
90; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
91; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v1 clamp
92; GFX8-NEXT:    s_setpc_b64 s[30:31]
93;
94; GFX9-LABEL: v_uaddsat_i32:
95; GFX9:       ; %bb.0:
96; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
97; GFX9-NEXT:    v_add_u32_e64 v0, v0, v1 clamp
98; GFX9-NEXT:    s_setpc_b64 s[30:31]
99;
100; GFX10-LABEL: v_uaddsat_i32:
101; GFX10:       ; %bb.0:
102; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
103; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
104; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v1 clamp
105; GFX10-NEXT:    s_setpc_b64 s[30:31]
106  %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs)
107  ret i32 %result
108}
109
110define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
111; GFX6-LABEL: v_uaddsat_v2i16:
112; GFX6:       ; %bb.0:
113; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114; GFX6-NEXT:    s_mov_b32 s4, 0xffff
115; GFX6-NEXT:    v_and_b32_e32 v3, s4, v3
116; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
117; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
118; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
119; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
120; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
121; GFX6-NEXT:    v_min_u32_e32 v1, s4, v1
122; GFX6-NEXT:    v_min_u32_e32 v0, s4, v0
123; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
124; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
125; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
126; GFX6-NEXT:    s_setpc_b64 s[30:31]
127;
128; GFX8-LABEL: v_uaddsat_v2i16:
129; GFX8:       ; %bb.0:
130; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
131; GFX8-NEXT:    v_add_u16_sdwa v2, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
132; GFX8-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
133; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
134; GFX8-NEXT:    s_setpc_b64 s[30:31]
135;
136; GFX9-LABEL: v_uaddsat_v2i16:
137; GFX9:       ; %bb.0:
138; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
139; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
140; GFX9-NEXT:    s_setpc_b64 s[30:31]
141;
142; GFX10-LABEL: v_uaddsat_v2i16:
143; GFX10:       ; %bb.0:
144; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
145; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
146; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
147; GFX10-NEXT:    s_setpc_b64 s[30:31]
148  %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
149  ret <2 x i16> %result
150}
151
152define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
153; GFX6-LABEL: v_uaddsat_v3i16:
154; GFX6:       ; %bb.0:
155; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
156; GFX6-NEXT:    s_mov_b32 s4, 0xffff
157; GFX6-NEXT:    v_and_b32_e32 v4, s4, v4
158; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
159; GFX6-NEXT:    v_and_b32_e32 v5, s4, v5
160; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
161; GFX6-NEXT:    v_and_b32_e32 v3, s4, v3
162; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
163; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
164; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
165; GFX6-NEXT:    v_min_u32_e32 v1, s4, v1
166; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
167; GFX6-NEXT:    v_min_u32_e32 v0, s4, v0
168; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
169; GFX6-NEXT:    v_min_u32_e32 v3, s4, v2
170; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
171; GFX6-NEXT:    v_or_b32_e32 v2, 0xffff0000, v3
172; GFX6-NEXT:    v_alignbit_b32 v1, v3, v1, 16
173; GFX6-NEXT:    s_setpc_b64 s[30:31]
174;
175; GFX8-LABEL: v_uaddsat_v3i16:
176; GFX8:       ; %bb.0:
177; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
178; GFX8-NEXT:    v_add_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
179; GFX8-NEXT:    v_add_u16_e64 v0, v0, v2 clamp
180; GFX8-NEXT:    v_add_u16_e64 v1, v1, v3 clamp
181; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
182; GFX8-NEXT:    s_setpc_b64 s[30:31]
183;
184; GFX9-LABEL: v_uaddsat_v3i16:
185; GFX9:       ; %bb.0:
186; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
187; GFX9-NEXT:    v_pk_add_u16 v1, v1, v3 clamp
188; GFX9-NEXT:    v_pk_add_u16 v0, v0, v2 clamp
189; GFX9-NEXT:    s_setpc_b64 s[30:31]
190;
191; GFX10-LABEL: v_uaddsat_v3i16:
192; GFX10:       ; %bb.0:
193; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
194; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
195; GFX10-NEXT:    v_pk_add_u16 v0, v0, v2 clamp
196; GFX10-NEXT:    v_pk_add_u16 v1, v1, v3 clamp
197; GFX10-NEXT:    s_setpc_b64 s[30:31]
198  %result = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
199  ret <3 x i16> %result
200}
201
202define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
203; GFX6-LABEL: v_uaddsat_v4i16:
204; GFX6:       ; %bb.0:
205; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
206; GFX6-NEXT:    s_mov_b32 s4, 0xffff
207; GFX6-NEXT:    v_and_b32_e32 v5, s4, v5
208; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
209; GFX6-NEXT:    v_and_b32_e32 v4, s4, v4
210; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
211; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
212; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
213; GFX6-NEXT:    v_min_u32_e32 v1, s4, v1
214; GFX6-NEXT:    v_and_b32_e32 v7, s4, v7
215; GFX6-NEXT:    v_and_b32_e32 v3, s4, v3
216; GFX6-NEXT:    v_and_b32_e32 v6, s4, v6
217; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
218; GFX6-NEXT:    v_min_u32_e32 v0, s4, v0
219; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
220; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
221; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v6
222; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v7
223; GFX6-NEXT:    v_min_u32_e32 v2, s4, v2
224; GFX6-NEXT:    v_min_u32_e32 v1, s4, v1
225; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
226; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
227; GFX6-NEXT:    s_setpc_b64 s[30:31]
228;
229; GFX8-LABEL: v_uaddsat_v4i16:
230; GFX8:       ; %bb.0:
231; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
232; GFX8-NEXT:    v_add_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
233; GFX8-NEXT:    v_add_u16_e64 v0, v0, v2 clamp
234; GFX8-NEXT:    v_add_u16_sdwa v2, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
235; GFX8-NEXT:    v_add_u16_e64 v1, v1, v3 clamp
236; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
237; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
238; GFX8-NEXT:    s_setpc_b64 s[30:31]
239;
240; GFX9-LABEL: v_uaddsat_v4i16:
241; GFX9:       ; %bb.0:
242; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
243; GFX9-NEXT:    v_pk_add_u16 v0, v0, v2 clamp
244; GFX9-NEXT:    v_pk_add_u16 v1, v1, v3 clamp
245; GFX9-NEXT:    s_setpc_b64 s[30:31]
246;
247; GFX10-LABEL: v_uaddsat_v4i16:
248; GFX10:       ; %bb.0:
249; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
250; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
251; GFX10-NEXT:    v_pk_add_u16 v0, v0, v2 clamp
252; GFX10-NEXT:    v_pk_add_u16 v1, v1, v3 clamp
253; GFX10-NEXT:    s_setpc_b64 s[30:31]
254  %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
255  %cast = bitcast <4 x i16> %result to <2 x float>
256  ret <2 x float> %cast
257}
258
259define <2 x i32> @v_uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
260; GFX6-LABEL: v_uaddsat_v2i32:
261; GFX6:       ; %bb.0:
262; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
263; GFX6-NEXT:    v_not_b32_e32 v4, v2
264; GFX6-NEXT:    v_min_u32_e32 v0, v0, v4
265; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
266; GFX6-NEXT:    v_not_b32_e32 v2, v3
267; GFX6-NEXT:    v_min_u32_e32 v1, v1, v2
268; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
269; GFX6-NEXT:    s_setpc_b64 s[30:31]
270;
271; GFX8-LABEL: v_uaddsat_v2i32:
272; GFX8:       ; %bb.0:
273; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
274; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v2 clamp
275; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v3 clamp
276; GFX8-NEXT:    s_setpc_b64 s[30:31]
277;
278; GFX9-LABEL: v_uaddsat_v2i32:
279; GFX9:       ; %bb.0:
280; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
281; GFX9-NEXT:    v_add_u32_e64 v0, v0, v2 clamp
282; GFX9-NEXT:    v_add_u32_e64 v1, v1, v3 clamp
283; GFX9-NEXT:    s_setpc_b64 s[30:31]
284;
285; GFX10-LABEL: v_uaddsat_v2i32:
286; GFX10:       ; %bb.0:
287; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
288; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
289; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v2 clamp
290; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, v3 clamp
291; GFX10-NEXT:    s_setpc_b64 s[30:31]
292  %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
293  ret <2 x i32> %result
294}
295
296define <3 x i32> @v_uaddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
297; GFX6-LABEL: v_uaddsat_v3i32:
298; GFX6:       ; %bb.0:
299; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
300; GFX6-NEXT:    v_not_b32_e32 v6, v3
301; GFX6-NEXT:    v_min_u32_e32 v0, v0, v6
302; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
303; GFX6-NEXT:    v_not_b32_e32 v3, v4
304; GFX6-NEXT:    v_min_u32_e32 v1, v1, v3
305; GFX6-NEXT:    v_not_b32_e32 v3, v5
306; GFX6-NEXT:    v_min_u32_e32 v2, v2, v3
307; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
308; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
309; GFX6-NEXT:    s_setpc_b64 s[30:31]
310;
311; GFX8-LABEL: v_uaddsat_v3i32:
312; GFX8:       ; %bb.0:
313; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
314; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v3 clamp
315; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v4 clamp
316; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v2, v5 clamp
317; GFX8-NEXT:    s_setpc_b64 s[30:31]
318;
319; GFX9-LABEL: v_uaddsat_v3i32:
320; GFX9:       ; %bb.0:
321; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
322; GFX9-NEXT:    v_add_u32_e64 v0, v0, v3 clamp
323; GFX9-NEXT:    v_add_u32_e64 v1, v1, v4 clamp
324; GFX9-NEXT:    v_add_u32_e64 v2, v2, v5 clamp
325; GFX9-NEXT:    s_setpc_b64 s[30:31]
326;
327; GFX10-LABEL: v_uaddsat_v3i32:
328; GFX10:       ; %bb.0:
329; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
330; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
331; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v3 clamp
332; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, v4 clamp
333; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, v5 clamp
334; GFX10-NEXT:    s_setpc_b64 s[30:31]
335  %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
336  ret <3 x i32> %result
337}
338
339define <4 x i32> @v_uaddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
340; GFX6-LABEL: v_uaddsat_v4i32:
341; GFX6:       ; %bb.0:
342; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
343; GFX6-NEXT:    v_not_b32_e32 v8, v4
344; GFX6-NEXT:    v_min_u32_e32 v0, v0, v8
345; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
346; GFX6-NEXT:    v_not_b32_e32 v4, v5
347; GFX6-NEXT:    v_min_u32_e32 v1, v1, v4
348; GFX6-NEXT:    v_not_b32_e32 v4, v6
349; GFX6-NEXT:    v_min_u32_e32 v2, v2, v4
350; GFX6-NEXT:    v_not_b32_e32 v4, v7
351; GFX6-NEXT:    v_min_u32_e32 v3, v3, v4
352; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
353; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
354; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
355; GFX6-NEXT:    s_setpc_b64 s[30:31]
356;
357; GFX8-LABEL: v_uaddsat_v4i32:
358; GFX8:       ; %bb.0:
359; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
360; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v4 clamp
361; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v5 clamp
362; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v2, v6 clamp
363; GFX8-NEXT:    v_add_u32_e64 v3, s[4:5], v3, v7 clamp
364; GFX8-NEXT:    s_setpc_b64 s[30:31]
365;
366; GFX9-LABEL: v_uaddsat_v4i32:
367; GFX9:       ; %bb.0:
368; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
369; GFX9-NEXT:    v_add_u32_e64 v0, v0, v4 clamp
370; GFX9-NEXT:    v_add_u32_e64 v1, v1, v5 clamp
371; GFX9-NEXT:    v_add_u32_e64 v2, v2, v6 clamp
372; GFX9-NEXT:    v_add_u32_e64 v3, v3, v7 clamp
373; GFX9-NEXT:    s_setpc_b64 s[30:31]
374;
375; GFX10-LABEL: v_uaddsat_v4i32:
376; GFX10:       ; %bb.0:
377; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
378; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
379; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v4 clamp
380; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, v5 clamp
381; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, v6 clamp
382; GFX10-NEXT:    v_add_nc_u32_e64 v3, v3, v7 clamp
383; GFX10-NEXT:    s_setpc_b64 s[30:31]
384  %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
385  ret <4 x i32> %result
386}
387
388define <8 x i32> @v_uaddsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
389; GFX6-LABEL: v_uaddsat_v8i32:
390; GFX6:       ; %bb.0:
391; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
392; GFX6-NEXT:    v_not_b32_e32 v16, v8
393; GFX6-NEXT:    v_min_u32_e32 v0, v0, v16
394; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
395; GFX6-NEXT:    v_not_b32_e32 v8, v9
396; GFX6-NEXT:    v_min_u32_e32 v1, v1, v8
397; GFX6-NEXT:    v_not_b32_e32 v8, v10
398; GFX6-NEXT:    v_min_u32_e32 v2, v2, v8
399; GFX6-NEXT:    v_not_b32_e32 v8, v11
400; GFX6-NEXT:    v_min_u32_e32 v3, v3, v8
401; GFX6-NEXT:    v_not_b32_e32 v8, v12
402; GFX6-NEXT:    v_min_u32_e32 v4, v4, v8
403; GFX6-NEXT:    v_not_b32_e32 v8, v13
404; GFX6-NEXT:    v_min_u32_e32 v5, v5, v8
405; GFX6-NEXT:    v_not_b32_e32 v8, v14
406; GFX6-NEXT:    v_min_u32_e32 v6, v6, v8
407; GFX6-NEXT:    v_not_b32_e32 v8, v15
408; GFX6-NEXT:    v_min_u32_e32 v7, v7, v8
409; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
410; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
411; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
412; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
413; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
414; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
415; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
416; GFX6-NEXT:    s_setpc_b64 s[30:31]
417;
418; GFX8-LABEL: v_uaddsat_v8i32:
419; GFX8:       ; %bb.0:
420; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
421; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v8 clamp
422; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v9 clamp
423; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v2, v10 clamp
424; GFX8-NEXT:    v_add_u32_e64 v3, s[4:5], v3, v11 clamp
425; GFX8-NEXT:    v_add_u32_e64 v4, s[4:5], v4, v12 clamp
426; GFX8-NEXT:    v_add_u32_e64 v5, s[4:5], v5, v13 clamp
427; GFX8-NEXT:    v_add_u32_e64 v6, s[4:5], v6, v14 clamp
428; GFX8-NEXT:    v_add_u32_e64 v7, s[4:5], v7, v15 clamp
429; GFX8-NEXT:    s_setpc_b64 s[30:31]
430;
431; GFX9-LABEL: v_uaddsat_v8i32:
432; GFX9:       ; %bb.0:
433; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
434; GFX9-NEXT:    v_add_u32_e64 v0, v0, v8 clamp
435; GFX9-NEXT:    v_add_u32_e64 v1, v1, v9 clamp
436; GFX9-NEXT:    v_add_u32_e64 v2, v2, v10 clamp
437; GFX9-NEXT:    v_add_u32_e64 v3, v3, v11 clamp
438; GFX9-NEXT:    v_add_u32_e64 v4, v4, v12 clamp
439; GFX9-NEXT:    v_add_u32_e64 v5, v5, v13 clamp
440; GFX9-NEXT:    v_add_u32_e64 v6, v6, v14 clamp
441; GFX9-NEXT:    v_add_u32_e64 v7, v7, v15 clamp
442; GFX9-NEXT:    s_setpc_b64 s[30:31]
443;
444; GFX10-LABEL: v_uaddsat_v8i32:
445; GFX10:       ; %bb.0:
446; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
447; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
448; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v8 clamp
449; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, v9 clamp
450; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, v10 clamp
451; GFX10-NEXT:    v_add_nc_u32_e64 v3, v3, v11 clamp
452; GFX10-NEXT:    v_add_nc_u32_e64 v4, v4, v12 clamp
453; GFX10-NEXT:    v_add_nc_u32_e64 v5, v5, v13 clamp
454; GFX10-NEXT:    v_add_nc_u32_e64 v6, v6, v14 clamp
455; GFX10-NEXT:    v_add_nc_u32_e64 v7, v7, v15 clamp
456; GFX10-NEXT:    s_setpc_b64 s[30:31]
457  %result = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs)
458  ret <8 x i32> %result
459}
460
461define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
462; GFX6-LABEL: v_uaddsat_v16i32:
463; GFX6:       ; %bb.0:
464; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
465; GFX6-NEXT:    v_not_b32_e32 v31, v16
466; GFX6-NEXT:    v_min_u32_e32 v0, v0, v31
467; GFX6-NEXT:    buffer_load_dword v31, off, s[0:3], s32
468; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v16
469; GFX6-NEXT:    v_not_b32_e32 v16, v17
470; GFX6-NEXT:    v_min_u32_e32 v1, v1, v16
471; GFX6-NEXT:    v_not_b32_e32 v16, v18
472; GFX6-NEXT:    v_min_u32_e32 v2, v2, v16
473; GFX6-NEXT:    v_not_b32_e32 v16, v19
474; GFX6-NEXT:    v_min_u32_e32 v3, v3, v16
475; GFX6-NEXT:    v_not_b32_e32 v16, v20
476; GFX6-NEXT:    v_min_u32_e32 v4, v4, v16
477; GFX6-NEXT:    v_not_b32_e32 v16, v21
478; GFX6-NEXT:    v_min_u32_e32 v5, v5, v16
479; GFX6-NEXT:    v_not_b32_e32 v16, v22
480; GFX6-NEXT:    v_min_u32_e32 v6, v6, v16
481; GFX6-NEXT:    v_not_b32_e32 v16, v23
482; GFX6-NEXT:    v_min_u32_e32 v7, v7, v16
483; GFX6-NEXT:    v_not_b32_e32 v16, v24
484; GFX6-NEXT:    v_min_u32_e32 v8, v8, v16
485; GFX6-NEXT:    v_not_b32_e32 v16, v25
486; GFX6-NEXT:    v_min_u32_e32 v9, v9, v16
487; GFX6-NEXT:    v_not_b32_e32 v16, v26
488; GFX6-NEXT:    v_min_u32_e32 v10, v10, v16
489; GFX6-NEXT:    v_not_b32_e32 v16, v27
490; GFX6-NEXT:    v_min_u32_e32 v11, v11, v16
491; GFX6-NEXT:    v_not_b32_e32 v16, v28
492; GFX6-NEXT:    v_min_u32_e32 v12, v12, v16
493; GFX6-NEXT:    v_not_b32_e32 v16, v29
494; GFX6-NEXT:    v_min_u32_e32 v13, v13, v16
495; GFX6-NEXT:    v_not_b32_e32 v16, v30
496; GFX6-NEXT:    v_min_u32_e32 v14, v14, v16
497; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v17
498; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v18
499; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v19
500; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v20
501; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v21
502; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v22
503; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v23
504; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v24
505; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v25
506; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v26
507; GFX6-NEXT:    v_add_i32_e32 v11, vcc, v11, v27
508; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v12, v28
509; GFX6-NEXT:    v_add_i32_e32 v13, vcc, v13, v29
510; GFX6-NEXT:    v_add_i32_e32 v14, vcc, v14, v30
511; GFX6-NEXT:    s_waitcnt vmcnt(0)
512; GFX6-NEXT:    v_not_b32_e32 v16, v31
513; GFX6-NEXT:    v_min_u32_e32 v15, v15, v16
514; GFX6-NEXT:    v_add_i32_e32 v15, vcc, v15, v31
515; GFX6-NEXT:    s_setpc_b64 s[30:31]
516;
517; GFX8-LABEL: v_uaddsat_v16i32:
518; GFX8:       ; %bb.0:
519; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
520; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v16 clamp
521; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
522; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v17 clamp
523; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v2, v18 clamp
524; GFX8-NEXT:    v_add_u32_e64 v3, s[4:5], v3, v19 clamp
525; GFX8-NEXT:    v_add_u32_e64 v4, s[4:5], v4, v20 clamp
526; GFX8-NEXT:    v_add_u32_e64 v5, s[4:5], v5, v21 clamp
527; GFX8-NEXT:    v_add_u32_e64 v6, s[4:5], v6, v22 clamp
528; GFX8-NEXT:    v_add_u32_e64 v7, s[4:5], v7, v23 clamp
529; GFX8-NEXT:    v_add_u32_e64 v8, s[4:5], v8, v24 clamp
530; GFX8-NEXT:    v_add_u32_e64 v9, s[4:5], v9, v25 clamp
531; GFX8-NEXT:    v_add_u32_e64 v10, s[4:5], v10, v26 clamp
532; GFX8-NEXT:    v_add_u32_e64 v11, s[4:5], v11, v27 clamp
533; GFX8-NEXT:    v_add_u32_e64 v12, s[4:5], v12, v28 clamp
534; GFX8-NEXT:    v_add_u32_e64 v13, s[4:5], v13, v29 clamp
535; GFX8-NEXT:    v_add_u32_e64 v14, s[4:5], v14, v30 clamp
536; GFX8-NEXT:    s_waitcnt vmcnt(0)
537; GFX8-NEXT:    v_add_u32_e64 v15, s[4:5], v15, v16 clamp
538; GFX8-NEXT:    s_setpc_b64 s[30:31]
539;
540; GFX9-LABEL: v_uaddsat_v16i32:
541; GFX9:       ; %bb.0:
542; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
543; GFX9-NEXT:    v_add_u32_e64 v0, v0, v16 clamp
544; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32
545; GFX9-NEXT:    v_add_u32_e64 v1, v1, v17 clamp
546; GFX9-NEXT:    v_add_u32_e64 v2, v2, v18 clamp
547; GFX9-NEXT:    v_add_u32_e64 v3, v3, v19 clamp
548; GFX9-NEXT:    v_add_u32_e64 v4, v4, v20 clamp
549; GFX9-NEXT:    v_add_u32_e64 v5, v5, v21 clamp
550; GFX9-NEXT:    v_add_u32_e64 v6, v6, v22 clamp
551; GFX9-NEXT:    v_add_u32_e64 v7, v7, v23 clamp
552; GFX9-NEXT:    v_add_u32_e64 v8, v8, v24 clamp
553; GFX9-NEXT:    v_add_u32_e64 v9, v9, v25 clamp
554; GFX9-NEXT:    v_add_u32_e64 v10, v10, v26 clamp
555; GFX9-NEXT:    v_add_u32_e64 v11, v11, v27 clamp
556; GFX9-NEXT:    v_add_u32_e64 v12, v12, v28 clamp
557; GFX9-NEXT:    v_add_u32_e64 v13, v13, v29 clamp
558; GFX9-NEXT:    v_add_u32_e64 v14, v14, v30 clamp
559; GFX9-NEXT:    s_waitcnt vmcnt(0)
560; GFX9-NEXT:    v_add_u32_e64 v15, v15, v16 clamp
561; GFX9-NEXT:    s_setpc_b64 s[30:31]
562;
563; GFX10-LABEL: v_uaddsat_v16i32:
564; GFX10:       ; %bb.0:
565; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
566; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
567; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
568; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v16 clamp
569; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, v17 clamp
570; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, v18 clamp
571; GFX10-NEXT:    v_add_nc_u32_e64 v3, v3, v19 clamp
572; GFX10-NEXT:    v_add_nc_u32_e64 v4, v4, v20 clamp
573; GFX10-NEXT:    v_add_nc_u32_e64 v5, v5, v21 clamp
574; GFX10-NEXT:    v_add_nc_u32_e64 v6, v6, v22 clamp
575; GFX10-NEXT:    v_add_nc_u32_e64 v7, v7, v23 clamp
576; GFX10-NEXT:    v_add_nc_u32_e64 v8, v8, v24 clamp
577; GFX10-NEXT:    v_add_nc_u32_e64 v9, v9, v25 clamp
578; GFX10-NEXT:    v_add_nc_u32_e64 v10, v10, v26 clamp
579; GFX10-NEXT:    v_add_nc_u32_e64 v11, v11, v27 clamp
580; GFX10-NEXT:    v_add_nc_u32_e64 v12, v12, v28 clamp
581; GFX10-NEXT:    v_add_nc_u32_e64 v13, v13, v29 clamp
582; GFX10-NEXT:    v_add_nc_u32_e64 v14, v14, v30 clamp
583; GFX10-NEXT:    s_waitcnt vmcnt(0)
584; GFX10-NEXT:    v_add_nc_u32_e64 v15, v15, v31 clamp
585; GFX10-NEXT:    s_setpc_b64 s[30:31]
586  %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
587  ret <16 x i32> %result
588}
589
590
591define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) {
592; GFX6-LABEL: v_uaddsat_i64:
593; GFX6:       ; %bb.0:
594; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
595; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
596; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
597; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
598; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
599; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, -1, vcc
600; GFX6-NEXT:    s_setpc_b64 s[30:31]
601;
602; GFX8-LABEL: v_uaddsat_i64:
603; GFX8:       ; %bb.0:
604; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
605; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v2
606; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
607; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
608; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
609; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, -1, vcc
610; GFX8-NEXT:    s_setpc_b64 s[30:31]
611;
612; GFX9-LABEL: v_uaddsat_i64:
613; GFX9:       ; %bb.0:
614; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
615; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
616; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
617; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
618; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
619; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, -1, vcc
620; GFX9-NEXT:    s_setpc_b64 s[30:31]
621;
622; GFX10-LABEL: v_uaddsat_i64:
623; GFX10:       ; %bb.0:
624; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
625; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
626; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, v2
627; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
628; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
629; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc_lo
630; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, -1, vcc_lo
631; GFX10-NEXT:    s_setpc_b64 s[30:31]
632  %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
633  ret i64 %result
634}
635
636declare i8 @llvm.uadd.sat.i8(i8, i8) #0
637declare i16 @llvm.uadd.sat.i16(i16, i16) #0
638declare <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16>, <2 x i16>) #0
639declare <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16>, <3 x i16>) #0
640declare <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16>, <4 x i16>) #0
641declare i32 @llvm.uadd.sat.i32(i32, i32) #0
642declare <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32>, <2 x i32>) #0
643declare <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32>, <3 x i32>) #0
644declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>) #0
645declare <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32>, <8 x i32>) #0
646declare <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32>, <16 x i32>) #0
647declare i64 @llvm.uadd.sat.i64(i64, i64) #0
648
649attributes #0 = { nounwind readnone speculatable willreturn }
650