1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI
4; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9
5; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10
6; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s --check-prefix=GFX11
7
8
9declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
10declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
11
12
13declare { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
14
15define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
16; SI-LABEL: saddo_i64_zext:
17; SI:       ; %bb.0:
18; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
19; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
20; SI-NEXT:    s_mov_b32 s3, 0xf000
21; SI-NEXT:    s_mov_b32 s2, -1
22; SI-NEXT:    s_waitcnt lgkmcnt(0)
23; SI-NEXT:    v_mov_b32_e32 v0, s6
24; SI-NEXT:    s_add_u32 s10, s6, s8
25; SI-NEXT:    s_addc_u32 s11, s7, s9
26; SI-NEXT:    v_mov_b32_e32 v1, s7
27; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
28; SI-NEXT:    v_cmp_lt_i64_e64 s[6:7], s[8:9], 0
29; SI-NEXT:    s_mov_b32 s0, s4
30; SI-NEXT:    s_mov_b32 s1, s5
31; SI-NEXT:    s_xor_b64 s[4:5], s[6:7], vcc
32; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
33; SI-NEXT:    v_mov_b32_e32 v1, s11
34; SI-NEXT:    v_add_i32_e32 v0, vcc, s10, v0
35; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
36; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
37; SI-NEXT:    s_endpgm
38;
39; VI-LABEL: saddo_i64_zext:
40; VI:       ; %bb.0:
41; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
42; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
43; VI-NEXT:    s_waitcnt lgkmcnt(0)
44; VI-NEXT:    v_mov_b32_e32 v1, s6
45; VI-NEXT:    s_add_u32 s2, s6, s0
46; VI-NEXT:    v_mov_b32_e32 v2, s7
47; VI-NEXT:    s_addc_u32 s3, s7, s1
48; VI-NEXT:    v_cmp_lt_i64_e64 s[8:9], s[0:1], 0
49; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
50; VI-NEXT:    v_mov_b32_e32 v3, s3
51; VI-NEXT:    s_xor_b64 s[0:1], s[8:9], vcc
52; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
53; VI-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
54; VI-NEXT:    v_mov_b32_e32 v0, s4
55; VI-NEXT:    v_mov_b32_e32 v1, s5
56; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
57; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
58; VI-NEXT:    s_endpgm
59;
60; GFX9-LABEL: saddo_i64_zext:
61; GFX9:       ; %bb.0:
62; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
63; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
64; GFX9-NEXT:    v_mov_b32_e32 v2, 0
65; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX9-NEXT:    v_mov_b32_e32 v0, s6
67; GFX9-NEXT:    s_add_u32 s0, s6, s2
68; GFX9-NEXT:    v_mov_b32_e32 v1, s7
69; GFX9-NEXT:    s_addc_u32 s1, s7, s3
70; GFX9-NEXT:    v_cmp_lt_i64_e64 s[8:9], s[2:3], 0
71; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
72; GFX9-NEXT:    v_mov_b32_e32 v1, s1
73; GFX9-NEXT:    s_xor_b64 s[2:3], s[8:9], vcc
74; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
75; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
76; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
77; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
78; GFX9-NEXT:    s_endpgm
79;
80; GFX10-LABEL: saddo_i64_zext:
81; GFX10:       ; %bb.0:
82; GFX10-NEXT:    s_clause 0x1
83; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
84; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
85; GFX10-NEXT:    v_mov_b32_e32 v2, 0
86; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
87; GFX10-NEXT:    s_add_u32 s0, s6, s2
88; GFX10-NEXT:    s_addc_u32 s1, s7, s3
89; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[2:3], 0
90; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[0:1], s[6:7]
91; GFX10-NEXT:    s_xor_b32 s2, s2, s3
92; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
93; GFX10-NEXT:    v_add_co_u32 v0, s0, s0, v0
94; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
95; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
96; GFX10-NEXT:    s_endpgm
97;
98; GFX11-LABEL: saddo_i64_zext:
99; GFX11:       ; %bb.0:
100; GFX11-NEXT:    s_clause 0x1
101; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
102; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
103; GFX11-NEXT:    v_mov_b32_e32 v2, 0
104; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
105; GFX11-NEXT:    s_add_u32 s2, s6, s0
106; GFX11-NEXT:    s_addc_u32 s3, s7, s1
107; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[0:1], 0
108; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], s[6:7]
109; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
110; GFX11-NEXT:    s_xor_b32 s0, s0, s1
111; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
112; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
113; GFX11-NEXT:    v_add_co_u32 v0, s0, s2, v0
114; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s0
115; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
116; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
117; GFX11-NEXT:    s_endpgm
118  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
119  %val = extractvalue { i64, i1 } %sadd, 0
120  %carry = extractvalue { i64, i1 } %sadd, 1
121  %ext = zext i1 %carry to i64
122  %add2 = add i64 %val, %ext
123  store i64 %add2, i64 addrspace(1)* %out, align 8
124  ret void
125}
126
127define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
128; SI-LABEL: s_saddo_i32:
129; SI:       ; %bb.0:
130; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
131; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
132; SI-NEXT:    s_mov_b32 s3, 0xf000
133; SI-NEXT:    s_mov_b32 s2, -1
134; SI-NEXT:    s_waitcnt lgkmcnt(0)
135; SI-NEXT:    s_mov_b32 s0, s4
136; SI-NEXT:    s_add_i32 s12, s8, s9
137; SI-NEXT:    s_cmp_lt_i32 s9, 0
138; SI-NEXT:    s_cselect_b64 s[10:11], -1, 0
139; SI-NEXT:    s_cmp_lt_i32 s12, s8
140; SI-NEXT:    s_mov_b32 s1, s5
141; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
142; SI-NEXT:    v_mov_b32_e32 v0, s12
143; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
144; SI-NEXT:    s_xor_b64 s[0:1], s[10:11], s[8:9]
145; SI-NEXT:    s_mov_b32 s4, s6
146; SI-NEXT:    s_mov_b32 s5, s7
147; SI-NEXT:    s_mov_b32 s6, s2
148; SI-NEXT:    s_mov_b32 s7, s3
149; SI-NEXT:    s_waitcnt expcnt(0)
150; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
151; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
152; SI-NEXT:    s_endpgm
153;
154; VI-LABEL: s_saddo_i32:
155; VI:       ; %bb.0:
156; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
157; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
158; VI-NEXT:    s_waitcnt lgkmcnt(0)
159; VI-NEXT:    v_mov_b32_e32 v0, s4
160; VI-NEXT:    s_add_i32 s4, s0, s1
161; VI-NEXT:    s_cmp_lt_i32 s1, 0
162; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
163; VI-NEXT:    s_cmp_lt_i32 s4, s0
164; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
165; VI-NEXT:    v_mov_b32_e32 v1, s5
166; VI-NEXT:    v_mov_b32_e32 v4, s4
167; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
168; VI-NEXT:    v_mov_b32_e32 v2, s6
169; VI-NEXT:    v_mov_b32_e32 v3, s7
170; VI-NEXT:    flat_store_dword v[0:1], v4
171; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
172; VI-NEXT:    flat_store_byte v[2:3], v0
173; VI-NEXT:    s_endpgm
174;
175; GFX9-LABEL: s_saddo_i32:
176; GFX9:       ; %bb.0:
177; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
178; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
179; GFX9-NEXT:    v_mov_b32_e32 v0, 0
180; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
181; GFX9-NEXT:    v_mov_b32_e32 v1, s3
182; GFX9-NEXT:    s_add_i32 s0, s2, s3
183; GFX9-NEXT:    v_add_i32 v1, s2, v1 clamp
184; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, s0, v1
185; GFX9-NEXT:    v_mov_b32_e32 v2, s0
186; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
187; GFX9-NEXT:    global_store_dword v0, v2, s[4:5]
188; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
189; GFX9-NEXT:    s_endpgm
190;
191; GFX10-LABEL: s_saddo_i32:
192; GFX10:       ; %bb.0:
193; GFX10-NEXT:    s_clause 0x1
194; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
195; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
196; GFX10-NEXT:    v_mov_b32_e32 v1, 0
197; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
198; GFX10-NEXT:    v_add_nc_i32 v0, s2, s3 clamp
199; GFX10-NEXT:    s_add_i32 s0, s2, s3
200; GFX10-NEXT:    v_mov_b32_e32 v2, s0
201; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s0, v0
202; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
203; GFX10-NEXT:    global_store_dword v1, v2, s[4:5]
204; GFX10-NEXT:    global_store_byte v1, v0, s[6:7]
205; GFX10-NEXT:    s_endpgm
206;
207; GFX11-LABEL: s_saddo_i32:
208; GFX11:       ; %bb.0:
209; GFX11-NEXT:    s_clause 0x1
210; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
211; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
212; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
213; GFX11-NEXT:    v_add_nc_i32 v0, s4, s5 clamp
214; GFX11-NEXT:    s_add_i32 s4, s4, s5
215; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
216; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4
217; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s4, v0
218; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
219; GFX11-NEXT:    s_clause 0x1
220; GFX11-NEXT:    global_store_b32 v1, v2, s[0:1]
221; GFX11-NEXT:    global_store_b8 v1, v0, s[2:3]
222; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
223; GFX11-NEXT:    s_endpgm
224  %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
225  %val = extractvalue { i32, i1 } %sadd, 0
226  %carry = extractvalue { i32, i1 } %sadd, 1
227  store i32 %val, i32 addrspace(1)* %out, align 4
228  store i1 %carry, i1 addrspace(1)* %carryout
229  ret void
230}
231
232define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
233; SI-LABEL: v_saddo_i32:
234; SI:       ; %bb.0:
235; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
236; SI-NEXT:    s_mov_b32 s11, 0xf000
237; SI-NEXT:    s_mov_b32 s10, -1
238; SI-NEXT:    s_mov_b32 s14, s10
239; SI-NEXT:    s_mov_b32 s15, s11
240; SI-NEXT:    s_waitcnt lgkmcnt(0)
241; SI-NEXT:    s_mov_b32 s12, s4
242; SI-NEXT:    s_mov_b32 s13, s5
243; SI-NEXT:    s_mov_b32 s4, s6
244; SI-NEXT:    s_mov_b32 s5, s7
245; SI-NEXT:    s_mov_b32 s6, s10
246; SI-NEXT:    s_mov_b32 s7, s11
247; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
248; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
249; SI-NEXT:    s_mov_b32 s8, s0
250; SI-NEXT:    s_mov_b32 s9, s1
251; SI-NEXT:    s_mov_b32 s4, s2
252; SI-NEXT:    s_mov_b32 s5, s3
253; SI-NEXT:    s_waitcnt vmcnt(0)
254; SI-NEXT:    v_add_i32_e32 v2, vcc, v0, v1
255; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
256; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], v2, v0
257; SI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
258; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
259; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0
260; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
261; SI-NEXT:    s_endpgm
262;
263; VI-LABEL: v_saddo_i32:
264; VI:       ; %bb.0:
265; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
266; VI-NEXT:    s_waitcnt lgkmcnt(0)
267; VI-NEXT:    v_mov_b32_e32 v0, s4
268; VI-NEXT:    v_mov_b32_e32 v1, s5
269; VI-NEXT:    v_mov_b32_e32 v2, s6
270; VI-NEXT:    v_mov_b32_e32 v3, s7
271; VI-NEXT:    flat_load_dword v4, v[0:1]
272; VI-NEXT:    flat_load_dword v5, v[2:3]
273; VI-NEXT:    v_mov_b32_e32 v0, s0
274; VI-NEXT:    v_mov_b32_e32 v1, s1
275; VI-NEXT:    v_mov_b32_e32 v2, s2
276; VI-NEXT:    v_mov_b32_e32 v3, s3
277; VI-NEXT:    s_waitcnt vmcnt(0)
278; VI-NEXT:    v_add_u32_e32 v6, vcc, v4, v5
279; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v5
280; VI-NEXT:    v_cmp_lt_i32_e64 s[0:1], v6, v4
281; VI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
282; VI-NEXT:    flat_store_dword v[0:1], v6
283; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
284; VI-NEXT:    flat_store_byte v[2:3], v0
285; VI-NEXT:    s_endpgm
286;
287; GFX9-LABEL: v_saddo_i32:
288; GFX9:       ; %bb.0:
289; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
290; GFX9-NEXT:    v_mov_b32_e32 v0, 0
291; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
293; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
294; GFX9-NEXT:    s_waitcnt vmcnt(0)
295; GFX9-NEXT:    v_add_i32 v3, v1, v2 clamp
296; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
297; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v3
298; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
299; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
300; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
301; GFX9-NEXT:    s_endpgm
302;
303; GFX10-LABEL: v_saddo_i32:
304; GFX10:       ; %bb.0:
305; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
306; GFX10-NEXT:    v_mov_b32_e32 v0, 0
307; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
308; GFX10-NEXT:    s_clause 0x1
309; GFX10-NEXT:    global_load_dword v1, v0, s[4:5]
310; GFX10-NEXT:    global_load_dword v2, v0, s[6:7]
311; GFX10-NEXT:    s_waitcnt vmcnt(0)
312; GFX10-NEXT:    v_add_nc_i32 v3, v1, v2 clamp
313; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
314; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v3
315; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
316; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
317; GFX10-NEXT:    global_store_byte v0, v2, s[2:3]
318; GFX10-NEXT:    s_endpgm
319;
320; GFX11-LABEL: v_saddo_i32:
321; GFX11:       ; %bb.0:
322; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
323; GFX11-NEXT:    v_mov_b32_e32 v0, 0
324; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
325; GFX11-NEXT:    s_clause 0x1
326; GFX11-NEXT:    global_load_b32 v1, v0, s[4:5]
327; GFX11-NEXT:    global_load_b32 v2, v0, s[6:7]
328; GFX11-NEXT:    s_waitcnt vmcnt(0)
329; GFX11-NEXT:    v_add_nc_i32 v3, v1, v2 clamp
330; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v2
331; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
332; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v3
333; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
334; GFX11-NEXT:    s_clause 0x1
335; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
336; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
337; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
338; GFX11-NEXT:    s_endpgm
339  %a = load i32, i32 addrspace(1)* %aptr, align 4
340  %b = load i32, i32 addrspace(1)* %bptr, align 4
341  %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
342  %val = extractvalue { i32, i1 } %sadd, 0
343  %carry = extractvalue { i32, i1 } %sadd, 1
344  store i32 %val, i32 addrspace(1)* %out, align 4
345  store i1 %carry, i1 addrspace(1)* %carryout
346  ret void
347}
348
349define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
350; SI-LABEL: s_saddo_i64:
351; SI:       ; %bb.0:
352; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
353; SI-NEXT:    s_mov_b32 s11, 0xf000
354; SI-NEXT:    s_mov_b32 s10, -1
355; SI-NEXT:    s_waitcnt lgkmcnt(0)
356; SI-NEXT:    s_add_u32 s12, s4, s6
357; SI-NEXT:    v_mov_b32_e32 v0, s4
358; SI-NEXT:    s_addc_u32 s13, s5, s7
359; SI-NEXT:    v_mov_b32_e32 v1, s5
360; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
361; SI-NEXT:    v_cmp_lt_i64_e64 s[4:5], s[6:7], 0
362; SI-NEXT:    v_mov_b32_e32 v0, s12
363; SI-NEXT:    s_mov_b32 s8, s0
364; SI-NEXT:    s_mov_b32 s9, s1
365; SI-NEXT:    v_mov_b32_e32 v1, s13
366; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
367; SI-NEXT:    s_mov_b32 s0, s2
368; SI-NEXT:    s_mov_b32 s1, s3
369; SI-NEXT:    s_mov_b32 s2, s10
370; SI-NEXT:    s_mov_b32 s3, s11
371; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
372; SI-NEXT:    s_waitcnt expcnt(0)
373; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
374; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
375; SI-NEXT:    s_endpgm
376;
377; VI-LABEL: s_saddo_i64:
378; VI:       ; %bb.0:
379; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
380; VI-NEXT:    s_waitcnt lgkmcnt(0)
381; VI-NEXT:    v_mov_b32_e32 v0, s0
382; VI-NEXT:    s_add_u32 s0, s4, s6
383; VI-NEXT:    v_mov_b32_e32 v4, s4
384; VI-NEXT:    v_mov_b32_e32 v1, s1
385; VI-NEXT:    s_addc_u32 s1, s5, s7
386; VI-NEXT:    v_mov_b32_e32 v5, s5
387; VI-NEXT:    v_mov_b32_e32 v2, s2
388; VI-NEXT:    v_mov_b32_e32 v3, s3
389; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
390; VI-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
391; VI-NEXT:    v_mov_b32_e32 v5, s1
392; VI-NEXT:    v_mov_b32_e32 v4, s0
393; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], vcc
394; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
395; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
396; VI-NEXT:    flat_store_byte v[2:3], v0
397; VI-NEXT:    s_endpgm
398;
399; GFX9-LABEL: s_saddo_i64:
400; GFX9:       ; %bb.0:
401; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
402; GFX9-NEXT:    v_mov_b32_e32 v2, 0
403; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
404; GFX9-NEXT:    s_add_u32 s8, s4, s6
405; GFX9-NEXT:    v_mov_b32_e32 v0, s4
406; GFX9-NEXT:    v_mov_b32_e32 v1, s5
407; GFX9-NEXT:    s_addc_u32 s9, s5, s7
408; GFX9-NEXT:    v_cmp_lt_i64_e64 s[10:11], s[6:7], 0
409; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
410; GFX9-NEXT:    v_mov_b32_e32 v0, s8
411; GFX9-NEXT:    v_mov_b32_e32 v1, s9
412; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
413; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], vcc
414; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
415; GFX9-NEXT:    global_store_byte v2, v0, s[2:3]
416; GFX9-NEXT:    s_endpgm
417;
418; GFX10-LABEL: s_saddo_i64:
419; GFX10:       ; %bb.0:
420; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
421; GFX10-NEXT:    v_mov_b32_e32 v2, 0
422; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
423; GFX10-NEXT:    s_add_u32 s8, s4, s6
424; GFX10-NEXT:    s_addc_u32 s9, s5, s7
425; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, s[6:7], 0
426; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
427; GFX10-NEXT:    v_mov_b32_e32 v0, s8
428; GFX10-NEXT:    v_mov_b32_e32 v1, s9
429; GFX10-NEXT:    s_xor_b32 s4, s6, s4
430; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
431; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
432; GFX10-NEXT:    global_store_byte v2, v3, s[2:3]
433; GFX10-NEXT:    s_endpgm
434;
435; GFX11-LABEL: s_saddo_i64:
436; GFX11:       ; %bb.0:
437; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
438; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
439; GFX11-NEXT:    s_add_u32 s8, s4, s6
440; GFX11-NEXT:    s_addc_u32 s9, s5, s7
441; GFX11-NEXT:    v_cmp_lt_i64_e64 s6, s[6:7], 0
442; GFX11-NEXT:    v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
443; GFX11-NEXT:    v_mov_b32_e32 v0, s8
444; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
445; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
446; GFX11-NEXT:    s_xor_b32 s4, s6, s4
447; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
448; GFX11-NEXT:    s_clause 0x1
449; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
450; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
451; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
452; GFX11-NEXT:    s_endpgm
453  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
454  %val = extractvalue { i64, i1 } %sadd, 0
455  %carry = extractvalue { i64, i1 } %sadd, 1
456  store i64 %val, i64 addrspace(1)* %out, align 8
457  store i1 %carry, i1 addrspace(1)* %carryout
458  ret void
459}
460
461define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
462; SI-LABEL: v_saddo_i64:
463; SI:       ; %bb.0:
464; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
465; SI-NEXT:    s_mov_b32 s11, 0xf000
466; SI-NEXT:    s_mov_b32 s10, -1
467; SI-NEXT:    s_mov_b32 s14, s10
468; SI-NEXT:    s_mov_b32 s15, s11
469; SI-NEXT:    s_waitcnt lgkmcnt(0)
470; SI-NEXT:    s_mov_b32 s12, s4
471; SI-NEXT:    s_mov_b32 s13, s5
472; SI-NEXT:    s_mov_b32 s4, s6
473; SI-NEXT:    s_mov_b32 s5, s7
474; SI-NEXT:    s_mov_b32 s6, s10
475; SI-NEXT:    s_mov_b32 s7, s11
476; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
477; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
478; SI-NEXT:    s_mov_b32 s8, s0
479; SI-NEXT:    s_mov_b32 s9, s1
480; SI-NEXT:    s_mov_b32 s4, s2
481; SI-NEXT:    s_mov_b32 s5, s3
482; SI-NEXT:    s_waitcnt vmcnt(0)
483; SI-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
484; SI-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
485; SI-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
486; SI-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
487; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[8:11], 0
488; SI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
489; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
490; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
491; SI-NEXT:    s_endpgm
492;
493; VI-LABEL: v_saddo_i64:
494; VI:       ; %bb.0:
495; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
496; VI-NEXT:    s_waitcnt lgkmcnt(0)
497; VI-NEXT:    v_mov_b32_e32 v0, s4
498; VI-NEXT:    v_mov_b32_e32 v1, s5
499; VI-NEXT:    v_mov_b32_e32 v2, s6
500; VI-NEXT:    v_mov_b32_e32 v3, s7
501; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
502; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
503; VI-NEXT:    v_mov_b32_e32 v4, s0
504; VI-NEXT:    v_mov_b32_e32 v5, s1
505; VI-NEXT:    v_mov_b32_e32 v6, s2
506; VI-NEXT:    v_mov_b32_e32 v7, s3
507; VI-NEXT:    s_waitcnt vmcnt(0)
508; VI-NEXT:    v_add_u32_e32 v8, vcc, v0, v2
509; VI-NEXT:    v_addc_u32_e32 v9, vcc, v1, v3, vcc
510; VI-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
511; VI-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1]
512; VI-NEXT:    flat_store_dwordx2 v[4:5], v[8:9]
513; VI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
514; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
515; VI-NEXT:    flat_store_byte v[6:7], v0
516; VI-NEXT:    s_endpgm
517;
518; GFX9-LABEL: v_saddo_i64:
519; GFX9:       ; %bb.0:
520; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
521; GFX9-NEXT:    v_mov_b32_e32 v6, 0
522; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
523; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[8:9]
524; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[10:11]
525; GFX9-NEXT:    s_waitcnt vmcnt(0)
526; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
527; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
528; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
529; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
530; GFX9-NEXT:    global_store_dwordx2 v6, v[4:5], s[4:5]
531; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
532; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
533; GFX9-NEXT:    global_store_byte v6, v0, s[6:7]
534; GFX9-NEXT:    s_endpgm
535;
536; GFX10-LABEL: v_saddo_i64:
537; GFX10:       ; %bb.0:
538; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
539; GFX10-NEXT:    v_mov_b32_e32 v6, 0
540; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
541; GFX10-NEXT:    s_clause 0x1
542; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[8:9]
543; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[10:11]
544; GFX10-NEXT:    s_waitcnt vmcnt(0)
545; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
546; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
547; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
548; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
549; GFX10-NEXT:    s_xor_b32 s0, vcc_lo, s0
550; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
551; GFX10-NEXT:    global_store_dwordx2 v6, v[4:5], s[4:5]
552; GFX10-NEXT:    global_store_byte v6, v0, s[6:7]
553; GFX10-NEXT:    s_endpgm
554;
555; GFX11-LABEL: v_saddo_i64:
556; GFX11:       ; %bb.0:
557; GFX11-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
558; GFX11-NEXT:    v_mov_b32_e32 v6, 0
559; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
560; GFX11-NEXT:    s_clause 0x1
561; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[8:9]
562; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[10:11]
563; GFX11-NEXT:    s_waitcnt vmcnt(0)
564; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
565; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
566; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
567; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
568; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
569; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, s0
570; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
571; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
572; GFX11-NEXT:    s_clause 0x1
573; GFX11-NEXT:    global_store_b64 v6, v[4:5], s[4:5]
574; GFX11-NEXT:    global_store_b8 v6, v0, s[6:7]
575; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
576; GFX11-NEXT:    s_endpgm
577  %a = load i64, i64 addrspace(1)* %aptr, align 4
578  %b = load i64, i64 addrspace(1)* %bptr, align 4
579  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
580  %val = extractvalue { i64, i1 } %sadd, 0
581  %carry = extractvalue { i64, i1 } %sadd, 1
582  store i64 %val, i64 addrspace(1)* %out, align 8
583  store i1 %carry, i1 addrspace(1)* %carryout
584  ret void
585}
586
587define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
588; SI-LABEL: v_saddo_v2i32:
589; SI:       ; %bb.0:
590; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
591; SI-NEXT:    s_mov_b32 s11, 0xf000
592; SI-NEXT:    s_mov_b32 s10, -1
593; SI-NEXT:    s_mov_b32 s14, s10
594; SI-NEXT:    s_mov_b32 s15, s11
595; SI-NEXT:    s_waitcnt lgkmcnt(0)
596; SI-NEXT:    s_mov_b32 s12, s4
597; SI-NEXT:    s_mov_b32 s13, s5
598; SI-NEXT:    s_mov_b32 s4, s6
599; SI-NEXT:    s_mov_b32 s5, s7
600; SI-NEXT:    s_mov_b32 s6, s10
601; SI-NEXT:    s_mov_b32 s7, s11
602; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
603; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
604; SI-NEXT:    s_mov_b32 s8, s0
605; SI-NEXT:    s_mov_b32 s9, s1
606; SI-NEXT:    s_mov_b32 s12, s2
607; SI-NEXT:    s_mov_b32 s13, s3
608; SI-NEXT:    s_waitcnt vmcnt(0)
609; SI-NEXT:    v_add_i32_e32 v5, vcc, v1, v3
610; SI-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
611; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], 0, v3
612; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], v5, v1
613; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
614; SI-NEXT:    v_cmp_lt_i32_e64 s[2:3], v4, v0
615; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
616; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
617; SI-NEXT:    s_xor_b64 s[0:1], vcc, s[2:3]
618; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
619; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[8:11], 0
620; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[12:15], 0
621; SI-NEXT:    s_endpgm
622;
623; VI-LABEL: v_saddo_v2i32:
624; VI:       ; %bb.0:
625; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
626; VI-NEXT:    s_waitcnt lgkmcnt(0)
627; VI-NEXT:    v_mov_b32_e32 v0, s4
628; VI-NEXT:    v_mov_b32_e32 v1, s5
629; VI-NEXT:    v_mov_b32_e32 v2, s6
630; VI-NEXT:    v_mov_b32_e32 v3, s7
631; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
632; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
633; VI-NEXT:    v_mov_b32_e32 v4, s0
634; VI-NEXT:    v_mov_b32_e32 v5, s1
635; VI-NEXT:    v_mov_b32_e32 v6, s2
636; VI-NEXT:    v_mov_b32_e32 v7, s3
637; VI-NEXT:    s_waitcnt vmcnt(0)
638; VI-NEXT:    v_add_u32_e32 v9, vcc, v1, v3
639; VI-NEXT:    v_add_u32_e32 v8, vcc, v0, v2
640; VI-NEXT:    v_cmp_gt_i32_e64 s[0:1], 0, v3
641; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], v9, v1
642; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
643; VI-NEXT:    v_cmp_lt_i32_e64 s[2:3], v8, v0
644; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
645; VI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
646; VI-NEXT:    s_xor_b64 s[0:1], vcc, s[2:3]
647; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
648; VI-NEXT:    flat_store_dwordx2 v[4:5], v[8:9]
649; VI-NEXT:    flat_store_dwordx2 v[6:7], v[0:1]
650; VI-NEXT:    s_endpgm
651;
652; GFX9-LABEL: v_saddo_v2i32:
653; GFX9:       ; %bb.0:
654; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
655; GFX9-NEXT:    v_mov_b32_e32 v6, 0
656; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
657; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[4:5]
658; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[6:7]
659; GFX9-NEXT:    s_waitcnt vmcnt(0)
660; GFX9-NEXT:    v_add_u32_e32 v5, v1, v3
661; GFX9-NEXT:    v_add_i32 v1, v1, v3 clamp
662; GFX9-NEXT:    v_add_u32_e32 v4, v0, v2
663; GFX9-NEXT:    v_add_i32 v0, v0, v2 clamp
664; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v1
665; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
666; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v0
667; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
668; GFX9-NEXT:    global_store_dwordx2 v6, v[4:5], s[0:1]
669; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[2:3]
670; GFX9-NEXT:    s_endpgm
671;
672; GFX10-LABEL: v_saddo_v2i32:
673; GFX10:       ; %bb.0:
674; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
675; GFX10-NEXT:    v_mov_b32_e32 v5, 0
676; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
677; GFX10-NEXT:    s_clause 0x1
678; GFX10-NEXT:    global_load_dwordx2 v[0:1], v5, s[4:5]
679; GFX10-NEXT:    global_load_dwordx2 v[2:3], v5, s[6:7]
680; GFX10-NEXT:    s_waitcnt vmcnt(0)
681; GFX10-NEXT:    v_add_nc_u32_e32 v4, v1, v3
682; GFX10-NEXT:    v_add_nc_i32 v1, v1, v3 clamp
683; GFX10-NEXT:    v_add_nc_u32_e32 v3, v0, v2
684; GFX10-NEXT:    v_add_nc_i32 v0, v0, v2 clamp
685; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v4, v1
686; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
687; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v3, v0
688; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
689; GFX10-NEXT:    global_store_dwordx2 v5, v[3:4], s[0:1]
690; GFX10-NEXT:    global_store_dwordx2 v5, v[0:1], s[2:3]
691; GFX10-NEXT:    s_endpgm
692;
693; GFX11-LABEL: v_saddo_v2i32:
694; GFX11:       ; %bb.0:
695; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
696; GFX11-NEXT:    v_mov_b32_e32 v5, 0
697; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
698; GFX11-NEXT:    s_clause 0x1
699; GFX11-NEXT:    global_load_b64 v[0:1], v5, s[4:5]
700; GFX11-NEXT:    global_load_b64 v[2:3], v5, s[6:7]
701; GFX11-NEXT:    s_waitcnt vmcnt(0)
702; GFX11-NEXT:    v_add_nc_u32_e32 v4, v1, v3
703; GFX11-NEXT:    v_add_nc_i32 v1, v1, v3 clamp
704; GFX11-NEXT:    v_add_nc_u32_e32 v3, v0, v2
705; GFX11-NEXT:    v_add_nc_i32 v0, v0, v2 clamp
706; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
707; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v4, v1
708; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
709; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v3, v0
710; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
711; GFX11-NEXT:    s_clause 0x1
712; GFX11-NEXT:    global_store_b64 v5, v[3:4], s[0:1]
713; GFX11-NEXT:    global_store_b64 v5, v[0:1], s[2:3]
714; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
715; GFX11-NEXT:    s_endpgm
716  %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
717  %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
718  %sadd = call { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
719  %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
720  %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
721  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
722  %carry.ext = zext <2 x i1> %carry to <2 x i32>
723  store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout
724  ret void
725}
726