1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI
4; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9
5; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10
6
7
8declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
9declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
10
11
12declare { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
13
14define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
15; SI-LABEL: saddo_i64_zext:
16; SI:       ; %bb.0:
17; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
18; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
19; SI-NEXT:    s_mov_b32 s3, 0xf000
20; SI-NEXT:    s_mov_b32 s2, -1
21; SI-NEXT:    s_waitcnt lgkmcnt(0)
22; SI-NEXT:    v_mov_b32_e32 v0, s6
23; SI-NEXT:    s_add_u32 s10, s6, s8
24; SI-NEXT:    s_addc_u32 s11, s7, s9
25; SI-NEXT:    v_mov_b32_e32 v1, s7
26; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
27; SI-NEXT:    v_cmp_lt_i64_e64 s[6:7], s[8:9], 0
28; SI-NEXT:    s_mov_b32 s0, s4
29; SI-NEXT:    s_mov_b32 s1, s5
30; SI-NEXT:    s_xor_b64 s[4:5], s[6:7], vcc
31; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
32; SI-NEXT:    v_mov_b32_e32 v1, s11
33; SI-NEXT:    v_add_i32_e32 v0, vcc, s10, v0
34; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
35; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
36; SI-NEXT:    s_endpgm
37;
38; VI-LABEL: saddo_i64_zext:
39; VI:       ; %bb.0:
40; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
41; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
42; VI-NEXT:    s_waitcnt lgkmcnt(0)
43; VI-NEXT:    v_mov_b32_e32 v1, s6
44; VI-NEXT:    s_add_u32 s2, s6, s0
45; VI-NEXT:    v_mov_b32_e32 v2, s7
46; VI-NEXT:    s_addc_u32 s3, s7, s1
47; VI-NEXT:    v_cmp_lt_i64_e64 s[8:9], s[0:1], 0
48; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
49; VI-NEXT:    v_mov_b32_e32 v3, s3
50; VI-NEXT:    s_xor_b64 s[0:1], s[8:9], vcc
51; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
52; VI-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
53; VI-NEXT:    v_mov_b32_e32 v0, s4
54; VI-NEXT:    v_mov_b32_e32 v1, s5
55; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
56; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
57; VI-NEXT:    s_endpgm
58;
59; GFX9-LABEL: saddo_i64_zext:
60; GFX9:       ; %bb.0:
61; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
62; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
63; GFX9-NEXT:    v_mov_b32_e32 v2, 0
64; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
65; GFX9-NEXT:    v_mov_b32_e32 v0, s6
66; GFX9-NEXT:    s_add_u32 s0, s6, s2
67; GFX9-NEXT:    v_mov_b32_e32 v1, s7
68; GFX9-NEXT:    s_addc_u32 s1, s7, s3
69; GFX9-NEXT:    v_cmp_lt_i64_e64 s[8:9], s[2:3], 0
70; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
71; GFX9-NEXT:    v_mov_b32_e32 v1, s1
72; GFX9-NEXT:    s_xor_b64 s[2:3], s[8:9], vcc
73; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
74; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
75; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
76; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
77; GFX9-NEXT:    s_endpgm
78;
79; GFX10-LABEL: saddo_i64_zext:
80; GFX10:       ; %bb.0:
81; GFX10-NEXT:    s_clause 0x1
82; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
83; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
84; GFX10-NEXT:    v_mov_b32_e32 v2, 0
85; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
86; GFX10-NEXT:    s_add_u32 s0, s6, s2
87; GFX10-NEXT:    s_addc_u32 s1, s7, s3
88; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[2:3], 0
89; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[0:1], s[6:7]
90; GFX10-NEXT:    s_xor_b32 s2, s2, s3
91; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
92; GFX10-NEXT:    v_add_co_u32 v0, s0, s0, v0
93; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
94; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
95; GFX10-NEXT:    s_endpgm
96  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
97  %val = extractvalue { i64, i1 } %sadd, 0
98  %carry = extractvalue { i64, i1 } %sadd, 1
99  %ext = zext i1 %carry to i64
100  %add2 = add i64 %val, %ext
101  store i64 %add2, i64 addrspace(1)* %out, align 8
102  ret void
103}
104
105define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
106; SI-LABEL: s_saddo_i32:
107; SI:       ; %bb.0:
108; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
109; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
110; SI-NEXT:    s_mov_b32 s3, 0xf000
111; SI-NEXT:    s_mov_b32 s2, -1
112; SI-NEXT:    s_waitcnt lgkmcnt(0)
113; SI-NEXT:    s_mov_b32 s0, s4
114; SI-NEXT:    s_add_i32 s12, s8, s9
115; SI-NEXT:    s_cmp_lt_i32 s9, 0
116; SI-NEXT:    s_cselect_b64 s[10:11], -1, 0
117; SI-NEXT:    s_cmp_lt_i32 s12, s8
118; SI-NEXT:    s_mov_b32 s1, s5
119; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
120; SI-NEXT:    v_mov_b32_e32 v0, s12
121; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
122; SI-NEXT:    s_xor_b64 s[0:1], s[10:11], s[8:9]
123; SI-NEXT:    s_mov_b32 s4, s6
124; SI-NEXT:    s_mov_b32 s5, s7
125; SI-NEXT:    s_mov_b32 s6, s2
126; SI-NEXT:    s_mov_b32 s7, s3
127; SI-NEXT:    s_waitcnt expcnt(0)
128; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
129; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
130; SI-NEXT:    s_endpgm
131;
132; VI-LABEL: s_saddo_i32:
133; VI:       ; %bb.0:
134; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
135; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
136; VI-NEXT:    s_waitcnt lgkmcnt(0)
137; VI-NEXT:    v_mov_b32_e32 v0, s4
138; VI-NEXT:    s_add_i32 s4, s0, s1
139; VI-NEXT:    s_cmp_lt_i32 s1, 0
140; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
141; VI-NEXT:    s_cmp_lt_i32 s4, s0
142; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
143; VI-NEXT:    v_mov_b32_e32 v1, s5
144; VI-NEXT:    v_mov_b32_e32 v4, s4
145; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
146; VI-NEXT:    v_mov_b32_e32 v2, s6
147; VI-NEXT:    v_mov_b32_e32 v3, s7
148; VI-NEXT:    flat_store_dword v[0:1], v4
149; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
150; VI-NEXT:    flat_store_byte v[2:3], v0
151; VI-NEXT:    s_endpgm
152;
153; GFX9-LABEL: s_saddo_i32:
154; GFX9:       ; %bb.0:
155; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
156; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
157; GFX9-NEXT:    v_mov_b32_e32 v0, 0
158; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
159; GFX9-NEXT:    v_mov_b32_e32 v1, s3
160; GFX9-NEXT:    s_add_i32 s0, s2, s3
161; GFX9-NEXT:    v_add_i32 v1, s2, v1 clamp
162; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, s0, v1
163; GFX9-NEXT:    v_mov_b32_e32 v2, s0
164; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
165; GFX9-NEXT:    global_store_dword v0, v2, s[4:5]
166; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
167; GFX9-NEXT:    s_endpgm
168;
169; GFX10-LABEL: s_saddo_i32:
170; GFX10:       ; %bb.0:
171; GFX10-NEXT:    s_clause 0x1
172; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
173; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
174; GFX10-NEXT:    v_mov_b32_e32 v1, 0
175; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
176; GFX10-NEXT:    v_add_nc_i32 v0, s2, s3 clamp
177; GFX10-NEXT:    s_add_i32 s0, s2, s3
178; GFX10-NEXT:    v_mov_b32_e32 v2, s0
179; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s0, v0
180; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
181; GFX10-NEXT:    global_store_dword v1, v2, s[4:5]
182; GFX10-NEXT:    global_store_byte v1, v0, s[6:7]
183; GFX10-NEXT:    s_endpgm
184  %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
185  %val = extractvalue { i32, i1 } %sadd, 0
186  %carry = extractvalue { i32, i1 } %sadd, 1
187  store i32 %val, i32 addrspace(1)* %out, align 4
188  store i1 %carry, i1 addrspace(1)* %carryout
189  ret void
190}
191
192define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
193; SI-LABEL: v_saddo_i32:
194; SI:       ; %bb.0:
195; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
196; SI-NEXT:    s_mov_b32 s11, 0xf000
197; SI-NEXT:    s_mov_b32 s10, -1
198; SI-NEXT:    s_mov_b32 s14, s10
199; SI-NEXT:    s_mov_b32 s15, s11
200; SI-NEXT:    s_waitcnt lgkmcnt(0)
201; SI-NEXT:    s_mov_b32 s12, s4
202; SI-NEXT:    s_mov_b32 s13, s5
203; SI-NEXT:    s_mov_b32 s4, s6
204; SI-NEXT:    s_mov_b32 s5, s7
205; SI-NEXT:    s_mov_b32 s6, s10
206; SI-NEXT:    s_mov_b32 s7, s11
207; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
208; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
209; SI-NEXT:    s_mov_b32 s8, s0
210; SI-NEXT:    s_mov_b32 s9, s1
211; SI-NEXT:    s_mov_b32 s4, s2
212; SI-NEXT:    s_mov_b32 s5, s3
213; SI-NEXT:    s_waitcnt vmcnt(0)
214; SI-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
215; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
216; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], v2, v0
217; SI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
218; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
219; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0
220; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
221; SI-NEXT:    s_endpgm
222;
223; VI-LABEL: v_saddo_i32:
224; VI:       ; %bb.0:
225; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
226; VI-NEXT:    s_waitcnt lgkmcnt(0)
227; VI-NEXT:    v_mov_b32_e32 v0, s4
228; VI-NEXT:    v_mov_b32_e32 v1, s5
229; VI-NEXT:    v_mov_b32_e32 v2, s6
230; VI-NEXT:    v_mov_b32_e32 v3, s7
231; VI-NEXT:    flat_load_dword v4, v[0:1]
232; VI-NEXT:    flat_load_dword v5, v[2:3]
233; VI-NEXT:    v_mov_b32_e32 v0, s0
234; VI-NEXT:    v_mov_b32_e32 v1, s1
235; VI-NEXT:    v_mov_b32_e32 v2, s2
236; VI-NEXT:    v_mov_b32_e32 v3, s3
237; VI-NEXT:    s_waitcnt vmcnt(0)
238; VI-NEXT:    v_add_u32_e32 v6, vcc, v5, v4
239; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v5
240; VI-NEXT:    v_cmp_lt_i32_e64 s[0:1], v6, v4
241; VI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
242; VI-NEXT:    flat_store_dword v[0:1], v6
243; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
244; VI-NEXT:    flat_store_byte v[2:3], v0
245; VI-NEXT:    s_endpgm
246;
247; GFX9-LABEL: v_saddo_i32:
248; GFX9:       ; %bb.0:
249; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
250; GFX9-NEXT:    v_mov_b32_e32 v0, 0
251; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
252; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
253; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
254; GFX9-NEXT:    s_waitcnt vmcnt(0)
255; GFX9-NEXT:    v_add_i32 v3, v1, v2 clamp
256; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
257; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v3
258; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
259; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
260; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
261; GFX9-NEXT:    s_endpgm
262;
263; GFX10-LABEL: v_saddo_i32:
264; GFX10:       ; %bb.0:
265; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
266; GFX10-NEXT:    v_mov_b32_e32 v0, 0
267; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX10-NEXT:    s_clause 0x1
269; GFX10-NEXT:    global_load_dword v1, v0, s[4:5]
270; GFX10-NEXT:    global_load_dword v2, v0, s[6:7]
271; GFX10-NEXT:    s_waitcnt vmcnt(0)
272; GFX10-NEXT:    v_add_nc_i32 v3, v1, v2 clamp
273; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
274; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v3
275; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
276; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
277; GFX10-NEXT:    global_store_byte v0, v2, s[2:3]
278; GFX10-NEXT:    s_endpgm
279  %a = load i32, i32 addrspace(1)* %aptr, align 4
280  %b = load i32, i32 addrspace(1)* %bptr, align 4
281  %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
282  %val = extractvalue { i32, i1 } %sadd, 0
283  %carry = extractvalue { i32, i1 } %sadd, 1
284  store i32 %val, i32 addrspace(1)* %out, align 4
285  store i1 %carry, i1 addrspace(1)* %carryout
286  ret void
287}
288
289define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
290; SI-LABEL: s_saddo_i64:
291; SI:       ; %bb.0:
292; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
293; SI-NEXT:    s_mov_b32 s11, 0xf000
294; SI-NEXT:    s_mov_b32 s10, -1
295; SI-NEXT:    s_waitcnt lgkmcnt(0)
296; SI-NEXT:    s_add_u32 s12, s4, s6
297; SI-NEXT:    v_mov_b32_e32 v0, s4
298; SI-NEXT:    s_addc_u32 s13, s5, s7
299; SI-NEXT:    v_mov_b32_e32 v1, s5
300; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
301; SI-NEXT:    v_cmp_lt_i64_e64 s[4:5], s[6:7], 0
302; SI-NEXT:    v_mov_b32_e32 v0, s12
303; SI-NEXT:    s_mov_b32 s8, s0
304; SI-NEXT:    s_mov_b32 s9, s1
305; SI-NEXT:    v_mov_b32_e32 v1, s13
306; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
307; SI-NEXT:    s_mov_b32 s0, s2
308; SI-NEXT:    s_mov_b32 s1, s3
309; SI-NEXT:    s_mov_b32 s2, s10
310; SI-NEXT:    s_mov_b32 s3, s11
311; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
312; SI-NEXT:    s_waitcnt expcnt(0)
313; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
314; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
315; SI-NEXT:    s_endpgm
316;
317; VI-LABEL: s_saddo_i64:
318; VI:       ; %bb.0:
319; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
320; VI-NEXT:    s_waitcnt lgkmcnt(0)
321; VI-NEXT:    v_mov_b32_e32 v0, s0
322; VI-NEXT:    s_add_u32 s0, s4, s6
323; VI-NEXT:    v_mov_b32_e32 v4, s4
324; VI-NEXT:    v_mov_b32_e32 v1, s1
325; VI-NEXT:    s_addc_u32 s1, s5, s7
326; VI-NEXT:    v_mov_b32_e32 v5, s5
327; VI-NEXT:    v_mov_b32_e32 v2, s2
328; VI-NEXT:    v_mov_b32_e32 v3, s3
329; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
330; VI-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
331; VI-NEXT:    v_mov_b32_e32 v5, s1
332; VI-NEXT:    v_mov_b32_e32 v4, s0
333; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], vcc
334; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
335; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
336; VI-NEXT:    flat_store_byte v[2:3], v0
337; VI-NEXT:    s_endpgm
338;
339; GFX9-LABEL: s_saddo_i64:
340; GFX9:       ; %bb.0:
341; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
342; GFX9-NEXT:    v_mov_b32_e32 v2, 0
343; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
344; GFX9-NEXT:    s_add_u32 s8, s4, s6
345; GFX9-NEXT:    v_mov_b32_e32 v0, s4
346; GFX9-NEXT:    v_mov_b32_e32 v1, s5
347; GFX9-NEXT:    s_addc_u32 s9, s5, s7
348; GFX9-NEXT:    v_cmp_lt_i64_e64 s[10:11], s[6:7], 0
349; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
350; GFX9-NEXT:    v_mov_b32_e32 v0, s8
351; GFX9-NEXT:    v_mov_b32_e32 v1, s9
352; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
353; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], vcc
354; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
355; GFX9-NEXT:    global_store_byte v2, v0, s[2:3]
356; GFX9-NEXT:    s_endpgm
357;
358; GFX10-LABEL: s_saddo_i64:
359; GFX10:       ; %bb.0:
360; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
361; GFX10-NEXT:    v_mov_b32_e32 v2, 0
362; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
363; GFX10-NEXT:    s_add_u32 s8, s4, s6
364; GFX10-NEXT:    s_addc_u32 s9, s5, s7
365; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, s[6:7], 0
366; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
367; GFX10-NEXT:    v_mov_b32_e32 v0, s8
368; GFX10-NEXT:    v_mov_b32_e32 v1, s9
369; GFX10-NEXT:    s_xor_b32 s4, s6, s4
370; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
371; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
372; GFX10-NEXT:    global_store_byte v2, v3, s[2:3]
373; GFX10-NEXT:    s_endpgm
374  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
375  %val = extractvalue { i64, i1 } %sadd, 0
376  %carry = extractvalue { i64, i1 } %sadd, 1
377  store i64 %val, i64 addrspace(1)* %out, align 8
378  store i1 %carry, i1 addrspace(1)* %carryout
379  ret void
380}
381
382define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
383; SI-LABEL: v_saddo_i64:
384; SI:       ; %bb.0:
385; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
386; SI-NEXT:    s_mov_b32 s11, 0xf000
387; SI-NEXT:    s_mov_b32 s10, -1
388; SI-NEXT:    s_mov_b32 s14, s10
389; SI-NEXT:    s_mov_b32 s15, s11
390; SI-NEXT:    s_waitcnt lgkmcnt(0)
391; SI-NEXT:    s_mov_b32 s12, s4
392; SI-NEXT:    s_mov_b32 s13, s5
393; SI-NEXT:    s_mov_b32 s4, s6
394; SI-NEXT:    s_mov_b32 s5, s7
395; SI-NEXT:    s_mov_b32 s6, s10
396; SI-NEXT:    s_mov_b32 s7, s11
397; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
398; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
399; SI-NEXT:    s_mov_b32 s8, s0
400; SI-NEXT:    s_mov_b32 s9, s1
401; SI-NEXT:    s_mov_b32 s4, s2
402; SI-NEXT:    s_mov_b32 s5, s3
403; SI-NEXT:    s_waitcnt vmcnt(0)
404; SI-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
405; SI-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
406; SI-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
407; SI-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
408; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[8:11], 0
409; SI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
410; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
411; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
412; SI-NEXT:    s_endpgm
413;
414; VI-LABEL: v_saddo_i64:
415; VI:       ; %bb.0:
416; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
417; VI-NEXT:    s_waitcnt lgkmcnt(0)
418; VI-NEXT:    v_mov_b32_e32 v0, s4
419; VI-NEXT:    v_mov_b32_e32 v1, s5
420; VI-NEXT:    v_mov_b32_e32 v2, s6
421; VI-NEXT:    v_mov_b32_e32 v3, s7
422; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
423; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
424; VI-NEXT:    v_mov_b32_e32 v4, s0
425; VI-NEXT:    v_mov_b32_e32 v5, s1
426; VI-NEXT:    v_mov_b32_e32 v6, s2
427; VI-NEXT:    v_mov_b32_e32 v7, s3
428; VI-NEXT:    s_waitcnt vmcnt(0)
429; VI-NEXT:    v_add_u32_e32 v8, vcc, v0, v2
430; VI-NEXT:    v_addc_u32_e32 v9, vcc, v1, v3, vcc
431; VI-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
432; VI-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1]
433; VI-NEXT:    flat_store_dwordx2 v[4:5], v[8:9]
434; VI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
435; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
436; VI-NEXT:    flat_store_byte v[6:7], v0
437; VI-NEXT:    s_endpgm
438;
439; GFX9-LABEL: v_saddo_i64:
440; GFX9:       ; %bb.0:
441; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
442; GFX9-NEXT:    v_mov_b32_e32 v6, 0
443; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
444; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[8:9]
445; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[10:11]
446; GFX9-NEXT:    s_waitcnt vmcnt(0)
447; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
448; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
449; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
450; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
451; GFX9-NEXT:    global_store_dwordx2 v6, v[4:5], s[4:5]
452; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
453; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
454; GFX9-NEXT:    global_store_byte v6, v0, s[6:7]
455; GFX9-NEXT:    s_endpgm
456;
457; GFX10-LABEL: v_saddo_i64:
458; GFX10:       ; %bb.0:
459; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
460; GFX10-NEXT:    v_mov_b32_e32 v6, 0
461; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
462; GFX10-NEXT:    s_clause 0x1
463; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[8:9]
464; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[10:11]
465; GFX10-NEXT:    s_waitcnt vmcnt(0)
466; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
467; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
468; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
469; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
470; GFX10-NEXT:    s_xor_b32 s0, vcc_lo, s0
471; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
472; GFX10-NEXT:    global_store_dwordx2 v6, v[4:5], s[4:5]
473; GFX10-NEXT:    global_store_byte v6, v0, s[6:7]
474; GFX10-NEXT:    s_endpgm
475  %a = load i64, i64 addrspace(1)* %aptr, align 4
476  %b = load i64, i64 addrspace(1)* %bptr, align 4
477  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
478  %val = extractvalue { i64, i1 } %sadd, 0
479  %carry = extractvalue { i64, i1 } %sadd, 1
480  store i64 %val, i64 addrspace(1)* %out, align 8
481  store i1 %carry, i1 addrspace(1)* %carryout
482  ret void
483}
484
485define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
486; SI-LABEL: v_saddo_v2i32:
487; SI:       ; %bb.0:
488; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
489; SI-NEXT:    s_mov_b32 s11, 0xf000
490; SI-NEXT:    s_mov_b32 s10, -1
491; SI-NEXT:    s_mov_b32 s14, s10
492; SI-NEXT:    s_mov_b32 s15, s11
493; SI-NEXT:    s_waitcnt lgkmcnt(0)
494; SI-NEXT:    s_mov_b32 s12, s4
495; SI-NEXT:    s_mov_b32 s13, s5
496; SI-NEXT:    s_mov_b32 s4, s6
497; SI-NEXT:    s_mov_b32 s5, s7
498; SI-NEXT:    s_mov_b32 s6, s10
499; SI-NEXT:    s_mov_b32 s7, s11
500; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
501; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
502; SI-NEXT:    s_mov_b32 s8, s0
503; SI-NEXT:    s_mov_b32 s9, s1
504; SI-NEXT:    s_mov_b32 s12, s2
505; SI-NEXT:    s_mov_b32 s13, s3
506; SI-NEXT:    s_waitcnt vmcnt(0)
507; SI-NEXT:    v_add_i32_e32 v5, vcc, v1, v3
508; SI-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
509; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], 0, v3
510; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], v5, v1
511; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
512; SI-NEXT:    v_cmp_lt_i32_e64 s[2:3], v4, v0
513; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
514; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
515; SI-NEXT:    s_xor_b64 s[0:1], vcc, s[2:3]
516; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
517; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[8:11], 0
518; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[12:15], 0
519; SI-NEXT:    s_endpgm
520;
521; VI-LABEL: v_saddo_v2i32:
522; VI:       ; %bb.0:
523; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
524; VI-NEXT:    s_waitcnt lgkmcnt(0)
525; VI-NEXT:    v_mov_b32_e32 v0, s4
526; VI-NEXT:    v_mov_b32_e32 v1, s5
527; VI-NEXT:    v_mov_b32_e32 v2, s6
528; VI-NEXT:    v_mov_b32_e32 v3, s7
529; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
530; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
531; VI-NEXT:    v_mov_b32_e32 v4, s0
532; VI-NEXT:    v_mov_b32_e32 v5, s1
533; VI-NEXT:    v_mov_b32_e32 v6, s2
534; VI-NEXT:    v_mov_b32_e32 v7, s3
535; VI-NEXT:    s_waitcnt vmcnt(0)
536; VI-NEXT:    v_add_u32_e32 v9, vcc, v1, v3
537; VI-NEXT:    v_add_u32_e32 v8, vcc, v0, v2
538; VI-NEXT:    v_cmp_gt_i32_e64 s[0:1], 0, v3
539; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], v9, v1
540; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
541; VI-NEXT:    v_cmp_lt_i32_e64 s[2:3], v8, v0
542; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
543; VI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
544; VI-NEXT:    s_xor_b64 s[0:1], vcc, s[2:3]
545; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
546; VI-NEXT:    flat_store_dwordx2 v[4:5], v[8:9]
547; VI-NEXT:    flat_store_dwordx2 v[6:7], v[0:1]
548; VI-NEXT:    s_endpgm
549;
550; GFX9-LABEL: v_saddo_v2i32:
551; GFX9:       ; %bb.0:
552; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
553; GFX9-NEXT:    v_mov_b32_e32 v6, 0
554; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
555; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[4:5]
556; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[6:7]
557; GFX9-NEXT:    s_waitcnt vmcnt(0)
558; GFX9-NEXT:    v_add_u32_e32 v5, v1, v3
559; GFX9-NEXT:    v_add_i32 v1, v1, v3 clamp
560; GFX9-NEXT:    v_add_u32_e32 v4, v0, v2
561; GFX9-NEXT:    v_add_i32 v0, v0, v2 clamp
562; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v1
563; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
564; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v0
565; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
566; GFX9-NEXT:    global_store_dwordx2 v6, v[4:5], s[0:1]
567; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[2:3]
568; GFX9-NEXT:    s_endpgm
569;
570; GFX10-LABEL: v_saddo_v2i32:
571; GFX10:       ; %bb.0:
572; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
573; GFX10-NEXT:    v_mov_b32_e32 v5, 0
574; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
575; GFX10-NEXT:    s_clause 0x1
576; GFX10-NEXT:    global_load_dwordx2 v[0:1], v5, s[4:5]
577; GFX10-NEXT:    global_load_dwordx2 v[2:3], v5, s[6:7]
578; GFX10-NEXT:    s_waitcnt vmcnt(0)
579; GFX10-NEXT:    v_add_nc_u32_e32 v4, v1, v3
580; GFX10-NEXT:    v_add_nc_i32 v1, v1, v3 clamp
581; GFX10-NEXT:    v_add_nc_u32_e32 v3, v0, v2
582; GFX10-NEXT:    v_add_nc_i32 v0, v0, v2 clamp
583; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v4, v1
584; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
585; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v3, v0
586; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
587; GFX10-NEXT:    global_store_dwordx2 v5, v[3:4], s[0:1]
588; GFX10-NEXT:    global_store_dwordx2 v5, v[0:1], s[2:3]
589; GFX10-NEXT:    s_endpgm
590  %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
591  %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
592  %sadd = call { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
593  %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
594  %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
595  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
596  %carry.ext = zext <2 x i1> %carry to <2 x i32>
597  store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout
598  ret void
599}
600