1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI 3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI 4; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9 5; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10 6; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s --check-prefix=GFX11 7 8 9declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone 10declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone 11 12 13declare { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 14 15define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { 16; SI-LABEL: saddo_i64_zext: 17; SI: ; %bb.0: 18; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 19; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 20; SI-NEXT: s_mov_b32 s3, 0xf000 21; SI-NEXT: s_mov_b32 s2, -1 22; SI-NEXT: s_waitcnt lgkmcnt(0) 23; SI-NEXT: v_mov_b32_e32 v0, s6 24; SI-NEXT: s_add_u32 s10, s6, s8 25; SI-NEXT: s_addc_u32 s11, s7, s9 26; SI-NEXT: v_mov_b32_e32 v1, s7 27; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1] 28; SI-NEXT: v_cmp_lt_i64_e64 s[6:7], s[8:9], 0 29; SI-NEXT: s_mov_b32 s0, s4 30; SI-NEXT: s_mov_b32 s1, s5 31; SI-NEXT: s_xor_b64 s[4:5], s[6:7], vcc 32; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] 33; SI-NEXT: v_mov_b32_e32 v1, s11 34; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0 35; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 36; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 37; SI-NEXT: s_endpgm 38; 39; VI-LABEL: saddo_i64_zext: 40; VI: ; %bb.0: 41; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 42; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 43; VI-NEXT: s_waitcnt lgkmcnt(0) 44; VI-NEXT: v_mov_b32_e32 v1, s6 45; VI-NEXT: s_add_u32 s2, s6, s0 46; VI-NEXT: v_mov_b32_e32 v2, s7 47; VI-NEXT: s_addc_u32 s3, s7, s1 48; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 49; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] 50; VI-NEXT: v_mov_b32_e32 v3, s3 51; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc 52; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] 53; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 54; VI-NEXT: v_mov_b32_e32 v0, s4 55; VI-NEXT: v_mov_b32_e32 v1, s5 56; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 57; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 58; VI-NEXT: s_endpgm 59; 60; GFX9-LABEL: saddo_i64_zext: 61; GFX9: ; %bb.0: 62; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 63; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 64; GFX9-NEXT: v_mov_b32_e32 v2, 0 65; GFX9-NEXT: s_waitcnt lgkmcnt(0) 66; GFX9-NEXT: v_mov_b32_e32 v0, s6 67; GFX9-NEXT: s_add_u32 s0, s6, s2 68; GFX9-NEXT: v_mov_b32_e32 v1, s7 69; GFX9-NEXT: s_addc_u32 s1, s7, s3 70; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0 71; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] 72; GFX9-NEXT: v_mov_b32_e32 v1, s1 73; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc 74; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] 75; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 76; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 77; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 78; GFX9-NEXT: s_endpgm 79; 80; GFX10-LABEL: saddo_i64_zext: 81; GFX10: ; %bb.0: 82; GFX10-NEXT: s_clause 0x1 83; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 84; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 85; GFX10-NEXT: v_mov_b32_e32 v2, 0 86; GFX10-NEXT: s_waitcnt lgkmcnt(0) 87; GFX10-NEXT: s_add_u32 s0, s6, s2 88; GFX10-NEXT: s_addc_u32 s1, s7, s3 89; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0 90; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[6:7] 91; GFX10-NEXT: s_xor_b32 s2, s2, s3 92; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 93; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 94; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 95; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 96; GFX10-NEXT: s_endpgm 97; 98; GFX11-LABEL: saddo_i64_zext: 99; GFX11: ; %bb.0: 100; GFX11-NEXT: s_clause 0x1 101; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 102; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 103; GFX11-NEXT: v_mov_b32_e32 v2, 0 104; GFX11-NEXT: s_waitcnt lgkmcnt(0) 105; GFX11-NEXT: s_add_u32 s2, s6, s0 106; GFX11-NEXT: s_addc_u32 s3, s7, s1 107; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 108; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], s[6:7] 109; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 110; GFX11-NEXT: s_xor_b32 s0, s0, s1 111; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 112; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 113; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0 114; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0 115; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] 116; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 117; GFX11-NEXT: s_endpgm 118 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind 119 %val = extractvalue { i64, i1 } %sadd, 0 120 %carry = extractvalue { i64, i1 } %sadd, 1 121 %ext = zext i1 %carry to i64 122 %add2 = add i64 %val, %ext 123 store i64 %add2, i64 addrspace(1)* %out, align 8 124 ret void 125} 126 127define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { 128; SI-LABEL: s_saddo_i32: 129; SI: ; %bb.0: 130; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 131; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 132; SI-NEXT: s_mov_b32 s3, 0xf000 133; SI-NEXT: s_mov_b32 s2, -1 134; SI-NEXT: s_waitcnt lgkmcnt(0) 135; SI-NEXT: s_mov_b32 s0, s4 136; SI-NEXT: s_add_i32 s12, s8, s9 137; SI-NEXT: s_cmp_lt_i32 s9, 0 138; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 139; SI-NEXT: s_cmp_lt_i32 s12, s8 140; SI-NEXT: s_mov_b32 s1, s5 141; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 142; SI-NEXT: v_mov_b32_e32 v0, s12 143; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 144; SI-NEXT: s_xor_b64 s[0:1], s[10:11], s[8:9] 145; SI-NEXT: s_mov_b32 s4, s6 146; SI-NEXT: s_mov_b32 s5, s7 147; SI-NEXT: s_mov_b32 s6, s2 148; SI-NEXT: s_mov_b32 s7, s3 149; SI-NEXT: s_waitcnt expcnt(0) 150; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 151; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 152; SI-NEXT: s_endpgm 153; 154; VI-LABEL: s_saddo_i32: 155; VI: ; %bb.0: 156; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 157; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 158; VI-NEXT: s_waitcnt lgkmcnt(0) 159; VI-NEXT: v_mov_b32_e32 v0, s4 160; VI-NEXT: s_add_i32 s4, s0, s1 161; VI-NEXT: s_cmp_lt_i32 s1, 0 162; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 163; VI-NEXT: s_cmp_lt_i32 s4, s0 164; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 165; VI-NEXT: v_mov_b32_e32 v1, s5 166; VI-NEXT: v_mov_b32_e32 v4, s4 167; VI-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] 168; VI-NEXT: v_mov_b32_e32 v2, s6 169; VI-NEXT: v_mov_b32_e32 v3, s7 170; VI-NEXT: flat_store_dword v[0:1], v4 171; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 172; VI-NEXT: flat_store_byte v[2:3], v0 173; VI-NEXT: s_endpgm 174; 175; GFX9-LABEL: s_saddo_i32: 176; GFX9: ; %bb.0: 177; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 178; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 179; GFX9-NEXT: v_mov_b32_e32 v0, 0 180; GFX9-NEXT: s_waitcnt lgkmcnt(0) 181; GFX9-NEXT: v_mov_b32_e32 v1, s3 182; GFX9-NEXT: s_add_i32 s0, s2, s3 183; GFX9-NEXT: v_add_i32 v1, s2, v1 clamp 184; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s0, v1 185; GFX9-NEXT: v_mov_b32_e32 v2, s0 186; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 187; GFX9-NEXT: global_store_dword v0, v2, s[4:5] 188; GFX9-NEXT: global_store_byte v0, v1, s[6:7] 189; GFX9-NEXT: s_endpgm 190; 191; GFX10-LABEL: s_saddo_i32: 192; GFX10: ; %bb.0: 193; GFX10-NEXT: s_clause 0x1 194; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 195; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 196; GFX10-NEXT: v_mov_b32_e32 v1, 0 197; GFX10-NEXT: s_waitcnt lgkmcnt(0) 198; GFX10-NEXT: v_add_nc_i32 v0, s2, s3 clamp 199; GFX10-NEXT: s_add_i32 s0, s2, s3 200; GFX10-NEXT: v_mov_b32_e32 v2, s0 201; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s0, v0 202; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 203; GFX10-NEXT: global_store_dword v1, v2, s[4:5] 204; GFX10-NEXT: global_store_byte v1, v0, s[6:7] 205; GFX10-NEXT: s_endpgm 206; 207; GFX11-LABEL: s_saddo_i32: 208; GFX11: ; %bb.0: 209; GFX11-NEXT: s_clause 0x1 210; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 211; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 212; GFX11-NEXT: s_waitcnt lgkmcnt(0) 213; GFX11-NEXT: v_add_nc_i32 v0, s4, s5 clamp 214; GFX11-NEXT: s_add_i32 s4, s4, s5 215; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) 216; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4 217; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0 218; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 219; GFX11-NEXT: s_clause 0x1 220; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] 221; GFX11-NEXT: global_store_b8 v1, v0, s[2:3] 222; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 223; GFX11-NEXT: s_endpgm 224 %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind 225 %val = extractvalue { i32, i1 } %sadd, 0 226 %carry = extractvalue { i32, i1 } %sadd, 1 227 store i32 %val, i32 addrspace(1)* %out, align 4 228 store i1 %carry, i1 addrspace(1)* %carryout 229 ret void 230} 231 232define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { 233; SI-LABEL: v_saddo_i32: 234; SI: ; %bb.0: 235; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 236; SI-NEXT: s_mov_b32 s11, 0xf000 237; SI-NEXT: s_mov_b32 s10, -1 238; SI-NEXT: s_mov_b32 s14, s10 239; SI-NEXT: s_mov_b32 s15, s11 240; SI-NEXT: s_waitcnt lgkmcnt(0) 241; SI-NEXT: s_mov_b32 s12, s4 242; SI-NEXT: s_mov_b32 s13, s5 243; SI-NEXT: s_mov_b32 s4, s6 244; SI-NEXT: s_mov_b32 s5, s7 245; SI-NEXT: s_mov_b32 s6, s10 246; SI-NEXT: s_mov_b32 s7, s11 247; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 248; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 249; SI-NEXT: s_mov_b32 s8, s0 250; SI-NEXT: s_mov_b32 s9, s1 251; SI-NEXT: s_mov_b32 s4, s2 252; SI-NEXT: s_mov_b32 s5, s3 253; SI-NEXT: s_waitcnt vmcnt(0) 254; SI-NEXT: v_add_i32_e32 v2, vcc, v0, v1 255; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 256; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0 257; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] 258; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 259; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 260; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 261; SI-NEXT: s_endpgm 262; 263; VI-LABEL: v_saddo_i32: 264; VI: ; %bb.0: 265; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 266; VI-NEXT: s_waitcnt lgkmcnt(0) 267; VI-NEXT: v_mov_b32_e32 v0, s4 268; VI-NEXT: v_mov_b32_e32 v1, s5 269; VI-NEXT: v_mov_b32_e32 v2, s6 270; VI-NEXT: v_mov_b32_e32 v3, s7 271; VI-NEXT: flat_load_dword v4, v[0:1] 272; VI-NEXT: flat_load_dword v5, v[2:3] 273; VI-NEXT: v_mov_b32_e32 v0, s0 274; VI-NEXT: v_mov_b32_e32 v1, s1 275; VI-NEXT: v_mov_b32_e32 v2, s2 276; VI-NEXT: v_mov_b32_e32 v3, s3 277; VI-NEXT: s_waitcnt vmcnt(0) 278; VI-NEXT: v_add_u32_e32 v6, vcc, v4, v5 279; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5 280; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4 281; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] 282; VI-NEXT: flat_store_dword v[0:1], v6 283; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 284; VI-NEXT: flat_store_byte v[2:3], v0 285; VI-NEXT: s_endpgm 286; 287; GFX9-LABEL: v_saddo_i32: 288; GFX9: ; %bb.0: 289; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 290; GFX9-NEXT: v_mov_b32_e32 v0, 0 291; GFX9-NEXT: s_waitcnt lgkmcnt(0) 292; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 293; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 294; GFX9-NEXT: s_waitcnt vmcnt(0) 295; GFX9-NEXT: v_add_i32 v3, v1, v2 clamp 296; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 297; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v3 298; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 299; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 300; GFX9-NEXT: global_store_byte v0, v1, s[2:3] 301; GFX9-NEXT: s_endpgm 302; 303; GFX10-LABEL: v_saddo_i32: 304; GFX10: ; %bb.0: 305; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 306; GFX10-NEXT: v_mov_b32_e32 v0, 0 307; GFX10-NEXT: s_waitcnt lgkmcnt(0) 308; GFX10-NEXT: s_clause 0x1 309; GFX10-NEXT: global_load_dword v1, v0, s[4:5] 310; GFX10-NEXT: global_load_dword v2, v0, s[6:7] 311; GFX10-NEXT: s_waitcnt vmcnt(0) 312; GFX10-NEXT: v_add_nc_i32 v3, v1, v2 clamp 313; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 314; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3 315; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 316; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 317; GFX10-NEXT: global_store_byte v0, v2, s[2:3] 318; GFX10-NEXT: s_endpgm 319; 320; GFX11-LABEL: v_saddo_i32: 321; GFX11: ; %bb.0: 322; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 323; GFX11-NEXT: v_mov_b32_e32 v0, 0 324; GFX11-NEXT: s_waitcnt lgkmcnt(0) 325; GFX11-NEXT: s_clause 0x1 326; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] 327; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] 328; GFX11-NEXT: s_waitcnt vmcnt(0) 329; GFX11-NEXT: v_add_nc_i32 v3, v1, v2 clamp 330; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v2 331; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 332; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3 333; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 334; GFX11-NEXT: s_clause 0x1 335; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 336; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] 337; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 338; GFX11-NEXT: s_endpgm 339 %a = load i32, i32 addrspace(1)* %aptr, align 4 340 %b = load i32, i32 addrspace(1)* %bptr, align 4 341 %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind 342 %val = extractvalue { i32, i1 } %sadd, 0 343 %carry = extractvalue { i32, i1 } %sadd, 1 344 store i32 %val, i32 addrspace(1)* %out, align 4 345 store i1 %carry, i1 addrspace(1)* %carryout 346 ret void 347} 348 349define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { 350; SI-LABEL: s_saddo_i64: 351; SI: ; %bb.0: 352; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 353; SI-NEXT: s_mov_b32 s11, 0xf000 354; SI-NEXT: s_mov_b32 s10, -1 355; SI-NEXT: s_waitcnt lgkmcnt(0) 356; SI-NEXT: s_add_u32 s12, s4, s6 357; SI-NEXT: v_mov_b32_e32 v0, s4 358; SI-NEXT: s_addc_u32 s13, s5, s7 359; SI-NEXT: v_mov_b32_e32 v1, s5 360; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] 361; SI-NEXT: v_cmp_lt_i64_e64 s[4:5], s[6:7], 0 362; SI-NEXT: v_mov_b32_e32 v0, s12 363; SI-NEXT: s_mov_b32 s8, s0 364; SI-NEXT: s_mov_b32 s9, s1 365; SI-NEXT: v_mov_b32_e32 v1, s13 366; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc 367; SI-NEXT: s_mov_b32 s0, s2 368; SI-NEXT: s_mov_b32 s1, s3 369; SI-NEXT: s_mov_b32 s2, s10 370; SI-NEXT: s_mov_b32 s3, s11 371; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 372; SI-NEXT: s_waitcnt expcnt(0) 373; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] 374; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 375; SI-NEXT: s_endpgm 376; 377; VI-LABEL: s_saddo_i64: 378; VI: ; %bb.0: 379; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 380; VI-NEXT: s_waitcnt lgkmcnt(0) 381; VI-NEXT: v_mov_b32_e32 v0, s0 382; VI-NEXT: s_add_u32 s0, s4, s6 383; VI-NEXT: v_mov_b32_e32 v4, s4 384; VI-NEXT: v_mov_b32_e32 v1, s1 385; VI-NEXT: s_addc_u32 s1, s5, s7 386; VI-NEXT: v_mov_b32_e32 v5, s5 387; VI-NEXT: v_mov_b32_e32 v2, s2 388; VI-NEXT: v_mov_b32_e32 v3, s3 389; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] 390; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 391; VI-NEXT: v_mov_b32_e32 v5, s1 392; VI-NEXT: v_mov_b32_e32 v4, s0 393; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc 394; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] 395; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 396; VI-NEXT: flat_store_byte v[2:3], v0 397; VI-NEXT: s_endpgm 398; 399; GFX9-LABEL: s_saddo_i64: 400; GFX9: ; %bb.0: 401; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 402; GFX9-NEXT: v_mov_b32_e32 v2, 0 403; GFX9-NEXT: s_waitcnt lgkmcnt(0) 404; GFX9-NEXT: s_add_u32 s8, s4, s6 405; GFX9-NEXT: v_mov_b32_e32 v0, s4 406; GFX9-NEXT: v_mov_b32_e32 v1, s5 407; GFX9-NEXT: s_addc_u32 s9, s5, s7 408; GFX9-NEXT: v_cmp_lt_i64_e64 s[10:11], s[6:7], 0 409; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] 410; GFX9-NEXT: v_mov_b32_e32 v0, s8 411; GFX9-NEXT: v_mov_b32_e32 v1, s9 412; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 413; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], vcc 414; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 415; GFX9-NEXT: global_store_byte v2, v0, s[2:3] 416; GFX9-NEXT: s_endpgm 417; 418; GFX10-LABEL: s_saddo_i64: 419; GFX10: ; %bb.0: 420; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 421; GFX10-NEXT: v_mov_b32_e32 v2, 0 422; GFX10-NEXT: s_waitcnt lgkmcnt(0) 423; GFX10-NEXT: s_add_u32 s8, s4, s6 424; GFX10-NEXT: s_addc_u32 s9, s5, s7 425; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0 426; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] 427; GFX10-NEXT: v_mov_b32_e32 v0, s8 428; GFX10-NEXT: v_mov_b32_e32 v1, s9 429; GFX10-NEXT: s_xor_b32 s4, s6, s4 430; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 431; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 432; GFX10-NEXT: global_store_byte v2, v3, s[2:3] 433; GFX10-NEXT: s_endpgm 434; 435; GFX11-LABEL: s_saddo_i64: 436; GFX11: ; %bb.0: 437; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 438; GFX11-NEXT: s_waitcnt lgkmcnt(0) 439; GFX11-NEXT: s_add_u32 s8, s4, s6 440; GFX11-NEXT: s_addc_u32 s9, s5, s7 441; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0 442; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] 443; GFX11-NEXT: v_mov_b32_e32 v0, s8 444; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9 445; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) 446; GFX11-NEXT: s_xor_b32 s4, s6, s4 447; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 448; GFX11-NEXT: s_clause 0x1 449; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 450; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] 451; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 452; GFX11-NEXT: s_endpgm 453 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind 454 %val = extractvalue { i64, i1 } %sadd, 0 455 %carry = extractvalue { i64, i1 } %sadd, 1 456 store i64 %val, i64 addrspace(1)* %out, align 8 457 store i1 %carry, i1 addrspace(1)* %carryout 458 ret void 459} 460 461define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { 462; SI-LABEL: v_saddo_i64: 463; SI: ; %bb.0: 464; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 465; SI-NEXT: s_mov_b32 s11, 0xf000 466; SI-NEXT: s_mov_b32 s10, -1 467; SI-NEXT: s_mov_b32 s14, s10 468; SI-NEXT: s_mov_b32 s15, s11 469; SI-NEXT: s_waitcnt lgkmcnt(0) 470; SI-NEXT: s_mov_b32 s12, s4 471; SI-NEXT: s_mov_b32 s13, s5 472; SI-NEXT: s_mov_b32 s4, s6 473; SI-NEXT: s_mov_b32 s5, s7 474; SI-NEXT: s_mov_b32 s6, s10 475; SI-NEXT: s_mov_b32 s7, s11 476; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 477; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 478; SI-NEXT: s_mov_b32 s8, s0 479; SI-NEXT: s_mov_b32 s9, s1 480; SI-NEXT: s_mov_b32 s4, s2 481; SI-NEXT: s_mov_b32 s5, s3 482; SI-NEXT: s_waitcnt vmcnt(0) 483; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 484; SI-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc 485; SI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] 486; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] 487; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 488; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] 489; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 490; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 491; SI-NEXT: s_endpgm 492; 493; VI-LABEL: v_saddo_i64: 494; VI: ; %bb.0: 495; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 496; VI-NEXT: s_waitcnt lgkmcnt(0) 497; VI-NEXT: v_mov_b32_e32 v0, s4 498; VI-NEXT: v_mov_b32_e32 v1, s5 499; VI-NEXT: v_mov_b32_e32 v2, s6 500; VI-NEXT: v_mov_b32_e32 v3, s7 501; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 502; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 503; VI-NEXT: v_mov_b32_e32 v4, s0 504; VI-NEXT: v_mov_b32_e32 v5, s1 505; VI-NEXT: v_mov_b32_e32 v6, s2 506; VI-NEXT: v_mov_b32_e32 v7, s3 507; VI-NEXT: s_waitcnt vmcnt(0) 508; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2 509; VI-NEXT: v_addc_u32_e32 v9, vcc, v1, v3, vcc 510; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] 511; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1] 512; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] 513; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] 514; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 515; VI-NEXT: flat_store_byte v[6:7], v0 516; VI-NEXT: s_endpgm 517; 518; GFX9-LABEL: v_saddo_i64: 519; GFX9: ; %bb.0: 520; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 521; GFX9-NEXT: v_mov_b32_e32 v6, 0 522; GFX9-NEXT: s_waitcnt lgkmcnt(0) 523; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] 524; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11] 525; GFX9-NEXT: s_waitcnt vmcnt(0) 526; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 527; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc 528; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] 529; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] 530; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5] 531; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] 532; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 533; GFX9-NEXT: global_store_byte v6, v0, s[6:7] 534; GFX9-NEXT: s_endpgm 535; 536; GFX10-LABEL: v_saddo_i64: 537; GFX10: ; %bb.0: 538; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 539; GFX10-NEXT: v_mov_b32_e32 v6, 0 540; GFX10-NEXT: s_waitcnt lgkmcnt(0) 541; GFX10-NEXT: s_clause 0x1 542; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] 543; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11] 544; GFX10-NEXT: s_waitcnt vmcnt(0) 545; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 546; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo 547; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] 548; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] 549; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0 550; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 551; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5] 552; GFX10-NEXT: global_store_byte v6, v0, s[6:7] 553; GFX10-NEXT: s_endpgm 554; 555; GFX11-LABEL: v_saddo_i64: 556; GFX11: ; %bb.0: 557; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 558; GFX11-NEXT: v_mov_b32_e32 v6, 0 559; GFX11-NEXT: s_waitcnt lgkmcnt(0) 560; GFX11-NEXT: s_clause 0x1 561; GFX11-NEXT: global_load_b64 v[0:1], v6, s[8:9] 562; GFX11-NEXT: global_load_b64 v[2:3], v6, s[10:11] 563; GFX11-NEXT: s_waitcnt vmcnt(0) 564; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 565; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo 566; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] 567; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 568; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] 569; GFX11-NEXT: s_xor_b32 s0, vcc_lo, s0 570; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 571; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 572; GFX11-NEXT: s_clause 0x1 573; GFX11-NEXT: global_store_b64 v6, v[4:5], s[4:5] 574; GFX11-NEXT: global_store_b8 v6, v0, s[6:7] 575; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 576; GFX11-NEXT: s_endpgm 577 %a = load i64, i64 addrspace(1)* %aptr, align 4 578 %b = load i64, i64 addrspace(1)* %bptr, align 4 579 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind 580 %val = extractvalue { i64, i1 } %sadd, 0 581 %carry = extractvalue { i64, i1 } %sadd, 1 582 store i64 %val, i64 addrspace(1)* %out, align 8 583 store i1 %carry, i1 addrspace(1)* %carryout 584 ret void 585} 586 587define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind { 588; SI-LABEL: v_saddo_v2i32: 589; SI: ; %bb.0: 590; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 591; SI-NEXT: s_mov_b32 s11, 0xf000 592; SI-NEXT: s_mov_b32 s10, -1 593; SI-NEXT: s_mov_b32 s14, s10 594; SI-NEXT: s_mov_b32 s15, s11 595; SI-NEXT: s_waitcnt lgkmcnt(0) 596; SI-NEXT: s_mov_b32 s12, s4 597; SI-NEXT: s_mov_b32 s13, s5 598; SI-NEXT: s_mov_b32 s4, s6 599; SI-NEXT: s_mov_b32 s5, s7 600; SI-NEXT: s_mov_b32 s6, s10 601; SI-NEXT: s_mov_b32 s7, s11 602; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 603; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 604; SI-NEXT: s_mov_b32 s8, s0 605; SI-NEXT: s_mov_b32 s9, s1 606; SI-NEXT: s_mov_b32 s12, s2 607; SI-NEXT: s_mov_b32 s13, s3 608; SI-NEXT: s_waitcnt vmcnt(0) 609; SI-NEXT: v_add_i32_e32 v5, vcc, v1, v3 610; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 611; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 612; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1 613; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 614; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v0 615; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] 616; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 617; SI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] 618; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 619; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 620; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 621; SI-NEXT: s_endpgm 622; 623; VI-LABEL: v_saddo_v2i32: 624; VI: ; %bb.0: 625; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 626; VI-NEXT: s_waitcnt lgkmcnt(0) 627; VI-NEXT: v_mov_b32_e32 v0, s4 628; VI-NEXT: v_mov_b32_e32 v1, s5 629; VI-NEXT: v_mov_b32_e32 v2, s6 630; VI-NEXT: v_mov_b32_e32 v3, s7 631; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 632; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 633; VI-NEXT: v_mov_b32_e32 v4, s0 634; VI-NEXT: v_mov_b32_e32 v5, s1 635; VI-NEXT: v_mov_b32_e32 v6, s2 636; VI-NEXT: v_mov_b32_e32 v7, s3 637; VI-NEXT: s_waitcnt vmcnt(0) 638; VI-NEXT: v_add_u32_e32 v9, vcc, v1, v3 639; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2 640; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 641; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1 642; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 643; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0 644; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] 645; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 646; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] 647; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 648; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] 649; VI-NEXT: flat_store_dwordx2 v[6:7], v[0:1] 650; VI-NEXT: s_endpgm 651; 652; GFX9-LABEL: v_saddo_v2i32: 653; GFX9: ; %bb.0: 654; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 655; GFX9-NEXT: v_mov_b32_e32 v6, 0 656; GFX9-NEXT: s_waitcnt lgkmcnt(0) 657; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[4:5] 658; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[6:7] 659; GFX9-NEXT: s_waitcnt vmcnt(0) 660; GFX9-NEXT: v_add_u32_e32 v5, v1, v3 661; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp 662; GFX9-NEXT: v_add_u32_e32 v4, v0, v2 663; GFX9-NEXT: v_add_i32 v0, v0, v2 clamp 664; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v5, v1 665; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 666; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v0 667; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 668; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] 669; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3] 670; GFX9-NEXT: s_endpgm 671; 672; GFX10-LABEL: v_saddo_v2i32: 673; GFX10: ; %bb.0: 674; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 675; GFX10-NEXT: v_mov_b32_e32 v5, 0 676; GFX10-NEXT: s_waitcnt lgkmcnt(0) 677; GFX10-NEXT: s_clause 0x1 678; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[4:5] 679; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[6:7] 680; GFX10-NEXT: s_waitcnt vmcnt(0) 681; GFX10-NEXT: v_add_nc_u32_e32 v4, v1, v3 682; GFX10-NEXT: v_add_nc_i32 v1, v1, v3 clamp 683; GFX10-NEXT: v_add_nc_u32_e32 v3, v0, v2 684; GFX10-NEXT: v_add_nc_i32 v0, v0, v2 clamp 685; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v1 686; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 687; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0 688; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 689; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[0:1] 690; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[2:3] 691; GFX10-NEXT: s_endpgm 692; 693; GFX11-LABEL: v_saddo_v2i32: 694; GFX11: ; %bb.0: 695; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 696; GFX11-NEXT: v_mov_b32_e32 v5, 0 697; GFX11-NEXT: s_waitcnt lgkmcnt(0) 698; GFX11-NEXT: s_clause 0x1 699; GFX11-NEXT: global_load_b64 v[0:1], v5, s[4:5] 700; GFX11-NEXT: global_load_b64 v[2:3], v5, s[6:7] 701; GFX11-NEXT: s_waitcnt vmcnt(0) 702; GFX11-NEXT: v_add_nc_u32_e32 v4, v1, v3 703; GFX11-NEXT: v_add_nc_i32 v1, v1, v3 clamp 704; GFX11-NEXT: v_add_nc_u32_e32 v3, v0, v2 705; GFX11-NEXT: v_add_nc_i32 v0, v0, v2 clamp 706; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) 707; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v1 708; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 709; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0 710; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 711; GFX11-NEXT: s_clause 0x1 712; GFX11-NEXT: global_store_b64 v5, v[3:4], s[0:1] 713; GFX11-NEXT: global_store_b64 v5, v[0:1], s[2:3] 714; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 715; GFX11-NEXT: s_endpgm 716 %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 717 %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 718 %sadd = call { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind 719 %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0 720 %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1 721 store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 722 %carry.ext = zext <2 x i1> %carry to <2 x i32> 723 store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout 724 ret void 725} 726