1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI 3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI 4; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9 5; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10 6 7 8declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone 9declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone 10 11 12declare { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 13 14define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { 15; SI-LABEL: saddo_i64_zext: 16; SI: ; %bb.0: 17; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 18; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 19; SI-NEXT: s_mov_b32 s3, 0xf000 20; SI-NEXT: s_mov_b32 s2, -1 21; SI-NEXT: s_waitcnt lgkmcnt(0) 22; SI-NEXT: v_mov_b32_e32 v0, s6 23; SI-NEXT: s_add_u32 s10, s6, s8 24; SI-NEXT: s_addc_u32 s11, s7, s9 25; SI-NEXT: v_mov_b32_e32 v1, s7 26; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1] 27; SI-NEXT: v_cmp_lt_i64_e64 s[6:7], s[8:9], 0 28; SI-NEXT: s_mov_b32 s0, s4 29; SI-NEXT: s_mov_b32 s1, s5 30; SI-NEXT: s_xor_b64 s[4:5], s[6:7], vcc 31; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] 32; SI-NEXT: v_mov_b32_e32 v1, s11 33; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0 34; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 35; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 36; SI-NEXT: s_endpgm 37; 38; VI-LABEL: saddo_i64_zext: 39; VI: ; %bb.0: 40; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 41; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 42; VI-NEXT: s_waitcnt lgkmcnt(0) 43; VI-NEXT: v_mov_b32_e32 v1, s6 44; VI-NEXT: s_add_u32 s2, s6, s0 45; VI-NEXT: v_mov_b32_e32 v2, s7 46; VI-NEXT: s_addc_u32 s3, s7, s1 47; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 48; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] 49; VI-NEXT: v_mov_b32_e32 v3, s3 50; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc 51; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] 52; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 53; VI-NEXT: v_mov_b32_e32 v0, s4 54; VI-NEXT: v_mov_b32_e32 v1, s5 55; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 56; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 57; VI-NEXT: s_endpgm 58; 59; GFX9-LABEL: saddo_i64_zext: 60; GFX9: ; %bb.0: 61; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 62; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 63; GFX9-NEXT: v_mov_b32_e32 v2, 0 64; GFX9-NEXT: s_waitcnt lgkmcnt(0) 65; GFX9-NEXT: v_mov_b32_e32 v0, s6 66; GFX9-NEXT: s_add_u32 s0, s6, s2 67; GFX9-NEXT: v_mov_b32_e32 v1, s7 68; GFX9-NEXT: s_addc_u32 s1, s7, s3 69; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0 70; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] 71; GFX9-NEXT: v_mov_b32_e32 v1, s1 72; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc 73; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] 74; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 75; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 76; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 77; GFX9-NEXT: s_endpgm 78; 79; GFX10-LABEL: saddo_i64_zext: 80; GFX10: ; %bb.0: 81; GFX10-NEXT: s_clause 0x1 82; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 83; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 84; GFX10-NEXT: v_mov_b32_e32 v2, 0 85; GFX10-NEXT: s_waitcnt lgkmcnt(0) 86; GFX10-NEXT: s_add_u32 s0, s6, s2 87; GFX10-NEXT: s_addc_u32 s1, s7, s3 88; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0 89; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[6:7] 90; GFX10-NEXT: s_xor_b32 s2, s2, s3 91; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 92; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 93; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 94; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 95; GFX10-NEXT: s_endpgm 96 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind 97 %val = extractvalue { i64, i1 } %sadd, 0 98 %carry = extractvalue { i64, i1 } %sadd, 1 99 %ext = zext i1 %carry to i64 100 %add2 = add i64 %val, %ext 101 store i64 %add2, i64 addrspace(1)* %out, align 8 102 ret void 103} 104 105define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { 106; SI-LABEL: s_saddo_i32: 107; SI: ; %bb.0: 108; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 109; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 110; SI-NEXT: s_mov_b32 s3, 0xf000 111; SI-NEXT: s_mov_b32 s2, -1 112; SI-NEXT: s_waitcnt lgkmcnt(0) 113; SI-NEXT: s_mov_b32 s0, s4 114; SI-NEXT: s_add_i32 s12, s8, s9 115; SI-NEXT: s_cmp_lt_i32 s9, 0 116; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 117; SI-NEXT: s_cmp_lt_i32 s12, s8 118; SI-NEXT: s_mov_b32 s1, s5 119; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 120; SI-NEXT: v_mov_b32_e32 v0, s12 121; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 122; SI-NEXT: s_xor_b64 s[0:1], s[10:11], s[8:9] 123; SI-NEXT: s_mov_b32 s4, s6 124; SI-NEXT: s_mov_b32 s5, s7 125; SI-NEXT: s_mov_b32 s6, s2 126; SI-NEXT: s_mov_b32 s7, s3 127; SI-NEXT: s_waitcnt expcnt(0) 128; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 129; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 130; SI-NEXT: s_endpgm 131; 132; VI-LABEL: s_saddo_i32: 133; VI: ; %bb.0: 134; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 135; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 136; VI-NEXT: s_waitcnt lgkmcnt(0) 137; VI-NEXT: v_mov_b32_e32 v0, s4 138; VI-NEXT: s_add_i32 s4, s0, s1 139; VI-NEXT: s_cmp_lt_i32 s1, 0 140; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 141; VI-NEXT: s_cmp_lt_i32 s4, s0 142; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 143; VI-NEXT: v_mov_b32_e32 v1, s5 144; VI-NEXT: v_mov_b32_e32 v4, s4 145; VI-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] 146; VI-NEXT: v_mov_b32_e32 v2, s6 147; VI-NEXT: v_mov_b32_e32 v3, s7 148; VI-NEXT: flat_store_dword v[0:1], v4 149; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 150; VI-NEXT: flat_store_byte v[2:3], v0 151; VI-NEXT: s_endpgm 152; 153; GFX9-LABEL: s_saddo_i32: 154; GFX9: ; %bb.0: 155; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 156; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 157; GFX9-NEXT: v_mov_b32_e32 v0, 0 158; GFX9-NEXT: s_waitcnt lgkmcnt(0) 159; GFX9-NEXT: v_mov_b32_e32 v1, s3 160; GFX9-NEXT: s_add_i32 s0, s2, s3 161; GFX9-NEXT: v_add_i32 v1, s2, v1 clamp 162; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s0, v1 163; GFX9-NEXT: v_mov_b32_e32 v2, s0 164; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 165; GFX9-NEXT: global_store_dword v0, v2, s[4:5] 166; GFX9-NEXT: global_store_byte v0, v1, s[6:7] 167; GFX9-NEXT: s_endpgm 168; 169; GFX10-LABEL: s_saddo_i32: 170; GFX10: ; %bb.0: 171; GFX10-NEXT: s_clause 0x1 172; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 173; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 174; GFX10-NEXT: v_mov_b32_e32 v1, 0 175; GFX10-NEXT: s_waitcnt lgkmcnt(0) 176; GFX10-NEXT: v_add_nc_i32 v0, s2, s3 clamp 177; GFX10-NEXT: s_add_i32 s0, s2, s3 178; GFX10-NEXT: v_mov_b32_e32 v2, s0 179; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s0, v0 180; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 181; GFX10-NEXT: global_store_dword v1, v2, s[4:5] 182; GFX10-NEXT: global_store_byte v1, v0, s[6:7] 183; GFX10-NEXT: s_endpgm 184 %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind 185 %val = extractvalue { i32, i1 } %sadd, 0 186 %carry = extractvalue { i32, i1 } %sadd, 1 187 store i32 %val, i32 addrspace(1)* %out, align 4 188 store i1 %carry, i1 addrspace(1)* %carryout 189 ret void 190} 191 192define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { 193; SI-LABEL: v_saddo_i32: 194; SI: ; %bb.0: 195; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 196; SI-NEXT: s_mov_b32 s11, 0xf000 197; SI-NEXT: s_mov_b32 s10, -1 198; SI-NEXT: s_mov_b32 s14, s10 199; SI-NEXT: s_mov_b32 s15, s11 200; SI-NEXT: s_waitcnt lgkmcnt(0) 201; SI-NEXT: s_mov_b32 s12, s4 202; SI-NEXT: s_mov_b32 s13, s5 203; SI-NEXT: s_mov_b32 s4, s6 204; SI-NEXT: s_mov_b32 s5, s7 205; SI-NEXT: s_mov_b32 s6, s10 206; SI-NEXT: s_mov_b32 s7, s11 207; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 208; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 209; SI-NEXT: s_mov_b32 s8, s0 210; SI-NEXT: s_mov_b32 s9, s1 211; SI-NEXT: s_mov_b32 s4, s2 212; SI-NEXT: s_mov_b32 s5, s3 213; SI-NEXT: s_waitcnt vmcnt(0) 214; SI-NEXT: v_add_i32_e32 v2, vcc, v1, v0 215; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 216; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0 217; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] 218; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 219; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 220; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 221; SI-NEXT: s_endpgm 222; 223; VI-LABEL: v_saddo_i32: 224; VI: ; %bb.0: 225; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 226; VI-NEXT: s_waitcnt lgkmcnt(0) 227; VI-NEXT: v_mov_b32_e32 v0, s4 228; VI-NEXT: v_mov_b32_e32 v1, s5 229; VI-NEXT: v_mov_b32_e32 v2, s6 230; VI-NEXT: v_mov_b32_e32 v3, s7 231; VI-NEXT: flat_load_dword v4, v[0:1] 232; VI-NEXT: flat_load_dword v5, v[2:3] 233; VI-NEXT: v_mov_b32_e32 v0, s0 234; VI-NEXT: v_mov_b32_e32 v1, s1 235; VI-NEXT: v_mov_b32_e32 v2, s2 236; VI-NEXT: v_mov_b32_e32 v3, s3 237; VI-NEXT: s_waitcnt vmcnt(0) 238; VI-NEXT: v_add_u32_e32 v6, vcc, v5, v4 239; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5 240; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4 241; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] 242; VI-NEXT: flat_store_dword v[0:1], v6 243; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 244; VI-NEXT: flat_store_byte v[2:3], v0 245; VI-NEXT: s_endpgm 246; 247; GFX9-LABEL: v_saddo_i32: 248; GFX9: ; %bb.0: 249; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 250; GFX9-NEXT: v_mov_b32_e32 v0, 0 251; GFX9-NEXT: s_waitcnt lgkmcnt(0) 252; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 253; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 254; GFX9-NEXT: s_waitcnt vmcnt(0) 255; GFX9-NEXT: v_add_i32 v3, v1, v2 clamp 256; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 257; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v3 258; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 259; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 260; GFX9-NEXT: global_store_byte v0, v1, s[2:3] 261; GFX9-NEXT: s_endpgm 262; 263; GFX10-LABEL: v_saddo_i32: 264; GFX10: ; %bb.0: 265; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 266; GFX10-NEXT: v_mov_b32_e32 v0, 0 267; GFX10-NEXT: s_waitcnt lgkmcnt(0) 268; GFX10-NEXT: s_clause 0x1 269; GFX10-NEXT: global_load_dword v1, v0, s[4:5] 270; GFX10-NEXT: global_load_dword v2, v0, s[6:7] 271; GFX10-NEXT: s_waitcnt vmcnt(0) 272; GFX10-NEXT: v_add_nc_i32 v3, v1, v2 clamp 273; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 274; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3 275; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 276; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 277; GFX10-NEXT: global_store_byte v0, v2, s[2:3] 278; GFX10-NEXT: s_endpgm 279 %a = load i32, i32 addrspace(1)* %aptr, align 4 280 %b = load i32, i32 addrspace(1)* %bptr, align 4 281 %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind 282 %val = extractvalue { i32, i1 } %sadd, 0 283 %carry = extractvalue { i32, i1 } %sadd, 1 284 store i32 %val, i32 addrspace(1)* %out, align 4 285 store i1 %carry, i1 addrspace(1)* %carryout 286 ret void 287} 288 289define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { 290; SI-LABEL: s_saddo_i64: 291; SI: ; %bb.0: 292; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 293; SI-NEXT: s_mov_b32 s11, 0xf000 294; SI-NEXT: s_mov_b32 s10, -1 295; SI-NEXT: s_waitcnt lgkmcnt(0) 296; SI-NEXT: s_add_u32 s12, s4, s6 297; SI-NEXT: v_mov_b32_e32 v0, s4 298; SI-NEXT: s_addc_u32 s13, s5, s7 299; SI-NEXT: v_mov_b32_e32 v1, s5 300; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] 301; SI-NEXT: v_cmp_lt_i64_e64 s[4:5], s[6:7], 0 302; SI-NEXT: v_mov_b32_e32 v0, s12 303; SI-NEXT: s_mov_b32 s8, s0 304; SI-NEXT: s_mov_b32 s9, s1 305; SI-NEXT: v_mov_b32_e32 v1, s13 306; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc 307; SI-NEXT: s_mov_b32 s0, s2 308; SI-NEXT: s_mov_b32 s1, s3 309; SI-NEXT: s_mov_b32 s2, s10 310; SI-NEXT: s_mov_b32 s3, s11 311; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 312; SI-NEXT: s_waitcnt expcnt(0) 313; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] 314; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 315; SI-NEXT: s_endpgm 316; 317; VI-LABEL: s_saddo_i64: 318; VI: ; %bb.0: 319; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 320; VI-NEXT: s_waitcnt lgkmcnt(0) 321; VI-NEXT: v_mov_b32_e32 v0, s0 322; VI-NEXT: s_add_u32 s0, s4, s6 323; VI-NEXT: v_mov_b32_e32 v4, s4 324; VI-NEXT: v_mov_b32_e32 v1, s1 325; VI-NEXT: s_addc_u32 s1, s5, s7 326; VI-NEXT: v_mov_b32_e32 v5, s5 327; VI-NEXT: v_mov_b32_e32 v2, s2 328; VI-NEXT: v_mov_b32_e32 v3, s3 329; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] 330; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 331; VI-NEXT: v_mov_b32_e32 v5, s1 332; VI-NEXT: v_mov_b32_e32 v4, s0 333; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc 334; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] 335; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 336; VI-NEXT: flat_store_byte v[2:3], v0 337; VI-NEXT: s_endpgm 338; 339; GFX9-LABEL: s_saddo_i64: 340; GFX9: ; %bb.0: 341; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 342; GFX9-NEXT: v_mov_b32_e32 v2, 0 343; GFX9-NEXT: s_waitcnt lgkmcnt(0) 344; GFX9-NEXT: s_add_u32 s8, s4, s6 345; GFX9-NEXT: v_mov_b32_e32 v0, s4 346; GFX9-NEXT: v_mov_b32_e32 v1, s5 347; GFX9-NEXT: s_addc_u32 s9, s5, s7 348; GFX9-NEXT: v_cmp_lt_i64_e64 s[10:11], s[6:7], 0 349; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] 350; GFX9-NEXT: v_mov_b32_e32 v0, s8 351; GFX9-NEXT: v_mov_b32_e32 v1, s9 352; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 353; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], vcc 354; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 355; GFX9-NEXT: global_store_byte v2, v0, s[2:3] 356; GFX9-NEXT: s_endpgm 357; 358; GFX10-LABEL: s_saddo_i64: 359; GFX10: ; %bb.0: 360; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 361; GFX10-NEXT: v_mov_b32_e32 v2, 0 362; GFX10-NEXT: s_waitcnt lgkmcnt(0) 363; GFX10-NEXT: s_add_u32 s8, s4, s6 364; GFX10-NEXT: s_addc_u32 s9, s5, s7 365; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0 366; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] 367; GFX10-NEXT: v_mov_b32_e32 v0, s8 368; GFX10-NEXT: v_mov_b32_e32 v1, s9 369; GFX10-NEXT: s_xor_b32 s4, s6, s4 370; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 371; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 372; GFX10-NEXT: global_store_byte v2, v3, s[2:3] 373; GFX10-NEXT: s_endpgm 374 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind 375 %val = extractvalue { i64, i1 } %sadd, 0 376 %carry = extractvalue { i64, i1 } %sadd, 1 377 store i64 %val, i64 addrspace(1)* %out, align 8 378 store i1 %carry, i1 addrspace(1)* %carryout 379 ret void 380} 381 382define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { 383; SI-LABEL: v_saddo_i64: 384; SI: ; %bb.0: 385; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 386; SI-NEXT: s_mov_b32 s11, 0xf000 387; SI-NEXT: s_mov_b32 s10, -1 388; SI-NEXT: s_mov_b32 s14, s10 389; SI-NEXT: s_mov_b32 s15, s11 390; SI-NEXT: s_waitcnt lgkmcnt(0) 391; SI-NEXT: s_mov_b32 s12, s4 392; SI-NEXT: s_mov_b32 s13, s5 393; SI-NEXT: s_mov_b32 s4, s6 394; SI-NEXT: s_mov_b32 s5, s7 395; SI-NEXT: s_mov_b32 s6, s10 396; SI-NEXT: s_mov_b32 s7, s11 397; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 398; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 399; SI-NEXT: s_mov_b32 s8, s0 400; SI-NEXT: s_mov_b32 s9, s1 401; SI-NEXT: s_mov_b32 s4, s2 402; SI-NEXT: s_mov_b32 s5, s3 403; SI-NEXT: s_waitcnt vmcnt(0) 404; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 405; SI-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc 406; SI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] 407; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] 408; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 409; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] 410; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 411; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 412; SI-NEXT: s_endpgm 413; 414; VI-LABEL: v_saddo_i64: 415; VI: ; %bb.0: 416; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 417; VI-NEXT: s_waitcnt lgkmcnt(0) 418; VI-NEXT: v_mov_b32_e32 v0, s4 419; VI-NEXT: v_mov_b32_e32 v1, s5 420; VI-NEXT: v_mov_b32_e32 v2, s6 421; VI-NEXT: v_mov_b32_e32 v3, s7 422; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 423; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 424; VI-NEXT: v_mov_b32_e32 v4, s0 425; VI-NEXT: v_mov_b32_e32 v5, s1 426; VI-NEXT: v_mov_b32_e32 v6, s2 427; VI-NEXT: v_mov_b32_e32 v7, s3 428; VI-NEXT: s_waitcnt vmcnt(0) 429; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2 430; VI-NEXT: v_addc_u32_e32 v9, vcc, v1, v3, vcc 431; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] 432; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1] 433; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] 434; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] 435; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 436; VI-NEXT: flat_store_byte v[6:7], v0 437; VI-NEXT: s_endpgm 438; 439; GFX9-LABEL: v_saddo_i64: 440; GFX9: ; %bb.0: 441; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 442; GFX9-NEXT: v_mov_b32_e32 v6, 0 443; GFX9-NEXT: s_waitcnt lgkmcnt(0) 444; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] 445; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11] 446; GFX9-NEXT: s_waitcnt vmcnt(0) 447; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 448; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc 449; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] 450; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] 451; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5] 452; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] 453; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 454; GFX9-NEXT: global_store_byte v6, v0, s[6:7] 455; GFX9-NEXT: s_endpgm 456; 457; GFX10-LABEL: v_saddo_i64: 458; GFX10: ; %bb.0: 459; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 460; GFX10-NEXT: v_mov_b32_e32 v6, 0 461; GFX10-NEXT: s_waitcnt lgkmcnt(0) 462; GFX10-NEXT: s_clause 0x1 463; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] 464; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11] 465; GFX10-NEXT: s_waitcnt vmcnt(0) 466; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 467; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo 468; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] 469; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] 470; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0 471; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 472; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5] 473; GFX10-NEXT: global_store_byte v6, v0, s[6:7] 474; GFX10-NEXT: s_endpgm 475 %a = load i64, i64 addrspace(1)* %aptr, align 4 476 %b = load i64, i64 addrspace(1)* %bptr, align 4 477 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind 478 %val = extractvalue { i64, i1 } %sadd, 0 479 %carry = extractvalue { i64, i1 } %sadd, 1 480 store i64 %val, i64 addrspace(1)* %out, align 8 481 store i1 %carry, i1 addrspace(1)* %carryout 482 ret void 483} 484 485define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind { 486; SI-LABEL: v_saddo_v2i32: 487; SI: ; %bb.0: 488; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 489; SI-NEXT: s_mov_b32 s11, 0xf000 490; SI-NEXT: s_mov_b32 s10, -1 491; SI-NEXT: s_mov_b32 s14, s10 492; SI-NEXT: s_mov_b32 s15, s11 493; SI-NEXT: s_waitcnt lgkmcnt(0) 494; SI-NEXT: s_mov_b32 s12, s4 495; SI-NEXT: s_mov_b32 s13, s5 496; SI-NEXT: s_mov_b32 s4, s6 497; SI-NEXT: s_mov_b32 s5, s7 498; SI-NEXT: s_mov_b32 s6, s10 499; SI-NEXT: s_mov_b32 s7, s11 500; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 501; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 502; SI-NEXT: s_mov_b32 s8, s0 503; SI-NEXT: s_mov_b32 s9, s1 504; SI-NEXT: s_mov_b32 s12, s2 505; SI-NEXT: s_mov_b32 s13, s3 506; SI-NEXT: s_waitcnt vmcnt(0) 507; SI-NEXT: v_add_i32_e32 v5, vcc, v1, v3 508; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 509; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 510; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1 511; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 512; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v0 513; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] 514; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 515; SI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] 516; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 517; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 518; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 519; SI-NEXT: s_endpgm 520; 521; VI-LABEL: v_saddo_v2i32: 522; VI: ; %bb.0: 523; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 524; VI-NEXT: s_waitcnt lgkmcnt(0) 525; VI-NEXT: v_mov_b32_e32 v0, s4 526; VI-NEXT: v_mov_b32_e32 v1, s5 527; VI-NEXT: v_mov_b32_e32 v2, s6 528; VI-NEXT: v_mov_b32_e32 v3, s7 529; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 530; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 531; VI-NEXT: v_mov_b32_e32 v4, s0 532; VI-NEXT: v_mov_b32_e32 v5, s1 533; VI-NEXT: v_mov_b32_e32 v6, s2 534; VI-NEXT: v_mov_b32_e32 v7, s3 535; VI-NEXT: s_waitcnt vmcnt(0) 536; VI-NEXT: v_add_u32_e32 v9, vcc, v1, v3 537; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2 538; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 539; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1 540; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 541; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0 542; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] 543; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 544; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] 545; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 546; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] 547; VI-NEXT: flat_store_dwordx2 v[6:7], v[0:1] 548; VI-NEXT: s_endpgm 549; 550; GFX9-LABEL: v_saddo_v2i32: 551; GFX9: ; %bb.0: 552; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 553; GFX9-NEXT: v_mov_b32_e32 v6, 0 554; GFX9-NEXT: s_waitcnt lgkmcnt(0) 555; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[4:5] 556; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[6:7] 557; GFX9-NEXT: s_waitcnt vmcnt(0) 558; GFX9-NEXT: v_add_u32_e32 v5, v1, v3 559; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp 560; GFX9-NEXT: v_add_u32_e32 v4, v0, v2 561; GFX9-NEXT: v_add_i32 v0, v0, v2 clamp 562; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v5, v1 563; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 564; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v0 565; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 566; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] 567; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3] 568; GFX9-NEXT: s_endpgm 569; 570; GFX10-LABEL: v_saddo_v2i32: 571; GFX10: ; %bb.0: 572; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 573; GFX10-NEXT: v_mov_b32_e32 v5, 0 574; GFX10-NEXT: s_waitcnt lgkmcnt(0) 575; GFX10-NEXT: s_clause 0x1 576; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[4:5] 577; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[6:7] 578; GFX10-NEXT: s_waitcnt vmcnt(0) 579; GFX10-NEXT: v_add_nc_u32_e32 v4, v1, v3 580; GFX10-NEXT: v_add_nc_i32 v1, v1, v3 clamp 581; GFX10-NEXT: v_add_nc_u32_e32 v3, v0, v2 582; GFX10-NEXT: v_add_nc_i32 v0, v0, v2 clamp 583; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v1 584; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 585; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0 586; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 587; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[0:1] 588; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[2:3] 589; GFX10-NEXT: s_endpgm 590 %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 591 %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 592 %sadd = call { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind 593 %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0 594 %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1 595 store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 596 %carry.ext = zext <2 x i1> %carry to <2 x i32> 597 store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout 598 ret void 599} 600