1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s 3; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s 7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s 8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s 9 10declare i32 @llvm.amdgcn.workitem.id.x() 11declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32) 12declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, i32) 13 14; Show what the atomic optimization pass will do for struct buffers. 15 16define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { 17; GFX6-LABEL: add_i32_constant: 18; GFX6: ; %bb.0: ; %entry 19; GFX6-NEXT: s_mov_b64 s[6:7], exec 20; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 21; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 22; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 23; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 24; GFX6-NEXT: ; implicit-def: $vgpr1 25; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc 26; GFX6-NEXT: s_cbranch_execz .LBB0_2 27; GFX6-NEXT: ; %bb.1: 28; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 29; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 30; GFX6-NEXT: s_mul_i32 s0, s0, 5 31; GFX6-NEXT: v_mov_b32_e32 v1, s0 32; GFX6-NEXT: v_mov_b32_e32 v2, 0 33; GFX6-NEXT: s_waitcnt lgkmcnt(0) 34; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 35; GFX6-NEXT: .LBB0_2: 36; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] 37; GFX6-NEXT: s_waitcnt vmcnt(0) 38; GFX6-NEXT: v_readfirstlane_b32 s0, v1 39; GFX6-NEXT: s_mov_b32 s7, 0xf000 40; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s0 41; GFX6-NEXT: s_mov_b32 s6, -1 42; GFX6-NEXT: s_waitcnt lgkmcnt(0) 43; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 44; GFX6-NEXT: s_endpgm 45; 46; GFX8-LABEL: add_i32_constant: 47; GFX8: ; %bb.0: ; %entry 48; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 49; GFX8-NEXT: s_mov_b64 s[6:7], exec 50; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 51; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 52; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 53; GFX8-NEXT: ; implicit-def: $vgpr1 54; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 55; GFX8-NEXT: s_cbranch_execz .LBB0_2 56; GFX8-NEXT: ; %bb.1: 57; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 58; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 59; GFX8-NEXT: s_mul_i32 s0, s0, 5 60; GFX8-NEXT: v_mov_b32_e32 v1, s0 61; GFX8-NEXT: v_mov_b32_e32 v2, 0 62; GFX8-NEXT: s_waitcnt lgkmcnt(0) 63; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 64; GFX8-NEXT: .LBB0_2: 65; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 66; GFX8-NEXT: s_waitcnt vmcnt(0) 67; GFX8-NEXT: v_readfirstlane_b32 s0, v1 68; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0 69; GFX8-NEXT: s_waitcnt lgkmcnt(0) 70; GFX8-NEXT: v_mov_b32_e32 v0, s2 71; GFX8-NEXT: v_mov_b32_e32 v1, s3 72; GFX8-NEXT: flat_store_dword v[0:1], v2 73; GFX8-NEXT: s_endpgm 74; 75; GFX9-LABEL: add_i32_constant: 76; GFX9: ; %bb.0: ; %entry 77; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 78; GFX9-NEXT: s_mov_b64 s[6:7], exec 79; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 80; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 81; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 82; GFX9-NEXT: ; implicit-def: $vgpr1 83; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 84; GFX9-NEXT: s_cbranch_execz .LBB0_2 85; GFX9-NEXT: ; %bb.1: 86; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 87; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 88; GFX9-NEXT: s_mul_i32 s0, s0, 5 89; GFX9-NEXT: v_mov_b32_e32 v1, s0 90; GFX9-NEXT: v_mov_b32_e32 v2, 0 91; GFX9-NEXT: s_waitcnt lgkmcnt(0) 92; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 93; GFX9-NEXT: .LBB0_2: 94; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 95; GFX9-NEXT: s_waitcnt vmcnt(0) 96; GFX9-NEXT: v_readfirstlane_b32 s0, v1 97; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0 98; GFX9-NEXT: v_mov_b32_e32 v1, 0 99; GFX9-NEXT: s_waitcnt lgkmcnt(0) 100; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 101; GFX9-NEXT: s_endpgm 102; 103; GFX10W64-LABEL: add_i32_constant: 104; GFX10W64: ; %bb.0: ; %entry 105; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 106; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 107; GFX10W64-NEXT: ; implicit-def: $vgpr1 108; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 109; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 110; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 111; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 112; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 113; GFX10W64-NEXT: ; %bb.1: 114; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 115; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 116; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 117; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 118; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 119; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 120; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 121; GFX10W64-NEXT: .LBB0_2: 122; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 123; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 124; GFX10W64-NEXT: s_waitcnt vmcnt(0) 125; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 126; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 127; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 128; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 129; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 130; GFX10W64-NEXT: s_endpgm 131; 132; GFX10W32-LABEL: add_i32_constant: 133; GFX10W32: ; %bb.0: ; %entry 134; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 135; GFX10W32-NEXT: s_mov_b32 s5, exec_lo 136; GFX10W32-NEXT: ; implicit-def: $vgpr1 137; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 138; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 139; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 140; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 141; GFX10W32-NEXT: ; %bb.1: 142; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 143; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 144; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 145; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 146; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 147; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 148; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 149; GFX10W32-NEXT: .LBB0_2: 150; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 151; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 152; GFX10W32-NEXT: s_waitcnt vmcnt(0) 153; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 154; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 155; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 156; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 157; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 158; GFX10W32-NEXT: s_endpgm 159; 160; GFX11W64-LABEL: add_i32_constant: 161; GFX11W64: ; %bb.0: ; %entry 162; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 163; GFX11W64-NEXT: s_mov_b64 s[6:7], exec 164; GFX11W64-NEXT: s_mov_b64 s[4:5], exec 165; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 166; GFX11W64-NEXT: ; implicit-def: $vgpr1 167; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 168; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 169; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 170; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 171; GFX11W64-NEXT: ; %bb.1: 172; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 173; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 174; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 175; GFX11W64-NEXT: s_mul_i32 s0, s0, 5 176; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 177; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 178; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 179; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc 180; GFX11W64-NEXT: .LBB0_2: 181; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] 182; GFX11W64-NEXT: s_waitcnt vmcnt(0) 183; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 184; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 185; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 186; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 187; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 188; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] 189; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 190; GFX11W64-NEXT: s_endpgm 191; 192; GFX11W32-LABEL: add_i32_constant: 193; GFX11W32: ; %bb.0: ; %entry 194; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 195; GFX11W32-NEXT: s_mov_b32 s5, exec_lo 196; GFX11W32-NEXT: s_mov_b32 s4, exec_lo 197; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 198; GFX11W32-NEXT: ; implicit-def: $vgpr1 199; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 200; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 201; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 202; GFX11W32-NEXT: ; %bb.1: 203; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 204; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5 205; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 206; GFX11W32-NEXT: s_mul_i32 s0, s0, 5 207; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 208; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 209; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 210; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc 211; GFX11W32-NEXT: .LBB0_2: 212; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 213; GFX11W32-NEXT: s_waitcnt vmcnt(0) 214; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 215; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 216; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 217; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 218; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 219; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] 220; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 221; GFX11W32-NEXT: s_endpgm 222entry: 223 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) 224 store i32 %old, i32 addrspace(1)* %out 225 ret void 226} 227 228define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) { 229; GFX6-LABEL: add_i32_uniform: 230; GFX6: ; %bb.0: ; %entry 231; GFX6-NEXT: s_mov_b64 s[2:3], exec 232; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 233; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 234; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 235; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 236; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 237; GFX6-NEXT: ; implicit-def: $vgpr1 238; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 239; GFX6-NEXT: s_cbranch_execz .LBB1_2 240; GFX6-NEXT: ; %bb.1: 241; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 242; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 243; GFX6-NEXT: s_waitcnt lgkmcnt(0) 244; GFX6-NEXT: s_mul_i32 s0, s8, s0 245; GFX6-NEXT: v_mov_b32_e32 v1, s0 246; GFX6-NEXT: v_mov_b32_e32 v2, 0 247; GFX6-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc 248; GFX6-NEXT: .LBB1_2: 249; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 250; GFX6-NEXT: s_waitcnt vmcnt(0) 251; GFX6-NEXT: v_readfirstlane_b32 s0, v1 252; GFX6-NEXT: s_waitcnt lgkmcnt(0) 253; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 254; GFX6-NEXT: s_mov_b32 s7, 0xf000 255; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 256; GFX6-NEXT: s_mov_b32 s6, -1 257; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 258; GFX6-NEXT: s_endpgm 259; 260; GFX8-LABEL: add_i32_uniform: 261; GFX8: ; %bb.0: ; %entry 262; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 263; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 264; GFX8-NEXT: s_mov_b64 s[6:7], exec 265; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 266; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 267; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 268; GFX8-NEXT: ; implicit-def: $vgpr1 269; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 270; GFX8-NEXT: s_cbranch_execz .LBB1_2 271; GFX8-NEXT: ; %bb.1: 272; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 273; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 274; GFX8-NEXT: s_waitcnt lgkmcnt(0) 275; GFX8-NEXT: s_mul_i32 s0, s8, s0 276; GFX8-NEXT: v_mov_b32_e32 v1, s0 277; GFX8-NEXT: v_mov_b32_e32 v2, 0 278; GFX8-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc 279; GFX8-NEXT: .LBB1_2: 280; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 281; GFX8-NEXT: s_waitcnt lgkmcnt(0) 282; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 283; GFX8-NEXT: s_waitcnt vmcnt(0) 284; GFX8-NEXT: v_readfirstlane_b32 s0, v1 285; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 286; GFX8-NEXT: v_mov_b32_e32 v0, s2 287; GFX8-NEXT: v_mov_b32_e32 v1, s3 288; GFX8-NEXT: flat_store_dword v[0:1], v2 289; GFX8-NEXT: s_endpgm 290; 291; GFX9-LABEL: add_i32_uniform: 292; GFX9: ; %bb.0: ; %entry 293; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 294; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 295; GFX9-NEXT: s_mov_b64 s[6:7], exec 296; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 297; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 298; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 299; GFX9-NEXT: ; implicit-def: $vgpr1 300; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 301; GFX9-NEXT: s_cbranch_execz .LBB1_2 302; GFX9-NEXT: ; %bb.1: 303; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 304; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 305; GFX9-NEXT: s_waitcnt lgkmcnt(0) 306; GFX9-NEXT: s_mul_i32 s0, s8, s0 307; GFX9-NEXT: v_mov_b32_e32 v1, s0 308; GFX9-NEXT: v_mov_b32_e32 v2, 0 309; GFX9-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc 310; GFX9-NEXT: .LBB1_2: 311; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 312; GFX9-NEXT: s_waitcnt lgkmcnt(0) 313; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 314; GFX9-NEXT: s_waitcnt vmcnt(0) 315; GFX9-NEXT: v_readfirstlane_b32 s0, v1 316; GFX9-NEXT: v_mov_b32_e32 v1, 0 317; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 318; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 319; GFX9-NEXT: s_endpgm 320; 321; GFX10W64-LABEL: add_i32_uniform: 322; GFX10W64: ; %bb.0: ; %entry 323; GFX10W64-NEXT: s_clause 0x1 324; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 325; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 326; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 327; GFX10W64-NEXT: ; implicit-def: $vgpr1 328; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 329; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 330; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 331; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 332; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 333; GFX10W64-NEXT: ; %bb.1: 334; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 335; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 336; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 337; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 338; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 339; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 340; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc 341; GFX10W64-NEXT: .LBB1_2: 342; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 343; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 344; GFX10W64-NEXT: s_waitcnt vmcnt(0) 345; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 346; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 347; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1] 348; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 349; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 350; GFX10W64-NEXT: s_endpgm 351; 352; GFX10W32-LABEL: add_i32_uniform: 353; GFX10W32: ; %bb.0: ; %entry 354; GFX10W32-NEXT: s_clause 0x1 355; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 356; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 357; GFX10W32-NEXT: s_mov_b32 s6, exec_lo 358; GFX10W32-NEXT: ; implicit-def: $vgpr1 359; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 360; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 361; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 362; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 363; GFX10W32-NEXT: ; %bb.1: 364; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 365; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 366; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 367; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 368; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 369; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 370; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 371; GFX10W32-NEXT: .LBB1_2: 372; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 373; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 374; GFX10W32-NEXT: s_waitcnt vmcnt(0) 375; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 376; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 377; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1] 378; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 379; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 380; GFX10W32-NEXT: s_endpgm 381; 382; GFX11W64-LABEL: add_i32_uniform: 383; GFX11W64: ; %bb.0: ; %entry 384; GFX11W64-NEXT: s_clause 0x1 385; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 386; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44 387; GFX11W64-NEXT: s_mov_b64 s[6:7], exec 388; GFX11W64-NEXT: s_mov_b64 s[4:5], exec 389; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 390; GFX11W64-NEXT: ; implicit-def: $vgpr1 391; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 392; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 393; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 394; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 395; GFX11W64-NEXT: ; %bb.1: 396; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34 397; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 398; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 399; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 400; GFX11W64-NEXT: s_mul_i32 s0, s8, s0 401; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 402; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 403; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[12:15], 0 idxen glc 404; GFX11W64-NEXT: .LBB1_2: 405; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] 406; GFX11W64-NEXT: s_waitcnt vmcnt(0) 407; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 408; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 409; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) 410; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1] 411; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 412; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3] 413; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 414; GFX11W64-NEXT: s_endpgm 415; 416; GFX11W32-LABEL: add_i32_uniform: 417; GFX11W32: ; %bb.0: ; %entry 418; GFX11W32-NEXT: s_clause 0x1 419; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 420; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44 421; GFX11W32-NEXT: s_mov_b32 s6, exec_lo 422; GFX11W32-NEXT: s_mov_b32 s5, exec_lo 423; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 424; GFX11W32-NEXT: ; implicit-def: $vgpr1 425; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 426; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 427; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 428; GFX11W32-NEXT: ; %bb.1: 429; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 430; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6 431; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 432; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 433; GFX11W32-NEXT: s_mul_i32 s0, s4, s0 434; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 435; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 436; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc 437; GFX11W32-NEXT: .LBB1_2: 438; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 439; GFX11W32-NEXT: s_waitcnt vmcnt(0) 440; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 441; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 442; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 443; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s4, v0, s[0:1] 444; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 445; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] 446; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 447; GFX11W32-NEXT: s_endpgm 448entry: 449 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) 450 store i32 %old, i32 addrspace(1)* %out 451 ret void 452} 453 454define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { 455; GFX6-LABEL: add_i32_varying_vdata: 456; GFX6: ; %bb.0: ; %entry 457; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 458; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 459; GFX6-NEXT: v_mov_b32_e32 v1, 0 460; GFX6-NEXT: s_waitcnt lgkmcnt(0) 461; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc 462; GFX6-NEXT: s_mov_b32 s3, 0xf000 463; GFX6-NEXT: s_mov_b32 s2, -1 464; GFX6-NEXT: s_waitcnt vmcnt(0) 465; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 466; GFX6-NEXT: s_endpgm 467; 468; GFX8-LABEL: add_i32_varying_vdata: 469; GFX8: ; %bb.0: ; %entry 470; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 471; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 472; GFX8-NEXT: v_mov_b32_e32 v1, 0 473; GFX8-NEXT: s_mov_b64 exec, s[4:5] 474; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 475; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 476; GFX8-NEXT: v_mov_b32_e32 v2, v0 477; GFX8-NEXT: s_not_b64 exec, exec 478; GFX8-NEXT: v_mov_b32_e32 v2, 0 479; GFX8-NEXT: s_not_b64 exec, exec 480; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 481; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 482; GFX8-NEXT: s_nop 1 483; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 484; GFX8-NEXT: s_nop 1 485; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 486; GFX8-NEXT: s_nop 1 487; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 488; GFX8-NEXT: s_nop 1 489; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 490; GFX8-NEXT: s_nop 1 491; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 492; GFX8-NEXT: v_readlane_b32 s6, v2, 63 493; GFX8-NEXT: s_nop 0 494; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 495; GFX8-NEXT: s_mov_b64 exec, s[4:5] 496; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 497; GFX8-NEXT: ; implicit-def: $vgpr0 498; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 499; GFX8-NEXT: s_cbranch_execz .LBB2_2 500; GFX8-NEXT: ; %bb.1: 501; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 502; GFX8-NEXT: v_mov_b32_e32 v0, s6 503; GFX8-NEXT: v_mov_b32_e32 v3, 0 504; GFX8-NEXT: s_waitcnt lgkmcnt(0) 505; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc 506; GFX8-NEXT: .LBB2_2: 507; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 508; GFX8-NEXT: s_waitcnt vmcnt(0) 509; GFX8-NEXT: v_readfirstlane_b32 s0, v0 510; GFX8-NEXT: v_mov_b32_e32 v0, v1 511; GFX8-NEXT: s_waitcnt lgkmcnt(0) 512; GFX8-NEXT: v_mov_b32_e32 v4, s3 513; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 514; GFX8-NEXT: v_mov_b32_e32 v3, s2 515; GFX8-NEXT: flat_store_dword v[3:4], v0 516; GFX8-NEXT: s_endpgm 517; 518; GFX9-LABEL: add_i32_varying_vdata: 519; GFX9: ; %bb.0: ; %entry 520; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 521; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 522; GFX9-NEXT: v_mov_b32_e32 v1, 0 523; GFX9-NEXT: s_mov_b64 exec, s[4:5] 524; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 525; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 526; GFX9-NEXT: v_mov_b32_e32 v2, v0 527; GFX9-NEXT: s_not_b64 exec, exec 528; GFX9-NEXT: v_mov_b32_e32 v2, 0 529; GFX9-NEXT: s_not_b64 exec, exec 530; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 531; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 532; GFX9-NEXT: s_nop 1 533; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 534; GFX9-NEXT: s_nop 1 535; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 536; GFX9-NEXT: s_nop 1 537; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 538; GFX9-NEXT: s_nop 1 539; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 540; GFX9-NEXT: s_nop 1 541; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 542; GFX9-NEXT: v_readlane_b32 s6, v2, 63 543; GFX9-NEXT: s_nop 0 544; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 545; GFX9-NEXT: s_mov_b64 exec, s[4:5] 546; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 547; GFX9-NEXT: ; implicit-def: $vgpr0 548; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 549; GFX9-NEXT: s_cbranch_execz .LBB2_2 550; GFX9-NEXT: ; %bb.1: 551; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 552; GFX9-NEXT: v_mov_b32_e32 v0, s6 553; GFX9-NEXT: v_mov_b32_e32 v3, 0 554; GFX9-NEXT: s_waitcnt lgkmcnt(0) 555; GFX9-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc 556; GFX9-NEXT: .LBB2_2: 557; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 558; GFX9-NEXT: s_waitcnt vmcnt(0) 559; GFX9-NEXT: v_readfirstlane_b32 s0, v0 560; GFX9-NEXT: v_mov_b32_e32 v0, v1 561; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 562; GFX9-NEXT: v_mov_b32_e32 v3, 0 563; GFX9-NEXT: s_waitcnt lgkmcnt(0) 564; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 565; GFX9-NEXT: s_endpgm 566; 567; GFX10W64-LABEL: add_i32_varying_vdata: 568; GFX10W64: ; %bb.0: ; %entry 569; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 570; GFX10W64-NEXT: s_not_b64 exec, exec 571; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 572; GFX10W64-NEXT: s_not_b64 exec, exec 573; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 574; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 575; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 576; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 577; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 578; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 579; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 580; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 581; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 582; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 583; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 584; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 585; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 586; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 587; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 588; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 589; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 590; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 591; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 592; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 593; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 594; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 595; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 596; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 597; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 598; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 599; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 600; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 601; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 602; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 603; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 604; GFX10W64-NEXT: ; implicit-def: $vgpr0 605; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 606; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 607; GFX10W64-NEXT: ; %bb.1: 608; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 609; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 610; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 611; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 612; GFX10W64-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc 613; GFX10W64-NEXT: .LBB2_2: 614; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 615; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 616; GFX10W64-NEXT: s_waitcnt vmcnt(0) 617; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 618; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 619; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 620; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 621; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 622; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 623; GFX10W64-NEXT: s_endpgm 624; 625; GFX10W32-LABEL: add_i32_varying_vdata: 626; GFX10W32: ; %bb.0: ; %entry 627; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 628; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 629; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 630; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 631; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 632; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 633; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 634; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 635; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 636; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 637; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 638; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 639; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 640; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 641; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 642; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 643; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 644; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 645; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 646; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 647; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 648; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 649; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 650; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 651; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 652; GFX10W32-NEXT: ; implicit-def: $vgpr0 653; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 654; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 655; GFX10W32-NEXT: ; %bb.1: 656; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 657; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 658; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 659; GFX10W32-NEXT: s_mov_b32 s5, s6 660; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 661; GFX10W32-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc 662; GFX10W32-NEXT: .LBB2_2: 663; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 664; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 665; GFX10W32-NEXT: s_waitcnt vmcnt(0) 666; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 667; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 668; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 669; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 670; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 671; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 672; GFX10W32-NEXT: s_endpgm 673; 674; GFX11W64-LABEL: add_i32_varying_vdata: 675; GFX11W64: ; %bb.0: ; %entry 676; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 677; GFX11W64-NEXT: s_not_b64 exec, exec 678; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 679; GFX11W64-NEXT: s_not_b64 exec, exec 680; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 681; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 682; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 683; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 684; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 685; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 686; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 687; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 688; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 689; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 690; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 691; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 692; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 693; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 694; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 695; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 696; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 697; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) 698; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 699; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 700; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] 701; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 702; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 703; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 704; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 705; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 706; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 707; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 708; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 709; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 710; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 711; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 712; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 713; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 714; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 715; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 716; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 717; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 718; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 719; GFX11W64-NEXT: ; implicit-def: $vgpr0 720; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 721; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 722; GFX11W64-NEXT: ; %bb.1: 723; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 724; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 725; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 726; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 727; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc 728; GFX11W64-NEXT: .LBB2_2: 729; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] 730; GFX11W64-NEXT: s_waitcnt vmcnt(0) 731; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 732; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 733; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 734; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 735; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 736; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 737; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] 738; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 739; GFX11W64-NEXT: s_endpgm 740; 741; GFX11W32-LABEL: add_i32_varying_vdata: 742; GFX11W32: ; %bb.0: ; %entry 743; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 744; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo 745; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 746; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo 747; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 748; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 749; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 750; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 751; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 752; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 753; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 754; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 755; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 756; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 757; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 758; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 759; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 760; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 761; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 762; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 763; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 764; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) 765; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 766; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 767; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 768; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 769; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 770; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 771; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 772; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 773; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 774; GFX11W32-NEXT: ; implicit-def: $vgpr0 775; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 776; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 777; GFX11W32-NEXT: ; %bb.1: 778; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 779; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 780; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 781; GFX11W32-NEXT: s_mov_b32 s5, s6 782; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 783; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc 784; GFX11W32-NEXT: .LBB2_2: 785; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 786; GFX11W32-NEXT: s_waitcnt vmcnt(0) 787; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 788; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 789; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 790; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 791; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 792; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 793; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] 794; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 795; GFX11W32-NEXT: s_endpgm 796entry: 797 %lane = call i32 @llvm.amdgcn.workitem.id.x() 798 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) 799 store i32 %old, i32 addrspace(1)* %out 800 ret void 801} 802 803define amdgpu_kernel void @add_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) { 804; GFX6-LABEL: add_i32_varying_vindex: 805; GFX6: ; %bb.0: ; %entry 806; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 807; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 808; GFX6-NEXT: v_mov_b32_e32 v1, 1 809; GFX6-NEXT: s_waitcnt lgkmcnt(0) 810; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc 811; GFX6-NEXT: s_mov_b32 s3, 0xf000 812; GFX6-NEXT: s_mov_b32 s2, -1 813; GFX6-NEXT: s_waitcnt vmcnt(0) 814; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 815; GFX6-NEXT: s_endpgm 816; 817; GFX8-LABEL: add_i32_varying_vindex: 818; GFX8: ; %bb.0: ; %entry 819; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 820; GFX8-NEXT: v_mov_b32_e32 v2, 1 821; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 822; GFX8-NEXT: s_waitcnt lgkmcnt(0) 823; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 idxen glc 824; GFX8-NEXT: v_mov_b32_e32 v0, s0 825; GFX8-NEXT: v_mov_b32_e32 v1, s1 826; GFX8-NEXT: s_waitcnt vmcnt(0) 827; GFX8-NEXT: flat_store_dword v[0:1], v2 828; GFX8-NEXT: s_endpgm 829; 830; GFX9-LABEL: add_i32_varying_vindex: 831; GFX9: ; %bb.0: ; %entry 832; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 833; GFX9-NEXT: v_mov_b32_e32 v1, 1 834; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 835; GFX9-NEXT: s_waitcnt lgkmcnt(0) 836; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc 837; GFX9-NEXT: v_mov_b32_e32 v0, 0 838; GFX9-NEXT: s_waitcnt vmcnt(0) 839; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 840; GFX9-NEXT: s_endpgm 841; 842; GFX10-LABEL: add_i32_varying_vindex: 843; GFX10: ; %bb.0: ; %entry 844; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 845; GFX10-NEXT: v_mov_b32_e32 v1, 1 846; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 847; GFX10-NEXT: s_waitcnt lgkmcnt(0) 848; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc 849; GFX10-NEXT: v_mov_b32_e32 v0, 0 850; GFX10-NEXT: s_waitcnt vmcnt(0) 851; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 852; GFX10-NEXT: s_endpgm 853; 854; GFX11-LABEL: add_i32_varying_vindex: 855; GFX11: ; %bb.0: ; %entry 856; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 857; GFX11-NEXT: v_mov_b32_e32 v1, 1 858; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 859; GFX11-NEXT: s_waitcnt lgkmcnt(0) 860; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 idxen glc 861; GFX11-NEXT: v_mov_b32_e32 v0, 0 862; GFX11-NEXT: s_waitcnt vmcnt(0) 863; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 864; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 865; GFX11-NEXT: s_endpgm 866entry: 867 %lane = call i32 @llvm.amdgcn.workitem.id.x() 868 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0) 869 store i32 %old, i32 addrspace(1)* %out 870 ret void 871} 872 873define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { 874; GFX6-LABEL: add_i32_varying_offset: 875; GFX6: ; %bb.0: ; %entry 876; GFX6-NEXT: v_mov_b32_e32 v1, v0 877; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 878; GFX6-NEXT: s_mov_b32 s2, 0 879; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 880; GFX6-NEXT: v_mov_b32_e32 v0, s2 881; GFX6-NEXT: v_mov_b32_e32 v2, 1 882; GFX6-NEXT: s_waitcnt lgkmcnt(0) 883; GFX6-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc 884; GFX6-NEXT: s_mov_b32 s3, 0xf000 885; GFX6-NEXT: s_mov_b32 s2, -1 886; GFX6-NEXT: s_waitcnt vmcnt(0) 887; GFX6-NEXT: buffer_store_dword v2, off, s[0:3], 0 888; GFX6-NEXT: s_endpgm 889; 890; GFX8-LABEL: add_i32_varying_offset: 891; GFX8: ; %bb.0: ; %entry 892; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 893; GFX8-NEXT: s_mov_b32 s2, 0 894; GFX8-NEXT: v_mov_b32_e32 v1, v0 895; GFX8-NEXT: v_mov_b32_e32 v0, s2 896; GFX8-NEXT: v_mov_b32_e32 v2, 1 897; GFX8-NEXT: s_waitcnt lgkmcnt(0) 898; GFX8-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc 899; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 900; GFX8-NEXT: s_waitcnt lgkmcnt(0) 901; GFX8-NEXT: v_mov_b32_e32 v0, s0 902; GFX8-NEXT: v_mov_b32_e32 v1, s1 903; GFX8-NEXT: s_waitcnt vmcnt(0) 904; GFX8-NEXT: flat_store_dword v[0:1], v2 905; GFX8-NEXT: s_endpgm 906; 907; GFX9-LABEL: add_i32_varying_offset: 908; GFX9: ; %bb.0: ; %entry 909; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 910; GFX9-NEXT: s_mov_b32 s2, 0 911; GFX9-NEXT: v_mov_b32_e32 v1, v0 912; GFX9-NEXT: v_mov_b32_e32 v0, s2 913; GFX9-NEXT: v_mov_b32_e32 v2, 1 914; GFX9-NEXT: s_waitcnt lgkmcnt(0) 915; GFX9-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc 916; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 917; GFX9-NEXT: v_mov_b32_e32 v0, 0 918; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 919; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 920; GFX9-NEXT: s_endpgm 921; 922; GFX10-LABEL: add_i32_varying_offset: 923; GFX10: ; %bb.0: ; %entry 924; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 925; GFX10-NEXT: s_mov_b32 s2, 0 926; GFX10-NEXT: v_mov_b32_e32 v1, v0 927; GFX10-NEXT: v_mov_b32_e32 v0, s2 928; GFX10-NEXT: v_mov_b32_e32 v2, 1 929; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 930; GFX10-NEXT: s_waitcnt lgkmcnt(0) 931; GFX10-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc 932; GFX10-NEXT: v_mov_b32_e32 v0, 0 933; GFX10-NEXT: s_waitcnt vmcnt(0) 934; GFX10-NEXT: global_store_dword v0, v2, s[0:1] 935; GFX10-NEXT: s_endpgm 936; 937; GFX11W64-LABEL: add_i32_varying_offset: 938; GFX11W64: ; %bb.0: ; %entry 939; GFX11W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 940; GFX11W64-NEXT: s_mov_b32 s2, 0 941; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 942; GFX11W64-NEXT: v_mov_b32_e32 v0, s2 943; GFX11W64-NEXT: v_mov_b32_e32 v2, 1 944; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 945; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 946; GFX11W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc 947; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 948; GFX11W64-NEXT: s_waitcnt vmcnt(0) 949; GFX11W64-NEXT: global_store_b32 v0, v2, s[0:1] 950; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 951; GFX11W64-NEXT: s_endpgm 952; 953; GFX11W32-LABEL: add_i32_varying_offset: 954; GFX11W32: ; %bb.0: ; %entry 955; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 956; GFX11W32-NEXT: s_mov_b32 s2, 0 957; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 958; GFX11W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s2 959; GFX11W32-NEXT: v_mov_b32_e32 v2, 1 960; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 961; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 962; GFX11W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc 963; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 964; GFX11W32-NEXT: s_waitcnt vmcnt(0) 965; GFX11W32-NEXT: global_store_b32 v0, v2, s[0:1] 966; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 967; GFX11W32-NEXT: s_endpgm 968entry: 969 %lane = call i32 @llvm.amdgcn.workitem.id.x() 970 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0) 971 store i32 %old, i32 addrspace(1)* %out 972 ret void 973} 974 975define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { 976; GFX6-LABEL: sub_i32_constant: 977; GFX6: ; %bb.0: ; %entry 978; GFX6-NEXT: s_mov_b64 s[6:7], exec 979; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 980; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 981; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 982; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 983; GFX6-NEXT: ; implicit-def: $vgpr1 984; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc 985; GFX6-NEXT: s_cbranch_execz .LBB5_2 986; GFX6-NEXT: ; %bb.1: 987; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 988; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 989; GFX6-NEXT: s_mul_i32 s0, s0, 5 990; GFX6-NEXT: v_mov_b32_e32 v1, s0 991; GFX6-NEXT: v_mov_b32_e32 v2, 0 992; GFX6-NEXT: s_waitcnt lgkmcnt(0) 993; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 994; GFX6-NEXT: .LBB5_2: 995; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] 996; GFX6-NEXT: s_waitcnt vmcnt(0) 997; GFX6-NEXT: v_readfirstlane_b32 s0, v1 998; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0 999; GFX6-NEXT: s_mov_b32 s7, 0xf000 1000; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1001; GFX6-NEXT: s_mov_b32 s6, -1 1002; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1003; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 1004; GFX6-NEXT: s_endpgm 1005; 1006; GFX8-LABEL: sub_i32_constant: 1007; GFX8: ; %bb.0: ; %entry 1008; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1009; GFX8-NEXT: s_mov_b64 s[6:7], exec 1010; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1011; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1012; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1013; GFX8-NEXT: ; implicit-def: $vgpr1 1014; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1015; GFX8-NEXT: s_cbranch_execz .LBB5_2 1016; GFX8-NEXT: ; %bb.1: 1017; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1018; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 1019; GFX8-NEXT: s_mul_i32 s0, s0, 5 1020; GFX8-NEXT: v_mov_b32_e32 v1, s0 1021; GFX8-NEXT: v_mov_b32_e32 v2, 0 1022; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1023; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 1024; GFX8-NEXT: .LBB5_2: 1025; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1026; GFX8-NEXT: s_waitcnt vmcnt(0) 1027; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1028; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1029; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 1030; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1031; GFX8-NEXT: v_mov_b32_e32 v0, s2 1032; GFX8-NEXT: v_mov_b32_e32 v1, s3 1033; GFX8-NEXT: flat_store_dword v[0:1], v2 1034; GFX8-NEXT: s_endpgm 1035; 1036; GFX9-LABEL: sub_i32_constant: 1037; GFX9: ; %bb.0: ; %entry 1038; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1039; GFX9-NEXT: s_mov_b64 s[6:7], exec 1040; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1041; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1042; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1043; GFX9-NEXT: ; implicit-def: $vgpr1 1044; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1045; GFX9-NEXT: s_cbranch_execz .LBB5_2 1046; GFX9-NEXT: ; %bb.1: 1047; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1048; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 1049; GFX9-NEXT: s_mul_i32 s0, s0, 5 1050; GFX9-NEXT: v_mov_b32_e32 v1, s0 1051; GFX9-NEXT: v_mov_b32_e32 v2, 0 1052; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1053; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 1054; GFX9-NEXT: .LBB5_2: 1055; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1056; GFX9-NEXT: s_waitcnt vmcnt(0) 1057; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1058; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1059; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1060; GFX9-NEXT: v_mov_b32_e32 v1, 0 1061; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1062; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 1063; GFX9-NEXT: s_endpgm 1064; 1065; GFX10W64-LABEL: sub_i32_constant: 1066; GFX10W64: ; %bb.0: ; %entry 1067; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1068; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 1069; GFX10W64-NEXT: ; implicit-def: $vgpr1 1070; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1071; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1072; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1073; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1074; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 1075; GFX10W64-NEXT: ; %bb.1: 1076; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1077; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 1078; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 1079; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 1080; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 1081; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1082; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 1083; GFX10W64-NEXT: .LBB5_2: 1084; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1085; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 1086; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1087; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 1088; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1089; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1090; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1091; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1092; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 1093; GFX10W64-NEXT: s_endpgm 1094; 1095; GFX10W32-LABEL: sub_i32_constant: 1096; GFX10W32: ; %bb.0: ; %entry 1097; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1098; GFX10W32-NEXT: s_mov_b32 s5, exec_lo 1099; GFX10W32-NEXT: ; implicit-def: $vgpr1 1100; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 1101; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1102; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 1103; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 1104; GFX10W32-NEXT: ; %bb.1: 1105; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1106; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 1107; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 1108; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 1109; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 1110; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1111; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 1112; GFX10W32-NEXT: .LBB5_2: 1113; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1114; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1115; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1116; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 1117; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1118; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1119; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1120; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1121; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 1122; GFX10W32-NEXT: s_endpgm 1123; 1124; GFX11W64-LABEL: sub_i32_constant: 1125; GFX11W64: ; %bb.0: ; %entry 1126; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1127; GFX11W64-NEXT: s_mov_b64 s[6:7], exec 1128; GFX11W64-NEXT: s_mov_b64 s[4:5], exec 1129; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1130; GFX11W64-NEXT: ; implicit-def: $vgpr1 1131; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1132; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1133; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 1134; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 1135; GFX11W64-NEXT: ; %bb.1: 1136; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 1137; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 1138; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 1139; GFX11W64-NEXT: s_mul_i32 s0, s0, 5 1140; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1141; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 1142; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1143; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc 1144; GFX11W64-NEXT: .LBB5_2: 1145; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] 1146; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1147; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 1148; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1149; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 1150; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1151; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1152; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1153; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] 1154; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1155; GFX11W64-NEXT: s_endpgm 1156; 1157; GFX11W32-LABEL: sub_i32_constant: 1158; GFX11W32: ; %bb.0: ; %entry 1159; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1160; GFX11W32-NEXT: s_mov_b32 s5, exec_lo 1161; GFX11W32-NEXT: s_mov_b32 s4, exec_lo 1162; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 1163; GFX11W32-NEXT: ; implicit-def: $vgpr1 1164; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1165; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 1166; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 1167; GFX11W32-NEXT: ; %bb.1: 1168; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 1169; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5 1170; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 1171; GFX11W32-NEXT: s_mul_i32 s0, s0, 5 1172; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1173; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 1174; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1175; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc 1176; GFX11W32-NEXT: .LBB5_2: 1177; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1178; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1179; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 1180; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1181; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 1182; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1183; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1184; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1185; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] 1186; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1187; GFX11W32-NEXT: s_endpgm 1188entry: 1189 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) 1190 store i32 %old, i32 addrspace(1)* %out 1191 ret void 1192} 1193 1194define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) { 1195; GFX6-LABEL: sub_i32_uniform: 1196; GFX6: ; %bb.0: ; %entry 1197; GFX6-NEXT: s_mov_b64 s[2:3], exec 1198; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1199; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 1200; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1201; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1202; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1203; GFX6-NEXT: ; implicit-def: $vgpr1 1204; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 1205; GFX6-NEXT: s_cbranch_execz .LBB6_2 1206; GFX6-NEXT: ; %bb.1: 1207; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 1208; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 1209; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1210; GFX6-NEXT: s_mul_i32 s0, s8, s0 1211; GFX6-NEXT: v_mov_b32_e32 v1, s0 1212; GFX6-NEXT: v_mov_b32_e32 v2, 0 1213; GFX6-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc 1214; GFX6-NEXT: .LBB6_2: 1215; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 1216; GFX6-NEXT: s_waitcnt vmcnt(0) 1217; GFX6-NEXT: v_readfirstlane_b32 s0, v1 1218; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1219; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 1220; GFX6-NEXT: s_mov_b32 s7, 0xf000 1221; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1222; GFX6-NEXT: s_mov_b32 s6, -1 1223; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 1224; GFX6-NEXT: s_endpgm 1225; 1226; GFX8-LABEL: sub_i32_uniform: 1227; GFX8: ; %bb.0: ; %entry 1228; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1229; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 1230; GFX8-NEXT: s_mov_b64 s[6:7], exec 1231; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1232; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1233; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1234; GFX8-NEXT: ; implicit-def: $vgpr1 1235; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1236; GFX8-NEXT: s_cbranch_execz .LBB6_2 1237; GFX8-NEXT: ; %bb.1: 1238; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 1239; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 1240; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1241; GFX8-NEXT: s_mul_i32 s0, s8, s0 1242; GFX8-NEXT: v_mov_b32_e32 v1, s0 1243; GFX8-NEXT: v_mov_b32_e32 v2, 0 1244; GFX8-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc 1245; GFX8-NEXT: .LBB6_2: 1246; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1247; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1248; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 1249; GFX8-NEXT: s_waitcnt vmcnt(0) 1250; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1251; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 1252; GFX8-NEXT: v_mov_b32_e32 v0, s2 1253; GFX8-NEXT: v_mov_b32_e32 v1, s3 1254; GFX8-NEXT: flat_store_dword v[0:1], v2 1255; GFX8-NEXT: s_endpgm 1256; 1257; GFX9-LABEL: sub_i32_uniform: 1258; GFX9: ; %bb.0: ; %entry 1259; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1260; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 1261; GFX9-NEXT: s_mov_b64 s[6:7], exec 1262; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1263; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1264; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1265; GFX9-NEXT: ; implicit-def: $vgpr1 1266; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1267; GFX9-NEXT: s_cbranch_execz .LBB6_2 1268; GFX9-NEXT: ; %bb.1: 1269; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 1270; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 1271; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1272; GFX9-NEXT: s_mul_i32 s0, s8, s0 1273; GFX9-NEXT: v_mov_b32_e32 v1, s0 1274; GFX9-NEXT: v_mov_b32_e32 v2, 0 1275; GFX9-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc 1276; GFX9-NEXT: .LBB6_2: 1277; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1278; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1279; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 1280; GFX9-NEXT: s_waitcnt vmcnt(0) 1281; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1282; GFX9-NEXT: v_mov_b32_e32 v1, 0 1283; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1284; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 1285; GFX9-NEXT: s_endpgm 1286; 1287; GFX10W64-LABEL: sub_i32_uniform: 1288; GFX10W64: ; %bb.0: ; %entry 1289; GFX10W64-NEXT: s_clause 0x1 1290; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1291; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 1292; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 1293; GFX10W64-NEXT: ; implicit-def: $vgpr1 1294; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1295; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1296; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1297; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1298; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 1299; GFX10W64-NEXT: ; %bb.1: 1300; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 1301; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 1302; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 1303; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1304; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 1305; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 1306; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc 1307; GFX10W64-NEXT: .LBB6_2: 1308; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1309; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 1310; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1311; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0 1312; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1313; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 1314; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1315; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1316; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 1317; GFX10W64-NEXT: s_endpgm 1318; 1319; GFX10W32-LABEL: sub_i32_uniform: 1320; GFX10W32: ; %bb.0: ; %entry 1321; GFX10W32-NEXT: s_clause 0x1 1322; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1323; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 1324; GFX10W32-NEXT: s_mov_b32 s6, exec_lo 1325; GFX10W32-NEXT: ; implicit-def: $vgpr1 1326; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1327; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1328; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 1329; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 1330; GFX10W32-NEXT: ; %bb.1: 1331; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1332; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 1333; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 1334; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1335; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 1336; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 1337; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 1338; GFX10W32-NEXT: .LBB6_2: 1339; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1340; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 1341; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1342; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0 1343; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1344; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 1345; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1346; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1347; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 1348; GFX10W32-NEXT: s_endpgm 1349; 1350; GFX11W64-LABEL: sub_i32_uniform: 1351; GFX11W64: ; %bb.0: ; %entry 1352; GFX11W64-NEXT: s_clause 0x1 1353; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1354; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44 1355; GFX11W64-NEXT: s_mov_b64 s[6:7], exec 1356; GFX11W64-NEXT: s_mov_b64 s[4:5], exec 1357; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1358; GFX11W64-NEXT: ; implicit-def: $vgpr1 1359; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1360; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1361; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 1362; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 1363; GFX11W64-NEXT: ; %bb.1: 1364; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34 1365; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 1366; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 1367; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1368; GFX11W64-NEXT: s_mul_i32 s0, s8, s0 1369; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1370; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 1371; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[12:15], 0 idxen glc 1372; GFX11W64-NEXT: .LBB6_2: 1373; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] 1374; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1375; GFX11W64-NEXT: v_mul_lo_u32 v0, s8, v0 1376; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1377; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 1378; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 1379; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1380; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1381; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] 1382; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1383; GFX11W64-NEXT: s_endpgm 1384; 1385; GFX11W32-LABEL: sub_i32_uniform: 1386; GFX11W32: ; %bb.0: ; %entry 1387; GFX11W32-NEXT: s_clause 0x1 1388; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1389; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44 1390; GFX11W32-NEXT: s_mov_b32 s6, exec_lo 1391; GFX11W32-NEXT: s_mov_b32 s5, exec_lo 1392; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1393; GFX11W32-NEXT: ; implicit-def: $vgpr1 1394; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1395; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 1396; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 1397; GFX11W32-NEXT: ; %bb.1: 1398; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 1399; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6 1400; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 1401; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1402; GFX11W32-NEXT: s_mul_i32 s0, s4, s0 1403; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1404; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 1405; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc 1406; GFX11W32-NEXT: .LBB6_2: 1407; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 1408; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1409; GFX11W32-NEXT: v_mul_lo_u32 v0, s4, v0 1410; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1411; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 1412; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 1413; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1414; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1415; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] 1416; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1417; GFX11W32-NEXT: s_endpgm 1418entry: 1419 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) 1420 store i32 %old, i32 addrspace(1)* %out 1421 ret void 1422} 1423 1424define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { 1425; GFX6-LABEL: sub_i32_varying_vdata: 1426; GFX6: ; %bb.0: ; %entry 1427; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1428; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1429; GFX6-NEXT: v_mov_b32_e32 v1, 0 1430; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1431; GFX6-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 idxen glc 1432; GFX6-NEXT: s_mov_b32 s3, 0xf000 1433; GFX6-NEXT: s_mov_b32 s2, -1 1434; GFX6-NEXT: s_waitcnt vmcnt(0) 1435; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 1436; GFX6-NEXT: s_endpgm 1437; 1438; GFX8-LABEL: sub_i32_varying_vdata: 1439; GFX8: ; %bb.0: ; %entry 1440; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1441; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1442; GFX8-NEXT: v_mov_b32_e32 v1, 0 1443; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1444; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1445; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1446; GFX8-NEXT: v_mov_b32_e32 v2, v0 1447; GFX8-NEXT: s_not_b64 exec, exec 1448; GFX8-NEXT: v_mov_b32_e32 v2, 0 1449; GFX8-NEXT: s_not_b64 exec, exec 1450; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1451; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1452; GFX8-NEXT: s_nop 1 1453; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1454; GFX8-NEXT: s_nop 1 1455; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1456; GFX8-NEXT: s_nop 1 1457; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1458; GFX8-NEXT: s_nop 1 1459; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1460; GFX8-NEXT: s_nop 1 1461; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1462; GFX8-NEXT: v_readlane_b32 s6, v2, 63 1463; GFX8-NEXT: s_nop 0 1464; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1465; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1466; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1467; GFX8-NEXT: ; implicit-def: $vgpr0 1468; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1469; GFX8-NEXT: s_cbranch_execz .LBB7_2 1470; GFX8-NEXT: ; %bb.1: 1471; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1472; GFX8-NEXT: v_mov_b32_e32 v0, s6 1473; GFX8-NEXT: v_mov_b32_e32 v3, 0 1474; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1475; GFX8-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc 1476; GFX8-NEXT: .LBB7_2: 1477; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1478; GFX8-NEXT: s_waitcnt vmcnt(0) 1479; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1480; GFX8-NEXT: v_mov_b32_e32 v0, v1 1481; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1482; GFX8-NEXT: v_mov_b32_e32 v4, s3 1483; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1484; GFX8-NEXT: v_mov_b32_e32 v3, s2 1485; GFX8-NEXT: flat_store_dword v[3:4], v0 1486; GFX8-NEXT: s_endpgm 1487; 1488; GFX9-LABEL: sub_i32_varying_vdata: 1489; GFX9: ; %bb.0: ; %entry 1490; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1491; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1492; GFX9-NEXT: v_mov_b32_e32 v1, 0 1493; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1494; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1495; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1496; GFX9-NEXT: v_mov_b32_e32 v2, v0 1497; GFX9-NEXT: s_not_b64 exec, exec 1498; GFX9-NEXT: v_mov_b32_e32 v2, 0 1499; GFX9-NEXT: s_not_b64 exec, exec 1500; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1501; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1502; GFX9-NEXT: s_nop 1 1503; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1504; GFX9-NEXT: s_nop 1 1505; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1506; GFX9-NEXT: s_nop 1 1507; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1508; GFX9-NEXT: s_nop 1 1509; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1510; GFX9-NEXT: s_nop 1 1511; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1512; GFX9-NEXT: v_readlane_b32 s6, v2, 63 1513; GFX9-NEXT: s_nop 0 1514; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1515; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1516; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1517; GFX9-NEXT: ; implicit-def: $vgpr0 1518; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1519; GFX9-NEXT: s_cbranch_execz .LBB7_2 1520; GFX9-NEXT: ; %bb.1: 1521; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1522; GFX9-NEXT: v_mov_b32_e32 v0, s6 1523; GFX9-NEXT: v_mov_b32_e32 v3, 0 1524; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1525; GFX9-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc 1526; GFX9-NEXT: .LBB7_2: 1527; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1528; GFX9-NEXT: s_waitcnt vmcnt(0) 1529; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1530; GFX9-NEXT: v_mov_b32_e32 v0, v1 1531; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1532; GFX9-NEXT: v_mov_b32_e32 v3, 0 1533; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1534; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 1535; GFX9-NEXT: s_endpgm 1536; 1537; GFX10W64-LABEL: sub_i32_varying_vdata: 1538; GFX10W64: ; %bb.0: ; %entry 1539; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 1540; GFX10W64-NEXT: s_not_b64 exec, exec 1541; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1542; GFX10W64-NEXT: s_not_b64 exec, exec 1543; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 1544; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1545; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 1546; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1547; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1548; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1549; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 1550; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1551; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1552; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 1553; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 1554; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1555; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 1556; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1557; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 1558; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1559; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1560; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 1561; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 1562; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1563; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1564; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1565; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 1566; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 1567; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 1568; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1569; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1570; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1571; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 1572; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1573; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1574; GFX10W64-NEXT: ; implicit-def: $vgpr0 1575; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1576; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 1577; GFX10W64-NEXT: ; %bb.1: 1578; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1579; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 1580; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 1581; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1582; GFX10W64-NEXT: buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc 1583; GFX10W64-NEXT: .LBB7_2: 1584; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1585; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 1586; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1587; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 1588; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 1589; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 1590; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1591; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1592; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 1593; GFX10W64-NEXT: s_endpgm 1594; 1595; GFX10W32-LABEL: sub_i32_varying_vdata: 1596; GFX10W32: ; %bb.0: ; %entry 1597; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 1598; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 1599; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1600; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 1601; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 1602; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1603; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1604; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1605; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1606; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 1607; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1608; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 1609; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1610; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 1611; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1612; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 1613; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 1614; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1615; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 1616; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 1617; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1618; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 1619; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 1620; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 1621; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1622; GFX10W32-NEXT: ; implicit-def: $vgpr0 1623; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 1624; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 1625; GFX10W32-NEXT: ; %bb.1: 1626; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1627; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 1628; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 1629; GFX10W32-NEXT: s_mov_b32 s5, s6 1630; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1631; GFX10W32-NEXT: buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc 1632; GFX10W32-NEXT: .LBB7_2: 1633; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1634; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1635; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1636; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 1637; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 1638; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 1639; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1640; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1641; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 1642; GFX10W32-NEXT: s_endpgm 1643; 1644; GFX11W64-LABEL: sub_i32_varying_vdata: 1645; GFX11W64: ; %bb.0: ; %entry 1646; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 1647; GFX11W64-NEXT: s_not_b64 exec, exec 1648; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 1649; GFX11W64-NEXT: s_not_b64 exec, exec 1650; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 1651; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1652; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1653; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 1654; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1655; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1656; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1657; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1658; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1659; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 1660; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1661; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1662; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1663; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 1664; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1665; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 1666; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1667; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) 1668; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 1669; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1670; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] 1671; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1672; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1673; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 1674; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 1675; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 1676; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1677; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1678; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1679; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 1680; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 1681; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 1682; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 1683; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 1684; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1685; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1686; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 1687; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 1688; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1689; GFX11W64-NEXT: ; implicit-def: $vgpr0 1690; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1691; GFX11W64-NEXT: s_cbranch_execz .LBB7_2 1692; GFX11W64-NEXT: ; %bb.1: 1693; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 1694; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 1695; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 1696; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1697; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, v4, s[8:11], 0 idxen glc 1698; GFX11W64-NEXT: .LBB7_2: 1699; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] 1700; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1701; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 1702; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 1703; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 1704; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1705; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1706; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1707; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] 1708; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1709; GFX11W64-NEXT: s_endpgm 1710; 1711; GFX11W32-LABEL: sub_i32_varying_vdata: 1712; GFX11W32: ; %bb.0: ; %entry 1713; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 1714; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo 1715; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 1716; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo 1717; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 1718; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1719; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1720; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1721; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1722; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1723; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1724; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1725; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 1726; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1727; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 1728; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1729; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 1730; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1731; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1732; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 1733; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 1734; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) 1735; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1736; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 1737; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 1738; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1739; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 1740; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1741; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 1742; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 1743; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1744; GFX11W32-NEXT: ; implicit-def: $vgpr0 1745; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 1746; GFX11W32-NEXT: s_cbranch_execz .LBB7_2 1747; GFX11W32-NEXT: ; %bb.1: 1748; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 1749; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 1750; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 1751; GFX11W32-NEXT: s_mov_b32 s5, s6 1752; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1753; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v4, s[8:11], 0 idxen glc 1754; GFX11W32-NEXT: .LBB7_2: 1755; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1756; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1757; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 1758; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 1759; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 1760; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1761; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1762; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1763; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] 1764; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1765; GFX11W32-NEXT: s_endpgm 1766entry: 1767 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1768 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) 1769 store i32 %old, i32 addrspace(1)* %out 1770 ret void 1771} 1772 1773define amdgpu_kernel void @sub_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) { 1774; GFX6-LABEL: sub_i32_varying_vindex: 1775; GFX6: ; %bb.0: ; %entry 1776; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1777; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1778; GFX6-NEXT: v_mov_b32_e32 v1, 1 1779; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1780; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc 1781; GFX6-NEXT: s_mov_b32 s3, 0xf000 1782; GFX6-NEXT: s_mov_b32 s2, -1 1783; GFX6-NEXT: s_waitcnt vmcnt(0) 1784; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 1785; GFX6-NEXT: s_endpgm 1786; 1787; GFX8-LABEL: sub_i32_varying_vindex: 1788; GFX8: ; %bb.0: ; %entry 1789; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1790; GFX8-NEXT: v_mov_b32_e32 v2, 1 1791; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1792; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1793; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 idxen glc 1794; GFX8-NEXT: v_mov_b32_e32 v0, s0 1795; GFX8-NEXT: v_mov_b32_e32 v1, s1 1796; GFX8-NEXT: s_waitcnt vmcnt(0) 1797; GFX8-NEXT: flat_store_dword v[0:1], v2 1798; GFX8-NEXT: s_endpgm 1799; 1800; GFX9-LABEL: sub_i32_varying_vindex: 1801; GFX9: ; %bb.0: ; %entry 1802; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1803; GFX9-NEXT: v_mov_b32_e32 v1, 1 1804; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1805; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1806; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc 1807; GFX9-NEXT: v_mov_b32_e32 v0, 0 1808; GFX9-NEXT: s_waitcnt vmcnt(0) 1809; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1810; GFX9-NEXT: s_endpgm 1811; 1812; GFX10-LABEL: sub_i32_varying_vindex: 1813; GFX10: ; %bb.0: ; %entry 1814; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1815; GFX10-NEXT: v_mov_b32_e32 v1, 1 1816; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1817; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1818; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc 1819; GFX10-NEXT: v_mov_b32_e32 v0, 0 1820; GFX10-NEXT: s_waitcnt vmcnt(0) 1821; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1822; GFX10-NEXT: s_endpgm 1823; 1824; GFX11-LABEL: sub_i32_varying_vindex: 1825; GFX11: ; %bb.0: ; %entry 1826; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 1827; GFX11-NEXT: v_mov_b32_e32 v1, 1 1828; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1829; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1830; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 idxen glc 1831; GFX11-NEXT: v_mov_b32_e32 v0, 0 1832; GFX11-NEXT: s_waitcnt vmcnt(0) 1833; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1834; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1835; GFX11-NEXT: s_endpgm 1836entry: 1837 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1838 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0) 1839 store i32 %old, i32 addrspace(1)* %out 1840 ret void 1841} 1842 1843define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { 1844; GFX6-LABEL: sub_i32_varying_offset: 1845; GFX6: ; %bb.0: ; %entry 1846; GFX6-NEXT: v_mov_b32_e32 v1, v0 1847; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1848; GFX6-NEXT: s_mov_b32 s2, 0 1849; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1850; GFX6-NEXT: v_mov_b32_e32 v0, s2 1851; GFX6-NEXT: v_mov_b32_e32 v2, 1 1852; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1853; GFX6-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc 1854; GFX6-NEXT: s_mov_b32 s3, 0xf000 1855; GFX6-NEXT: s_mov_b32 s2, -1 1856; GFX6-NEXT: s_waitcnt vmcnt(0) 1857; GFX6-NEXT: buffer_store_dword v2, off, s[0:3], 0 1858; GFX6-NEXT: s_endpgm 1859; 1860; GFX8-LABEL: sub_i32_varying_offset: 1861; GFX8: ; %bb.0: ; %entry 1862; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1863; GFX8-NEXT: s_mov_b32 s2, 0 1864; GFX8-NEXT: v_mov_b32_e32 v1, v0 1865; GFX8-NEXT: v_mov_b32_e32 v0, s2 1866; GFX8-NEXT: v_mov_b32_e32 v2, 1 1867; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1868; GFX8-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc 1869; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1870; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1871; GFX8-NEXT: v_mov_b32_e32 v0, s0 1872; GFX8-NEXT: v_mov_b32_e32 v1, s1 1873; GFX8-NEXT: s_waitcnt vmcnt(0) 1874; GFX8-NEXT: flat_store_dword v[0:1], v2 1875; GFX8-NEXT: s_endpgm 1876; 1877; GFX9-LABEL: sub_i32_varying_offset: 1878; GFX9: ; %bb.0: ; %entry 1879; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1880; GFX9-NEXT: s_mov_b32 s2, 0 1881; GFX9-NEXT: v_mov_b32_e32 v1, v0 1882; GFX9-NEXT: v_mov_b32_e32 v0, s2 1883; GFX9-NEXT: v_mov_b32_e32 v2, 1 1884; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1885; GFX9-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc 1886; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1887; GFX9-NEXT: v_mov_b32_e32 v0, 0 1888; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1889; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 1890; GFX9-NEXT: s_endpgm 1891; 1892; GFX10-LABEL: sub_i32_varying_offset: 1893; GFX10: ; %bb.0: ; %entry 1894; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1895; GFX10-NEXT: s_mov_b32 s2, 0 1896; GFX10-NEXT: v_mov_b32_e32 v1, v0 1897; GFX10-NEXT: v_mov_b32_e32 v0, s2 1898; GFX10-NEXT: v_mov_b32_e32 v2, 1 1899; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1900; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1901; GFX10-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc 1902; GFX10-NEXT: v_mov_b32_e32 v0, 0 1903; GFX10-NEXT: s_waitcnt vmcnt(0) 1904; GFX10-NEXT: global_store_dword v0, v2, s[0:1] 1905; GFX10-NEXT: s_endpgm 1906; 1907; GFX11W64-LABEL: sub_i32_varying_offset: 1908; GFX11W64: ; %bb.0: ; %entry 1909; GFX11W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 1910; GFX11W64-NEXT: s_mov_b32 s2, 0 1911; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 1912; GFX11W64-NEXT: v_mov_b32_e32 v0, s2 1913; GFX11W64-NEXT: v_mov_b32_e32 v2, 1 1914; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1915; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1916; GFX11W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc 1917; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 1918; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1919; GFX11W64-NEXT: global_store_b32 v0, v2, s[0:1] 1920; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1921; GFX11W64-NEXT: s_endpgm 1922; 1923; GFX11W32-LABEL: sub_i32_varying_offset: 1924; GFX11W32: ; %bb.0: ; %entry 1925; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 1926; GFX11W32-NEXT: s_mov_b32 s2, 0 1927; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1928; GFX11W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s2 1929; GFX11W32-NEXT: v_mov_b32_e32 v2, 1 1930; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1931; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1932; GFX11W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc 1933; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 1934; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1935; GFX11W32-NEXT: global_store_b32 v0, v2, s[0:1] 1936; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1937; GFX11W32-NEXT: s_endpgm 1938entry: 1939 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1940 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0) 1941 store i32 %old, i32 addrspace(1)* %out 1942 ret void 1943} 1944