1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s 3; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s 7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s 8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s 9 10declare i32 @llvm.amdgcn.workitem.id.x() 11declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg) 12declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32 immarg) 13declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32 immarg) 14 15; Show what the atomic optimization pass will do for raw buffers. 16 17define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { 18; GFX6-LABEL: add_i32_constant: 19; GFX6: ; %bb.0: ; %entry 20; GFX6-NEXT: s_mov_b64 s[2:3], exec 21; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 22; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 23; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 24; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX6-NEXT: ; implicit-def: $vgpr1 26; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 27; GFX6-NEXT: s_cbranch_execz .LBB0_2 28; GFX6-NEXT: ; %bb.1: 29; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 30; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 31; GFX6-NEXT: s_mul_i32 s0, s0, 5 32; GFX6-NEXT: v_mov_b32_e32 v1, s0 33; GFX6-NEXT: s_waitcnt lgkmcnt(0) 34; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 35; GFX6-NEXT: .LBB0_2: 36; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 37; GFX6-NEXT: s_waitcnt vmcnt(0) 38; GFX6-NEXT: v_readfirstlane_b32 s0, v1 39; GFX6-NEXT: s_mov_b32 s7, 0xf000 40; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s0 41; GFX6-NEXT: s_mov_b32 s6, -1 42; GFX6-NEXT: s_waitcnt lgkmcnt(0) 43; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 44; GFX6-NEXT: s_endpgm 45; 46; GFX8-LABEL: add_i32_constant: 47; GFX8: ; %bb.0: ; %entry 48; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 49; GFX8-NEXT: s_mov_b64 s[6:7], exec 50; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 51; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 52; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 53; GFX8-NEXT: ; implicit-def: $vgpr1 54; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 55; GFX8-NEXT: s_cbranch_execz .LBB0_2 56; GFX8-NEXT: ; %bb.1: 57; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 58; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 59; GFX8-NEXT: s_mul_i32 s0, s0, 5 60; GFX8-NEXT: v_mov_b32_e32 v1, s0 61; GFX8-NEXT: s_waitcnt lgkmcnt(0) 62; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 63; GFX8-NEXT: .LBB0_2: 64; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 65; GFX8-NEXT: s_waitcnt vmcnt(0) 66; GFX8-NEXT: v_readfirstlane_b32 s0, v1 67; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0 68; GFX8-NEXT: s_waitcnt lgkmcnt(0) 69; GFX8-NEXT: v_mov_b32_e32 v0, s2 70; GFX8-NEXT: v_mov_b32_e32 v1, s3 71; GFX8-NEXT: flat_store_dword v[0:1], v2 72; GFX8-NEXT: s_endpgm 73; 74; GFX9-LABEL: add_i32_constant: 75; GFX9: ; %bb.0: ; %entry 76; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 77; GFX9-NEXT: s_mov_b64 s[6:7], exec 78; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 79; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 80; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 81; GFX9-NEXT: ; implicit-def: $vgpr1 82; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 83; GFX9-NEXT: s_cbranch_execz .LBB0_2 84; GFX9-NEXT: ; %bb.1: 85; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 86; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 87; GFX9-NEXT: s_mul_i32 s0, s0, 5 88; GFX9-NEXT: v_mov_b32_e32 v1, s0 89; GFX9-NEXT: s_waitcnt lgkmcnt(0) 90; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 91; GFX9-NEXT: .LBB0_2: 92; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 93; GFX9-NEXT: s_waitcnt vmcnt(0) 94; GFX9-NEXT: v_readfirstlane_b32 s0, v1 95; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0 96; GFX9-NEXT: v_mov_b32_e32 v1, 0 97; GFX9-NEXT: s_waitcnt lgkmcnt(0) 98; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 99; GFX9-NEXT: s_endpgm 100; 101; GFX10W64-LABEL: add_i32_constant: 102; GFX10W64: ; %bb.0: ; %entry 103; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 104; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 105; GFX10W64-NEXT: ; implicit-def: $vgpr1 106; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 107; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 108; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 109; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 110; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 111; GFX10W64-NEXT: ; %bb.1: 112; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 113; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 114; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 115; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 116; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 117; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 118; GFX10W64-NEXT: .LBB0_2: 119; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 120; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 121; GFX10W64-NEXT: s_waitcnt vmcnt(0) 122; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 123; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 124; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 125; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 126; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 127; GFX10W64-NEXT: s_endpgm 128; 129; GFX10W32-LABEL: add_i32_constant: 130; GFX10W32: ; %bb.0: ; %entry 131; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 132; GFX10W32-NEXT: s_mov_b32 s5, exec_lo 133; GFX10W32-NEXT: ; implicit-def: $vgpr1 134; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 135; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 136; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 137; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 138; GFX10W32-NEXT: ; %bb.1: 139; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 140; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 141; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 142; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 143; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 144; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 145; GFX10W32-NEXT: .LBB0_2: 146; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 147; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 148; GFX10W32-NEXT: s_waitcnt vmcnt(0) 149; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 150; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 151; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 152; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 153; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 154; GFX10W32-NEXT: s_endpgm 155; 156; GFX11W64-LABEL: add_i32_constant: 157; GFX11W64: ; %bb.0: ; %entry 158; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 159; GFX11W64-NEXT: s_mov_b64 s[6:7], exec 160; GFX11W64-NEXT: s_mov_b64 s[4:5], exec 161; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 162; GFX11W64-NEXT: ; implicit-def: $vgpr1 163; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 164; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 165; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 166; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 167; GFX11W64-NEXT: ; %bb.1: 168; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 169; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 170; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 171; GFX11W64-NEXT: s_mul_i32 s0, s0, 5 172; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 173; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 174; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 175; GFX11W64-NEXT: .LBB0_2: 176; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] 177; GFX11W64-NEXT: s_waitcnt vmcnt(0) 178; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 179; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 180; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 181; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 182; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 183; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] 184; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 185; GFX11W64-NEXT: s_endpgm 186; 187; GFX11W32-LABEL: add_i32_constant: 188; GFX11W32: ; %bb.0: ; %entry 189; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 190; GFX11W32-NEXT: s_mov_b32 s5, exec_lo 191; GFX11W32-NEXT: s_mov_b32 s4, exec_lo 192; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 193; GFX11W32-NEXT: ; implicit-def: $vgpr1 194; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 195; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 196; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 197; GFX11W32-NEXT: ; %bb.1: 198; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 199; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5 200; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 201; GFX11W32-NEXT: s_mul_i32 s0, s0, 5 202; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 203; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 204; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 205; GFX11W32-NEXT: .LBB0_2: 206; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 207; GFX11W32-NEXT: s_waitcnt vmcnt(0) 208; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 209; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 210; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 211; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 212; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 213; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] 214; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 215; GFX11W32-NEXT: s_endpgm 216entry: 217 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) 218 store i32 %old, i32 addrspace(1)* %out 219 ret void 220} 221 222define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) { 223; GFX6-LABEL: add_i32_uniform: 224; GFX6: ; %bb.0: ; %entry 225; GFX6-NEXT: s_mov_b64 s[2:3], exec 226; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 227; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 228; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 229; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 230; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 231; GFX6-NEXT: ; implicit-def: $vgpr1 232; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 233; GFX6-NEXT: s_cbranch_execz .LBB1_2 234; GFX6-NEXT: ; %bb.1: 235; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 236; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 237; GFX6-NEXT: s_waitcnt lgkmcnt(0) 238; GFX6-NEXT: s_mul_i32 s0, s8, s0 239; GFX6-NEXT: v_mov_b32_e32 v1, s0 240; GFX6-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 241; GFX6-NEXT: .LBB1_2: 242; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 243; GFX6-NEXT: s_waitcnt vmcnt(0) 244; GFX6-NEXT: v_readfirstlane_b32 s0, v1 245; GFX6-NEXT: s_waitcnt lgkmcnt(0) 246; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 247; GFX6-NEXT: s_mov_b32 s7, 0xf000 248; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 249; GFX6-NEXT: s_mov_b32 s6, -1 250; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 251; GFX6-NEXT: s_endpgm 252; 253; GFX8-LABEL: add_i32_uniform: 254; GFX8: ; %bb.0: ; %entry 255; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 256; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 257; GFX8-NEXT: s_mov_b64 s[4:5], exec 258; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 259; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 260; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 261; GFX8-NEXT: ; implicit-def: $vgpr1 262; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 263; GFX8-NEXT: s_cbranch_execz .LBB1_2 264; GFX8-NEXT: ; %bb.1: 265; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 266; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 267; GFX8-NEXT: s_waitcnt lgkmcnt(0) 268; GFX8-NEXT: s_mul_i32 s0, s8, s0 269; GFX8-NEXT: v_mov_b32_e32 v1, s0 270; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 271; GFX8-NEXT: .LBB1_2: 272; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 273; GFX8-NEXT: s_waitcnt lgkmcnt(0) 274; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 275; GFX8-NEXT: s_waitcnt vmcnt(0) 276; GFX8-NEXT: v_readfirstlane_b32 s0, v1 277; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 278; GFX8-NEXT: v_mov_b32_e32 v0, s2 279; GFX8-NEXT: v_mov_b32_e32 v1, s3 280; GFX8-NEXT: flat_store_dword v[0:1], v2 281; GFX8-NEXT: s_endpgm 282; 283; GFX9-LABEL: add_i32_uniform: 284; GFX9: ; %bb.0: ; %entry 285; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 286; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 287; GFX9-NEXT: s_mov_b64 s[4:5], exec 288; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 289; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 290; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 291; GFX9-NEXT: ; implicit-def: $vgpr1 292; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc 293; GFX9-NEXT: s_cbranch_execz .LBB1_2 294; GFX9-NEXT: ; %bb.1: 295; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 296; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 297; GFX9-NEXT: s_waitcnt lgkmcnt(0) 298; GFX9-NEXT: s_mul_i32 s0, s8, s0 299; GFX9-NEXT: v_mov_b32_e32 v1, s0 300; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 301; GFX9-NEXT: .LBB1_2: 302; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 303; GFX9-NEXT: s_waitcnt lgkmcnt(0) 304; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 305; GFX9-NEXT: s_waitcnt vmcnt(0) 306; GFX9-NEXT: v_readfirstlane_b32 s0, v1 307; GFX9-NEXT: v_mov_b32_e32 v1, 0 308; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 309; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 310; GFX9-NEXT: s_endpgm 311; 312; GFX10W64-LABEL: add_i32_uniform: 313; GFX10W64: ; %bb.0: ; %entry 314; GFX10W64-NEXT: s_clause 0x1 315; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 316; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 317; GFX10W64-NEXT: s_mov_b64 s[4:5], exec 318; GFX10W64-NEXT: ; implicit-def: $vgpr1 319; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 320; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 321; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 322; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc 323; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 324; GFX10W64-NEXT: ; %bb.1: 325; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 326; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 327; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 328; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 329; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 330; GFX10W64-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 331; GFX10W64-NEXT: .LBB1_2: 332; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 333; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7] 334; GFX10W64-NEXT: s_waitcnt vmcnt(0) 335; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 336; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 337; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1] 338; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 339; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 340; GFX10W64-NEXT: s_endpgm 341; 342; GFX10W32-LABEL: add_i32_uniform: 343; GFX10W32: ; %bb.0: ; %entry 344; GFX10W32-NEXT: s_clause 0x1 345; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 346; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 347; GFX10W32-NEXT: s_mov_b32 s6, exec_lo 348; GFX10W32-NEXT: ; implicit-def: $vgpr1 349; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 350; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 351; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 352; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 353; GFX10W32-NEXT: ; %bb.1: 354; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 355; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 356; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 357; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 358; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 359; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 360; GFX10W32-NEXT: .LBB1_2: 361; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 362; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 363; GFX10W32-NEXT: s_waitcnt vmcnt(0) 364; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 365; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 366; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1] 367; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 368; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 369; GFX10W32-NEXT: s_endpgm 370; 371; GFX11W64-LABEL: add_i32_uniform: 372; GFX11W64: ; %bb.0: ; %entry 373; GFX11W64-NEXT: s_clause 0x1 374; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 375; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44 376; GFX11W64-NEXT: s_mov_b64 s[4:5], exec 377; GFX11W64-NEXT: s_mov_b64 s[6:7], exec 378; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 379; GFX11W64-NEXT: ; implicit-def: $vgpr1 380; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 381; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 382; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 383; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 384; GFX11W64-NEXT: ; %bb.1: 385; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34 386; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 387; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 388; GFX11W64-NEXT: s_mul_i32 s0, s8, s0 389; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 390; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 391; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], 0 glc 392; GFX11W64-NEXT: .LBB1_2: 393; GFX11W64-NEXT: s_or_b64 exec, exec, s[6:7] 394; GFX11W64-NEXT: s_waitcnt vmcnt(0) 395; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 396; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 397; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) 398; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1] 399; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 400; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3] 401; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 402; GFX11W64-NEXT: s_endpgm 403; 404; GFX11W32-LABEL: add_i32_uniform: 405; GFX11W32: ; %bb.0: ; %entry 406; GFX11W32-NEXT: s_clause 0x1 407; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 408; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44 409; GFX11W32-NEXT: s_mov_b32 s6, exec_lo 410; GFX11W32-NEXT: s_mov_b32 s5, exec_lo 411; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 412; GFX11W32-NEXT: ; implicit-def: $vgpr1 413; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 414; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 415; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 416; GFX11W32-NEXT: ; %bb.1: 417; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 418; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6 419; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 420; GFX11W32-NEXT: s_mul_i32 s0, s4, s0 421; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 422; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 423; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 424; GFX11W32-NEXT: .LBB1_2: 425; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 426; GFX11W32-NEXT: s_waitcnt vmcnt(0) 427; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 428; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 429; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 430; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s4, v0, s[0:1] 431; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 432; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] 433; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 434; GFX11W32-NEXT: s_endpgm 435entry: 436 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0) 437 store i32 %old, i32 addrspace(1)* %out 438 ret void 439} 440 441define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { 442; GFX6-LABEL: add_i32_varying_vdata: 443; GFX6: ; %bb.0: ; %entry 444; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 445; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 446; GFX6-NEXT: s_waitcnt lgkmcnt(0) 447; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 448; GFX6-NEXT: s_mov_b32 s3, 0xf000 449; GFX6-NEXT: s_mov_b32 s2, -1 450; GFX6-NEXT: s_waitcnt vmcnt(0) 451; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 452; GFX6-NEXT: s_endpgm 453; 454; GFX8-LABEL: add_i32_varying_vdata: 455; GFX8: ; %bb.0: ; %entry 456; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 457; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 458; GFX8-NEXT: v_mov_b32_e32 v1, 0 459; GFX8-NEXT: s_mov_b64 exec, s[4:5] 460; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 461; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 462; GFX8-NEXT: v_mov_b32_e32 v2, v0 463; GFX8-NEXT: s_not_b64 exec, exec 464; GFX8-NEXT: v_mov_b32_e32 v2, 0 465; GFX8-NEXT: s_not_b64 exec, exec 466; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 467; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 468; GFX8-NEXT: s_nop 1 469; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 470; GFX8-NEXT: s_nop 1 471; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 472; GFX8-NEXT: s_nop 1 473; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 474; GFX8-NEXT: s_nop 1 475; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 476; GFX8-NEXT: s_nop 1 477; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 478; GFX8-NEXT: v_readlane_b32 s6, v2, 63 479; GFX8-NEXT: s_nop 0 480; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 481; GFX8-NEXT: s_mov_b64 exec, s[4:5] 482; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 483; GFX8-NEXT: ; implicit-def: $vgpr0 484; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 485; GFX8-NEXT: s_cbranch_execz .LBB2_2 486; GFX8-NEXT: ; %bb.1: 487; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 488; GFX8-NEXT: v_mov_b32_e32 v0, s6 489; GFX8-NEXT: s_waitcnt lgkmcnt(0) 490; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 491; GFX8-NEXT: .LBB2_2: 492; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 493; GFX8-NEXT: s_waitcnt vmcnt(0) 494; GFX8-NEXT: v_readfirstlane_b32 s0, v0 495; GFX8-NEXT: v_mov_b32_e32 v0, v1 496; GFX8-NEXT: s_waitcnt lgkmcnt(0) 497; GFX8-NEXT: v_mov_b32_e32 v4, s3 498; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 499; GFX8-NEXT: v_mov_b32_e32 v3, s2 500; GFX8-NEXT: flat_store_dword v[3:4], v0 501; GFX8-NEXT: s_endpgm 502; 503; GFX9-LABEL: add_i32_varying_vdata: 504; GFX9: ; %bb.0: ; %entry 505; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 506; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 507; GFX9-NEXT: v_mov_b32_e32 v1, 0 508; GFX9-NEXT: s_mov_b64 exec, s[4:5] 509; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 510; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 511; GFX9-NEXT: v_mov_b32_e32 v2, v0 512; GFX9-NEXT: s_not_b64 exec, exec 513; GFX9-NEXT: v_mov_b32_e32 v2, 0 514; GFX9-NEXT: s_not_b64 exec, exec 515; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 516; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 517; GFX9-NEXT: s_nop 1 518; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 519; GFX9-NEXT: s_nop 1 520; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 521; GFX9-NEXT: s_nop 1 522; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 523; GFX9-NEXT: s_nop 1 524; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 525; GFX9-NEXT: s_nop 1 526; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 527; GFX9-NEXT: v_readlane_b32 s6, v2, 63 528; GFX9-NEXT: s_nop 0 529; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 530; GFX9-NEXT: s_mov_b64 exec, s[4:5] 531; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 532; GFX9-NEXT: ; implicit-def: $vgpr0 533; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 534; GFX9-NEXT: s_cbranch_execz .LBB2_2 535; GFX9-NEXT: ; %bb.1: 536; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 537; GFX9-NEXT: v_mov_b32_e32 v0, s6 538; GFX9-NEXT: s_waitcnt lgkmcnt(0) 539; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 540; GFX9-NEXT: .LBB2_2: 541; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 542; GFX9-NEXT: s_waitcnt vmcnt(0) 543; GFX9-NEXT: v_readfirstlane_b32 s0, v0 544; GFX9-NEXT: v_mov_b32_e32 v0, v1 545; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 546; GFX9-NEXT: v_mov_b32_e32 v3, 0 547; GFX9-NEXT: s_waitcnt lgkmcnt(0) 548; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 549; GFX9-NEXT: s_endpgm 550; 551; GFX10W64-LABEL: add_i32_varying_vdata: 552; GFX10W64: ; %bb.0: ; %entry 553; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 554; GFX10W64-NEXT: s_not_b64 exec, exec 555; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 556; GFX10W64-NEXT: s_not_b64 exec, exec 557; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 558; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 559; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 560; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 561; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 562; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 563; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 564; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 565; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 566; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 567; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 568; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 569; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 570; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 571; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 572; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 573; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 574; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 575; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 576; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 577; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 578; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 579; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 580; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 581; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 582; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 583; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 584; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 585; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 586; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 587; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 588; GFX10W64-NEXT: ; implicit-def: $vgpr0 589; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 590; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 591; GFX10W64-NEXT: ; %bb.1: 592; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 593; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 594; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 595; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 596; GFX10W64-NEXT: .LBB2_2: 597; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 598; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 599; GFX10W64-NEXT: s_waitcnt vmcnt(0) 600; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 601; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 602; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 603; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 604; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 605; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 606; GFX10W64-NEXT: s_endpgm 607; 608; GFX10W32-LABEL: add_i32_varying_vdata: 609; GFX10W32: ; %bb.0: ; %entry 610; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 611; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 612; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 613; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 614; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 615; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 616; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 617; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 618; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 619; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 620; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 621; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 622; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 623; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 624; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 625; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 626; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 627; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 628; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 629; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 630; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 631; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 632; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 633; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 634; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 635; GFX10W32-NEXT: ; implicit-def: $vgpr0 636; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 637; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 638; GFX10W32-NEXT: ; %bb.1: 639; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 640; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 641; GFX10W32-NEXT: s_mov_b32 s5, s6 642; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 643; GFX10W32-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 644; GFX10W32-NEXT: .LBB2_2: 645; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 646; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 647; GFX10W32-NEXT: s_waitcnt vmcnt(0) 648; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 649; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 650; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 651; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 652; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 653; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 654; GFX10W32-NEXT: s_endpgm 655; 656; GFX11W64-LABEL: add_i32_varying_vdata: 657; GFX11W64: ; %bb.0: ; %entry 658; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 659; GFX11W64-NEXT: s_not_b64 exec, exec 660; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 661; GFX11W64-NEXT: s_not_b64 exec, exec 662; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 663; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 664; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 665; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 666; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 667; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 668; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 669; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 670; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 671; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 672; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 673; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 674; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 675; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 676; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 677; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 678; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 679; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) 680; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 681; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 682; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] 683; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 684; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 685; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 686; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 687; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 688; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 689; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 690; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 691; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 692; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 693; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 694; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 695; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 696; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 697; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 698; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 699; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 700; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 701; GFX11W64-NEXT: ; implicit-def: $vgpr0 702; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 703; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 704; GFX11W64-NEXT: ; %bb.1: 705; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 706; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 707; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 708; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc 709; GFX11W64-NEXT: .LBB2_2: 710; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] 711; GFX11W64-NEXT: s_waitcnt vmcnt(0) 712; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 713; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 714; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 715; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 716; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 717; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 718; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] 719; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 720; GFX11W64-NEXT: s_endpgm 721; 722; GFX11W32-LABEL: add_i32_varying_vdata: 723; GFX11W32: ; %bb.0: ; %entry 724; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 725; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo 726; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 727; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo 728; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 729; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 730; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 731; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 732; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 733; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 734; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 735; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 736; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 737; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 738; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 739; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 740; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 741; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 742; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 743; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 744; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 745; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) 746; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 747; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 748; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 749; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 750; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 751; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 752; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 753; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 754; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 755; GFX11W32-NEXT: ; implicit-def: $vgpr0 756; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 757; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 758; GFX11W32-NEXT: ; %bb.1: 759; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 760; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 761; GFX11W32-NEXT: s_mov_b32 s5, s6 762; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 763; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc 764; GFX11W32-NEXT: .LBB2_2: 765; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 766; GFX11W32-NEXT: s_waitcnt vmcnt(0) 767; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 768; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 769; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 770; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 771; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 772; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 773; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] 774; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 775; GFX11W32-NEXT: s_endpgm 776entry: 777 %lane = call i32 @llvm.amdgcn.workitem.id.x() 778 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) 779 store i32 %old, i32 addrspace(1)* %out 780 ret void 781} 782 783define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %vindex) { 784; GFX6-LABEL: struct_add_i32_varying_vdata: 785; GFX6: ; %bb.0: ; %entry 786; GFX6-NEXT: s_load_dword s2, s[0:1], 0x11 787; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 788; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 789; GFX6-NEXT: s_waitcnt lgkmcnt(0) 790; GFX6-NEXT: v_mov_b32_e32 v1, s2 791; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc 792; GFX6-NEXT: s_mov_b32 s3, 0xf000 793; GFX6-NEXT: s_mov_b32 s2, -1 794; GFX6-NEXT: s_waitcnt vmcnt(0) 795; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 796; GFX6-NEXT: s_endpgm 797; 798; GFX8-LABEL: struct_add_i32_varying_vdata: 799; GFX8: ; %bb.0: ; %entry 800; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 801; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 802; GFX8-NEXT: v_mov_b32_e32 v1, 0 803; GFX8-NEXT: s_mov_b64 exec, s[4:5] 804; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 805; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 806; GFX8-NEXT: v_mov_b32_e32 v2, v0 807; GFX8-NEXT: s_not_b64 exec, exec 808; GFX8-NEXT: v_mov_b32_e32 v2, 0 809; GFX8-NEXT: s_not_b64 exec, exec 810; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 811; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 812; GFX8-NEXT: s_nop 1 813; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 814; GFX8-NEXT: s_nop 1 815; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 816; GFX8-NEXT: s_nop 1 817; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 818; GFX8-NEXT: s_nop 1 819; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 820; GFX8-NEXT: s_nop 1 821; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 822; GFX8-NEXT: v_readlane_b32 s6, v2, 63 823; GFX8-NEXT: s_nop 0 824; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 825; GFX8-NEXT: s_mov_b64 exec, s[4:5] 826; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 827; GFX8-NEXT: ; implicit-def: $vgpr0 828; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 829; GFX8-NEXT: s_cbranch_execz .LBB3_2 830; GFX8-NEXT: ; %bb.1: 831; GFX8-NEXT: s_load_dword s7, s[0:1], 0x44 832; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 833; GFX8-NEXT: v_mov_b32_e32 v0, s6 834; GFX8-NEXT: s_waitcnt lgkmcnt(0) 835; GFX8-NEXT: v_mov_b32_e32 v3, s7 836; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc 837; GFX8-NEXT: .LBB3_2: 838; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 839; GFX8-NEXT: s_waitcnt vmcnt(0) 840; GFX8-NEXT: v_readfirstlane_b32 s0, v0 841; GFX8-NEXT: v_mov_b32_e32 v0, v1 842; GFX8-NEXT: s_waitcnt lgkmcnt(0) 843; GFX8-NEXT: v_mov_b32_e32 v4, s3 844; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 845; GFX8-NEXT: v_mov_b32_e32 v3, s2 846; GFX8-NEXT: flat_store_dword v[3:4], v0 847; GFX8-NEXT: s_endpgm 848; 849; GFX9-LABEL: struct_add_i32_varying_vdata: 850; GFX9: ; %bb.0: ; %entry 851; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 852; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 853; GFX9-NEXT: v_mov_b32_e32 v1, 0 854; GFX9-NEXT: s_mov_b64 exec, s[4:5] 855; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 856; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 857; GFX9-NEXT: v_mov_b32_e32 v2, v0 858; GFX9-NEXT: s_not_b64 exec, exec 859; GFX9-NEXT: v_mov_b32_e32 v2, 0 860; GFX9-NEXT: s_not_b64 exec, exec 861; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 862; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 863; GFX9-NEXT: s_nop 1 864; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 865; GFX9-NEXT: s_nop 1 866; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 867; GFX9-NEXT: s_nop 1 868; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 869; GFX9-NEXT: s_nop 1 870; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 871; GFX9-NEXT: s_nop 1 872; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 873; GFX9-NEXT: v_readlane_b32 s6, v2, 63 874; GFX9-NEXT: s_nop 0 875; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 876; GFX9-NEXT: s_mov_b64 exec, s[4:5] 877; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 878; GFX9-NEXT: ; implicit-def: $vgpr0 879; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 880; GFX9-NEXT: s_cbranch_execz .LBB3_2 881; GFX9-NEXT: ; %bb.1: 882; GFX9-NEXT: s_load_dword s7, s[0:1], 0x44 883; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 884; GFX9-NEXT: v_mov_b32_e32 v0, s6 885; GFX9-NEXT: s_waitcnt lgkmcnt(0) 886; GFX9-NEXT: v_mov_b32_e32 v3, s7 887; GFX9-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc 888; GFX9-NEXT: .LBB3_2: 889; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 890; GFX9-NEXT: s_waitcnt vmcnt(0) 891; GFX9-NEXT: v_readfirstlane_b32 s0, v0 892; GFX9-NEXT: v_mov_b32_e32 v0, v1 893; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 894; GFX9-NEXT: v_mov_b32_e32 v3, 0 895; GFX9-NEXT: s_waitcnt lgkmcnt(0) 896; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 897; GFX9-NEXT: s_endpgm 898; 899; GFX10W64-LABEL: struct_add_i32_varying_vdata: 900; GFX10W64: ; %bb.0: ; %entry 901; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 902; GFX10W64-NEXT: s_not_b64 exec, exec 903; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 904; GFX10W64-NEXT: s_not_b64 exec, exec 905; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 906; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 907; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 908; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 909; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 910; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 911; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 912; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 913; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 914; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 915; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 916; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 917; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 918; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 919; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 920; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 921; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 922; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 923; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 924; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 925; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 926; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 927; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 928; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 929; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 930; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 931; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 932; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 933; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 934; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 935; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 936; GFX10W64-NEXT: ; implicit-def: $vgpr0 937; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 938; GFX10W64-NEXT: s_cbranch_execz .LBB3_2 939; GFX10W64-NEXT: ; %bb.1: 940; GFX10W64-NEXT: s_clause 0x1 941; GFX10W64-NEXT: s_load_dword s7, s[0:1], 0x44 942; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 943; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 944; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 945; GFX10W64-NEXT: v_mov_b32_e32 v4, s7 946; GFX10W64-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc 947; GFX10W64-NEXT: .LBB3_2: 948; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 949; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 950; GFX10W64-NEXT: s_waitcnt vmcnt(0) 951; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 952; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 953; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 954; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 955; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 956; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 957; GFX10W64-NEXT: s_endpgm 958; 959; GFX10W32-LABEL: struct_add_i32_varying_vdata: 960; GFX10W32: ; %bb.0: ; %entry 961; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 962; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 963; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 964; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 965; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 966; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 967; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 968; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 969; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 970; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 971; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 972; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 973; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 974; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 975; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 976; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 977; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 978; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 979; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 980; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 981; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 982; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 983; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 984; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 985; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 986; GFX10W32-NEXT: ; implicit-def: $vgpr0 987; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 988; GFX10W32-NEXT: s_cbranch_execz .LBB3_2 989; GFX10W32-NEXT: ; %bb.1: 990; GFX10W32-NEXT: s_mov_b32 s5, s6 991; GFX10W32-NEXT: s_clause 0x1 992; GFX10W32-NEXT: s_load_dword s6, s[0:1], 0x44 993; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 994; GFX10W32-NEXT: v_mov_b32_e32 v0, s5 995; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 996; GFX10W32-NEXT: v_mov_b32_e32 v4, s6 997; GFX10W32-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc 998; GFX10W32-NEXT: .LBB3_2: 999; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1000; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1001; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1002; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 1003; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 1004; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 1005; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 1006; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1007; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 1008; GFX10W32-NEXT: s_endpgm 1009; 1010; GFX11W64-LABEL: struct_add_i32_varying_vdata: 1011; GFX11W64: ; %bb.0: ; %entry 1012; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 1013; GFX11W64-NEXT: s_not_b64 exec, exec 1014; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 1015; GFX11W64-NEXT: s_not_b64 exec, exec 1016; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 1017; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1018; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1019; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 1020; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1021; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1022; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1023; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1024; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1025; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 1026; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1027; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1028; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1029; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 1030; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1031; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 1032; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1033; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) 1034; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 1035; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1036; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] 1037; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1038; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1039; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 1040; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 1041; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 1042; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1043; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1044; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1045; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 1046; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 1047; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 1048; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 1049; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 1050; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1051; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1052; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 1053; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 1054; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1055; GFX11W64-NEXT: ; implicit-def: $vgpr0 1056; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1057; GFX11W64-NEXT: s_cbranch_execz .LBB3_2 1058; GFX11W64-NEXT: ; %bb.1: 1059; GFX11W64-NEXT: s_clause 0x1 1060; GFX11W64-NEXT: s_load_b32 s7, s[0:1], 0x44 1061; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 1062; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 1063; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1064; GFX11W64-NEXT: v_mov_b32_e32 v4, s7 1065; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc 1066; GFX11W64-NEXT: .LBB3_2: 1067; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] 1068; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1069; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 1070; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 1071; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 1072; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1073; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 1074; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1075; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] 1076; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1077; GFX11W64-NEXT: s_endpgm 1078; 1079; GFX11W32-LABEL: struct_add_i32_varying_vdata: 1080; GFX11W32: ; %bb.0: ; %entry 1081; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 1082; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo 1083; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 1084; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo 1085; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 1086; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1087; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1088; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1089; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1090; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1091; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1092; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1093; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 1094; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1095; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 1096; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1097; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 1098; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1099; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1100; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 1101; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 1102; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) 1103; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1104; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 1105; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 1106; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1107; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 1108; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1109; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 1110; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 1111; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1112; GFX11W32-NEXT: ; implicit-def: $vgpr0 1113; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 1114; GFX11W32-NEXT: s_cbranch_execz .LBB3_2 1115; GFX11W32-NEXT: ; %bb.1: 1116; GFX11W32-NEXT: s_mov_b32 s5, s6 1117; GFX11W32-NEXT: s_clause 0x1 1118; GFX11W32-NEXT: s_load_b32 s6, s[0:1], 0x44 1119; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 1120; GFX11W32-NEXT: v_mov_b32_e32 v0, s5 1121; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1122; GFX11W32-NEXT: v_mov_b32_e32 v4, s6 1123; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc 1124; GFX11W32-NEXT: .LBB3_2: 1125; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1126; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1127; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 1128; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 1129; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 1130; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1131; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 1132; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1133; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] 1134; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1135; GFX11W32-NEXT: s_endpgm 1136entry: 1137 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1138 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 %vindex, i32 0, i32 0, i32 0) 1139 store i32 %old, i32 addrspace(1)* %out 1140 ret void 1141} 1142 1143define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { 1144; GFX6-LABEL: add_i32_varying_offset: 1145; GFX6: ; %bb.0: ; %entry 1146; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1147; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1148; GFX6-NEXT: v_mov_b32_e32 v1, 1 1149; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1150; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc 1151; GFX6-NEXT: s_mov_b32 s3, 0xf000 1152; GFX6-NEXT: s_mov_b32 s2, -1 1153; GFX6-NEXT: s_waitcnt vmcnt(0) 1154; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 1155; GFX6-NEXT: s_endpgm 1156; 1157; GFX8-LABEL: add_i32_varying_offset: 1158; GFX8: ; %bb.0: ; %entry 1159; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1160; GFX8-NEXT: v_mov_b32_e32 v2, 1 1161; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1162; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1163; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc 1164; GFX8-NEXT: v_mov_b32_e32 v0, s0 1165; GFX8-NEXT: v_mov_b32_e32 v1, s1 1166; GFX8-NEXT: s_waitcnt vmcnt(0) 1167; GFX8-NEXT: flat_store_dword v[0:1], v2 1168; GFX8-NEXT: s_endpgm 1169; 1170; GFX9-LABEL: add_i32_varying_offset: 1171; GFX9: ; %bb.0: ; %entry 1172; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1173; GFX9-NEXT: v_mov_b32_e32 v1, 1 1174; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1175; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1176; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc 1177; GFX9-NEXT: v_mov_b32_e32 v0, 0 1178; GFX9-NEXT: s_waitcnt vmcnt(0) 1179; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1180; GFX9-NEXT: s_endpgm 1181; 1182; GFX10-LABEL: add_i32_varying_offset: 1183; GFX10: ; %bb.0: ; %entry 1184; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1185; GFX10-NEXT: v_mov_b32_e32 v1, 1 1186; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1187; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1188; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc 1189; GFX10-NEXT: v_mov_b32_e32 v0, 0 1190; GFX10-NEXT: s_waitcnt vmcnt(0) 1191; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1192; GFX10-NEXT: s_endpgm 1193; 1194; GFX11-LABEL: add_i32_varying_offset: 1195; GFX11: ; %bb.0: ; %entry 1196; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 1197; GFX11-NEXT: v_mov_b32_e32 v1, 1 1198; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1199; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1200; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc 1201; GFX11-NEXT: v_mov_b32_e32 v0, 0 1202; GFX11-NEXT: s_waitcnt vmcnt(0) 1203; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1204; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1205; GFX11-NEXT: s_endpgm 1206entry: 1207 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1208 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) 1209 store i32 %old, i32 addrspace(1)* %out 1210 ret void 1211} 1212 1213define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { 1214; GFX6-LABEL: sub_i32_constant: 1215; GFX6: ; %bb.0: ; %entry 1216; GFX6-NEXT: s_mov_b64 s[2:3], exec 1217; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1218; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1219; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1220; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1221; GFX6-NEXT: ; implicit-def: $vgpr1 1222; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 1223; GFX6-NEXT: s_cbranch_execz .LBB5_2 1224; GFX6-NEXT: ; %bb.1: 1225; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 1226; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 1227; GFX6-NEXT: s_mul_i32 s0, s0, 5 1228; GFX6-NEXT: v_mov_b32_e32 v1, s0 1229; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1230; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1231; GFX6-NEXT: .LBB5_2: 1232; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 1233; GFX6-NEXT: s_waitcnt vmcnt(0) 1234; GFX6-NEXT: v_readfirstlane_b32 s0, v1 1235; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1236; GFX6-NEXT: s_mov_b32 s7, 0xf000 1237; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1238; GFX6-NEXT: s_mov_b32 s6, -1 1239; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1240; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 1241; GFX6-NEXT: s_endpgm 1242; 1243; GFX8-LABEL: sub_i32_constant: 1244; GFX8: ; %bb.0: ; %entry 1245; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1246; GFX8-NEXT: s_mov_b64 s[6:7], exec 1247; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1248; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1249; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1250; GFX8-NEXT: ; implicit-def: $vgpr1 1251; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1252; GFX8-NEXT: s_cbranch_execz .LBB5_2 1253; GFX8-NEXT: ; %bb.1: 1254; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1255; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 1256; GFX8-NEXT: s_mul_i32 s0, s0, 5 1257; GFX8-NEXT: v_mov_b32_e32 v1, s0 1258; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1259; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1260; GFX8-NEXT: .LBB5_2: 1261; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1262; GFX8-NEXT: s_waitcnt vmcnt(0) 1263; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1264; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1265; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 1266; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1267; GFX8-NEXT: v_mov_b32_e32 v0, s2 1268; GFX8-NEXT: v_mov_b32_e32 v1, s3 1269; GFX8-NEXT: flat_store_dword v[0:1], v2 1270; GFX8-NEXT: s_endpgm 1271; 1272; GFX9-LABEL: sub_i32_constant: 1273; GFX9: ; %bb.0: ; %entry 1274; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1275; GFX9-NEXT: s_mov_b64 s[6:7], exec 1276; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1277; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1278; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1279; GFX9-NEXT: ; implicit-def: $vgpr1 1280; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1281; GFX9-NEXT: s_cbranch_execz .LBB5_2 1282; GFX9-NEXT: ; %bb.1: 1283; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1284; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 1285; GFX9-NEXT: s_mul_i32 s0, s0, 5 1286; GFX9-NEXT: v_mov_b32_e32 v1, s0 1287; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1288; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1289; GFX9-NEXT: .LBB5_2: 1290; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1291; GFX9-NEXT: s_waitcnt vmcnt(0) 1292; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1293; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1294; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1295; GFX9-NEXT: v_mov_b32_e32 v1, 0 1296; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1297; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 1298; GFX9-NEXT: s_endpgm 1299; 1300; GFX10W64-LABEL: sub_i32_constant: 1301; GFX10W64: ; %bb.0: ; %entry 1302; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1303; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 1304; GFX10W64-NEXT: ; implicit-def: $vgpr1 1305; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1306; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1307; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1308; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1309; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 1310; GFX10W64-NEXT: ; %bb.1: 1311; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1312; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 1313; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 1314; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 1315; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1316; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1317; GFX10W64-NEXT: .LBB5_2: 1318; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1319; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 1320; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1321; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 1322; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1323; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1324; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1325; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1326; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 1327; GFX10W64-NEXT: s_endpgm 1328; 1329; GFX10W32-LABEL: sub_i32_constant: 1330; GFX10W32: ; %bb.0: ; %entry 1331; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1332; GFX10W32-NEXT: s_mov_b32 s5, exec_lo 1333; GFX10W32-NEXT: ; implicit-def: $vgpr1 1334; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 1335; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1336; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 1337; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 1338; GFX10W32-NEXT: ; %bb.1: 1339; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1340; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 1341; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 1342; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 1343; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1344; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1345; GFX10W32-NEXT: .LBB5_2: 1346; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1347; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1348; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1349; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 1350; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1351; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1352; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1353; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1354; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 1355; GFX10W32-NEXT: s_endpgm 1356; 1357; GFX11W64-LABEL: sub_i32_constant: 1358; GFX11W64: ; %bb.0: ; %entry 1359; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1360; GFX11W64-NEXT: s_mov_b64 s[6:7], exec 1361; GFX11W64-NEXT: s_mov_b64 s[4:5], exec 1362; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1363; GFX11W64-NEXT: ; implicit-def: $vgpr1 1364; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1365; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1366; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 1367; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 1368; GFX11W64-NEXT: ; %bb.1: 1369; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 1370; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 1371; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1372; GFX11W64-NEXT: s_mul_i32 s0, s0, 5 1373; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 1374; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1375; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 1376; GFX11W64-NEXT: .LBB5_2: 1377; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] 1378; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1379; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 1380; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1381; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 1382; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1383; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1384; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1385; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] 1386; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1387; GFX11W64-NEXT: s_endpgm 1388; 1389; GFX11W32-LABEL: sub_i32_constant: 1390; GFX11W32: ; %bb.0: ; %entry 1391; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1392; GFX11W32-NEXT: s_mov_b32 s5, exec_lo 1393; GFX11W32-NEXT: s_mov_b32 s4, exec_lo 1394; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 1395; GFX11W32-NEXT: ; implicit-def: $vgpr1 1396; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1397; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 1398; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 1399; GFX11W32-NEXT: ; %bb.1: 1400; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 1401; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5 1402; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1403; GFX11W32-NEXT: s_mul_i32 s0, s0, 5 1404; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 1405; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1406; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 1407; GFX11W32-NEXT: .LBB5_2: 1408; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1409; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1410; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 1411; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1412; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 1413; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1414; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1415; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1416; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] 1417; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1418; GFX11W32-NEXT: s_endpgm 1419entry: 1420 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) 1421 store i32 %old, i32 addrspace(1)* %out 1422 ret void 1423} 1424 1425define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) { 1426; GFX6-LABEL: sub_i32_uniform: 1427; GFX6: ; %bb.0: ; %entry 1428; GFX6-NEXT: s_mov_b64 s[2:3], exec 1429; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1430; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 1431; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1432; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1433; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1434; GFX6-NEXT: ; implicit-def: $vgpr1 1435; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 1436; GFX6-NEXT: s_cbranch_execz .LBB6_2 1437; GFX6-NEXT: ; %bb.1: 1438; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 1439; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 1440; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1441; GFX6-NEXT: s_mul_i32 s0, s8, s0 1442; GFX6-NEXT: v_mov_b32_e32 v1, s0 1443; GFX6-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1444; GFX6-NEXT: .LBB6_2: 1445; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 1446; GFX6-NEXT: s_waitcnt vmcnt(0) 1447; GFX6-NEXT: v_readfirstlane_b32 s0, v1 1448; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1449; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 1450; GFX6-NEXT: s_mov_b32 s7, 0xf000 1451; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1452; GFX6-NEXT: s_mov_b32 s6, -1 1453; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 1454; GFX6-NEXT: s_endpgm 1455; 1456; GFX8-LABEL: sub_i32_uniform: 1457; GFX8: ; %bb.0: ; %entry 1458; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1459; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 1460; GFX8-NEXT: s_mov_b64 s[4:5], exec 1461; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1462; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1463; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1464; GFX8-NEXT: ; implicit-def: $vgpr1 1465; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 1466; GFX8-NEXT: s_cbranch_execz .LBB6_2 1467; GFX8-NEXT: ; %bb.1: 1468; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 1469; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 1470; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1471; GFX8-NEXT: s_mul_i32 s0, s8, s0 1472; GFX8-NEXT: v_mov_b32_e32 v1, s0 1473; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1474; GFX8-NEXT: .LBB6_2: 1475; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 1476; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1477; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 1478; GFX8-NEXT: s_waitcnt vmcnt(0) 1479; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1480; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 1481; GFX8-NEXT: v_mov_b32_e32 v0, s2 1482; GFX8-NEXT: v_mov_b32_e32 v1, s3 1483; GFX8-NEXT: flat_store_dword v[0:1], v2 1484; GFX8-NEXT: s_endpgm 1485; 1486; GFX9-LABEL: sub_i32_uniform: 1487; GFX9: ; %bb.0: ; %entry 1488; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1489; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 1490; GFX9-NEXT: s_mov_b64 s[4:5], exec 1491; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1492; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1493; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1494; GFX9-NEXT: ; implicit-def: $vgpr1 1495; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc 1496; GFX9-NEXT: s_cbranch_execz .LBB6_2 1497; GFX9-NEXT: ; %bb.1: 1498; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 1499; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 1500; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1501; GFX9-NEXT: s_mul_i32 s0, s8, s0 1502; GFX9-NEXT: v_mov_b32_e32 v1, s0 1503; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1504; GFX9-NEXT: .LBB6_2: 1505; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 1506; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1507; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 1508; GFX9-NEXT: s_waitcnt vmcnt(0) 1509; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1510; GFX9-NEXT: v_mov_b32_e32 v1, 0 1511; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1512; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 1513; GFX9-NEXT: s_endpgm 1514; 1515; GFX10W64-LABEL: sub_i32_uniform: 1516; GFX10W64: ; %bb.0: ; %entry 1517; GFX10W64-NEXT: s_clause 0x1 1518; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1519; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 1520; GFX10W64-NEXT: s_mov_b64 s[4:5], exec 1521; GFX10W64-NEXT: ; implicit-def: $vgpr1 1522; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1523; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1524; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1525; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc 1526; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 1527; GFX10W64-NEXT: ; %bb.1: 1528; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 1529; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 1530; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1531; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 1532; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 1533; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1534; GFX10W64-NEXT: .LBB6_2: 1535; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1536; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7] 1537; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1538; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0 1539; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1540; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 1541; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1542; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1543; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 1544; GFX10W64-NEXT: s_endpgm 1545; 1546; GFX10W32-LABEL: sub_i32_uniform: 1547; GFX10W32: ; %bb.0: ; %entry 1548; GFX10W32-NEXT: s_clause 0x1 1549; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1550; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 1551; GFX10W32-NEXT: s_mov_b32 s6, exec_lo 1552; GFX10W32-NEXT: ; implicit-def: $vgpr1 1553; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1554; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1555; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 1556; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 1557; GFX10W32-NEXT: ; %bb.1: 1558; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1559; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 1560; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1561; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 1562; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 1563; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1564; GFX10W32-NEXT: .LBB6_2: 1565; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1566; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 1567; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1568; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0 1569; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1570; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 1571; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1572; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1573; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 1574; GFX10W32-NEXT: s_endpgm 1575; 1576; GFX11W64-LABEL: sub_i32_uniform: 1577; GFX11W64: ; %bb.0: ; %entry 1578; GFX11W64-NEXT: s_clause 0x1 1579; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1580; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44 1581; GFX11W64-NEXT: s_mov_b64 s[4:5], exec 1582; GFX11W64-NEXT: s_mov_b64 s[6:7], exec 1583; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1584; GFX11W64-NEXT: ; implicit-def: $vgpr1 1585; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1586; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1587; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 1588; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 1589; GFX11W64-NEXT: ; %bb.1: 1590; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34 1591; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 1592; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1593; GFX11W64-NEXT: s_mul_i32 s0, s8, s0 1594; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1595; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 1596; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc 1597; GFX11W64-NEXT: .LBB6_2: 1598; GFX11W64-NEXT: s_or_b64 exec, exec, s[6:7] 1599; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1600; GFX11W64-NEXT: v_mul_lo_u32 v0, s8, v0 1601; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1602; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 1603; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 1604; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1605; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1606; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] 1607; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1608; GFX11W64-NEXT: s_endpgm 1609; 1610; GFX11W32-LABEL: sub_i32_uniform: 1611; GFX11W32: ; %bb.0: ; %entry 1612; GFX11W32-NEXT: s_clause 0x1 1613; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1614; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44 1615; GFX11W32-NEXT: s_mov_b32 s6, exec_lo 1616; GFX11W32-NEXT: s_mov_b32 s5, exec_lo 1617; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1618; GFX11W32-NEXT: ; implicit-def: $vgpr1 1619; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1620; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 1621; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 1622; GFX11W32-NEXT: ; %bb.1: 1623; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 1624; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6 1625; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1626; GFX11W32-NEXT: s_mul_i32 s0, s4, s0 1627; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1628; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 1629; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 1630; GFX11W32-NEXT: .LBB6_2: 1631; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 1632; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1633; GFX11W32-NEXT: v_mul_lo_u32 v0, s4, v0 1634; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1635; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 1636; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 1637; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1638; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1639; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] 1640; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1641; GFX11W32-NEXT: s_endpgm 1642entry: 1643 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0) 1644 store i32 %old, i32 addrspace(1)* %out 1645 ret void 1646} 1647 1648define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { 1649; GFX6-LABEL: sub_i32_varying_vdata: 1650; GFX6: ; %bb.0: ; %entry 1651; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1652; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1653; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1654; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 1655; GFX6-NEXT: s_mov_b32 s3, 0xf000 1656; GFX6-NEXT: s_mov_b32 s2, -1 1657; GFX6-NEXT: s_waitcnt vmcnt(0) 1658; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 1659; GFX6-NEXT: s_endpgm 1660; 1661; GFX8-LABEL: sub_i32_varying_vdata: 1662; GFX8: ; %bb.0: ; %entry 1663; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1664; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1665; GFX8-NEXT: v_mov_b32_e32 v1, 0 1666; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1667; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1668; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1669; GFX8-NEXT: v_mov_b32_e32 v2, v0 1670; GFX8-NEXT: s_not_b64 exec, exec 1671; GFX8-NEXT: v_mov_b32_e32 v2, 0 1672; GFX8-NEXT: s_not_b64 exec, exec 1673; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1674; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1675; GFX8-NEXT: s_nop 1 1676; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1677; GFX8-NEXT: s_nop 1 1678; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1679; GFX8-NEXT: s_nop 1 1680; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1681; GFX8-NEXT: s_nop 1 1682; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1683; GFX8-NEXT: s_nop 1 1684; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1685; GFX8-NEXT: v_readlane_b32 s6, v2, 63 1686; GFX8-NEXT: s_nop 0 1687; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1688; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1689; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1690; GFX8-NEXT: ; implicit-def: $vgpr0 1691; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1692; GFX8-NEXT: s_cbranch_execz .LBB7_2 1693; GFX8-NEXT: ; %bb.1: 1694; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1695; GFX8-NEXT: v_mov_b32_e32 v0, s6 1696; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1697; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1698; GFX8-NEXT: .LBB7_2: 1699; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1700; GFX8-NEXT: s_waitcnt vmcnt(0) 1701; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1702; GFX8-NEXT: v_mov_b32_e32 v0, v1 1703; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1704; GFX8-NEXT: v_mov_b32_e32 v4, s3 1705; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1706; GFX8-NEXT: v_mov_b32_e32 v3, s2 1707; GFX8-NEXT: flat_store_dword v[3:4], v0 1708; GFX8-NEXT: s_endpgm 1709; 1710; GFX9-LABEL: sub_i32_varying_vdata: 1711; GFX9: ; %bb.0: ; %entry 1712; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1713; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1714; GFX9-NEXT: v_mov_b32_e32 v1, 0 1715; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1716; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1717; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1718; GFX9-NEXT: v_mov_b32_e32 v2, v0 1719; GFX9-NEXT: s_not_b64 exec, exec 1720; GFX9-NEXT: v_mov_b32_e32 v2, 0 1721; GFX9-NEXT: s_not_b64 exec, exec 1722; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1723; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1724; GFX9-NEXT: s_nop 1 1725; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1726; GFX9-NEXT: s_nop 1 1727; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1728; GFX9-NEXT: s_nop 1 1729; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1730; GFX9-NEXT: s_nop 1 1731; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1732; GFX9-NEXT: s_nop 1 1733; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1734; GFX9-NEXT: v_readlane_b32 s6, v2, 63 1735; GFX9-NEXT: s_nop 0 1736; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1737; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1738; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1739; GFX9-NEXT: ; implicit-def: $vgpr0 1740; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1741; GFX9-NEXT: s_cbranch_execz .LBB7_2 1742; GFX9-NEXT: ; %bb.1: 1743; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1744; GFX9-NEXT: v_mov_b32_e32 v0, s6 1745; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1746; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1747; GFX9-NEXT: .LBB7_2: 1748; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1749; GFX9-NEXT: s_waitcnt vmcnt(0) 1750; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1751; GFX9-NEXT: v_mov_b32_e32 v0, v1 1752; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1753; GFX9-NEXT: v_mov_b32_e32 v3, 0 1754; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1755; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 1756; GFX9-NEXT: s_endpgm 1757; 1758; GFX10W64-LABEL: sub_i32_varying_vdata: 1759; GFX10W64: ; %bb.0: ; %entry 1760; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 1761; GFX10W64-NEXT: s_not_b64 exec, exec 1762; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1763; GFX10W64-NEXT: s_not_b64 exec, exec 1764; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 1765; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1766; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 1767; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1768; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1769; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1770; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 1771; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1772; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1773; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 1774; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 1775; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1776; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 1777; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1778; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 1779; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1780; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1781; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 1782; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 1783; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1784; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1785; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1786; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 1787; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 1788; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 1789; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1790; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1791; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1792; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 1793; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1794; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1795; GFX10W64-NEXT: ; implicit-def: $vgpr0 1796; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1797; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 1798; GFX10W64-NEXT: ; %bb.1: 1799; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1800; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 1801; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1802; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1803; GFX10W64-NEXT: .LBB7_2: 1804; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1805; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 1806; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1807; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 1808; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 1809; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 1810; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1811; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1812; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 1813; GFX10W64-NEXT: s_endpgm 1814; 1815; GFX10W32-LABEL: sub_i32_varying_vdata: 1816; GFX10W32: ; %bb.0: ; %entry 1817; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 1818; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 1819; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1820; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 1821; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 1822; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1823; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1824; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1825; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1826; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 1827; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1828; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 1829; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1830; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 1831; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1832; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 1833; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 1834; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1835; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 1836; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 1837; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1838; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 1839; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 1840; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 1841; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1842; GFX10W32-NEXT: ; implicit-def: $vgpr0 1843; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 1844; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 1845; GFX10W32-NEXT: ; %bb.1: 1846; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1847; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 1848; GFX10W32-NEXT: s_mov_b32 s5, s6 1849; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1850; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1851; GFX10W32-NEXT: .LBB7_2: 1852; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1853; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1854; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1855; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 1856; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 1857; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 1858; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1859; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1860; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 1861; GFX10W32-NEXT: s_endpgm 1862; 1863; GFX11W64-LABEL: sub_i32_varying_vdata: 1864; GFX11W64: ; %bb.0: ; %entry 1865; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 1866; GFX11W64-NEXT: s_not_b64 exec, exec 1867; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 1868; GFX11W64-NEXT: s_not_b64 exec, exec 1869; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 1870; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1871; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1872; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 1873; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1874; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1875; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1876; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1877; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1878; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 1879; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1880; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1881; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1882; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 1883; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1884; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 1885; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1886; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) 1887; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 1888; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1889; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] 1890; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1891; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1892; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 1893; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 1894; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 1895; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1896; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1897; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1898; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 1899; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 1900; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 1901; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 1902; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 1903; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1904; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1905; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 1906; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 1907; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1908; GFX11W64-NEXT: ; implicit-def: $vgpr0 1909; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1910; GFX11W64-NEXT: s_cbranch_execz .LBB7_2 1911; GFX11W64-NEXT: ; %bb.1: 1912; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 1913; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 1914; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1915; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc 1916; GFX11W64-NEXT: .LBB7_2: 1917; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] 1918; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1919; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 1920; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 1921; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 1922; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1923; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1924; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1925; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] 1926; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1927; GFX11W64-NEXT: s_endpgm 1928; 1929; GFX11W32-LABEL: sub_i32_varying_vdata: 1930; GFX11W32: ; %bb.0: ; %entry 1931; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 1932; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo 1933; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 1934; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo 1935; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 1936; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1937; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1938; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1939; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1940; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1941; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1942; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1943; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 1944; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1945; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 1946; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1947; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 1948; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1949; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1950; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 1951; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 1952; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) 1953; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1954; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 1955; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 1956; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1957; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 1958; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1959; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 1960; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 1961; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1962; GFX11W32-NEXT: ; implicit-def: $vgpr0 1963; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 1964; GFX11W32-NEXT: s_cbranch_execz .LBB7_2 1965; GFX11W32-NEXT: ; %bb.1: 1966; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 1967; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 1968; GFX11W32-NEXT: s_mov_b32 s5, s6 1969; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1970; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc 1971; GFX11W32-NEXT: .LBB7_2: 1972; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1973; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1974; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 1975; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 1976; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 1977; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1978; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1979; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1980; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] 1981; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1982; GFX11W32-NEXT: s_endpgm 1983entry: 1984 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1985 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) 1986 store i32 %old, i32 addrspace(1)* %out 1987 ret void 1988} 1989 1990define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { 1991; GFX6-LABEL: sub_i32_varying_offset: 1992; GFX6: ; %bb.0: ; %entry 1993; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1994; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1995; GFX6-NEXT: v_mov_b32_e32 v1, 1 1996; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1997; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc 1998; GFX6-NEXT: s_mov_b32 s3, 0xf000 1999; GFX6-NEXT: s_mov_b32 s2, -1 2000; GFX6-NEXT: s_waitcnt vmcnt(0) 2001; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 2002; GFX6-NEXT: s_endpgm 2003; 2004; GFX8-LABEL: sub_i32_varying_offset: 2005; GFX8: ; %bb.0: ; %entry 2006; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 2007; GFX8-NEXT: v_mov_b32_e32 v2, 1 2008; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2009; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2010; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc 2011; GFX8-NEXT: v_mov_b32_e32 v0, s0 2012; GFX8-NEXT: v_mov_b32_e32 v1, s1 2013; GFX8-NEXT: s_waitcnt vmcnt(0) 2014; GFX8-NEXT: flat_store_dword v[0:1], v2 2015; GFX8-NEXT: s_endpgm 2016; 2017; GFX9-LABEL: sub_i32_varying_offset: 2018; GFX9: ; %bb.0: ; %entry 2019; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 2020; GFX9-NEXT: v_mov_b32_e32 v1, 1 2021; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2022; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2023; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc 2024; GFX9-NEXT: v_mov_b32_e32 v0, 0 2025; GFX9-NEXT: s_waitcnt vmcnt(0) 2026; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2027; GFX9-NEXT: s_endpgm 2028; 2029; GFX10-LABEL: sub_i32_varying_offset: 2030; GFX10: ; %bb.0: ; %entry 2031; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 2032; GFX10-NEXT: v_mov_b32_e32 v1, 1 2033; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2034; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2035; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc 2036; GFX10-NEXT: v_mov_b32_e32 v0, 0 2037; GFX10-NEXT: s_waitcnt vmcnt(0) 2038; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 2039; GFX10-NEXT: s_endpgm 2040; 2041; GFX11-LABEL: sub_i32_varying_offset: 2042; GFX11: ; %bb.0: ; %entry 2043; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 2044; GFX11-NEXT: v_mov_b32_e32 v1, 1 2045; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2046; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2047; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc 2048; GFX11-NEXT: v_mov_b32_e32 v0, 0 2049; GFX11-NEXT: s_waitcnt vmcnt(0) 2050; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2051; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2052; GFX11-NEXT: s_endpgm 2053entry: 2054 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2055 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) 2056 store i32 %old, i32 addrspace(1)* %out 2057 ret void 2058} 2059