1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s 3; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s 7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s 8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s 9 10declare i32 @llvm.amdgcn.workitem.id.x() 11declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32) 12declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32) 13 14; Show what the atomic optimization pass will do for raw buffers. 15 16define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { 17; GFX6-LABEL: add_i32_constant: 18; GFX6: ; %bb.0: ; %entry 19; GFX6-NEXT: s_mov_b64 s[2:3], exec 20; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 21; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 22; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 23; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 24; GFX6-NEXT: ; implicit-def: $vgpr1 25; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 26; GFX6-NEXT: s_cbranch_execz .LBB0_2 27; GFX6-NEXT: ; %bb.1: 28; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 29; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 30; GFX6-NEXT: s_mul_i32 s0, s0, 5 31; GFX6-NEXT: v_mov_b32_e32 v1, s0 32; GFX6-NEXT: s_waitcnt lgkmcnt(0) 33; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 34; GFX6-NEXT: .LBB0_2: 35; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 36; GFX6-NEXT: s_waitcnt vmcnt(0) 37; GFX6-NEXT: v_readfirstlane_b32 s0, v1 38; GFX6-NEXT: s_mov_b32 s7, 0xf000 39; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s0 40; GFX6-NEXT: s_mov_b32 s6, -1 41; GFX6-NEXT: s_waitcnt lgkmcnt(0) 42; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 43; GFX6-NEXT: s_endpgm 44; 45; GFX8-LABEL: add_i32_constant: 46; GFX8: ; %bb.0: ; %entry 47; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 48; GFX8-NEXT: s_mov_b64 s[6:7], exec 49; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 50; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 51; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 52; GFX8-NEXT: ; implicit-def: $vgpr1 53; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 54; GFX8-NEXT: s_cbranch_execz .LBB0_2 55; GFX8-NEXT: ; %bb.1: 56; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 57; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 58; GFX8-NEXT: s_mul_i32 s0, s0, 5 59; GFX8-NEXT: v_mov_b32_e32 v1, s0 60; GFX8-NEXT: s_waitcnt lgkmcnt(0) 61; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 62; GFX8-NEXT: .LBB0_2: 63; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 64; GFX8-NEXT: s_waitcnt vmcnt(0) 65; GFX8-NEXT: v_readfirstlane_b32 s0, v1 66; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0 67; GFX8-NEXT: s_waitcnt lgkmcnt(0) 68; GFX8-NEXT: v_mov_b32_e32 v0, s2 69; GFX8-NEXT: v_mov_b32_e32 v1, s3 70; GFX8-NEXT: flat_store_dword v[0:1], v2 71; GFX8-NEXT: s_endpgm 72; 73; GFX9-LABEL: add_i32_constant: 74; GFX9: ; %bb.0: ; %entry 75; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 76; GFX9-NEXT: s_mov_b64 s[6:7], exec 77; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 78; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 79; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 80; GFX9-NEXT: ; implicit-def: $vgpr1 81; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 82; GFX9-NEXT: s_cbranch_execz .LBB0_2 83; GFX9-NEXT: ; %bb.1: 84; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 85; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 86; GFX9-NEXT: s_mul_i32 s0, s0, 5 87; GFX9-NEXT: v_mov_b32_e32 v1, s0 88; GFX9-NEXT: s_waitcnt lgkmcnt(0) 89; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 90; GFX9-NEXT: .LBB0_2: 91; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 92; GFX9-NEXT: s_waitcnt vmcnt(0) 93; GFX9-NEXT: v_readfirstlane_b32 s0, v1 94; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0 95; GFX9-NEXT: v_mov_b32_e32 v1, 0 96; GFX9-NEXT: s_waitcnt lgkmcnt(0) 97; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 98; GFX9-NEXT: s_endpgm 99; 100; GFX10W64-LABEL: add_i32_constant: 101; GFX10W64: ; %bb.0: ; %entry 102; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 103; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 104; GFX10W64-NEXT: ; implicit-def: $vgpr1 105; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 106; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 107; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 108; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 109; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 110; GFX10W64-NEXT: ; %bb.1: 111; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 112; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 113; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 114; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 115; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 116; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 117; GFX10W64-NEXT: .LBB0_2: 118; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 119; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 120; GFX10W64-NEXT: s_waitcnt vmcnt(0) 121; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 122; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 123; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 124; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 125; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 126; GFX10W64-NEXT: s_endpgm 127; 128; GFX10W32-LABEL: add_i32_constant: 129; GFX10W32: ; %bb.0: ; %entry 130; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 131; GFX10W32-NEXT: s_mov_b32 s5, exec_lo 132; GFX10W32-NEXT: ; implicit-def: $vgpr1 133; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 134; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 135; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 136; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 137; GFX10W32-NEXT: ; %bb.1: 138; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 139; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 140; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 141; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 142; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 143; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 144; GFX10W32-NEXT: .LBB0_2: 145; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 146; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 147; GFX10W32-NEXT: s_waitcnt vmcnt(0) 148; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 149; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 150; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 151; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 152; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 153; GFX10W32-NEXT: s_endpgm 154; 155; GFX11W64-LABEL: add_i32_constant: 156; GFX11W64: ; %bb.0: ; %entry 157; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 158; GFX11W64-NEXT: s_mov_b64 s[6:7], exec 159; GFX11W64-NEXT: s_mov_b64 s[4:5], exec 160; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 161; GFX11W64-NEXT: ; implicit-def: $vgpr1 162; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 163; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 164; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 165; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 166; GFX11W64-NEXT: ; %bb.1: 167; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 168; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 169; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 170; GFX11W64-NEXT: s_mul_i32 s0, s0, 5 171; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 172; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 173; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 174; GFX11W64-NEXT: .LBB0_2: 175; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] 176; GFX11W64-NEXT: s_waitcnt vmcnt(0) 177; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 178; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 179; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 180; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 181; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 182; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] 183; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 184; GFX11W64-NEXT: s_endpgm 185; 186; GFX11W32-LABEL: add_i32_constant: 187; GFX11W32: ; %bb.0: ; %entry 188; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 189; GFX11W32-NEXT: s_mov_b32 s5, exec_lo 190; GFX11W32-NEXT: s_mov_b32 s4, exec_lo 191; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 192; GFX11W32-NEXT: ; implicit-def: $vgpr1 193; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 194; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 195; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 196; GFX11W32-NEXT: ; %bb.1: 197; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 198; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5 199; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 200; GFX11W32-NEXT: s_mul_i32 s0, s0, 5 201; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 202; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 203; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 204; GFX11W32-NEXT: .LBB0_2: 205; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 206; GFX11W32-NEXT: s_waitcnt vmcnt(0) 207; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 208; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 209; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 210; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 211; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 212; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] 213; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 214; GFX11W32-NEXT: s_endpgm 215entry: 216 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) 217 store i32 %old, i32 addrspace(1)* %out 218 ret void 219} 220 221define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) { 222; GFX6-LABEL: add_i32_uniform: 223; GFX6: ; %bb.0: ; %entry 224; GFX6-NEXT: s_mov_b64 s[2:3], exec 225; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 226; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 227; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 228; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 229; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 230; GFX6-NEXT: ; implicit-def: $vgpr1 231; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 232; GFX6-NEXT: s_cbranch_execz .LBB1_2 233; GFX6-NEXT: ; %bb.1: 234; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 235; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 236; GFX6-NEXT: s_waitcnt lgkmcnt(0) 237; GFX6-NEXT: s_mul_i32 s0, s8, s0 238; GFX6-NEXT: v_mov_b32_e32 v1, s0 239; GFX6-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 240; GFX6-NEXT: .LBB1_2: 241; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 242; GFX6-NEXT: s_waitcnt vmcnt(0) 243; GFX6-NEXT: v_readfirstlane_b32 s0, v1 244; GFX6-NEXT: s_waitcnt lgkmcnt(0) 245; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 246; GFX6-NEXT: s_mov_b32 s7, 0xf000 247; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 248; GFX6-NEXT: s_mov_b32 s6, -1 249; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 250; GFX6-NEXT: s_endpgm 251; 252; GFX8-LABEL: add_i32_uniform: 253; GFX8: ; %bb.0: ; %entry 254; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 255; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 256; GFX8-NEXT: s_mov_b64 s[4:5], exec 257; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 258; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 259; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 260; GFX8-NEXT: ; implicit-def: $vgpr1 261; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 262; GFX8-NEXT: s_cbranch_execz .LBB1_2 263; GFX8-NEXT: ; %bb.1: 264; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 265; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 266; GFX8-NEXT: s_waitcnt lgkmcnt(0) 267; GFX8-NEXT: s_mul_i32 s0, s8, s0 268; GFX8-NEXT: v_mov_b32_e32 v1, s0 269; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 270; GFX8-NEXT: .LBB1_2: 271; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 272; GFX8-NEXT: s_waitcnt lgkmcnt(0) 273; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 274; GFX8-NEXT: s_waitcnt vmcnt(0) 275; GFX8-NEXT: v_readfirstlane_b32 s0, v1 276; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 277; GFX8-NEXT: v_mov_b32_e32 v0, s2 278; GFX8-NEXT: v_mov_b32_e32 v1, s3 279; GFX8-NEXT: flat_store_dword v[0:1], v2 280; GFX8-NEXT: s_endpgm 281; 282; GFX9-LABEL: add_i32_uniform: 283; GFX9: ; %bb.0: ; %entry 284; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 285; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 286; GFX9-NEXT: s_mov_b64 s[4:5], exec 287; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 288; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 289; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 290; GFX9-NEXT: ; implicit-def: $vgpr1 291; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc 292; GFX9-NEXT: s_cbranch_execz .LBB1_2 293; GFX9-NEXT: ; %bb.1: 294; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 295; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 296; GFX9-NEXT: s_waitcnt lgkmcnt(0) 297; GFX9-NEXT: s_mul_i32 s0, s8, s0 298; GFX9-NEXT: v_mov_b32_e32 v1, s0 299; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 300; GFX9-NEXT: .LBB1_2: 301; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 302; GFX9-NEXT: s_waitcnt lgkmcnt(0) 303; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 304; GFX9-NEXT: s_waitcnt vmcnt(0) 305; GFX9-NEXT: v_readfirstlane_b32 s0, v1 306; GFX9-NEXT: v_mov_b32_e32 v1, 0 307; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 308; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 309; GFX9-NEXT: s_endpgm 310; 311; GFX10W64-LABEL: add_i32_uniform: 312; GFX10W64: ; %bb.0: ; %entry 313; GFX10W64-NEXT: s_clause 0x1 314; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 315; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 316; GFX10W64-NEXT: s_mov_b64 s[4:5], exec 317; GFX10W64-NEXT: ; implicit-def: $vgpr1 318; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 319; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 320; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 321; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc 322; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 323; GFX10W64-NEXT: ; %bb.1: 324; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 325; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 326; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 327; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 328; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 329; GFX10W64-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 330; GFX10W64-NEXT: .LBB1_2: 331; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 332; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7] 333; GFX10W64-NEXT: s_waitcnt vmcnt(0) 334; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 335; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 336; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1] 337; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 338; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 339; GFX10W64-NEXT: s_endpgm 340; 341; GFX10W32-LABEL: add_i32_uniform: 342; GFX10W32: ; %bb.0: ; %entry 343; GFX10W32-NEXT: s_clause 0x1 344; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 345; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 346; GFX10W32-NEXT: s_mov_b32 s6, exec_lo 347; GFX10W32-NEXT: ; implicit-def: $vgpr1 348; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 349; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 350; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 351; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 352; GFX10W32-NEXT: ; %bb.1: 353; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 354; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 355; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 356; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 357; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 358; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 359; GFX10W32-NEXT: .LBB1_2: 360; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 361; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 362; GFX10W32-NEXT: s_waitcnt vmcnt(0) 363; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 364; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 365; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1] 366; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 367; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 368; GFX10W32-NEXT: s_endpgm 369; 370; GFX11W64-LABEL: add_i32_uniform: 371; GFX11W64: ; %bb.0: ; %entry 372; GFX11W64-NEXT: s_clause 0x1 373; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 374; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44 375; GFX11W64-NEXT: s_mov_b64 s[4:5], exec 376; GFX11W64-NEXT: s_mov_b64 s[6:7], exec 377; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 378; GFX11W64-NEXT: ; implicit-def: $vgpr1 379; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 380; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 381; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 382; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 383; GFX11W64-NEXT: ; %bb.1: 384; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34 385; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 386; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 387; GFX11W64-NEXT: s_mul_i32 s0, s8, s0 388; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 389; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 390; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], 0 glc 391; GFX11W64-NEXT: .LBB1_2: 392; GFX11W64-NEXT: s_or_b64 exec, exec, s[6:7] 393; GFX11W64-NEXT: s_waitcnt vmcnt(0) 394; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 395; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 396; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) 397; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1] 398; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 399; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3] 400; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 401; GFX11W64-NEXT: s_endpgm 402; 403; GFX11W32-LABEL: add_i32_uniform: 404; GFX11W32: ; %bb.0: ; %entry 405; GFX11W32-NEXT: s_clause 0x1 406; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 407; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44 408; GFX11W32-NEXT: s_mov_b32 s6, exec_lo 409; GFX11W32-NEXT: s_mov_b32 s5, exec_lo 410; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 411; GFX11W32-NEXT: ; implicit-def: $vgpr1 412; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 413; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 414; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 415; GFX11W32-NEXT: ; %bb.1: 416; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 417; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6 418; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 419; GFX11W32-NEXT: s_mul_i32 s0, s4, s0 420; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 421; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 422; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 423; GFX11W32-NEXT: .LBB1_2: 424; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 425; GFX11W32-NEXT: s_waitcnt vmcnt(0) 426; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 427; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 428; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 429; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s4, v0, s[0:1] 430; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 431; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] 432; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 433; GFX11W32-NEXT: s_endpgm 434entry: 435 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0) 436 store i32 %old, i32 addrspace(1)* %out 437 ret void 438} 439 440define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { 441; GFX6-LABEL: add_i32_varying_vdata: 442; GFX6: ; %bb.0: ; %entry 443; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 444; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 445; GFX6-NEXT: s_waitcnt lgkmcnt(0) 446; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 447; GFX6-NEXT: s_mov_b32 s3, 0xf000 448; GFX6-NEXT: s_mov_b32 s2, -1 449; GFX6-NEXT: s_waitcnt vmcnt(0) 450; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 451; GFX6-NEXT: s_endpgm 452; 453; GFX8-LABEL: add_i32_varying_vdata: 454; GFX8: ; %bb.0: ; %entry 455; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 456; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 457; GFX8-NEXT: v_mov_b32_e32 v1, 0 458; GFX8-NEXT: s_mov_b64 exec, s[4:5] 459; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 460; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 461; GFX8-NEXT: v_mov_b32_e32 v2, v0 462; GFX8-NEXT: s_not_b64 exec, exec 463; GFX8-NEXT: v_mov_b32_e32 v2, 0 464; GFX8-NEXT: s_not_b64 exec, exec 465; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 466; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 467; GFX8-NEXT: s_nop 1 468; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 469; GFX8-NEXT: s_nop 1 470; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 471; GFX8-NEXT: s_nop 1 472; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 473; GFX8-NEXT: s_nop 1 474; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 475; GFX8-NEXT: s_nop 1 476; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 477; GFX8-NEXT: v_readlane_b32 s6, v2, 63 478; GFX8-NEXT: s_nop 0 479; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 480; GFX8-NEXT: s_mov_b64 exec, s[4:5] 481; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 482; GFX8-NEXT: ; implicit-def: $vgpr0 483; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 484; GFX8-NEXT: s_cbranch_execz .LBB2_2 485; GFX8-NEXT: ; %bb.1: 486; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 487; GFX8-NEXT: v_mov_b32_e32 v0, s6 488; GFX8-NEXT: s_waitcnt lgkmcnt(0) 489; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 490; GFX8-NEXT: .LBB2_2: 491; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 492; GFX8-NEXT: s_waitcnt vmcnt(0) 493; GFX8-NEXT: v_readfirstlane_b32 s0, v0 494; GFX8-NEXT: v_mov_b32_e32 v0, v1 495; GFX8-NEXT: s_waitcnt lgkmcnt(0) 496; GFX8-NEXT: v_mov_b32_e32 v4, s3 497; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 498; GFX8-NEXT: v_mov_b32_e32 v3, s2 499; GFX8-NEXT: flat_store_dword v[3:4], v0 500; GFX8-NEXT: s_endpgm 501; 502; GFX9-LABEL: add_i32_varying_vdata: 503; GFX9: ; %bb.0: ; %entry 504; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 505; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 506; GFX9-NEXT: v_mov_b32_e32 v1, 0 507; GFX9-NEXT: s_mov_b64 exec, s[4:5] 508; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 509; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 510; GFX9-NEXT: v_mov_b32_e32 v2, v0 511; GFX9-NEXT: s_not_b64 exec, exec 512; GFX9-NEXT: v_mov_b32_e32 v2, 0 513; GFX9-NEXT: s_not_b64 exec, exec 514; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 515; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 516; GFX9-NEXT: s_nop 1 517; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 518; GFX9-NEXT: s_nop 1 519; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 520; GFX9-NEXT: s_nop 1 521; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 522; GFX9-NEXT: s_nop 1 523; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 524; GFX9-NEXT: s_nop 1 525; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 526; GFX9-NEXT: v_readlane_b32 s6, v2, 63 527; GFX9-NEXT: s_nop 0 528; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 529; GFX9-NEXT: s_mov_b64 exec, s[4:5] 530; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 531; GFX9-NEXT: ; implicit-def: $vgpr0 532; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 533; GFX9-NEXT: s_cbranch_execz .LBB2_2 534; GFX9-NEXT: ; %bb.1: 535; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 536; GFX9-NEXT: v_mov_b32_e32 v0, s6 537; GFX9-NEXT: s_waitcnt lgkmcnt(0) 538; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 539; GFX9-NEXT: .LBB2_2: 540; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 541; GFX9-NEXT: s_waitcnt vmcnt(0) 542; GFX9-NEXT: v_readfirstlane_b32 s0, v0 543; GFX9-NEXT: v_mov_b32_e32 v0, v1 544; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 545; GFX9-NEXT: v_mov_b32_e32 v3, 0 546; GFX9-NEXT: s_waitcnt lgkmcnt(0) 547; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 548; GFX9-NEXT: s_endpgm 549; 550; GFX10W64-LABEL: add_i32_varying_vdata: 551; GFX10W64: ; %bb.0: ; %entry 552; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 553; GFX10W64-NEXT: s_not_b64 exec, exec 554; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 555; GFX10W64-NEXT: s_not_b64 exec, exec 556; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 557; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 558; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 559; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 560; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 561; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 562; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 563; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 564; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 565; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 566; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 567; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 568; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 569; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 570; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 571; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 572; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 573; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 574; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 575; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 576; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 577; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 578; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 579; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 580; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 581; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 582; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 583; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 584; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 585; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 586; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 587; GFX10W64-NEXT: ; implicit-def: $vgpr0 588; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 589; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 590; GFX10W64-NEXT: ; %bb.1: 591; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 592; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 593; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 594; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 595; GFX10W64-NEXT: .LBB2_2: 596; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 597; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 598; GFX10W64-NEXT: s_waitcnt vmcnt(0) 599; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 600; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 601; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 602; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 603; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 604; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 605; GFX10W64-NEXT: s_endpgm 606; 607; GFX10W32-LABEL: add_i32_varying_vdata: 608; GFX10W32: ; %bb.0: ; %entry 609; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 610; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 611; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 612; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 613; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 614; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 615; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 616; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 617; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 618; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 619; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 620; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 621; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 622; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 623; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 624; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 625; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 626; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 627; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 628; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 629; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 630; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 631; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 632; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 633; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 634; GFX10W32-NEXT: ; implicit-def: $vgpr0 635; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 636; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 637; GFX10W32-NEXT: ; %bb.1: 638; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 639; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 640; GFX10W32-NEXT: s_mov_b32 s5, s6 641; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 642; GFX10W32-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 643; GFX10W32-NEXT: .LBB2_2: 644; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 645; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 646; GFX10W32-NEXT: s_waitcnt vmcnt(0) 647; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 648; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 649; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 650; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 651; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 652; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 653; GFX10W32-NEXT: s_endpgm 654; 655; GFX11W64-LABEL: add_i32_varying_vdata: 656; GFX11W64: ; %bb.0: ; %entry 657; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 658; GFX11W64-NEXT: s_not_b64 exec, exec 659; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 660; GFX11W64-NEXT: s_not_b64 exec, exec 661; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 662; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 663; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 664; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 665; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 666; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 667; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 668; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 669; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 670; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 671; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 672; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 673; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 674; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 675; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 676; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 677; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 678; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) 679; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 680; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 681; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] 682; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 683; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 684; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 685; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 686; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 687; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 688; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 689; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 690; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 691; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 692; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 693; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 694; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 695; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 696; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 697; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 698; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 699; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 700; GFX11W64-NEXT: ; implicit-def: $vgpr0 701; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 702; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 703; GFX11W64-NEXT: ; %bb.1: 704; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 705; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 706; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 707; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc 708; GFX11W64-NEXT: .LBB2_2: 709; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] 710; GFX11W64-NEXT: s_waitcnt vmcnt(0) 711; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 712; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 713; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 714; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 715; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 716; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 717; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] 718; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 719; GFX11W64-NEXT: s_endpgm 720; 721; GFX11W32-LABEL: add_i32_varying_vdata: 722; GFX11W32: ; %bb.0: ; %entry 723; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 724; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo 725; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 726; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo 727; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 728; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 729; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 730; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 731; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 732; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 733; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 734; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 735; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 736; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 737; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 738; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 739; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 740; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 741; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 742; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 743; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 744; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) 745; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 746; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 747; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 748; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 749; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 750; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 751; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 752; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 753; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 754; GFX11W32-NEXT: ; implicit-def: $vgpr0 755; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 756; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 757; GFX11W32-NEXT: ; %bb.1: 758; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 759; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 760; GFX11W32-NEXT: s_mov_b32 s5, s6 761; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 762; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc 763; GFX11W32-NEXT: .LBB2_2: 764; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 765; GFX11W32-NEXT: s_waitcnt vmcnt(0) 766; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 767; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 768; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 769; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 770; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 771; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 772; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] 773; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 774; GFX11W32-NEXT: s_endpgm 775entry: 776 %lane = call i32 @llvm.amdgcn.workitem.id.x() 777 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) 778 store i32 %old, i32 addrspace(1)* %out 779 ret void 780} 781 782define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { 783; GFX6-LABEL: add_i32_varying_offset: 784; GFX6: ; %bb.0: ; %entry 785; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 786; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 787; GFX6-NEXT: v_mov_b32_e32 v1, 1 788; GFX6-NEXT: s_waitcnt lgkmcnt(0) 789; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc 790; GFX6-NEXT: s_mov_b32 s3, 0xf000 791; GFX6-NEXT: s_mov_b32 s2, -1 792; GFX6-NEXT: s_waitcnt vmcnt(0) 793; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 794; GFX6-NEXT: s_endpgm 795; 796; GFX8-LABEL: add_i32_varying_offset: 797; GFX8: ; %bb.0: ; %entry 798; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 799; GFX8-NEXT: v_mov_b32_e32 v2, 1 800; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 801; GFX8-NEXT: s_waitcnt lgkmcnt(0) 802; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc 803; GFX8-NEXT: v_mov_b32_e32 v0, s0 804; GFX8-NEXT: v_mov_b32_e32 v1, s1 805; GFX8-NEXT: s_waitcnt vmcnt(0) 806; GFX8-NEXT: flat_store_dword v[0:1], v2 807; GFX8-NEXT: s_endpgm 808; 809; GFX9-LABEL: add_i32_varying_offset: 810; GFX9: ; %bb.0: ; %entry 811; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 812; GFX9-NEXT: v_mov_b32_e32 v1, 1 813; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 814; GFX9-NEXT: s_waitcnt lgkmcnt(0) 815; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc 816; GFX9-NEXT: v_mov_b32_e32 v0, 0 817; GFX9-NEXT: s_waitcnt vmcnt(0) 818; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 819; GFX9-NEXT: s_endpgm 820; 821; GFX10-LABEL: add_i32_varying_offset: 822; GFX10: ; %bb.0: ; %entry 823; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 824; GFX10-NEXT: v_mov_b32_e32 v1, 1 825; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 826; GFX10-NEXT: s_waitcnt lgkmcnt(0) 827; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc 828; GFX10-NEXT: v_mov_b32_e32 v0, 0 829; GFX10-NEXT: s_waitcnt vmcnt(0) 830; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 831; GFX10-NEXT: s_endpgm 832; 833; GFX11-LABEL: add_i32_varying_offset: 834; GFX11: ; %bb.0: ; %entry 835; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 836; GFX11-NEXT: v_mov_b32_e32 v1, 1 837; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 838; GFX11-NEXT: s_waitcnt lgkmcnt(0) 839; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc 840; GFX11-NEXT: v_mov_b32_e32 v0, 0 841; GFX11-NEXT: s_waitcnt vmcnt(0) 842; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 843; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 844; GFX11-NEXT: s_endpgm 845entry: 846 %lane = call i32 @llvm.amdgcn.workitem.id.x() 847 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) 848 store i32 %old, i32 addrspace(1)* %out 849 ret void 850} 851 852define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { 853; GFX6-LABEL: sub_i32_constant: 854; GFX6: ; %bb.0: ; %entry 855; GFX6-NEXT: s_mov_b64 s[2:3], exec 856; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 857; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 858; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 859; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 860; GFX6-NEXT: ; implicit-def: $vgpr1 861; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 862; GFX6-NEXT: s_cbranch_execz .LBB4_2 863; GFX6-NEXT: ; %bb.1: 864; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 865; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 866; GFX6-NEXT: s_mul_i32 s0, s0, 5 867; GFX6-NEXT: v_mov_b32_e32 v1, s0 868; GFX6-NEXT: s_waitcnt lgkmcnt(0) 869; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 870; GFX6-NEXT: .LBB4_2: 871; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 872; GFX6-NEXT: s_waitcnt vmcnt(0) 873; GFX6-NEXT: v_readfirstlane_b32 s0, v1 874; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0 875; GFX6-NEXT: s_mov_b32 s7, 0xf000 876; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 877; GFX6-NEXT: s_mov_b32 s6, -1 878; GFX6-NEXT: s_waitcnt lgkmcnt(0) 879; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 880; GFX6-NEXT: s_endpgm 881; 882; GFX8-LABEL: sub_i32_constant: 883; GFX8: ; %bb.0: ; %entry 884; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 885; GFX8-NEXT: s_mov_b64 s[6:7], exec 886; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 887; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 888; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 889; GFX8-NEXT: ; implicit-def: $vgpr1 890; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 891; GFX8-NEXT: s_cbranch_execz .LBB4_2 892; GFX8-NEXT: ; %bb.1: 893; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 894; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 895; GFX8-NEXT: s_mul_i32 s0, s0, 5 896; GFX8-NEXT: v_mov_b32_e32 v1, s0 897; GFX8-NEXT: s_waitcnt lgkmcnt(0) 898; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 899; GFX8-NEXT: .LBB4_2: 900; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 901; GFX8-NEXT: s_waitcnt vmcnt(0) 902; GFX8-NEXT: v_readfirstlane_b32 s0, v1 903; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 904; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 905; GFX8-NEXT: s_waitcnt lgkmcnt(0) 906; GFX8-NEXT: v_mov_b32_e32 v0, s2 907; GFX8-NEXT: v_mov_b32_e32 v1, s3 908; GFX8-NEXT: flat_store_dword v[0:1], v2 909; GFX8-NEXT: s_endpgm 910; 911; GFX9-LABEL: sub_i32_constant: 912; GFX9: ; %bb.0: ; %entry 913; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 914; GFX9-NEXT: s_mov_b64 s[6:7], exec 915; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 916; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 917; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 918; GFX9-NEXT: ; implicit-def: $vgpr1 919; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 920; GFX9-NEXT: s_cbranch_execz .LBB4_2 921; GFX9-NEXT: ; %bb.1: 922; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 923; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 924; GFX9-NEXT: s_mul_i32 s0, s0, 5 925; GFX9-NEXT: v_mov_b32_e32 v1, s0 926; GFX9-NEXT: s_waitcnt lgkmcnt(0) 927; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 928; GFX9-NEXT: .LBB4_2: 929; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 930; GFX9-NEXT: s_waitcnt vmcnt(0) 931; GFX9-NEXT: v_readfirstlane_b32 s0, v1 932; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 933; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 934; GFX9-NEXT: v_mov_b32_e32 v1, 0 935; GFX9-NEXT: s_waitcnt lgkmcnt(0) 936; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 937; GFX9-NEXT: s_endpgm 938; 939; GFX10W64-LABEL: sub_i32_constant: 940; GFX10W64: ; %bb.0: ; %entry 941; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 942; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 943; GFX10W64-NEXT: ; implicit-def: $vgpr1 944; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 945; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 946; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 947; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 948; GFX10W64-NEXT: s_cbranch_execz .LBB4_2 949; GFX10W64-NEXT: ; %bb.1: 950; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 951; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 952; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 953; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 954; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 955; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 956; GFX10W64-NEXT: .LBB4_2: 957; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 958; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 959; GFX10W64-NEXT: s_waitcnt vmcnt(0) 960; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 961; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 962; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 963; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 964; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 965; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 966; GFX10W64-NEXT: s_endpgm 967; 968; GFX10W32-LABEL: sub_i32_constant: 969; GFX10W32: ; %bb.0: ; %entry 970; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 971; GFX10W32-NEXT: s_mov_b32 s5, exec_lo 972; GFX10W32-NEXT: ; implicit-def: $vgpr1 973; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 974; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 975; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 976; GFX10W32-NEXT: s_cbranch_execz .LBB4_2 977; GFX10W32-NEXT: ; %bb.1: 978; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 979; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 980; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 981; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 982; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 983; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 984; GFX10W32-NEXT: .LBB4_2: 985; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 986; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 987; GFX10W32-NEXT: s_waitcnt vmcnt(0) 988; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 989; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 990; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 991; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 992; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 993; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 994; GFX10W32-NEXT: s_endpgm 995; 996; GFX11W64-LABEL: sub_i32_constant: 997; GFX11W64: ; %bb.0: ; %entry 998; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 999; GFX11W64-NEXT: s_mov_b64 s[6:7], exec 1000; GFX11W64-NEXT: s_mov_b64 s[4:5], exec 1001; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1002; GFX11W64-NEXT: ; implicit-def: $vgpr1 1003; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1004; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1005; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 1006; GFX11W64-NEXT: s_cbranch_execz .LBB4_2 1007; GFX11W64-NEXT: ; %bb.1: 1008; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 1009; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 1010; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1011; GFX11W64-NEXT: s_mul_i32 s0, s0, 5 1012; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 1013; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1014; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 1015; GFX11W64-NEXT: .LBB4_2: 1016; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] 1017; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1018; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 1019; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1020; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 1021; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1022; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1023; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1024; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] 1025; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1026; GFX11W64-NEXT: s_endpgm 1027; 1028; GFX11W32-LABEL: sub_i32_constant: 1029; GFX11W32: ; %bb.0: ; %entry 1030; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1031; GFX11W32-NEXT: s_mov_b32 s5, exec_lo 1032; GFX11W32-NEXT: s_mov_b32 s4, exec_lo 1033; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 1034; GFX11W32-NEXT: ; implicit-def: $vgpr1 1035; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1036; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 1037; GFX11W32-NEXT: s_cbranch_execz .LBB4_2 1038; GFX11W32-NEXT: ; %bb.1: 1039; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 1040; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5 1041; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1042; GFX11W32-NEXT: s_mul_i32 s0, s0, 5 1043; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 1044; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1045; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 1046; GFX11W32-NEXT: .LBB4_2: 1047; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1048; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1049; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 1050; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1051; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 1052; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1053; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1054; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1055; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] 1056; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1057; GFX11W32-NEXT: s_endpgm 1058entry: 1059 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) 1060 store i32 %old, i32 addrspace(1)* %out 1061 ret void 1062} 1063 1064define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) { 1065; GFX6-LABEL: sub_i32_uniform: 1066; GFX6: ; %bb.0: ; %entry 1067; GFX6-NEXT: s_mov_b64 s[2:3], exec 1068; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1069; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 1070; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1071; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1072; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1073; GFX6-NEXT: ; implicit-def: $vgpr1 1074; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 1075; GFX6-NEXT: s_cbranch_execz .LBB5_2 1076; GFX6-NEXT: ; %bb.1: 1077; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 1078; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 1079; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1080; GFX6-NEXT: s_mul_i32 s0, s8, s0 1081; GFX6-NEXT: v_mov_b32_e32 v1, s0 1082; GFX6-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1083; GFX6-NEXT: .LBB5_2: 1084; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 1085; GFX6-NEXT: s_waitcnt vmcnt(0) 1086; GFX6-NEXT: v_readfirstlane_b32 s0, v1 1087; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1088; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 1089; GFX6-NEXT: s_mov_b32 s7, 0xf000 1090; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1091; GFX6-NEXT: s_mov_b32 s6, -1 1092; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 1093; GFX6-NEXT: s_endpgm 1094; 1095; GFX8-LABEL: sub_i32_uniform: 1096; GFX8: ; %bb.0: ; %entry 1097; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1098; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 1099; GFX8-NEXT: s_mov_b64 s[4:5], exec 1100; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1101; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1102; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1103; GFX8-NEXT: ; implicit-def: $vgpr1 1104; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 1105; GFX8-NEXT: s_cbranch_execz .LBB5_2 1106; GFX8-NEXT: ; %bb.1: 1107; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 1108; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 1109; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1110; GFX8-NEXT: s_mul_i32 s0, s8, s0 1111; GFX8-NEXT: v_mov_b32_e32 v1, s0 1112; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1113; GFX8-NEXT: .LBB5_2: 1114; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 1115; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1116; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 1117; GFX8-NEXT: s_waitcnt vmcnt(0) 1118; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1119; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 1120; GFX8-NEXT: v_mov_b32_e32 v0, s2 1121; GFX8-NEXT: v_mov_b32_e32 v1, s3 1122; GFX8-NEXT: flat_store_dword v[0:1], v2 1123; GFX8-NEXT: s_endpgm 1124; 1125; GFX9-LABEL: sub_i32_uniform: 1126; GFX9: ; %bb.0: ; %entry 1127; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1128; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 1129; GFX9-NEXT: s_mov_b64 s[4:5], exec 1130; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1131; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1132; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1133; GFX9-NEXT: ; implicit-def: $vgpr1 1134; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc 1135; GFX9-NEXT: s_cbranch_execz .LBB5_2 1136; GFX9-NEXT: ; %bb.1: 1137; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 1138; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 1139; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1140; GFX9-NEXT: s_mul_i32 s0, s8, s0 1141; GFX9-NEXT: v_mov_b32_e32 v1, s0 1142; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1143; GFX9-NEXT: .LBB5_2: 1144; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 1145; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1146; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 1147; GFX9-NEXT: s_waitcnt vmcnt(0) 1148; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1149; GFX9-NEXT: v_mov_b32_e32 v1, 0 1150; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1151; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 1152; GFX9-NEXT: s_endpgm 1153; 1154; GFX10W64-LABEL: sub_i32_uniform: 1155; GFX10W64: ; %bb.0: ; %entry 1156; GFX10W64-NEXT: s_clause 0x1 1157; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1158; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 1159; GFX10W64-NEXT: s_mov_b64 s[4:5], exec 1160; GFX10W64-NEXT: ; implicit-def: $vgpr1 1161; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1162; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1163; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1164; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc 1165; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 1166; GFX10W64-NEXT: ; %bb.1: 1167; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 1168; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 1169; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1170; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 1171; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 1172; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1173; GFX10W64-NEXT: .LBB5_2: 1174; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1175; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7] 1176; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1177; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0 1178; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1179; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 1180; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1181; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1182; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 1183; GFX10W64-NEXT: s_endpgm 1184; 1185; GFX10W32-LABEL: sub_i32_uniform: 1186; GFX10W32: ; %bb.0: ; %entry 1187; GFX10W32-NEXT: s_clause 0x1 1188; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1189; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 1190; GFX10W32-NEXT: s_mov_b32 s6, exec_lo 1191; GFX10W32-NEXT: ; implicit-def: $vgpr1 1192; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1193; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1194; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 1195; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 1196; GFX10W32-NEXT: ; %bb.1: 1197; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1198; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 1199; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1200; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 1201; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 1202; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1203; GFX10W32-NEXT: .LBB5_2: 1204; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1205; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 1206; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1207; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0 1208; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1209; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 1210; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1211; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1212; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 1213; GFX10W32-NEXT: s_endpgm 1214; 1215; GFX11W64-LABEL: sub_i32_uniform: 1216; GFX11W64: ; %bb.0: ; %entry 1217; GFX11W64-NEXT: s_clause 0x1 1218; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1219; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44 1220; GFX11W64-NEXT: s_mov_b64 s[4:5], exec 1221; GFX11W64-NEXT: s_mov_b64 s[6:7], exec 1222; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1223; GFX11W64-NEXT: ; implicit-def: $vgpr1 1224; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1225; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1226; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 1227; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 1228; GFX11W64-NEXT: ; %bb.1: 1229; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34 1230; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 1231; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1232; GFX11W64-NEXT: s_mul_i32 s0, s8, s0 1233; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1234; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 1235; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc 1236; GFX11W64-NEXT: .LBB5_2: 1237; GFX11W64-NEXT: s_or_b64 exec, exec, s[6:7] 1238; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1239; GFX11W64-NEXT: v_mul_lo_u32 v0, s8, v0 1240; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1241; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 1242; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 1243; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1244; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1245; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] 1246; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1247; GFX11W64-NEXT: s_endpgm 1248; 1249; GFX11W32-LABEL: sub_i32_uniform: 1250; GFX11W32: ; %bb.0: ; %entry 1251; GFX11W32-NEXT: s_clause 0x1 1252; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1253; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44 1254; GFX11W32-NEXT: s_mov_b32 s6, exec_lo 1255; GFX11W32-NEXT: s_mov_b32 s5, exec_lo 1256; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1257; GFX11W32-NEXT: ; implicit-def: $vgpr1 1258; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1259; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 1260; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 1261; GFX11W32-NEXT: ; %bb.1: 1262; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 1263; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6 1264; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1265; GFX11W32-NEXT: s_mul_i32 s0, s4, s0 1266; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1267; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 1268; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 1269; GFX11W32-NEXT: .LBB5_2: 1270; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 1271; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1272; GFX11W32-NEXT: v_mul_lo_u32 v0, s4, v0 1273; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1274; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 1275; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 1276; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1277; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1278; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] 1279; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1280; GFX11W32-NEXT: s_endpgm 1281entry: 1282 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0) 1283 store i32 %old, i32 addrspace(1)* %out 1284 ret void 1285} 1286 1287define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { 1288; GFX6-LABEL: sub_i32_varying_vdata: 1289; GFX6: ; %bb.0: ; %entry 1290; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1291; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1292; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1293; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 1294; GFX6-NEXT: s_mov_b32 s3, 0xf000 1295; GFX6-NEXT: s_mov_b32 s2, -1 1296; GFX6-NEXT: s_waitcnt vmcnt(0) 1297; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 1298; GFX6-NEXT: s_endpgm 1299; 1300; GFX8-LABEL: sub_i32_varying_vdata: 1301; GFX8: ; %bb.0: ; %entry 1302; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1303; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1304; GFX8-NEXT: v_mov_b32_e32 v1, 0 1305; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1306; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1307; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1308; GFX8-NEXT: v_mov_b32_e32 v2, v0 1309; GFX8-NEXT: s_not_b64 exec, exec 1310; GFX8-NEXT: v_mov_b32_e32 v2, 0 1311; GFX8-NEXT: s_not_b64 exec, exec 1312; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1313; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1314; GFX8-NEXT: s_nop 1 1315; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1316; GFX8-NEXT: s_nop 1 1317; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1318; GFX8-NEXT: s_nop 1 1319; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1320; GFX8-NEXT: s_nop 1 1321; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1322; GFX8-NEXT: s_nop 1 1323; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1324; GFX8-NEXT: v_readlane_b32 s6, v2, 63 1325; GFX8-NEXT: s_nop 0 1326; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1327; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1328; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1329; GFX8-NEXT: ; implicit-def: $vgpr0 1330; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1331; GFX8-NEXT: s_cbranch_execz .LBB6_2 1332; GFX8-NEXT: ; %bb.1: 1333; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1334; GFX8-NEXT: v_mov_b32_e32 v0, s6 1335; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1336; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1337; GFX8-NEXT: .LBB6_2: 1338; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1339; GFX8-NEXT: s_waitcnt vmcnt(0) 1340; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1341; GFX8-NEXT: v_mov_b32_e32 v0, v1 1342; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1343; GFX8-NEXT: v_mov_b32_e32 v4, s3 1344; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1345; GFX8-NEXT: v_mov_b32_e32 v3, s2 1346; GFX8-NEXT: flat_store_dword v[3:4], v0 1347; GFX8-NEXT: s_endpgm 1348; 1349; GFX9-LABEL: sub_i32_varying_vdata: 1350; GFX9: ; %bb.0: ; %entry 1351; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1352; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1353; GFX9-NEXT: v_mov_b32_e32 v1, 0 1354; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1355; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1356; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1357; GFX9-NEXT: v_mov_b32_e32 v2, v0 1358; GFX9-NEXT: s_not_b64 exec, exec 1359; GFX9-NEXT: v_mov_b32_e32 v2, 0 1360; GFX9-NEXT: s_not_b64 exec, exec 1361; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1362; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1363; GFX9-NEXT: s_nop 1 1364; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1365; GFX9-NEXT: s_nop 1 1366; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1367; GFX9-NEXT: s_nop 1 1368; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1369; GFX9-NEXT: s_nop 1 1370; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1371; GFX9-NEXT: s_nop 1 1372; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1373; GFX9-NEXT: v_readlane_b32 s6, v2, 63 1374; GFX9-NEXT: s_nop 0 1375; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1376; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1377; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1378; GFX9-NEXT: ; implicit-def: $vgpr0 1379; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1380; GFX9-NEXT: s_cbranch_execz .LBB6_2 1381; GFX9-NEXT: ; %bb.1: 1382; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1383; GFX9-NEXT: v_mov_b32_e32 v0, s6 1384; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1385; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1386; GFX9-NEXT: .LBB6_2: 1387; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1388; GFX9-NEXT: s_waitcnt vmcnt(0) 1389; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1390; GFX9-NEXT: v_mov_b32_e32 v0, v1 1391; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1392; GFX9-NEXT: v_mov_b32_e32 v3, 0 1393; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1394; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 1395; GFX9-NEXT: s_endpgm 1396; 1397; GFX10W64-LABEL: sub_i32_varying_vdata: 1398; GFX10W64: ; %bb.0: ; %entry 1399; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 1400; GFX10W64-NEXT: s_not_b64 exec, exec 1401; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1402; GFX10W64-NEXT: s_not_b64 exec, exec 1403; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 1404; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1405; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 1406; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1407; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1408; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1409; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 1410; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1411; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1412; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 1413; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 1414; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1415; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 1416; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1417; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 1418; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1419; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1420; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 1421; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 1422; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1423; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1424; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1425; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 1426; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 1427; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 1428; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1429; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1430; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1431; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 1432; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1433; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1434; GFX10W64-NEXT: ; implicit-def: $vgpr0 1435; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1436; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 1437; GFX10W64-NEXT: ; %bb.1: 1438; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1439; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 1440; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1441; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1442; GFX10W64-NEXT: .LBB6_2: 1443; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1444; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 1445; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1446; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 1447; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 1448; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 1449; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1450; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1451; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 1452; GFX10W64-NEXT: s_endpgm 1453; 1454; GFX10W32-LABEL: sub_i32_varying_vdata: 1455; GFX10W32: ; %bb.0: ; %entry 1456; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 1457; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 1458; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1459; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 1460; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 1461; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1462; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1463; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1464; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1465; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 1466; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1467; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 1468; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1469; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 1470; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1471; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 1472; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 1473; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1474; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 1475; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 1476; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1477; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 1478; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 1479; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 1480; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1481; GFX10W32-NEXT: ; implicit-def: $vgpr0 1482; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 1483; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 1484; GFX10W32-NEXT: ; %bb.1: 1485; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1486; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 1487; GFX10W32-NEXT: s_mov_b32 s5, s6 1488; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1489; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1490; GFX10W32-NEXT: .LBB6_2: 1491; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1492; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1493; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1494; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 1495; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 1496; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 1497; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1498; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1499; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 1500; GFX10W32-NEXT: s_endpgm 1501; 1502; GFX11W64-LABEL: sub_i32_varying_vdata: 1503; GFX11W64: ; %bb.0: ; %entry 1504; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 1505; GFX11W64-NEXT: s_not_b64 exec, exec 1506; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 1507; GFX11W64-NEXT: s_not_b64 exec, exec 1508; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 1509; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1510; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1511; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 1512; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1513; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1514; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1515; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1516; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1517; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 1518; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1519; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1520; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1521; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 1522; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1523; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 1524; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1525; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) 1526; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 1527; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1528; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] 1529; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1530; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1531; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 1532; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 1533; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 1534; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1535; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1536; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1537; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 1538; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 1539; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 1540; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 1541; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 1542; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1543; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1544; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 1545; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] 1546; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1547; GFX11W64-NEXT: ; implicit-def: $vgpr0 1548; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1549; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 1550; GFX11W64-NEXT: ; %bb.1: 1551; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 1552; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 1553; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1554; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc 1555; GFX11W64-NEXT: .LBB6_2: 1556; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] 1557; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1558; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 1559; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 1560; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 1561; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1562; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1563; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1564; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] 1565; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1566; GFX11W64-NEXT: s_endpgm 1567; 1568; GFX11W32-LABEL: sub_i32_varying_vdata: 1569; GFX11W32: ; %bb.0: ; %entry 1570; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 1571; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo 1572; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 1573; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo 1574; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 1575; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1576; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1577; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1578; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1579; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1580; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1581; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1582; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 1583; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1584; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 1585; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 1586; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 1587; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1588; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1589; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 1590; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 1591; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) 1592; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1593; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 1594; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 1595; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1596; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 1597; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1598; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 1599; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 1600; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1601; GFX11W32-NEXT: ; implicit-def: $vgpr0 1602; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 1603; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 1604; GFX11W32-NEXT: ; %bb.1: 1605; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 1606; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 1607; GFX11W32-NEXT: s_mov_b32 s5, s6 1608; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1609; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc 1610; GFX11W32-NEXT: .LBB6_2: 1611; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1612; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1613; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 1614; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 1615; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 1616; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1617; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1618; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1619; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] 1620; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1621; GFX11W32-NEXT: s_endpgm 1622entry: 1623 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1624 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) 1625 store i32 %old, i32 addrspace(1)* %out 1626 ret void 1627} 1628 1629define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { 1630; GFX6-LABEL: sub_i32_varying_offset: 1631; GFX6: ; %bb.0: ; %entry 1632; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1633; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1634; GFX6-NEXT: v_mov_b32_e32 v1, 1 1635; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1636; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc 1637; GFX6-NEXT: s_mov_b32 s3, 0xf000 1638; GFX6-NEXT: s_mov_b32 s2, -1 1639; GFX6-NEXT: s_waitcnt vmcnt(0) 1640; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 1641; GFX6-NEXT: s_endpgm 1642; 1643; GFX8-LABEL: sub_i32_varying_offset: 1644; GFX8: ; %bb.0: ; %entry 1645; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1646; GFX8-NEXT: v_mov_b32_e32 v2, 1 1647; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1648; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1649; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc 1650; GFX8-NEXT: v_mov_b32_e32 v0, s0 1651; GFX8-NEXT: v_mov_b32_e32 v1, s1 1652; GFX8-NEXT: s_waitcnt vmcnt(0) 1653; GFX8-NEXT: flat_store_dword v[0:1], v2 1654; GFX8-NEXT: s_endpgm 1655; 1656; GFX9-LABEL: sub_i32_varying_offset: 1657; GFX9: ; %bb.0: ; %entry 1658; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1659; GFX9-NEXT: v_mov_b32_e32 v1, 1 1660; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1661; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1662; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc 1663; GFX9-NEXT: v_mov_b32_e32 v0, 0 1664; GFX9-NEXT: s_waitcnt vmcnt(0) 1665; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1666; GFX9-NEXT: s_endpgm 1667; 1668; GFX10-LABEL: sub_i32_varying_offset: 1669; GFX10: ; %bb.0: ; %entry 1670; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1671; GFX10-NEXT: v_mov_b32_e32 v1, 1 1672; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1673; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1674; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc 1675; GFX10-NEXT: v_mov_b32_e32 v0, 0 1676; GFX10-NEXT: s_waitcnt vmcnt(0) 1677; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1678; GFX10-NEXT: s_endpgm 1679; 1680; GFX11-LABEL: sub_i32_varying_offset: 1681; GFX11: ; %bb.0: ; %entry 1682; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 1683; GFX11-NEXT: v_mov_b32_e32 v1, 1 1684; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1685; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1686; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc 1687; GFX11-NEXT: v_mov_b32_e32 v0, 0 1688; GFX11-NEXT: s_waitcnt vmcnt(0) 1689; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1690; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1691; GFX11-NEXT: s_endpgm 1692entry: 1693 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1694 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) 1695 store i32 %old, i32 addrspace(1)* %out 1696 ret void 1697} 1698