1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s 3; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32) 10declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, i32) 11 12; Show what the atomic optimization pass will do for struct buffers. 13 14define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { 15; GFX6-LABEL: add_i32_constant: 16; GFX6: ; %bb.0: ; %entry 17; GFX6-NEXT: s_mov_b64 s[6:7], exec 18; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 19; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 20; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 21; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 22; GFX6-NEXT: ; implicit-def: $vgpr1 23; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc 24; GFX6-NEXT: s_cbranch_execz .LBB0_2 25; GFX6-NEXT: ; %bb.1: 26; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 27; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 28; GFX6-NEXT: s_mul_i32 s0, s0, 5 29; GFX6-NEXT: v_mov_b32_e32 v1, s0 30; GFX6-NEXT: v_mov_b32_e32 v2, 0 31; GFX6-NEXT: s_waitcnt lgkmcnt(0) 32; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 33; GFX6-NEXT: .LBB0_2: 34; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] 35; GFX6-NEXT: s_waitcnt vmcnt(0) 36; GFX6-NEXT: v_readfirstlane_b32 s0, v1 37; GFX6-NEXT: s_mov_b32 s7, 0xf000 38; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s0 39; GFX6-NEXT: s_mov_b32 s6, -1 40; GFX6-NEXT: s_waitcnt lgkmcnt(0) 41; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 42; GFX6-NEXT: s_endpgm 43; 44; GFX8-LABEL: add_i32_constant: 45; GFX8: ; %bb.0: ; %entry 46; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 47; GFX8-NEXT: s_mov_b64 s[6:7], exec 48; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 49; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 50; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 51; GFX8-NEXT: ; implicit-def: $vgpr1 52; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 53; GFX8-NEXT: s_cbranch_execz .LBB0_2 54; GFX8-NEXT: ; %bb.1: 55; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 56; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 57; GFX8-NEXT: s_mul_i32 s0, s0, 5 58; GFX8-NEXT: v_mov_b32_e32 v1, s0 59; GFX8-NEXT: v_mov_b32_e32 v2, 0 60; GFX8-NEXT: s_waitcnt lgkmcnt(0) 61; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 62; GFX8-NEXT: .LBB0_2: 63; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 64; GFX8-NEXT: s_waitcnt vmcnt(0) 65; GFX8-NEXT: v_readfirstlane_b32 s0, v1 66; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0 67; GFX8-NEXT: s_waitcnt lgkmcnt(0) 68; GFX8-NEXT: v_mov_b32_e32 v0, s2 69; GFX8-NEXT: v_mov_b32_e32 v1, s3 70; GFX8-NEXT: flat_store_dword v[0:1], v2 71; GFX8-NEXT: s_endpgm 72; 73; GFX9-LABEL: add_i32_constant: 74; GFX9: ; %bb.0: ; %entry 75; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 76; GFX9-NEXT: s_mov_b64 s[6:7], exec 77; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 78; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 79; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 80; GFX9-NEXT: ; implicit-def: $vgpr1 81; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 82; GFX9-NEXT: s_cbranch_execz .LBB0_2 83; GFX9-NEXT: ; %bb.1: 84; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 85; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 86; GFX9-NEXT: s_mul_i32 s0, s0, 5 87; GFX9-NEXT: v_mov_b32_e32 v1, s0 88; GFX9-NEXT: v_mov_b32_e32 v2, 0 89; GFX9-NEXT: s_waitcnt lgkmcnt(0) 90; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 91; GFX9-NEXT: .LBB0_2: 92; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 93; GFX9-NEXT: s_waitcnt vmcnt(0) 94; GFX9-NEXT: v_readfirstlane_b32 s0, v1 95; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0 96; GFX9-NEXT: v_mov_b32_e32 v1, 0 97; GFX9-NEXT: s_waitcnt lgkmcnt(0) 98; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 99; GFX9-NEXT: s_endpgm 100; 101; GFX10W64-LABEL: add_i32_constant: 102; GFX10W64: ; %bb.0: ; %entry 103; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 104; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 105; GFX10W64-NEXT: ; implicit-def: $vgpr1 106; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 107; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 108; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 109; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 110; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 111; GFX10W64-NEXT: ; %bb.1: 112; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 113; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 114; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 115; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 116; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 117; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 118; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 119; GFX10W64-NEXT: .LBB0_2: 120; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 121; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 122; GFX10W64-NEXT: s_waitcnt vmcnt(0) 123; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 124; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 125; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 126; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 127; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 128; GFX10W64-NEXT: s_endpgm 129; 130; GFX10W32-LABEL: add_i32_constant: 131; GFX10W32: ; %bb.0: ; %entry 132; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 133; GFX10W32-NEXT: s_mov_b32 s5, exec_lo 134; GFX10W32-NEXT: ; implicit-def: $vgpr1 135; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 136; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 137; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 138; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 139; GFX10W32-NEXT: ; %bb.1: 140; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 141; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 142; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 143; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 144; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 145; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 146; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 147; GFX10W32-NEXT: .LBB0_2: 148; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 149; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 150; GFX10W32-NEXT: s_waitcnt vmcnt(0) 151; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 152; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 153; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 154; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 155; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 156; GFX10W32-NEXT: s_endpgm 157entry: 158 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) 159 store i32 %old, i32 addrspace(1)* %out 160 ret void 161} 162 163define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) { 164; GFX6-LABEL: add_i32_uniform: 165; GFX6: ; %bb.0: ; %entry 166; GFX6-NEXT: s_mov_b64 s[2:3], exec 167; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 168; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 169; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 170; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 171; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 172; GFX6-NEXT: ; implicit-def: $vgpr1 173; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 174; GFX6-NEXT: s_cbranch_execz .LBB1_2 175; GFX6-NEXT: ; %bb.1: 176; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 177; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 178; GFX6-NEXT: s_waitcnt lgkmcnt(0) 179; GFX6-NEXT: s_mul_i32 s0, s8, s0 180; GFX6-NEXT: v_mov_b32_e32 v1, s0 181; GFX6-NEXT: v_mov_b32_e32 v2, 0 182; GFX6-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc 183; GFX6-NEXT: .LBB1_2: 184; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 185; GFX6-NEXT: s_waitcnt vmcnt(0) 186; GFX6-NEXT: v_readfirstlane_b32 s0, v1 187; GFX6-NEXT: s_waitcnt lgkmcnt(0) 188; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 189; GFX6-NEXT: s_mov_b32 s7, 0xf000 190; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 191; GFX6-NEXT: s_mov_b32 s6, -1 192; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 193; GFX6-NEXT: s_endpgm 194; 195; GFX8-LABEL: add_i32_uniform: 196; GFX8: ; %bb.0: ; %entry 197; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 198; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 199; GFX8-NEXT: s_mov_b64 s[6:7], exec 200; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 201; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 202; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 203; GFX8-NEXT: ; implicit-def: $vgpr1 204; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 205; GFX8-NEXT: s_cbranch_execz .LBB1_2 206; GFX8-NEXT: ; %bb.1: 207; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 208; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 209; GFX8-NEXT: s_waitcnt lgkmcnt(0) 210; GFX8-NEXT: s_mul_i32 s0, s8, s0 211; GFX8-NEXT: v_mov_b32_e32 v1, s0 212; GFX8-NEXT: v_mov_b32_e32 v2, 0 213; GFX8-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc 214; GFX8-NEXT: .LBB1_2: 215; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 216; GFX8-NEXT: s_waitcnt lgkmcnt(0) 217; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 218; GFX8-NEXT: s_waitcnt vmcnt(0) 219; GFX8-NEXT: v_readfirstlane_b32 s0, v1 220; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 221; GFX8-NEXT: v_mov_b32_e32 v0, s2 222; GFX8-NEXT: v_mov_b32_e32 v1, s3 223; GFX8-NEXT: flat_store_dword v[0:1], v2 224; GFX8-NEXT: s_endpgm 225; 226; GFX9-LABEL: add_i32_uniform: 227; GFX9: ; %bb.0: ; %entry 228; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 229; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 230; GFX9-NEXT: s_mov_b64 s[6:7], exec 231; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 232; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 233; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 234; GFX9-NEXT: ; implicit-def: $vgpr1 235; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 236; GFX9-NEXT: s_cbranch_execz .LBB1_2 237; GFX9-NEXT: ; %bb.1: 238; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 239; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 240; GFX9-NEXT: s_waitcnt lgkmcnt(0) 241; GFX9-NEXT: s_mul_i32 s0, s8, s0 242; GFX9-NEXT: v_mov_b32_e32 v1, s0 243; GFX9-NEXT: v_mov_b32_e32 v2, 0 244; GFX9-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc 245; GFX9-NEXT: .LBB1_2: 246; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 247; GFX9-NEXT: s_waitcnt lgkmcnt(0) 248; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 249; GFX9-NEXT: s_waitcnt vmcnt(0) 250; GFX9-NEXT: v_readfirstlane_b32 s0, v1 251; GFX9-NEXT: v_mov_b32_e32 v1, 0 252; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 253; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 254; GFX9-NEXT: s_endpgm 255; 256; GFX10W64-LABEL: add_i32_uniform: 257; GFX10W64: ; %bb.0: ; %entry 258; GFX10W64-NEXT: s_clause 0x1 259; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 260; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 261; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 262; GFX10W64-NEXT: ; implicit-def: $vgpr1 263; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 264; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 265; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 266; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 267; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 268; GFX10W64-NEXT: ; %bb.1: 269; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 270; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 271; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 272; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 273; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 274; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 275; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc 276; GFX10W64-NEXT: .LBB1_2: 277; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 278; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 279; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 280; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0 281; GFX10W64-NEXT: s_waitcnt vmcnt(0) 282; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 283; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 284; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 285; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 286; GFX10W64-NEXT: s_endpgm 287; 288; GFX10W32-LABEL: add_i32_uniform: 289; GFX10W32: ; %bb.0: ; %entry 290; GFX10W32-NEXT: s_clause 0x1 291; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 292; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 293; GFX10W32-NEXT: s_mov_b32 s6, exec_lo 294; GFX10W32-NEXT: ; implicit-def: $vgpr1 295; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 296; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 297; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 298; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 299; GFX10W32-NEXT: ; %bb.1: 300; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 301; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 302; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 303; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 304; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 305; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 306; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 307; GFX10W32-NEXT: .LBB1_2: 308; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 309; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 310; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 311; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0 312; GFX10W32-NEXT: s_waitcnt vmcnt(0) 313; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 314; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 315; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 316; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 317; GFX10W32-NEXT: s_endpgm 318entry: 319 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) 320 store i32 %old, i32 addrspace(1)* %out 321 ret void 322} 323 324define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { 325; GFX6-LABEL: add_i32_varying_vdata: 326; GFX6: ; %bb.0: ; %entry 327; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 328; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 329; GFX6-NEXT: v_mov_b32_e32 v1, 0 330; GFX6-NEXT: s_waitcnt lgkmcnt(0) 331; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc 332; GFX6-NEXT: s_mov_b32 s3, 0xf000 333; GFX6-NEXT: s_mov_b32 s2, -1 334; GFX6-NEXT: s_waitcnt vmcnt(0) 335; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 336; GFX6-NEXT: s_endpgm 337; 338; GFX8-LABEL: add_i32_varying_vdata: 339; GFX8: ; %bb.0: ; %entry 340; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 341; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 342; GFX8-NEXT: v_mov_b32_e32 v1, 0 343; GFX8-NEXT: s_mov_b64 exec, s[4:5] 344; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 345; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 346; GFX8-NEXT: v_mov_b32_e32 v2, v0 347; GFX8-NEXT: s_not_b64 exec, exec 348; GFX8-NEXT: v_mov_b32_e32 v2, 0 349; GFX8-NEXT: s_not_b64 exec, exec 350; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 351; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 352; GFX8-NEXT: s_nop 1 353; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 354; GFX8-NEXT: s_nop 1 355; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 356; GFX8-NEXT: s_nop 1 357; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 358; GFX8-NEXT: s_nop 1 359; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 360; GFX8-NEXT: s_nop 1 361; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 362; GFX8-NEXT: v_readlane_b32 s6, v2, 63 363; GFX8-NEXT: s_nop 0 364; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 365; GFX8-NEXT: s_mov_b64 exec, s[4:5] 366; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 367; GFX8-NEXT: ; implicit-def: $vgpr0 368; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 369; GFX8-NEXT: s_cbranch_execz .LBB2_2 370; GFX8-NEXT: ; %bb.1: 371; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 372; GFX8-NEXT: v_mov_b32_e32 v0, s6 373; GFX8-NEXT: v_mov_b32_e32 v3, 0 374; GFX8-NEXT: s_waitcnt lgkmcnt(0) 375; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc 376; GFX8-NEXT: .LBB2_2: 377; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 378; GFX8-NEXT: s_waitcnt vmcnt(0) 379; GFX8-NEXT: v_readfirstlane_b32 s0, v0 380; GFX8-NEXT: v_mov_b32_e32 v0, v1 381; GFX8-NEXT: s_waitcnt lgkmcnt(0) 382; GFX8-NEXT: v_mov_b32_e32 v4, s3 383; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 384; GFX8-NEXT: v_mov_b32_e32 v3, s2 385; GFX8-NEXT: flat_store_dword v[3:4], v0 386; GFX8-NEXT: s_endpgm 387; 388; GFX9-LABEL: add_i32_varying_vdata: 389; GFX9: ; %bb.0: ; %entry 390; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 391; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 392; GFX9-NEXT: v_mov_b32_e32 v1, 0 393; GFX9-NEXT: s_mov_b64 exec, s[4:5] 394; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 395; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 396; GFX9-NEXT: v_mov_b32_e32 v2, v0 397; GFX9-NEXT: s_not_b64 exec, exec 398; GFX9-NEXT: v_mov_b32_e32 v2, 0 399; GFX9-NEXT: s_not_b64 exec, exec 400; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 401; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 402; GFX9-NEXT: s_nop 1 403; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 404; GFX9-NEXT: s_nop 1 405; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 406; GFX9-NEXT: s_nop 1 407; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 408; GFX9-NEXT: s_nop 1 409; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 410; GFX9-NEXT: s_nop 1 411; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 412; GFX9-NEXT: v_readlane_b32 s6, v2, 63 413; GFX9-NEXT: s_nop 0 414; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 415; GFX9-NEXT: s_mov_b64 exec, s[4:5] 416; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 417; GFX9-NEXT: ; implicit-def: $vgpr0 418; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 419; GFX9-NEXT: s_cbranch_execz .LBB2_2 420; GFX9-NEXT: ; %bb.1: 421; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 422; GFX9-NEXT: v_mov_b32_e32 v0, s6 423; GFX9-NEXT: v_mov_b32_e32 v3, 0 424; GFX9-NEXT: s_waitcnt lgkmcnt(0) 425; GFX9-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc 426; GFX9-NEXT: .LBB2_2: 427; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 428; GFX9-NEXT: s_waitcnt vmcnt(0) 429; GFX9-NEXT: v_readfirstlane_b32 s0, v0 430; GFX9-NEXT: v_mov_b32_e32 v0, v1 431; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 432; GFX9-NEXT: v_mov_b32_e32 v3, 0 433; GFX9-NEXT: s_waitcnt lgkmcnt(0) 434; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 435; GFX9-NEXT: s_endpgm 436; 437; GFX10W64-LABEL: add_i32_varying_vdata: 438; GFX10W64: ; %bb.0: ; %entry 439; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 440; GFX10W64-NEXT: s_not_b64 exec, exec 441; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 442; GFX10W64-NEXT: s_not_b64 exec, exec 443; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 444; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 445; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 446; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 447; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 448; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 449; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 450; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 451; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 452; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 453; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 454; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 455; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 456; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 457; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 458; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 459; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 460; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 461; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 462; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 463; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 464; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 465; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 466; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 467; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 468; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 469; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 470; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 471; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 472; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 473; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 474; GFX10W64-NEXT: ; implicit-def: $vgpr0 475; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 476; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 477; GFX10W64-NEXT: ; %bb.1: 478; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 479; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 480; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 481; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 482; GFX10W64-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc 483; GFX10W64-NEXT: .LBB2_2: 484; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 485; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 486; GFX10W64-NEXT: s_waitcnt vmcnt(0) 487; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 488; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 489; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 490; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 491; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 492; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 493; GFX10W64-NEXT: s_endpgm 494; 495; GFX10W32-LABEL: add_i32_varying_vdata: 496; GFX10W32: ; %bb.0: ; %entry 497; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 498; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 499; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 500; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 501; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 502; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 503; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 504; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 505; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 506; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 507; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 508; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 509; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 510; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 511; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 512; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 513; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 514; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 515; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 516; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 517; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 518; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 519; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 520; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 521; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 522; GFX10W32-NEXT: ; implicit-def: $vgpr0 523; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 524; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 525; GFX10W32-NEXT: ; %bb.1: 526; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 527; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 528; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 529; GFX10W32-NEXT: s_mov_b32 s5, s6 530; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 531; GFX10W32-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc 532; GFX10W32-NEXT: .LBB2_2: 533; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 534; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 535; GFX10W32-NEXT: s_waitcnt vmcnt(0) 536; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 537; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 538; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 539; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 540; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 541; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 542; GFX10W32-NEXT: s_endpgm 543entry: 544 %lane = call i32 @llvm.amdgcn.workitem.id.x() 545 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) 546 store i32 %old, i32 addrspace(1)* %out 547 ret void 548} 549 550define amdgpu_kernel void @add_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) { 551; GFX6-LABEL: add_i32_varying_vindex: 552; GFX6: ; %bb.0: ; %entry 553; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 554; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 555; GFX6-NEXT: v_mov_b32_e32 v1, 1 556; GFX6-NEXT: s_waitcnt lgkmcnt(0) 557; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc 558; GFX6-NEXT: s_mov_b32 s3, 0xf000 559; GFX6-NEXT: s_mov_b32 s2, -1 560; GFX6-NEXT: s_waitcnt vmcnt(0) 561; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 562; GFX6-NEXT: s_endpgm 563; 564; GFX8-LABEL: add_i32_varying_vindex: 565; GFX8: ; %bb.0: ; %entry 566; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 567; GFX8-NEXT: v_mov_b32_e32 v2, 1 568; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 569; GFX8-NEXT: s_waitcnt lgkmcnt(0) 570; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 idxen glc 571; GFX8-NEXT: v_mov_b32_e32 v0, s0 572; GFX8-NEXT: v_mov_b32_e32 v1, s1 573; GFX8-NEXT: s_waitcnt vmcnt(0) 574; GFX8-NEXT: flat_store_dword v[0:1], v2 575; GFX8-NEXT: s_endpgm 576; 577; GFX9-LABEL: add_i32_varying_vindex: 578; GFX9: ; %bb.0: ; %entry 579; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 580; GFX9-NEXT: v_mov_b32_e32 v1, 1 581; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 582; GFX9-NEXT: s_waitcnt lgkmcnt(0) 583; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc 584; GFX9-NEXT: v_mov_b32_e32 v0, 0 585; GFX9-NEXT: s_waitcnt vmcnt(0) 586; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 587; GFX9-NEXT: s_endpgm 588; 589; GFX10-LABEL: add_i32_varying_vindex: 590; GFX10: ; %bb.0: ; %entry 591; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 592; GFX10-NEXT: v_mov_b32_e32 v1, 1 593; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 594; GFX10-NEXT: s_waitcnt lgkmcnt(0) 595; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc 596; GFX10-NEXT: v_mov_b32_e32 v0, 0 597; GFX10-NEXT: s_waitcnt vmcnt(0) 598; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 599; GFX10-NEXT: s_endpgm 600entry: 601 %lane = call i32 @llvm.amdgcn.workitem.id.x() 602 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0) 603 store i32 %old, i32 addrspace(1)* %out 604 ret void 605} 606 607define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { 608; GFX6-LABEL: add_i32_varying_offset: 609; GFX6: ; %bb.0: ; %entry 610; GFX6-NEXT: v_mov_b32_e32 v1, v0 611; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 612; GFX6-NEXT: s_mov_b32 s2, 0 613; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 614; GFX6-NEXT: v_mov_b32_e32 v0, s2 615; GFX6-NEXT: v_mov_b32_e32 v2, 1 616; GFX6-NEXT: s_waitcnt lgkmcnt(0) 617; GFX6-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc 618; GFX6-NEXT: s_mov_b32 s3, 0xf000 619; GFX6-NEXT: s_mov_b32 s2, -1 620; GFX6-NEXT: s_waitcnt vmcnt(0) 621; GFX6-NEXT: buffer_store_dword v2, off, s[0:3], 0 622; GFX6-NEXT: s_endpgm 623; 624; GFX8-LABEL: add_i32_varying_offset: 625; GFX8: ; %bb.0: ; %entry 626; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 627; GFX8-NEXT: s_mov_b32 s2, 0 628; GFX8-NEXT: v_mov_b32_e32 v1, v0 629; GFX8-NEXT: v_mov_b32_e32 v0, s2 630; GFX8-NEXT: v_mov_b32_e32 v2, 1 631; GFX8-NEXT: s_waitcnt lgkmcnt(0) 632; GFX8-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc 633; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 634; GFX8-NEXT: s_waitcnt lgkmcnt(0) 635; GFX8-NEXT: v_mov_b32_e32 v0, s0 636; GFX8-NEXT: v_mov_b32_e32 v1, s1 637; GFX8-NEXT: s_waitcnt vmcnt(0) 638; GFX8-NEXT: flat_store_dword v[0:1], v2 639; GFX8-NEXT: s_endpgm 640; 641; GFX9-LABEL: add_i32_varying_offset: 642; GFX9: ; %bb.0: ; %entry 643; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 644; GFX9-NEXT: s_mov_b32 s2, 0 645; GFX9-NEXT: v_mov_b32_e32 v1, v0 646; GFX9-NEXT: v_mov_b32_e32 v0, s2 647; GFX9-NEXT: v_mov_b32_e32 v2, 1 648; GFX9-NEXT: s_waitcnt lgkmcnt(0) 649; GFX9-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc 650; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 651; GFX9-NEXT: v_mov_b32_e32 v0, 0 652; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 653; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 654; GFX9-NEXT: s_endpgm 655; 656; GFX10-LABEL: add_i32_varying_offset: 657; GFX10: ; %bb.0: ; %entry 658; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 659; GFX10-NEXT: s_mov_b32 s2, 0 660; GFX10-NEXT: v_mov_b32_e32 v1, v0 661; GFX10-NEXT: v_mov_b32_e32 v0, s2 662; GFX10-NEXT: v_mov_b32_e32 v2, 1 663; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 664; GFX10-NEXT: s_waitcnt lgkmcnt(0) 665; GFX10-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc 666; GFX10-NEXT: v_mov_b32_e32 v0, 0 667; GFX10-NEXT: s_waitcnt vmcnt(0) 668; GFX10-NEXT: global_store_dword v0, v2, s[0:1] 669; GFX10-NEXT: s_endpgm 670entry: 671 %lane = call i32 @llvm.amdgcn.workitem.id.x() 672 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0) 673 store i32 %old, i32 addrspace(1)* %out 674 ret void 675} 676 677define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { 678; GFX6-LABEL: sub_i32_constant: 679; GFX6: ; %bb.0: ; %entry 680; GFX6-NEXT: s_mov_b64 s[6:7], exec 681; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 682; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 683; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 684; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 685; GFX6-NEXT: ; implicit-def: $vgpr1 686; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc 687; GFX6-NEXT: s_cbranch_execz .LBB5_2 688; GFX6-NEXT: ; %bb.1: 689; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 690; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 691; GFX6-NEXT: s_mul_i32 s0, s0, 5 692; GFX6-NEXT: v_mov_b32_e32 v1, s0 693; GFX6-NEXT: v_mov_b32_e32 v2, 0 694; GFX6-NEXT: s_waitcnt lgkmcnt(0) 695; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 696; GFX6-NEXT: .LBB5_2: 697; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] 698; GFX6-NEXT: s_waitcnt vmcnt(0) 699; GFX6-NEXT: v_readfirstlane_b32 s0, v1 700; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0 701; GFX6-NEXT: s_mov_b32 s7, 0xf000 702; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 703; GFX6-NEXT: s_mov_b32 s6, -1 704; GFX6-NEXT: s_waitcnt lgkmcnt(0) 705; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 706; GFX6-NEXT: s_endpgm 707; 708; GFX8-LABEL: sub_i32_constant: 709; GFX8: ; %bb.0: ; %entry 710; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 711; GFX8-NEXT: s_mov_b64 s[6:7], exec 712; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 713; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 714; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 715; GFX8-NEXT: ; implicit-def: $vgpr1 716; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 717; GFX8-NEXT: s_cbranch_execz .LBB5_2 718; GFX8-NEXT: ; %bb.1: 719; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 720; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 721; GFX8-NEXT: s_mul_i32 s0, s0, 5 722; GFX8-NEXT: v_mov_b32_e32 v1, s0 723; GFX8-NEXT: v_mov_b32_e32 v2, 0 724; GFX8-NEXT: s_waitcnt lgkmcnt(0) 725; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 726; GFX8-NEXT: .LBB5_2: 727; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 728; GFX8-NEXT: s_waitcnt vmcnt(0) 729; GFX8-NEXT: v_readfirstlane_b32 s0, v1 730; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 731; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 732; GFX8-NEXT: s_waitcnt lgkmcnt(0) 733; GFX8-NEXT: v_mov_b32_e32 v0, s2 734; GFX8-NEXT: v_mov_b32_e32 v1, s3 735; GFX8-NEXT: flat_store_dword v[0:1], v2 736; GFX8-NEXT: s_endpgm 737; 738; GFX9-LABEL: sub_i32_constant: 739; GFX9: ; %bb.0: ; %entry 740; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 741; GFX9-NEXT: s_mov_b64 s[6:7], exec 742; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 743; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 744; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 745; GFX9-NEXT: ; implicit-def: $vgpr1 746; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 747; GFX9-NEXT: s_cbranch_execz .LBB5_2 748; GFX9-NEXT: ; %bb.1: 749; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 750; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 751; GFX9-NEXT: s_mul_i32 s0, s0, 5 752; GFX9-NEXT: v_mov_b32_e32 v1, s0 753; GFX9-NEXT: v_mov_b32_e32 v2, 0 754; GFX9-NEXT: s_waitcnt lgkmcnt(0) 755; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 756; GFX9-NEXT: .LBB5_2: 757; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 758; GFX9-NEXT: s_waitcnt vmcnt(0) 759; GFX9-NEXT: v_readfirstlane_b32 s0, v1 760; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 761; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 762; GFX9-NEXT: v_mov_b32_e32 v1, 0 763; GFX9-NEXT: s_waitcnt lgkmcnt(0) 764; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 765; GFX9-NEXT: s_endpgm 766; 767; GFX10W64-LABEL: sub_i32_constant: 768; GFX10W64: ; %bb.0: ; %entry 769; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 770; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 771; GFX10W64-NEXT: ; implicit-def: $vgpr1 772; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 773; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 774; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 775; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 776; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 777; GFX10W64-NEXT: ; %bb.1: 778; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 779; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 780; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 781; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 782; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 783; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 784; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 785; GFX10W64-NEXT: .LBB5_2: 786; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 787; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 788; GFX10W64-NEXT: s_waitcnt vmcnt(0) 789; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 790; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 791; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 792; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 793; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 794; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 795; GFX10W64-NEXT: s_endpgm 796; 797; GFX10W32-LABEL: sub_i32_constant: 798; GFX10W32: ; %bb.0: ; %entry 799; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 800; GFX10W32-NEXT: s_mov_b32 s5, exec_lo 801; GFX10W32-NEXT: ; implicit-def: $vgpr1 802; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 803; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 804; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 805; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 806; GFX10W32-NEXT: ; %bb.1: 807; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 808; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 809; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 810; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 811; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 812; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 813; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 814; GFX10W32-NEXT: .LBB5_2: 815; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 816; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 817; GFX10W32-NEXT: s_waitcnt vmcnt(0) 818; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 819; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 820; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 821; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 822; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 823; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 824; GFX10W32-NEXT: s_endpgm 825entry: 826 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) 827 store i32 %old, i32 addrspace(1)* %out 828 ret void 829} 830 831define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) { 832; GFX6-LABEL: sub_i32_uniform: 833; GFX6: ; %bb.0: ; %entry 834; GFX6-NEXT: s_mov_b64 s[2:3], exec 835; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 836; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 837; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 838; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 839; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 840; GFX6-NEXT: ; implicit-def: $vgpr1 841; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 842; GFX6-NEXT: s_cbranch_execz .LBB6_2 843; GFX6-NEXT: ; %bb.1: 844; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 845; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 846; GFX6-NEXT: s_waitcnt lgkmcnt(0) 847; GFX6-NEXT: s_mul_i32 s0, s8, s0 848; GFX6-NEXT: v_mov_b32_e32 v1, s0 849; GFX6-NEXT: v_mov_b32_e32 v2, 0 850; GFX6-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc 851; GFX6-NEXT: .LBB6_2: 852; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 853; GFX6-NEXT: s_waitcnt vmcnt(0) 854; GFX6-NEXT: v_readfirstlane_b32 s0, v1 855; GFX6-NEXT: s_waitcnt lgkmcnt(0) 856; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 857; GFX6-NEXT: s_mov_b32 s7, 0xf000 858; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 859; GFX6-NEXT: s_mov_b32 s6, -1 860; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 861; GFX6-NEXT: s_endpgm 862; 863; GFX8-LABEL: sub_i32_uniform: 864; GFX8: ; %bb.0: ; %entry 865; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 866; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 867; GFX8-NEXT: s_mov_b64 s[6:7], exec 868; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 869; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 870; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 871; GFX8-NEXT: ; implicit-def: $vgpr1 872; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 873; GFX8-NEXT: s_cbranch_execz .LBB6_2 874; GFX8-NEXT: ; %bb.1: 875; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 876; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 877; GFX8-NEXT: s_waitcnt lgkmcnt(0) 878; GFX8-NEXT: s_mul_i32 s0, s8, s0 879; GFX8-NEXT: v_mov_b32_e32 v1, s0 880; GFX8-NEXT: v_mov_b32_e32 v2, 0 881; GFX8-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc 882; GFX8-NEXT: .LBB6_2: 883; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 884; GFX8-NEXT: s_waitcnt lgkmcnt(0) 885; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 886; GFX8-NEXT: s_waitcnt vmcnt(0) 887; GFX8-NEXT: v_readfirstlane_b32 s0, v1 888; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 889; GFX8-NEXT: v_mov_b32_e32 v0, s2 890; GFX8-NEXT: v_mov_b32_e32 v1, s3 891; GFX8-NEXT: flat_store_dword v[0:1], v2 892; GFX8-NEXT: s_endpgm 893; 894; GFX9-LABEL: sub_i32_uniform: 895; GFX9: ; %bb.0: ; %entry 896; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 897; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 898; GFX9-NEXT: s_mov_b64 s[6:7], exec 899; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 900; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 901; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 902; GFX9-NEXT: ; implicit-def: $vgpr1 903; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 904; GFX9-NEXT: s_cbranch_execz .LBB6_2 905; GFX9-NEXT: ; %bb.1: 906; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 907; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 908; GFX9-NEXT: s_waitcnt lgkmcnt(0) 909; GFX9-NEXT: s_mul_i32 s0, s8, s0 910; GFX9-NEXT: v_mov_b32_e32 v1, s0 911; GFX9-NEXT: v_mov_b32_e32 v2, 0 912; GFX9-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc 913; GFX9-NEXT: .LBB6_2: 914; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 915; GFX9-NEXT: s_waitcnt lgkmcnt(0) 916; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 917; GFX9-NEXT: s_waitcnt vmcnt(0) 918; GFX9-NEXT: v_readfirstlane_b32 s0, v1 919; GFX9-NEXT: v_mov_b32_e32 v1, 0 920; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 921; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 922; GFX9-NEXT: s_endpgm 923; 924; GFX10W64-LABEL: sub_i32_uniform: 925; GFX10W64: ; %bb.0: ; %entry 926; GFX10W64-NEXT: s_clause 0x1 927; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 928; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 929; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 930; GFX10W64-NEXT: ; implicit-def: $vgpr1 931; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 932; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 933; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 934; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 935; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 936; GFX10W64-NEXT: ; %bb.1: 937; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 938; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 939; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 940; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 941; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 942; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 943; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc 944; GFX10W64-NEXT: .LBB6_2: 945; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 946; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 947; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 948; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0 949; GFX10W64-NEXT: s_waitcnt vmcnt(0) 950; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 951; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 952; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 953; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 954; GFX10W64-NEXT: s_endpgm 955; 956; GFX10W32-LABEL: sub_i32_uniform: 957; GFX10W32: ; %bb.0: ; %entry 958; GFX10W32-NEXT: s_clause 0x1 959; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 960; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 961; GFX10W32-NEXT: s_mov_b32 s6, exec_lo 962; GFX10W32-NEXT: ; implicit-def: $vgpr1 963; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 964; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 965; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 966; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 967; GFX10W32-NEXT: ; %bb.1: 968; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 969; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 970; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 971; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 972; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 973; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 974; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 975; GFX10W32-NEXT: .LBB6_2: 976; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 977; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 978; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 979; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0 980; GFX10W32-NEXT: s_waitcnt vmcnt(0) 981; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 982; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 983; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 984; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 985; GFX10W32-NEXT: s_endpgm 986entry: 987 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) 988 store i32 %old, i32 addrspace(1)* %out 989 ret void 990} 991 992define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { 993; GFX6-LABEL: sub_i32_varying_vdata: 994; GFX6: ; %bb.0: ; %entry 995; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 996; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 997; GFX6-NEXT: v_mov_b32_e32 v1, 0 998; GFX6-NEXT: s_waitcnt lgkmcnt(0) 999; GFX6-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 idxen glc 1000; GFX6-NEXT: s_mov_b32 s3, 0xf000 1001; GFX6-NEXT: s_mov_b32 s2, -1 1002; GFX6-NEXT: s_waitcnt vmcnt(0) 1003; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 1004; GFX6-NEXT: s_endpgm 1005; 1006; GFX8-LABEL: sub_i32_varying_vdata: 1007; GFX8: ; %bb.0: ; %entry 1008; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1009; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1010; GFX8-NEXT: v_mov_b32_e32 v1, 0 1011; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1012; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1013; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1014; GFX8-NEXT: v_mov_b32_e32 v2, v0 1015; GFX8-NEXT: s_not_b64 exec, exec 1016; GFX8-NEXT: v_mov_b32_e32 v2, 0 1017; GFX8-NEXT: s_not_b64 exec, exec 1018; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1019; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1020; GFX8-NEXT: s_nop 1 1021; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1022; GFX8-NEXT: s_nop 1 1023; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1024; GFX8-NEXT: s_nop 1 1025; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1026; GFX8-NEXT: s_nop 1 1027; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1028; GFX8-NEXT: s_nop 1 1029; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1030; GFX8-NEXT: v_readlane_b32 s6, v2, 63 1031; GFX8-NEXT: s_nop 0 1032; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1033; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1034; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1035; GFX8-NEXT: ; implicit-def: $vgpr0 1036; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1037; GFX8-NEXT: s_cbranch_execz .LBB7_2 1038; GFX8-NEXT: ; %bb.1: 1039; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1040; GFX8-NEXT: v_mov_b32_e32 v0, s6 1041; GFX8-NEXT: v_mov_b32_e32 v3, 0 1042; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1043; GFX8-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc 1044; GFX8-NEXT: .LBB7_2: 1045; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1046; GFX8-NEXT: s_waitcnt vmcnt(0) 1047; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1048; GFX8-NEXT: v_mov_b32_e32 v0, v1 1049; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1050; GFX8-NEXT: v_mov_b32_e32 v4, s3 1051; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1052; GFX8-NEXT: v_mov_b32_e32 v3, s2 1053; GFX8-NEXT: flat_store_dword v[3:4], v0 1054; GFX8-NEXT: s_endpgm 1055; 1056; GFX9-LABEL: sub_i32_varying_vdata: 1057; GFX9: ; %bb.0: ; %entry 1058; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1059; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1060; GFX9-NEXT: v_mov_b32_e32 v1, 0 1061; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1062; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1063; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1064; GFX9-NEXT: v_mov_b32_e32 v2, v0 1065; GFX9-NEXT: s_not_b64 exec, exec 1066; GFX9-NEXT: v_mov_b32_e32 v2, 0 1067; GFX9-NEXT: s_not_b64 exec, exec 1068; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1069; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1070; GFX9-NEXT: s_nop 1 1071; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1072; GFX9-NEXT: s_nop 1 1073; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1074; GFX9-NEXT: s_nop 1 1075; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1076; GFX9-NEXT: s_nop 1 1077; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1078; GFX9-NEXT: s_nop 1 1079; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1080; GFX9-NEXT: v_readlane_b32 s6, v2, 63 1081; GFX9-NEXT: s_nop 0 1082; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1083; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1084; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1085; GFX9-NEXT: ; implicit-def: $vgpr0 1086; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1087; GFX9-NEXT: s_cbranch_execz .LBB7_2 1088; GFX9-NEXT: ; %bb.1: 1089; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1090; GFX9-NEXT: v_mov_b32_e32 v0, s6 1091; GFX9-NEXT: v_mov_b32_e32 v3, 0 1092; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1093; GFX9-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc 1094; GFX9-NEXT: .LBB7_2: 1095; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1096; GFX9-NEXT: s_waitcnt vmcnt(0) 1097; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1098; GFX9-NEXT: v_mov_b32_e32 v0, v1 1099; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1100; GFX9-NEXT: v_mov_b32_e32 v3, 0 1101; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1102; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 1103; GFX9-NEXT: s_endpgm 1104; 1105; GFX10W64-LABEL: sub_i32_varying_vdata: 1106; GFX10W64: ; %bb.0: ; %entry 1107; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 1108; GFX10W64-NEXT: s_not_b64 exec, exec 1109; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1110; GFX10W64-NEXT: s_not_b64 exec, exec 1111; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 1112; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1113; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 1114; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1115; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1116; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1117; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 1118; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1119; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1120; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 1121; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 1122; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1123; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 1124; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1125; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 1126; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1127; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1128; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 1129; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 1130; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1131; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1132; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1133; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 1134; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 1135; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 1136; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1137; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1138; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1139; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 1140; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1141; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1142; GFX10W64-NEXT: ; implicit-def: $vgpr0 1143; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1144; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 1145; GFX10W64-NEXT: ; %bb.1: 1146; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1147; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 1148; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 1149; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1150; GFX10W64-NEXT: buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc 1151; GFX10W64-NEXT: .LBB7_2: 1152; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1153; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 1154; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1155; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 1156; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 1157; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 1158; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1159; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1160; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 1161; GFX10W64-NEXT: s_endpgm 1162; 1163; GFX10W32-LABEL: sub_i32_varying_vdata: 1164; GFX10W32: ; %bb.0: ; %entry 1165; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 1166; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 1167; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1168; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 1169; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 1170; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1171; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1172; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1173; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1174; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 1175; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1176; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 1177; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1178; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 1179; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1180; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 1181; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 1182; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1183; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 1184; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 1185; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1186; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 1187; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 1188; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 1189; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1190; GFX10W32-NEXT: ; implicit-def: $vgpr0 1191; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 1192; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 1193; GFX10W32-NEXT: ; %bb.1: 1194; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1195; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 1196; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 1197; GFX10W32-NEXT: s_mov_b32 s5, s6 1198; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1199; GFX10W32-NEXT: buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc 1200; GFX10W32-NEXT: .LBB7_2: 1201; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1202; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1203; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1204; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 1205; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 1206; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 1207; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1208; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1209; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 1210; GFX10W32-NEXT: s_endpgm 1211entry: 1212 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1213 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) 1214 store i32 %old, i32 addrspace(1)* %out 1215 ret void 1216} 1217 1218define amdgpu_kernel void @sub_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) { 1219; GFX6-LABEL: sub_i32_varying_vindex: 1220; GFX6: ; %bb.0: ; %entry 1221; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1222; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1223; GFX6-NEXT: v_mov_b32_e32 v1, 1 1224; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1225; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc 1226; GFX6-NEXT: s_mov_b32 s3, 0xf000 1227; GFX6-NEXT: s_mov_b32 s2, -1 1228; GFX6-NEXT: s_waitcnt vmcnt(0) 1229; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 1230; GFX6-NEXT: s_endpgm 1231; 1232; GFX8-LABEL: sub_i32_varying_vindex: 1233; GFX8: ; %bb.0: ; %entry 1234; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1235; GFX8-NEXT: v_mov_b32_e32 v2, 1 1236; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1237; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1238; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 idxen glc 1239; GFX8-NEXT: v_mov_b32_e32 v0, s0 1240; GFX8-NEXT: v_mov_b32_e32 v1, s1 1241; GFX8-NEXT: s_waitcnt vmcnt(0) 1242; GFX8-NEXT: flat_store_dword v[0:1], v2 1243; GFX8-NEXT: s_endpgm 1244; 1245; GFX9-LABEL: sub_i32_varying_vindex: 1246; GFX9: ; %bb.0: ; %entry 1247; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1248; GFX9-NEXT: v_mov_b32_e32 v1, 1 1249; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1250; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1251; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc 1252; GFX9-NEXT: v_mov_b32_e32 v0, 0 1253; GFX9-NEXT: s_waitcnt vmcnt(0) 1254; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1255; GFX9-NEXT: s_endpgm 1256; 1257; GFX10-LABEL: sub_i32_varying_vindex: 1258; GFX10: ; %bb.0: ; %entry 1259; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1260; GFX10-NEXT: v_mov_b32_e32 v1, 1 1261; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1262; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1263; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc 1264; GFX10-NEXT: v_mov_b32_e32 v0, 0 1265; GFX10-NEXT: s_waitcnt vmcnt(0) 1266; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1267; GFX10-NEXT: s_endpgm 1268entry: 1269 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1270 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0) 1271 store i32 %old, i32 addrspace(1)* %out 1272 ret void 1273} 1274 1275define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { 1276; GFX6-LABEL: sub_i32_varying_offset: 1277; GFX6: ; %bb.0: ; %entry 1278; GFX6-NEXT: v_mov_b32_e32 v1, v0 1279; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1280; GFX6-NEXT: s_mov_b32 s2, 0 1281; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1282; GFX6-NEXT: v_mov_b32_e32 v0, s2 1283; GFX6-NEXT: v_mov_b32_e32 v2, 1 1284; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1285; GFX6-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc 1286; GFX6-NEXT: s_mov_b32 s3, 0xf000 1287; GFX6-NEXT: s_mov_b32 s2, -1 1288; GFX6-NEXT: s_waitcnt vmcnt(0) 1289; GFX6-NEXT: buffer_store_dword v2, off, s[0:3], 0 1290; GFX6-NEXT: s_endpgm 1291; 1292; GFX8-LABEL: sub_i32_varying_offset: 1293; GFX8: ; %bb.0: ; %entry 1294; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1295; GFX8-NEXT: s_mov_b32 s2, 0 1296; GFX8-NEXT: v_mov_b32_e32 v1, v0 1297; GFX8-NEXT: v_mov_b32_e32 v0, s2 1298; GFX8-NEXT: v_mov_b32_e32 v2, 1 1299; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1300; GFX8-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc 1301; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1302; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1303; GFX8-NEXT: v_mov_b32_e32 v0, s0 1304; GFX8-NEXT: v_mov_b32_e32 v1, s1 1305; GFX8-NEXT: s_waitcnt vmcnt(0) 1306; GFX8-NEXT: flat_store_dword v[0:1], v2 1307; GFX8-NEXT: s_endpgm 1308; 1309; GFX9-LABEL: sub_i32_varying_offset: 1310; GFX9: ; %bb.0: ; %entry 1311; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1312; GFX9-NEXT: s_mov_b32 s2, 0 1313; GFX9-NEXT: v_mov_b32_e32 v1, v0 1314; GFX9-NEXT: v_mov_b32_e32 v0, s2 1315; GFX9-NEXT: v_mov_b32_e32 v2, 1 1316; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1317; GFX9-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc 1318; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1319; GFX9-NEXT: v_mov_b32_e32 v0, 0 1320; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1321; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 1322; GFX9-NEXT: s_endpgm 1323; 1324; GFX10-LABEL: sub_i32_varying_offset: 1325; GFX10: ; %bb.0: ; %entry 1326; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1327; GFX10-NEXT: s_mov_b32 s2, 0 1328; GFX10-NEXT: v_mov_b32_e32 v1, v0 1329; GFX10-NEXT: v_mov_b32_e32 v0, s2 1330; GFX10-NEXT: v_mov_b32_e32 v2, 1 1331; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1332; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1333; GFX10-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc 1334; GFX10-NEXT: v_mov_b32_e32 v0, 0 1335; GFX10-NEXT: s_waitcnt vmcnt(0) 1336; GFX10-NEXT: global_store_dword v0, v2, s[0:1] 1337; GFX10-NEXT: s_endpgm 1338entry: 1339 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1340 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0) 1341 store i32 %old, i32 addrspace(1)* %out 1342 ret void 1343} 1344