1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s 3; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32) 10declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, i32) 11 12; Show what the atomic optimization pass will do for struct buffers. 13 14define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { 15; GFX6-LABEL: add_i32_constant: 16; GFX6: ; %bb.0: ; %entry 17; GFX6-NEXT: s_mov_b64 s[6:7], exec 18; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 19; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 20; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 21; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 22; GFX6-NEXT: ; implicit-def: $vgpr1 23; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc 24; GFX6-NEXT: s_cbranch_execz .LBB0_2 25; GFX6-NEXT: ; %bb.1: 26; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 27; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 28; GFX6-NEXT: s_mul_i32 s0, s0, 5 29; GFX6-NEXT: v_mov_b32_e32 v1, s0 30; GFX6-NEXT: v_mov_b32_e32 v2, 0 31; GFX6-NEXT: s_waitcnt lgkmcnt(0) 32; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 33; GFX6-NEXT: .LBB0_2: 34; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] 35; GFX6-NEXT: s_waitcnt vmcnt(0) 36; GFX6-NEXT: v_readfirstlane_b32 s0, v1 37; GFX6-NEXT: s_mov_b32 s7, 0xf000 38; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s0 39; GFX6-NEXT: s_mov_b32 s6, -1 40; GFX6-NEXT: s_waitcnt lgkmcnt(0) 41; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 42; GFX6-NEXT: s_endpgm 43; 44; GFX8-LABEL: add_i32_constant: 45; GFX8: ; %bb.0: ; %entry 46; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 47; GFX8-NEXT: s_mov_b64 s[6:7], exec 48; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 49; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 50; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 51; GFX8-NEXT: ; implicit-def: $vgpr1 52; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 53; GFX8-NEXT: s_cbranch_execz .LBB0_2 54; GFX8-NEXT: ; %bb.1: 55; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 56; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 57; GFX8-NEXT: s_mul_i32 s0, s0, 5 58; GFX8-NEXT: v_mov_b32_e32 v1, s0 59; GFX8-NEXT: v_mov_b32_e32 v2, 0 60; GFX8-NEXT: s_waitcnt lgkmcnt(0) 61; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 62; GFX8-NEXT: .LBB0_2: 63; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 64; GFX8-NEXT: s_waitcnt vmcnt(0) 65; GFX8-NEXT: v_readfirstlane_b32 s0, v1 66; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0 67; GFX8-NEXT: s_waitcnt lgkmcnt(0) 68; GFX8-NEXT: v_mov_b32_e32 v0, s2 69; GFX8-NEXT: v_mov_b32_e32 v1, s3 70; GFX8-NEXT: flat_store_dword v[0:1], v2 71; GFX8-NEXT: s_endpgm 72; 73; GFX9-LABEL: add_i32_constant: 74; GFX9: ; %bb.0: ; %entry 75; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 76; GFX9-NEXT: s_mov_b64 s[6:7], exec 77; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 78; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 79; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 80; GFX9-NEXT: ; implicit-def: $vgpr1 81; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 82; GFX9-NEXT: s_cbranch_execz .LBB0_2 83; GFX9-NEXT: ; %bb.1: 84; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 85; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 86; GFX9-NEXT: s_mul_i32 s0, s0, 5 87; GFX9-NEXT: v_mov_b32_e32 v1, s0 88; GFX9-NEXT: v_mov_b32_e32 v2, 0 89; GFX9-NEXT: s_waitcnt lgkmcnt(0) 90; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 91; GFX9-NEXT: .LBB0_2: 92; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 93; GFX9-NEXT: s_waitcnt vmcnt(0) 94; GFX9-NEXT: v_readfirstlane_b32 s0, v1 95; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0 96; GFX9-NEXT: v_mov_b32_e32 v1, 0 97; GFX9-NEXT: s_waitcnt lgkmcnt(0) 98; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 99; GFX9-NEXT: s_endpgm 100; 101; GFX10W64-LABEL: add_i32_constant: 102; GFX10W64: ; %bb.0: ; %entry 103; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 104; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 105; GFX10W64-NEXT: ; implicit-def: $vgpr1 106; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 107; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 108; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 109; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 110; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 111; GFX10W64-NEXT: ; %bb.1: 112; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 113; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 114; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 115; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 116; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 117; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 118; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 119; GFX10W64-NEXT: .LBB0_2: 120; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 121; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 122; GFX10W64-NEXT: s_waitcnt vmcnt(0) 123; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 124; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 125; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 126; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 127; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 128; GFX10W64-NEXT: s_endpgm 129; 130; GFX10W32-LABEL: add_i32_constant: 131; GFX10W32: ; %bb.0: ; %entry 132; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 133; GFX10W32-NEXT: s_mov_b32 s5, exec_lo 134; GFX10W32-NEXT: ; implicit-def: $vgpr1 135; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 136; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 137; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 138; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 139; GFX10W32-NEXT: ; %bb.1: 140; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 141; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 142; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 143; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 144; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 145; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 146; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 147; GFX10W32-NEXT: .LBB0_2: 148; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 149; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 150; GFX10W32-NEXT: s_waitcnt vmcnt(0) 151; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 152; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 153; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 154; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 155; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 156; GFX10W32-NEXT: s_endpgm 157entry: 158 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) 159 store i32 %old, i32 addrspace(1)* %out 160 ret void 161} 162 163define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) { 164; GFX6-LABEL: add_i32_uniform: 165; GFX6: ; %bb.0: ; %entry 166; GFX6-NEXT: s_mov_b64 s[2:3], exec 167; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 168; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 169; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 170; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 171; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 172; GFX6-NEXT: ; implicit-def: $vgpr1 173; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 174; GFX6-NEXT: s_cbranch_execz .LBB1_2 175; GFX6-NEXT: ; %bb.1: 176; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 177; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 178; GFX6-NEXT: s_waitcnt lgkmcnt(0) 179; GFX6-NEXT: s_mul_i32 s0, s8, s0 180; GFX6-NEXT: v_mov_b32_e32 v1, s0 181; GFX6-NEXT: v_mov_b32_e32 v2, 0 182; GFX6-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc 183; GFX6-NEXT: .LBB1_2: 184; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 185; GFX6-NEXT: s_waitcnt vmcnt(0) 186; GFX6-NEXT: v_readfirstlane_b32 s0, v1 187; GFX6-NEXT: s_waitcnt lgkmcnt(0) 188; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 189; GFX6-NEXT: s_mov_b32 s7, 0xf000 190; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 191; GFX6-NEXT: s_mov_b32 s6, -1 192; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 193; GFX6-NEXT: s_endpgm 194; 195; GFX8-LABEL: add_i32_uniform: 196; GFX8: ; %bb.0: ; %entry 197; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 198; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 199; GFX8-NEXT: s_mov_b64 s[6:7], exec 200; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 201; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 202; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 203; GFX8-NEXT: ; implicit-def: $vgpr1 204; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 205; GFX8-NEXT: s_cbranch_execz .LBB1_2 206; GFX8-NEXT: ; %bb.1: 207; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 208; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 209; GFX8-NEXT: s_waitcnt lgkmcnt(0) 210; GFX8-NEXT: s_mul_i32 s0, s8, s0 211; GFX8-NEXT: v_mov_b32_e32 v1, s0 212; GFX8-NEXT: v_mov_b32_e32 v2, 0 213; GFX8-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc 214; GFX8-NEXT: .LBB1_2: 215; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 216; GFX8-NEXT: s_waitcnt lgkmcnt(0) 217; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 218; GFX8-NEXT: s_waitcnt vmcnt(0) 219; GFX8-NEXT: v_readfirstlane_b32 s0, v1 220; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 221; GFX8-NEXT: v_mov_b32_e32 v0, s2 222; GFX8-NEXT: v_mov_b32_e32 v1, s3 223; GFX8-NEXT: flat_store_dword v[0:1], v2 224; GFX8-NEXT: s_endpgm 225; 226; GFX9-LABEL: add_i32_uniform: 227; GFX9: ; %bb.0: ; %entry 228; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 229; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 230; GFX9-NEXT: s_mov_b64 s[6:7], exec 231; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 232; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 233; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 234; GFX9-NEXT: ; implicit-def: $vgpr1 235; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 236; GFX9-NEXT: s_cbranch_execz .LBB1_2 237; GFX9-NEXT: ; %bb.1: 238; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 239; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 240; GFX9-NEXT: s_waitcnt lgkmcnt(0) 241; GFX9-NEXT: s_mul_i32 s0, s8, s0 242; GFX9-NEXT: v_mov_b32_e32 v1, s0 243; GFX9-NEXT: v_mov_b32_e32 v2, 0 244; GFX9-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc 245; GFX9-NEXT: .LBB1_2: 246; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 247; GFX9-NEXT: s_waitcnt lgkmcnt(0) 248; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 249; GFX9-NEXT: s_waitcnt vmcnt(0) 250; GFX9-NEXT: v_readfirstlane_b32 s0, v1 251; GFX9-NEXT: v_mov_b32_e32 v1, 0 252; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 253; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 254; GFX9-NEXT: s_endpgm 255; 256; GFX10W64-LABEL: add_i32_uniform: 257; GFX10W64: ; %bb.0: ; %entry 258; GFX10W64-NEXT: s_clause 0x1 259; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 260; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 261; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 262; GFX10W64-NEXT: ; implicit-def: $vgpr1 263; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 264; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 265; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 266; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 267; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 268; GFX10W64-NEXT: ; %bb.1: 269; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 270; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 271; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 272; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 273; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 274; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 275; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc 276; GFX10W64-NEXT: .LBB1_2: 277; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 278; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 279; GFX10W64-NEXT: s_waitcnt vmcnt(0) 280; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 281; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 282; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], null, s8, v0, s[0:1] 283; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 284; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 285; GFX10W64-NEXT: s_endpgm 286; 287; GFX10W32-LABEL: add_i32_uniform: 288; GFX10W32: ; %bb.0: ; %entry 289; GFX10W32-NEXT: s_clause 0x1 290; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 291; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 292; GFX10W32-NEXT: s_mov_b32 s6, exec_lo 293; GFX10W32-NEXT: ; implicit-def: $vgpr1 294; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 295; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 296; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 297; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 298; GFX10W32-NEXT: ; %bb.1: 299; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 300; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 301; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 302; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 303; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 304; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 305; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 306; GFX10W32-NEXT: .LBB1_2: 307; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 308; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 309; GFX10W32-NEXT: s_waitcnt vmcnt(0) 310; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 311; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 312; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], null, s4, v0, s[0:1] 313; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 314; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 315; GFX10W32-NEXT: s_endpgm 316entry: 317 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) 318 store i32 %old, i32 addrspace(1)* %out 319 ret void 320} 321 322define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { 323; GFX6-LABEL: add_i32_varying_vdata: 324; GFX6: ; %bb.0: ; %entry 325; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 326; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 327; GFX6-NEXT: v_mov_b32_e32 v1, 0 328; GFX6-NEXT: s_waitcnt lgkmcnt(0) 329; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc 330; GFX6-NEXT: s_mov_b32 s3, 0xf000 331; GFX6-NEXT: s_mov_b32 s2, -1 332; GFX6-NEXT: s_waitcnt vmcnt(0) 333; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 334; GFX6-NEXT: s_endpgm 335; 336; GFX8-LABEL: add_i32_varying_vdata: 337; GFX8: ; %bb.0: ; %entry 338; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 339; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 340; GFX8-NEXT: v_mov_b32_e32 v1, 0 341; GFX8-NEXT: s_mov_b64 exec, s[4:5] 342; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 343; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 344; GFX8-NEXT: v_mov_b32_e32 v2, v0 345; GFX8-NEXT: s_not_b64 exec, exec 346; GFX8-NEXT: v_mov_b32_e32 v2, 0 347; GFX8-NEXT: s_not_b64 exec, exec 348; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 349; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 350; GFX8-NEXT: s_nop 1 351; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 352; GFX8-NEXT: s_nop 1 353; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 354; GFX8-NEXT: s_nop 1 355; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 356; GFX8-NEXT: s_nop 1 357; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 358; GFX8-NEXT: s_nop 1 359; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 360; GFX8-NEXT: v_readlane_b32 s6, v2, 63 361; GFX8-NEXT: s_nop 0 362; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 363; GFX8-NEXT: s_mov_b64 exec, s[4:5] 364; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 365; GFX8-NEXT: ; implicit-def: $vgpr0 366; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 367; GFX8-NEXT: s_cbranch_execz .LBB2_2 368; GFX8-NEXT: ; %bb.1: 369; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 370; GFX8-NEXT: v_mov_b32_e32 v0, s6 371; GFX8-NEXT: v_mov_b32_e32 v3, 0 372; GFX8-NEXT: s_waitcnt lgkmcnt(0) 373; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc 374; GFX8-NEXT: .LBB2_2: 375; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 376; GFX8-NEXT: s_waitcnt vmcnt(0) 377; GFX8-NEXT: v_readfirstlane_b32 s0, v0 378; GFX8-NEXT: v_mov_b32_e32 v0, v1 379; GFX8-NEXT: s_waitcnt lgkmcnt(0) 380; GFX8-NEXT: v_mov_b32_e32 v4, s3 381; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 382; GFX8-NEXT: v_mov_b32_e32 v3, s2 383; GFX8-NEXT: flat_store_dword v[3:4], v0 384; GFX8-NEXT: s_endpgm 385; 386; GFX9-LABEL: add_i32_varying_vdata: 387; GFX9: ; %bb.0: ; %entry 388; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 389; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 390; GFX9-NEXT: v_mov_b32_e32 v1, 0 391; GFX9-NEXT: s_mov_b64 exec, s[4:5] 392; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 393; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 394; GFX9-NEXT: v_mov_b32_e32 v2, v0 395; GFX9-NEXT: s_not_b64 exec, exec 396; GFX9-NEXT: v_mov_b32_e32 v2, 0 397; GFX9-NEXT: s_not_b64 exec, exec 398; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 399; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 400; GFX9-NEXT: s_nop 1 401; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 402; GFX9-NEXT: s_nop 1 403; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 404; GFX9-NEXT: s_nop 1 405; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 406; GFX9-NEXT: s_nop 1 407; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 408; GFX9-NEXT: s_nop 1 409; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 410; GFX9-NEXT: v_readlane_b32 s6, v2, 63 411; GFX9-NEXT: s_nop 0 412; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 413; GFX9-NEXT: s_mov_b64 exec, s[4:5] 414; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 415; GFX9-NEXT: ; implicit-def: $vgpr0 416; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 417; GFX9-NEXT: s_cbranch_execz .LBB2_2 418; GFX9-NEXT: ; %bb.1: 419; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 420; GFX9-NEXT: v_mov_b32_e32 v0, s6 421; GFX9-NEXT: v_mov_b32_e32 v3, 0 422; GFX9-NEXT: s_waitcnt lgkmcnt(0) 423; GFX9-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc 424; GFX9-NEXT: .LBB2_2: 425; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 426; GFX9-NEXT: s_waitcnt vmcnt(0) 427; GFX9-NEXT: v_readfirstlane_b32 s0, v0 428; GFX9-NEXT: v_mov_b32_e32 v0, v1 429; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 430; GFX9-NEXT: v_mov_b32_e32 v3, 0 431; GFX9-NEXT: s_waitcnt lgkmcnt(0) 432; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 433; GFX9-NEXT: s_endpgm 434; 435; GFX10W64-LABEL: add_i32_varying_vdata: 436; GFX10W64: ; %bb.0: ; %entry 437; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 438; GFX10W64-NEXT: s_not_b64 exec, exec 439; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 440; GFX10W64-NEXT: s_not_b64 exec, exec 441; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 442; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 443; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 444; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 445; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 446; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 447; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 448; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 449; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 450; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 451; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 452; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 453; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 454; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 455; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 456; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 457; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 458; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 459; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 460; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 461; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 462; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 463; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 464; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 465; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 466; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 467; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 468; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 469; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 470; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 471; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 472; GFX10W64-NEXT: ; implicit-def: $vgpr0 473; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 474; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 475; GFX10W64-NEXT: ; %bb.1: 476; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 477; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 478; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 479; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 480; GFX10W64-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc 481; GFX10W64-NEXT: .LBB2_2: 482; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 483; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 484; GFX10W64-NEXT: s_waitcnt vmcnt(0) 485; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 486; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 487; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 488; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 489; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 490; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 491; GFX10W64-NEXT: s_endpgm 492; 493; GFX10W32-LABEL: add_i32_varying_vdata: 494; GFX10W32: ; %bb.0: ; %entry 495; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 496; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 497; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 498; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 499; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 500; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 501; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 502; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 503; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 504; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 505; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 506; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 507; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 508; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 509; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 510; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 511; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 512; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 513; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 514; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 515; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 516; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 517; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 518; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 519; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 520; GFX10W32-NEXT: ; implicit-def: $vgpr0 521; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 522; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 523; GFX10W32-NEXT: ; %bb.1: 524; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 525; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 526; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 527; GFX10W32-NEXT: s_mov_b32 s5, s6 528; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 529; GFX10W32-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc 530; GFX10W32-NEXT: .LBB2_2: 531; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 532; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 533; GFX10W32-NEXT: s_waitcnt vmcnt(0) 534; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 535; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 536; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 537; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 538; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 539; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 540; GFX10W32-NEXT: s_endpgm 541entry: 542 %lane = call i32 @llvm.amdgcn.workitem.id.x() 543 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) 544 store i32 %old, i32 addrspace(1)* %out 545 ret void 546} 547 548define amdgpu_kernel void @add_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) { 549; GFX6-LABEL: add_i32_varying_vindex: 550; GFX6: ; %bb.0: ; %entry 551; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 552; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 553; GFX6-NEXT: v_mov_b32_e32 v1, 1 554; GFX6-NEXT: s_waitcnt lgkmcnt(0) 555; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc 556; GFX6-NEXT: s_mov_b32 s3, 0xf000 557; GFX6-NEXT: s_mov_b32 s2, -1 558; GFX6-NEXT: s_waitcnt vmcnt(0) 559; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 560; GFX6-NEXT: s_endpgm 561; 562; GFX8-LABEL: add_i32_varying_vindex: 563; GFX8: ; %bb.0: ; %entry 564; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 565; GFX8-NEXT: v_mov_b32_e32 v2, 1 566; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 567; GFX8-NEXT: s_waitcnt lgkmcnt(0) 568; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 idxen glc 569; GFX8-NEXT: v_mov_b32_e32 v0, s0 570; GFX8-NEXT: v_mov_b32_e32 v1, s1 571; GFX8-NEXT: s_waitcnt vmcnt(0) 572; GFX8-NEXT: flat_store_dword v[0:1], v2 573; GFX8-NEXT: s_endpgm 574; 575; GFX9-LABEL: add_i32_varying_vindex: 576; GFX9: ; %bb.0: ; %entry 577; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 578; GFX9-NEXT: v_mov_b32_e32 v1, 1 579; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 580; GFX9-NEXT: s_waitcnt lgkmcnt(0) 581; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc 582; GFX9-NEXT: v_mov_b32_e32 v0, 0 583; GFX9-NEXT: s_waitcnt vmcnt(0) 584; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 585; GFX9-NEXT: s_endpgm 586; 587; GFX10-LABEL: add_i32_varying_vindex: 588; GFX10: ; %bb.0: ; %entry 589; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 590; GFX10-NEXT: v_mov_b32_e32 v1, 1 591; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 592; GFX10-NEXT: s_waitcnt lgkmcnt(0) 593; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc 594; GFX10-NEXT: v_mov_b32_e32 v0, 0 595; GFX10-NEXT: s_waitcnt vmcnt(0) 596; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 597; GFX10-NEXT: s_endpgm 598entry: 599 %lane = call i32 @llvm.amdgcn.workitem.id.x() 600 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0) 601 store i32 %old, i32 addrspace(1)* %out 602 ret void 603} 604 605define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { 606; GFX6-LABEL: add_i32_varying_offset: 607; GFX6: ; %bb.0: ; %entry 608; GFX6-NEXT: v_mov_b32_e32 v1, v0 609; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 610; GFX6-NEXT: s_mov_b32 s2, 0 611; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 612; GFX6-NEXT: v_mov_b32_e32 v0, s2 613; GFX6-NEXT: v_mov_b32_e32 v2, 1 614; GFX6-NEXT: s_waitcnt lgkmcnt(0) 615; GFX6-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc 616; GFX6-NEXT: s_mov_b32 s3, 0xf000 617; GFX6-NEXT: s_mov_b32 s2, -1 618; GFX6-NEXT: s_waitcnt vmcnt(0) 619; GFX6-NEXT: buffer_store_dword v2, off, s[0:3], 0 620; GFX6-NEXT: s_endpgm 621; 622; GFX8-LABEL: add_i32_varying_offset: 623; GFX8: ; %bb.0: ; %entry 624; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 625; GFX8-NEXT: s_mov_b32 s2, 0 626; GFX8-NEXT: v_mov_b32_e32 v1, v0 627; GFX8-NEXT: v_mov_b32_e32 v0, s2 628; GFX8-NEXT: v_mov_b32_e32 v2, 1 629; GFX8-NEXT: s_waitcnt lgkmcnt(0) 630; GFX8-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc 631; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 632; GFX8-NEXT: s_waitcnt lgkmcnt(0) 633; GFX8-NEXT: v_mov_b32_e32 v0, s0 634; GFX8-NEXT: v_mov_b32_e32 v1, s1 635; GFX8-NEXT: s_waitcnt vmcnt(0) 636; GFX8-NEXT: flat_store_dword v[0:1], v2 637; GFX8-NEXT: s_endpgm 638; 639; GFX9-LABEL: add_i32_varying_offset: 640; GFX9: ; %bb.0: ; %entry 641; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 642; GFX9-NEXT: s_mov_b32 s2, 0 643; GFX9-NEXT: v_mov_b32_e32 v1, v0 644; GFX9-NEXT: v_mov_b32_e32 v0, s2 645; GFX9-NEXT: v_mov_b32_e32 v2, 1 646; GFX9-NEXT: s_waitcnt lgkmcnt(0) 647; GFX9-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc 648; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 649; GFX9-NEXT: v_mov_b32_e32 v0, 0 650; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 651; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 652; GFX9-NEXT: s_endpgm 653; 654; GFX10-LABEL: add_i32_varying_offset: 655; GFX10: ; %bb.0: ; %entry 656; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 657; GFX10-NEXT: s_mov_b32 s2, 0 658; GFX10-NEXT: v_mov_b32_e32 v1, v0 659; GFX10-NEXT: v_mov_b32_e32 v0, s2 660; GFX10-NEXT: v_mov_b32_e32 v2, 1 661; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 662; GFX10-NEXT: s_waitcnt lgkmcnt(0) 663; GFX10-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc 664; GFX10-NEXT: v_mov_b32_e32 v0, 0 665; GFX10-NEXT: s_waitcnt vmcnt(0) 666; GFX10-NEXT: global_store_dword v0, v2, s[0:1] 667; GFX10-NEXT: s_endpgm 668entry: 669 %lane = call i32 @llvm.amdgcn.workitem.id.x() 670 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0) 671 store i32 %old, i32 addrspace(1)* %out 672 ret void 673} 674 675define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { 676; GFX6-LABEL: sub_i32_constant: 677; GFX6: ; %bb.0: ; %entry 678; GFX6-NEXT: s_mov_b64 s[6:7], exec 679; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 680; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 681; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 682; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 683; GFX6-NEXT: ; implicit-def: $vgpr1 684; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc 685; GFX6-NEXT: s_cbranch_execz .LBB5_2 686; GFX6-NEXT: ; %bb.1: 687; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 688; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 689; GFX6-NEXT: s_mul_i32 s0, s0, 5 690; GFX6-NEXT: v_mov_b32_e32 v1, s0 691; GFX6-NEXT: v_mov_b32_e32 v2, 0 692; GFX6-NEXT: s_waitcnt lgkmcnt(0) 693; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 694; GFX6-NEXT: .LBB5_2: 695; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] 696; GFX6-NEXT: s_waitcnt vmcnt(0) 697; GFX6-NEXT: v_readfirstlane_b32 s0, v1 698; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0 699; GFX6-NEXT: s_mov_b32 s7, 0xf000 700; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 701; GFX6-NEXT: s_mov_b32 s6, -1 702; GFX6-NEXT: s_waitcnt lgkmcnt(0) 703; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 704; GFX6-NEXT: s_endpgm 705; 706; GFX8-LABEL: sub_i32_constant: 707; GFX8: ; %bb.0: ; %entry 708; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 709; GFX8-NEXT: s_mov_b64 s[6:7], exec 710; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 711; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 712; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 713; GFX8-NEXT: ; implicit-def: $vgpr1 714; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 715; GFX8-NEXT: s_cbranch_execz .LBB5_2 716; GFX8-NEXT: ; %bb.1: 717; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 718; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 719; GFX8-NEXT: s_mul_i32 s0, s0, 5 720; GFX8-NEXT: v_mov_b32_e32 v1, s0 721; GFX8-NEXT: v_mov_b32_e32 v2, 0 722; GFX8-NEXT: s_waitcnt lgkmcnt(0) 723; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 724; GFX8-NEXT: .LBB5_2: 725; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 726; GFX8-NEXT: s_waitcnt vmcnt(0) 727; GFX8-NEXT: v_readfirstlane_b32 s0, v1 728; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 729; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 730; GFX8-NEXT: s_waitcnt lgkmcnt(0) 731; GFX8-NEXT: v_mov_b32_e32 v0, s2 732; GFX8-NEXT: v_mov_b32_e32 v1, s3 733; GFX8-NEXT: flat_store_dword v[0:1], v2 734; GFX8-NEXT: s_endpgm 735; 736; GFX9-LABEL: sub_i32_constant: 737; GFX9: ; %bb.0: ; %entry 738; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 739; GFX9-NEXT: s_mov_b64 s[6:7], exec 740; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 741; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 742; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 743; GFX9-NEXT: ; implicit-def: $vgpr1 744; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 745; GFX9-NEXT: s_cbranch_execz .LBB5_2 746; GFX9-NEXT: ; %bb.1: 747; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 748; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 749; GFX9-NEXT: s_mul_i32 s0, s0, 5 750; GFX9-NEXT: v_mov_b32_e32 v1, s0 751; GFX9-NEXT: v_mov_b32_e32 v2, 0 752; GFX9-NEXT: s_waitcnt lgkmcnt(0) 753; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 754; GFX9-NEXT: .LBB5_2: 755; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 756; GFX9-NEXT: s_waitcnt vmcnt(0) 757; GFX9-NEXT: v_readfirstlane_b32 s0, v1 758; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 759; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 760; GFX9-NEXT: v_mov_b32_e32 v1, 0 761; GFX9-NEXT: s_waitcnt lgkmcnt(0) 762; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 763; GFX9-NEXT: s_endpgm 764; 765; GFX10W64-LABEL: sub_i32_constant: 766; GFX10W64: ; %bb.0: ; %entry 767; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 768; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 769; GFX10W64-NEXT: ; implicit-def: $vgpr1 770; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 771; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 772; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 773; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 774; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 775; GFX10W64-NEXT: ; %bb.1: 776; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 777; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 778; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 779; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 780; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 781; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 782; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 783; GFX10W64-NEXT: .LBB5_2: 784; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 785; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 786; GFX10W64-NEXT: s_waitcnt vmcnt(0) 787; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 788; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 789; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 790; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 791; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 792; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 793; GFX10W64-NEXT: s_endpgm 794; 795; GFX10W32-LABEL: sub_i32_constant: 796; GFX10W32: ; %bb.0: ; %entry 797; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 798; GFX10W32-NEXT: s_mov_b32 s5, exec_lo 799; GFX10W32-NEXT: ; implicit-def: $vgpr1 800; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 801; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 802; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 803; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 804; GFX10W32-NEXT: ; %bb.1: 805; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 806; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 807; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 808; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 809; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 810; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 811; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 812; GFX10W32-NEXT: .LBB5_2: 813; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 814; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 815; GFX10W32-NEXT: s_waitcnt vmcnt(0) 816; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 817; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 818; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 819; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 820; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 821; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 822; GFX10W32-NEXT: s_endpgm 823entry: 824 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) 825 store i32 %old, i32 addrspace(1)* %out 826 ret void 827} 828 829define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) { 830; GFX6-LABEL: sub_i32_uniform: 831; GFX6: ; %bb.0: ; %entry 832; GFX6-NEXT: s_mov_b64 s[2:3], exec 833; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 834; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 835; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 836; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 837; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 838; GFX6-NEXT: ; implicit-def: $vgpr1 839; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 840; GFX6-NEXT: s_cbranch_execz .LBB6_2 841; GFX6-NEXT: ; %bb.1: 842; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 843; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 844; GFX6-NEXT: s_waitcnt lgkmcnt(0) 845; GFX6-NEXT: s_mul_i32 s0, s8, s0 846; GFX6-NEXT: v_mov_b32_e32 v1, s0 847; GFX6-NEXT: v_mov_b32_e32 v2, 0 848; GFX6-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc 849; GFX6-NEXT: .LBB6_2: 850; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 851; GFX6-NEXT: s_waitcnt vmcnt(0) 852; GFX6-NEXT: v_readfirstlane_b32 s0, v1 853; GFX6-NEXT: s_waitcnt lgkmcnt(0) 854; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 855; GFX6-NEXT: s_mov_b32 s7, 0xf000 856; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 857; GFX6-NEXT: s_mov_b32 s6, -1 858; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 859; GFX6-NEXT: s_endpgm 860; 861; GFX8-LABEL: sub_i32_uniform: 862; GFX8: ; %bb.0: ; %entry 863; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 864; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 865; GFX8-NEXT: s_mov_b64 s[6:7], exec 866; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 867; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 868; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 869; GFX8-NEXT: ; implicit-def: $vgpr1 870; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 871; GFX8-NEXT: s_cbranch_execz .LBB6_2 872; GFX8-NEXT: ; %bb.1: 873; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 874; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 875; GFX8-NEXT: s_waitcnt lgkmcnt(0) 876; GFX8-NEXT: s_mul_i32 s0, s8, s0 877; GFX8-NEXT: v_mov_b32_e32 v1, s0 878; GFX8-NEXT: v_mov_b32_e32 v2, 0 879; GFX8-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc 880; GFX8-NEXT: .LBB6_2: 881; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 882; GFX8-NEXT: s_waitcnt lgkmcnt(0) 883; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 884; GFX8-NEXT: s_waitcnt vmcnt(0) 885; GFX8-NEXT: v_readfirstlane_b32 s0, v1 886; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 887; GFX8-NEXT: v_mov_b32_e32 v0, s2 888; GFX8-NEXT: v_mov_b32_e32 v1, s3 889; GFX8-NEXT: flat_store_dword v[0:1], v2 890; GFX8-NEXT: s_endpgm 891; 892; GFX9-LABEL: sub_i32_uniform: 893; GFX9: ; %bb.0: ; %entry 894; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 895; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 896; GFX9-NEXT: s_mov_b64 s[6:7], exec 897; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 898; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 899; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 900; GFX9-NEXT: ; implicit-def: $vgpr1 901; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 902; GFX9-NEXT: s_cbranch_execz .LBB6_2 903; GFX9-NEXT: ; %bb.1: 904; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 905; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 906; GFX9-NEXT: s_waitcnt lgkmcnt(0) 907; GFX9-NEXT: s_mul_i32 s0, s8, s0 908; GFX9-NEXT: v_mov_b32_e32 v1, s0 909; GFX9-NEXT: v_mov_b32_e32 v2, 0 910; GFX9-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc 911; GFX9-NEXT: .LBB6_2: 912; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 913; GFX9-NEXT: s_waitcnt lgkmcnt(0) 914; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 915; GFX9-NEXT: s_waitcnt vmcnt(0) 916; GFX9-NEXT: v_readfirstlane_b32 s0, v1 917; GFX9-NEXT: v_mov_b32_e32 v1, 0 918; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 919; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 920; GFX9-NEXT: s_endpgm 921; 922; GFX10W64-LABEL: sub_i32_uniform: 923; GFX10W64: ; %bb.0: ; %entry 924; GFX10W64-NEXT: s_clause 0x1 925; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 926; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 927; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 928; GFX10W64-NEXT: ; implicit-def: $vgpr1 929; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 930; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 931; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 932; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 933; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 934; GFX10W64-NEXT: ; %bb.1: 935; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 936; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 937; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 938; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 939; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 940; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 941; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc 942; GFX10W64-NEXT: .LBB6_2: 943; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 944; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 945; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 946; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0 947; GFX10W64-NEXT: s_waitcnt vmcnt(0) 948; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 949; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 950; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 951; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 952; GFX10W64-NEXT: s_endpgm 953; 954; GFX10W32-LABEL: sub_i32_uniform: 955; GFX10W32: ; %bb.0: ; %entry 956; GFX10W32-NEXT: s_clause 0x1 957; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 958; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 959; GFX10W32-NEXT: s_mov_b32 s6, exec_lo 960; GFX10W32-NEXT: ; implicit-def: $vgpr1 961; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 962; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 963; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 964; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 965; GFX10W32-NEXT: ; %bb.1: 966; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 967; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 968; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 969; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 970; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 971; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 972; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 973; GFX10W32-NEXT: .LBB6_2: 974; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 975; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 976; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 977; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0 978; GFX10W32-NEXT: s_waitcnt vmcnt(0) 979; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 980; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 981; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 982; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 983; GFX10W32-NEXT: s_endpgm 984entry: 985 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) 986 store i32 %old, i32 addrspace(1)* %out 987 ret void 988} 989 990define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { 991; GFX6-LABEL: sub_i32_varying_vdata: 992; GFX6: ; %bb.0: ; %entry 993; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 994; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 995; GFX6-NEXT: v_mov_b32_e32 v1, 0 996; GFX6-NEXT: s_waitcnt lgkmcnt(0) 997; GFX6-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 idxen glc 998; GFX6-NEXT: s_mov_b32 s3, 0xf000 999; GFX6-NEXT: s_mov_b32 s2, -1 1000; GFX6-NEXT: s_waitcnt vmcnt(0) 1001; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 1002; GFX6-NEXT: s_endpgm 1003; 1004; GFX8-LABEL: sub_i32_varying_vdata: 1005; GFX8: ; %bb.0: ; %entry 1006; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1007; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1008; GFX8-NEXT: v_mov_b32_e32 v1, 0 1009; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1010; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1011; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1012; GFX8-NEXT: v_mov_b32_e32 v2, v0 1013; GFX8-NEXT: s_not_b64 exec, exec 1014; GFX8-NEXT: v_mov_b32_e32 v2, 0 1015; GFX8-NEXT: s_not_b64 exec, exec 1016; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1017; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1018; GFX8-NEXT: s_nop 1 1019; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1020; GFX8-NEXT: s_nop 1 1021; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1022; GFX8-NEXT: s_nop 1 1023; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1024; GFX8-NEXT: s_nop 1 1025; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1026; GFX8-NEXT: s_nop 1 1027; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1028; GFX8-NEXT: v_readlane_b32 s6, v2, 63 1029; GFX8-NEXT: s_nop 0 1030; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1031; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1032; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1033; GFX8-NEXT: ; implicit-def: $vgpr0 1034; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1035; GFX8-NEXT: s_cbranch_execz .LBB7_2 1036; GFX8-NEXT: ; %bb.1: 1037; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1038; GFX8-NEXT: v_mov_b32_e32 v0, s6 1039; GFX8-NEXT: v_mov_b32_e32 v3, 0 1040; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1041; GFX8-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc 1042; GFX8-NEXT: .LBB7_2: 1043; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1044; GFX8-NEXT: s_waitcnt vmcnt(0) 1045; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1046; GFX8-NEXT: v_mov_b32_e32 v0, v1 1047; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1048; GFX8-NEXT: v_mov_b32_e32 v4, s3 1049; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1050; GFX8-NEXT: v_mov_b32_e32 v3, s2 1051; GFX8-NEXT: flat_store_dword v[3:4], v0 1052; GFX8-NEXT: s_endpgm 1053; 1054; GFX9-LABEL: sub_i32_varying_vdata: 1055; GFX9: ; %bb.0: ; %entry 1056; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1057; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1058; GFX9-NEXT: v_mov_b32_e32 v1, 0 1059; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1060; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1061; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1062; GFX9-NEXT: v_mov_b32_e32 v2, v0 1063; GFX9-NEXT: s_not_b64 exec, exec 1064; GFX9-NEXT: v_mov_b32_e32 v2, 0 1065; GFX9-NEXT: s_not_b64 exec, exec 1066; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1067; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1068; GFX9-NEXT: s_nop 1 1069; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1070; GFX9-NEXT: s_nop 1 1071; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1072; GFX9-NEXT: s_nop 1 1073; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1074; GFX9-NEXT: s_nop 1 1075; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1076; GFX9-NEXT: s_nop 1 1077; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1078; GFX9-NEXT: v_readlane_b32 s6, v2, 63 1079; GFX9-NEXT: s_nop 0 1080; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1081; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1082; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1083; GFX9-NEXT: ; implicit-def: $vgpr0 1084; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1085; GFX9-NEXT: s_cbranch_execz .LBB7_2 1086; GFX9-NEXT: ; %bb.1: 1087; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1088; GFX9-NEXT: v_mov_b32_e32 v0, s6 1089; GFX9-NEXT: v_mov_b32_e32 v3, 0 1090; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1091; GFX9-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc 1092; GFX9-NEXT: .LBB7_2: 1093; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1094; GFX9-NEXT: s_waitcnt vmcnt(0) 1095; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1096; GFX9-NEXT: v_mov_b32_e32 v0, v1 1097; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1098; GFX9-NEXT: v_mov_b32_e32 v3, 0 1099; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1100; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 1101; GFX9-NEXT: s_endpgm 1102; 1103; GFX10W64-LABEL: sub_i32_varying_vdata: 1104; GFX10W64: ; %bb.0: ; %entry 1105; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 1106; GFX10W64-NEXT: s_not_b64 exec, exec 1107; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1108; GFX10W64-NEXT: s_not_b64 exec, exec 1109; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 1110; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1111; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 1112; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1113; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1114; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1115; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 1116; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1117; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1118; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 1119; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 1120; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1121; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 1122; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1123; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 1124; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1125; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1126; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 1127; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 1128; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1129; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1130; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1131; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 1132; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 1133; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 1134; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1135; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1136; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1137; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 1138; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1139; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1140; GFX10W64-NEXT: ; implicit-def: $vgpr0 1141; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1142; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 1143; GFX10W64-NEXT: ; %bb.1: 1144; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1145; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 1146; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 1147; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1148; GFX10W64-NEXT: buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc 1149; GFX10W64-NEXT: .LBB7_2: 1150; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1151; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 1152; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1153; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 1154; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 1155; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 1156; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1157; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1158; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 1159; GFX10W64-NEXT: s_endpgm 1160; 1161; GFX10W32-LABEL: sub_i32_varying_vdata: 1162; GFX10W32: ; %bb.0: ; %entry 1163; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 1164; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 1165; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1166; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 1167; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 1168; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1169; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1170; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1171; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1172; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 1173; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1174; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 1175; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1176; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 1177; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1178; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 1179; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 1180; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1181; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 1182; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 1183; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1184; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 1185; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 1186; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 1187; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1188; GFX10W32-NEXT: ; implicit-def: $vgpr0 1189; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 1190; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 1191; GFX10W32-NEXT: ; %bb.1: 1192; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1193; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 1194; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 1195; GFX10W32-NEXT: s_mov_b32 s5, s6 1196; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1197; GFX10W32-NEXT: buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc 1198; GFX10W32-NEXT: .LBB7_2: 1199; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1200; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1201; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1202; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 1203; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 1204; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 1205; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1206; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1207; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 1208; GFX10W32-NEXT: s_endpgm 1209entry: 1210 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1211 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) 1212 store i32 %old, i32 addrspace(1)* %out 1213 ret void 1214} 1215 1216define amdgpu_kernel void @sub_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) { 1217; GFX6-LABEL: sub_i32_varying_vindex: 1218; GFX6: ; %bb.0: ; %entry 1219; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1220; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1221; GFX6-NEXT: v_mov_b32_e32 v1, 1 1222; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1223; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc 1224; GFX6-NEXT: s_mov_b32 s3, 0xf000 1225; GFX6-NEXT: s_mov_b32 s2, -1 1226; GFX6-NEXT: s_waitcnt vmcnt(0) 1227; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 1228; GFX6-NEXT: s_endpgm 1229; 1230; GFX8-LABEL: sub_i32_varying_vindex: 1231; GFX8: ; %bb.0: ; %entry 1232; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1233; GFX8-NEXT: v_mov_b32_e32 v2, 1 1234; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1235; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1236; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 idxen glc 1237; GFX8-NEXT: v_mov_b32_e32 v0, s0 1238; GFX8-NEXT: v_mov_b32_e32 v1, s1 1239; GFX8-NEXT: s_waitcnt vmcnt(0) 1240; GFX8-NEXT: flat_store_dword v[0:1], v2 1241; GFX8-NEXT: s_endpgm 1242; 1243; GFX9-LABEL: sub_i32_varying_vindex: 1244; GFX9: ; %bb.0: ; %entry 1245; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1246; GFX9-NEXT: v_mov_b32_e32 v1, 1 1247; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1248; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1249; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc 1250; GFX9-NEXT: v_mov_b32_e32 v0, 0 1251; GFX9-NEXT: s_waitcnt vmcnt(0) 1252; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1253; GFX9-NEXT: s_endpgm 1254; 1255; GFX10-LABEL: sub_i32_varying_vindex: 1256; GFX10: ; %bb.0: ; %entry 1257; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1258; GFX10-NEXT: v_mov_b32_e32 v1, 1 1259; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1260; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1261; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc 1262; GFX10-NEXT: v_mov_b32_e32 v0, 0 1263; GFX10-NEXT: s_waitcnt vmcnt(0) 1264; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1265; GFX10-NEXT: s_endpgm 1266entry: 1267 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1268 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0) 1269 store i32 %old, i32 addrspace(1)* %out 1270 ret void 1271} 1272 1273define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { 1274; GFX6-LABEL: sub_i32_varying_offset: 1275; GFX6: ; %bb.0: ; %entry 1276; GFX6-NEXT: v_mov_b32_e32 v1, v0 1277; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1278; GFX6-NEXT: s_mov_b32 s2, 0 1279; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1280; GFX6-NEXT: v_mov_b32_e32 v0, s2 1281; GFX6-NEXT: v_mov_b32_e32 v2, 1 1282; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1283; GFX6-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc 1284; GFX6-NEXT: s_mov_b32 s3, 0xf000 1285; GFX6-NEXT: s_mov_b32 s2, -1 1286; GFX6-NEXT: s_waitcnt vmcnt(0) 1287; GFX6-NEXT: buffer_store_dword v2, off, s[0:3], 0 1288; GFX6-NEXT: s_endpgm 1289; 1290; GFX8-LABEL: sub_i32_varying_offset: 1291; GFX8: ; %bb.0: ; %entry 1292; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1293; GFX8-NEXT: s_mov_b32 s2, 0 1294; GFX8-NEXT: v_mov_b32_e32 v1, v0 1295; GFX8-NEXT: v_mov_b32_e32 v0, s2 1296; GFX8-NEXT: v_mov_b32_e32 v2, 1 1297; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1298; GFX8-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc 1299; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1300; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1301; GFX8-NEXT: v_mov_b32_e32 v0, s0 1302; GFX8-NEXT: v_mov_b32_e32 v1, s1 1303; GFX8-NEXT: s_waitcnt vmcnt(0) 1304; GFX8-NEXT: flat_store_dword v[0:1], v2 1305; GFX8-NEXT: s_endpgm 1306; 1307; GFX9-LABEL: sub_i32_varying_offset: 1308; GFX9: ; %bb.0: ; %entry 1309; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1310; GFX9-NEXT: s_mov_b32 s2, 0 1311; GFX9-NEXT: v_mov_b32_e32 v1, v0 1312; GFX9-NEXT: v_mov_b32_e32 v0, s2 1313; GFX9-NEXT: v_mov_b32_e32 v2, 1 1314; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1315; GFX9-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc 1316; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1317; GFX9-NEXT: v_mov_b32_e32 v0, 0 1318; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1319; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 1320; GFX9-NEXT: s_endpgm 1321; 1322; GFX10-LABEL: sub_i32_varying_offset: 1323; GFX10: ; %bb.0: ; %entry 1324; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1325; GFX10-NEXT: s_mov_b32 s2, 0 1326; GFX10-NEXT: v_mov_b32_e32 v1, v0 1327; GFX10-NEXT: v_mov_b32_e32 v0, s2 1328; GFX10-NEXT: v_mov_b32_e32 v2, 1 1329; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1330; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1331; GFX10-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc 1332; GFX10-NEXT: v_mov_b32_e32 v0, 0 1333; GFX10-NEXT: s_waitcnt vmcnt(0) 1334; GFX10-NEXT: global_store_dword v0, v2, s[0:1] 1335; GFX10-NEXT: s_endpgm 1336entry: 1337 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1338 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0) 1339 store i32 %old, i32 addrspace(1)* %out 1340 ret void 1341} 1342