1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s 3; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg) 10declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32 immarg) 11declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32 immarg) 12 13; Show what the atomic optimization pass will do for raw buffers. 14 15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { 16; GFX6-LABEL: add_i32_constant: 17; GFX6: ; %bb.0: ; %entry 18; GFX6-NEXT: s_mov_b64 s[2:3], exec 19; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 20; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 21; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 22; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 23; GFX6-NEXT: ; implicit-def: $vgpr1 24; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 25; GFX6-NEXT: s_cbranch_execz .LBB0_2 26; GFX6-NEXT: ; %bb.1: 27; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 28; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 29; GFX6-NEXT: s_mul_i32 s0, s0, 5 30; GFX6-NEXT: v_mov_b32_e32 v1, s0 31; GFX6-NEXT: s_waitcnt lgkmcnt(0) 32; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 33; GFX6-NEXT: .LBB0_2: 34; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 35; GFX6-NEXT: s_waitcnt vmcnt(0) 36; GFX6-NEXT: v_readfirstlane_b32 s0, v1 37; GFX6-NEXT: s_mov_b32 s7, 0xf000 38; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s0 39; GFX6-NEXT: s_mov_b32 s6, -1 40; GFX6-NEXT: s_waitcnt lgkmcnt(0) 41; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 42; GFX6-NEXT: s_endpgm 43; 44; GFX8-LABEL: add_i32_constant: 45; GFX8: ; %bb.0: ; %entry 46; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 47; GFX8-NEXT: s_mov_b64 s[6:7], exec 48; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 49; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 50; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 51; GFX8-NEXT: ; implicit-def: $vgpr1 52; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 53; GFX8-NEXT: s_cbranch_execz .LBB0_2 54; GFX8-NEXT: ; %bb.1: 55; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 56; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 57; GFX8-NEXT: s_mul_i32 s0, s0, 5 58; GFX8-NEXT: v_mov_b32_e32 v1, s0 59; GFX8-NEXT: s_waitcnt lgkmcnt(0) 60; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 61; GFX8-NEXT: .LBB0_2: 62; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 63; GFX8-NEXT: s_waitcnt vmcnt(0) 64; GFX8-NEXT: v_readfirstlane_b32 s0, v1 65; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0 66; GFX8-NEXT: s_waitcnt lgkmcnt(0) 67; GFX8-NEXT: v_mov_b32_e32 v0, s2 68; GFX8-NEXT: v_mov_b32_e32 v1, s3 69; GFX8-NEXT: flat_store_dword v[0:1], v2 70; GFX8-NEXT: s_endpgm 71; 72; GFX9-LABEL: add_i32_constant: 73; GFX9: ; %bb.0: ; %entry 74; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 75; GFX9-NEXT: s_mov_b64 s[6:7], exec 76; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 77; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 78; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 79; GFX9-NEXT: ; implicit-def: $vgpr1 80; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 81; GFX9-NEXT: s_cbranch_execz .LBB0_2 82; GFX9-NEXT: ; %bb.1: 83; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 84; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 85; GFX9-NEXT: s_mul_i32 s0, s0, 5 86; GFX9-NEXT: v_mov_b32_e32 v1, s0 87; GFX9-NEXT: s_waitcnt lgkmcnt(0) 88; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 89; GFX9-NEXT: .LBB0_2: 90; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 91; GFX9-NEXT: s_waitcnt vmcnt(0) 92; GFX9-NEXT: v_readfirstlane_b32 s0, v1 93; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0 94; GFX9-NEXT: v_mov_b32_e32 v1, 0 95; GFX9-NEXT: s_waitcnt lgkmcnt(0) 96; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 97; GFX9-NEXT: s_endpgm 98; 99; GFX10W64-LABEL: add_i32_constant: 100; GFX10W64: ; %bb.0: ; %entry 101; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 102; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 103; GFX10W64-NEXT: ; implicit-def: $vgpr1 104; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 105; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 106; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 107; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 108; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 109; GFX10W64-NEXT: ; %bb.1: 110; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 111; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 112; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 113; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 114; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 115; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 116; GFX10W64-NEXT: .LBB0_2: 117; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 118; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 119; GFX10W64-NEXT: s_waitcnt vmcnt(0) 120; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 121; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 122; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 123; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 124; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 125; GFX10W64-NEXT: s_endpgm 126; 127; GFX10W32-LABEL: add_i32_constant: 128; GFX10W32: ; %bb.0: ; %entry 129; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 130; GFX10W32-NEXT: s_mov_b32 s5, exec_lo 131; GFX10W32-NEXT: ; implicit-def: $vgpr1 132; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 133; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 134; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 135; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 136; GFX10W32-NEXT: ; %bb.1: 137; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 138; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 139; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 140; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 141; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 142; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 143; GFX10W32-NEXT: .LBB0_2: 144; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 145; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 146; GFX10W32-NEXT: s_waitcnt vmcnt(0) 147; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 148; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 149; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 150; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 151; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 152; GFX10W32-NEXT: s_endpgm 153entry: 154 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) 155 store i32 %old, i32 addrspace(1)* %out 156 ret void 157} 158 159define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) { 160; GFX6-LABEL: add_i32_uniform: 161; GFX6: ; %bb.0: ; %entry 162; GFX6-NEXT: s_mov_b64 s[2:3], exec 163; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 164; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 165; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 166; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 167; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 168; GFX6-NEXT: ; implicit-def: $vgpr1 169; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 170; GFX6-NEXT: s_cbranch_execz .LBB1_2 171; GFX6-NEXT: ; %bb.1: 172; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 173; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 174; GFX6-NEXT: s_waitcnt lgkmcnt(0) 175; GFX6-NEXT: s_mul_i32 s0, s8, s0 176; GFX6-NEXT: v_mov_b32_e32 v1, s0 177; GFX6-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 178; GFX6-NEXT: .LBB1_2: 179; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 180; GFX6-NEXT: s_waitcnt vmcnt(0) 181; GFX6-NEXT: v_readfirstlane_b32 s0, v1 182; GFX6-NEXT: s_waitcnt lgkmcnt(0) 183; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 184; GFX6-NEXT: s_mov_b32 s7, 0xf000 185; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 186; GFX6-NEXT: s_mov_b32 s6, -1 187; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 188; GFX6-NEXT: s_endpgm 189; 190; GFX8-LABEL: add_i32_uniform: 191; GFX8: ; %bb.0: ; %entry 192; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 193; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 194; GFX8-NEXT: s_mov_b64 s[4:5], exec 195; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 196; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 197; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 198; GFX8-NEXT: ; implicit-def: $vgpr1 199; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 200; GFX8-NEXT: s_cbranch_execz .LBB1_2 201; GFX8-NEXT: ; %bb.1: 202; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 203; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 204; GFX8-NEXT: s_waitcnt lgkmcnt(0) 205; GFX8-NEXT: s_mul_i32 s0, s8, s0 206; GFX8-NEXT: v_mov_b32_e32 v1, s0 207; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 208; GFX8-NEXT: .LBB1_2: 209; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 210; GFX8-NEXT: s_waitcnt lgkmcnt(0) 211; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 212; GFX8-NEXT: s_waitcnt vmcnt(0) 213; GFX8-NEXT: v_readfirstlane_b32 s0, v1 214; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 215; GFX8-NEXT: v_mov_b32_e32 v0, s2 216; GFX8-NEXT: v_mov_b32_e32 v1, s3 217; GFX8-NEXT: flat_store_dword v[0:1], v2 218; GFX8-NEXT: s_endpgm 219; 220; GFX9-LABEL: add_i32_uniform: 221; GFX9: ; %bb.0: ; %entry 222; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 223; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 224; GFX9-NEXT: s_mov_b64 s[4:5], exec 225; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 226; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 227; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 228; GFX9-NEXT: ; implicit-def: $vgpr1 229; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc 230; GFX9-NEXT: s_cbranch_execz .LBB1_2 231; GFX9-NEXT: ; %bb.1: 232; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 233; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 234; GFX9-NEXT: s_waitcnt lgkmcnt(0) 235; GFX9-NEXT: s_mul_i32 s0, s8, s0 236; GFX9-NEXT: v_mov_b32_e32 v1, s0 237; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 238; GFX9-NEXT: .LBB1_2: 239; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 240; GFX9-NEXT: s_waitcnt lgkmcnt(0) 241; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 242; GFX9-NEXT: s_waitcnt vmcnt(0) 243; GFX9-NEXT: v_readfirstlane_b32 s0, v1 244; GFX9-NEXT: v_mov_b32_e32 v1, 0 245; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 246; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 247; GFX9-NEXT: s_endpgm 248; 249; GFX10W64-LABEL: add_i32_uniform: 250; GFX10W64: ; %bb.0: ; %entry 251; GFX10W64-NEXT: s_clause 0x1 252; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 253; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 254; GFX10W64-NEXT: s_mov_b64 s[4:5], exec 255; GFX10W64-NEXT: ; implicit-def: $vgpr1 256; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 257; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 258; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 259; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc 260; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 261; GFX10W64-NEXT: ; %bb.1: 262; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 263; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 264; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 265; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 266; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 267; GFX10W64-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 268; GFX10W64-NEXT: .LBB1_2: 269; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 270; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7] 271; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 272; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0 273; GFX10W64-NEXT: s_waitcnt vmcnt(0) 274; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 275; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 276; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 277; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 278; GFX10W64-NEXT: s_endpgm 279; 280; GFX10W32-LABEL: add_i32_uniform: 281; GFX10W32: ; %bb.0: ; %entry 282; GFX10W32-NEXT: s_clause 0x1 283; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 284; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 285; GFX10W32-NEXT: s_mov_b32 s6, exec_lo 286; GFX10W32-NEXT: ; implicit-def: $vgpr1 287; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 288; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 289; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 290; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 291; GFX10W32-NEXT: ; %bb.1: 292; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 293; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 294; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 295; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 296; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 297; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 298; GFX10W32-NEXT: .LBB1_2: 299; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 300; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 301; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 302; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0 303; GFX10W32-NEXT: s_waitcnt vmcnt(0) 304; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 305; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 306; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 307; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 308; GFX10W32-NEXT: s_endpgm 309entry: 310 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0) 311 store i32 %old, i32 addrspace(1)* %out 312 ret void 313} 314 315define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { 316; GFX6-LABEL: add_i32_varying_vdata: 317; GFX6: ; %bb.0: ; %entry 318; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 319; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 320; GFX6-NEXT: s_waitcnt lgkmcnt(0) 321; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 322; GFX6-NEXT: s_mov_b32 s3, 0xf000 323; GFX6-NEXT: s_mov_b32 s2, -1 324; GFX6-NEXT: s_waitcnt vmcnt(0) 325; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 326; GFX6-NEXT: s_endpgm 327; 328; GFX8-LABEL: add_i32_varying_vdata: 329; GFX8: ; %bb.0: ; %entry 330; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 331; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 332; GFX8-NEXT: v_mov_b32_e32 v1, 0 333; GFX8-NEXT: s_mov_b64 exec, s[4:5] 334; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 335; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 336; GFX8-NEXT: v_mov_b32_e32 v2, v0 337; GFX8-NEXT: s_not_b64 exec, exec 338; GFX8-NEXT: v_mov_b32_e32 v2, 0 339; GFX8-NEXT: s_not_b64 exec, exec 340; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 341; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 342; GFX8-NEXT: s_nop 1 343; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 344; GFX8-NEXT: s_nop 1 345; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 346; GFX8-NEXT: s_nop 1 347; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 348; GFX8-NEXT: s_nop 1 349; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 350; GFX8-NEXT: s_nop 1 351; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 352; GFX8-NEXT: v_readlane_b32 s6, v2, 63 353; GFX8-NEXT: s_nop 0 354; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 355; GFX8-NEXT: s_mov_b64 exec, s[4:5] 356; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 357; GFX8-NEXT: ; implicit-def: $vgpr0 358; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 359; GFX8-NEXT: s_cbranch_execz .LBB2_2 360; GFX8-NEXT: ; %bb.1: 361; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 362; GFX8-NEXT: v_mov_b32_e32 v0, s6 363; GFX8-NEXT: s_waitcnt lgkmcnt(0) 364; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 365; GFX8-NEXT: .LBB2_2: 366; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 367; GFX8-NEXT: s_waitcnt vmcnt(0) 368; GFX8-NEXT: v_readfirstlane_b32 s0, v0 369; GFX8-NEXT: v_mov_b32_e32 v0, v1 370; GFX8-NEXT: s_waitcnt lgkmcnt(0) 371; GFX8-NEXT: v_mov_b32_e32 v4, s3 372; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 373; GFX8-NEXT: v_mov_b32_e32 v3, s2 374; GFX8-NEXT: flat_store_dword v[3:4], v0 375; GFX8-NEXT: s_endpgm 376; 377; GFX9-LABEL: add_i32_varying_vdata: 378; GFX9: ; %bb.0: ; %entry 379; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 380; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 381; GFX9-NEXT: v_mov_b32_e32 v1, 0 382; GFX9-NEXT: s_mov_b64 exec, s[4:5] 383; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 384; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 385; GFX9-NEXT: v_mov_b32_e32 v2, v0 386; GFX9-NEXT: s_not_b64 exec, exec 387; GFX9-NEXT: v_mov_b32_e32 v2, 0 388; GFX9-NEXT: s_not_b64 exec, exec 389; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 390; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 391; GFX9-NEXT: s_nop 1 392; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 393; GFX9-NEXT: s_nop 1 394; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 395; GFX9-NEXT: s_nop 1 396; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 397; GFX9-NEXT: s_nop 1 398; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 399; GFX9-NEXT: s_nop 1 400; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 401; GFX9-NEXT: v_readlane_b32 s6, v2, 63 402; GFX9-NEXT: s_nop 0 403; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 404; GFX9-NEXT: s_mov_b64 exec, s[4:5] 405; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 406; GFX9-NEXT: ; implicit-def: $vgpr0 407; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 408; GFX9-NEXT: s_cbranch_execz .LBB2_2 409; GFX9-NEXT: ; %bb.1: 410; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 411; GFX9-NEXT: v_mov_b32_e32 v0, s6 412; GFX9-NEXT: s_waitcnt lgkmcnt(0) 413; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 414; GFX9-NEXT: .LBB2_2: 415; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 416; GFX9-NEXT: s_waitcnt vmcnt(0) 417; GFX9-NEXT: v_readfirstlane_b32 s0, v0 418; GFX9-NEXT: v_mov_b32_e32 v0, v1 419; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 420; GFX9-NEXT: v_mov_b32_e32 v3, 0 421; GFX9-NEXT: s_waitcnt lgkmcnt(0) 422; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 423; GFX9-NEXT: s_endpgm 424; 425; GFX10W64-LABEL: add_i32_varying_vdata: 426; GFX10W64: ; %bb.0: ; %entry 427; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 428; GFX10W64-NEXT: s_not_b64 exec, exec 429; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 430; GFX10W64-NEXT: s_not_b64 exec, exec 431; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 432; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 433; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 434; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 435; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 436; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 437; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 438; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 439; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 440; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 441; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 442; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 443; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 444; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 445; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 446; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 447; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 448; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 449; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 450; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 451; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 452; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 453; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 454; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 455; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 456; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 457; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 458; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 459; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 460; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 461; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 462; GFX10W64-NEXT: ; implicit-def: $vgpr0 463; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 464; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 465; GFX10W64-NEXT: ; %bb.1: 466; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 467; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 468; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 469; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 470; GFX10W64-NEXT: .LBB2_2: 471; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 472; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 473; GFX10W64-NEXT: s_waitcnt vmcnt(0) 474; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 475; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 476; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 477; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 478; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 479; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 480; GFX10W64-NEXT: s_endpgm 481; 482; GFX10W32-LABEL: add_i32_varying_vdata: 483; GFX10W32: ; %bb.0: ; %entry 484; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 485; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 486; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 487; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 488; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 489; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 490; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 491; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 492; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 493; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 494; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 495; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 496; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 497; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 498; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 499; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 500; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 501; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 502; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 503; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 504; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 505; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 506; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 507; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 508; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 509; GFX10W32-NEXT: ; implicit-def: $vgpr0 510; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 511; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 512; GFX10W32-NEXT: ; %bb.1: 513; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 514; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 515; GFX10W32-NEXT: s_mov_b32 s5, s6 516; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 517; GFX10W32-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 518; GFX10W32-NEXT: .LBB2_2: 519; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 520; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 521; GFX10W32-NEXT: s_waitcnt vmcnt(0) 522; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 523; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 524; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 525; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 526; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 527; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 528; GFX10W32-NEXT: s_endpgm 529entry: 530 %lane = call i32 @llvm.amdgcn.workitem.id.x() 531 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) 532 store i32 %old, i32 addrspace(1)* %out 533 ret void 534} 535 536define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %vindex) { 537; GFX6-LABEL: struct_add_i32_varying_vdata: 538; GFX6: ; %bb.0: ; %entry 539; GFX6-NEXT: s_load_dword s2, s[0:1], 0x11 540; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 541; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 542; GFX6-NEXT: s_waitcnt lgkmcnt(0) 543; GFX6-NEXT: v_mov_b32_e32 v1, s2 544; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc 545; GFX6-NEXT: s_mov_b32 s3, 0xf000 546; GFX6-NEXT: s_mov_b32 s2, -1 547; GFX6-NEXT: s_waitcnt vmcnt(0) 548; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 549; GFX6-NEXT: s_endpgm 550; 551; GFX8-LABEL: struct_add_i32_varying_vdata: 552; GFX8: ; %bb.0: ; %entry 553; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 554; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 555; GFX8-NEXT: v_mov_b32_e32 v1, 0 556; GFX8-NEXT: s_mov_b64 exec, s[4:5] 557; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 558; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 559; GFX8-NEXT: v_mov_b32_e32 v2, v0 560; GFX8-NEXT: s_not_b64 exec, exec 561; GFX8-NEXT: v_mov_b32_e32 v2, 0 562; GFX8-NEXT: s_not_b64 exec, exec 563; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 564; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 565; GFX8-NEXT: s_nop 1 566; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 567; GFX8-NEXT: s_nop 1 568; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 569; GFX8-NEXT: s_nop 1 570; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 571; GFX8-NEXT: s_nop 1 572; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 573; GFX8-NEXT: s_nop 1 574; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 575; GFX8-NEXT: v_readlane_b32 s6, v2, 63 576; GFX8-NEXT: s_nop 0 577; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 578; GFX8-NEXT: s_mov_b64 exec, s[4:5] 579; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 580; GFX8-NEXT: ; implicit-def: $vgpr0 581; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 582; GFX8-NEXT: s_cbranch_execz .LBB3_2 583; GFX8-NEXT: ; %bb.1: 584; GFX8-NEXT: s_load_dword s7, s[0:1], 0x44 585; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 586; GFX8-NEXT: v_mov_b32_e32 v0, s6 587; GFX8-NEXT: s_waitcnt lgkmcnt(0) 588; GFX8-NEXT: v_mov_b32_e32 v3, s7 589; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc 590; GFX8-NEXT: .LBB3_2: 591; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 592; GFX8-NEXT: s_waitcnt vmcnt(0) 593; GFX8-NEXT: v_readfirstlane_b32 s0, v0 594; GFX8-NEXT: v_mov_b32_e32 v0, v1 595; GFX8-NEXT: s_waitcnt lgkmcnt(0) 596; GFX8-NEXT: v_mov_b32_e32 v4, s3 597; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 598; GFX8-NEXT: v_mov_b32_e32 v3, s2 599; GFX8-NEXT: flat_store_dword v[3:4], v0 600; GFX8-NEXT: s_endpgm 601; 602; GFX9-LABEL: struct_add_i32_varying_vdata: 603; GFX9: ; %bb.0: ; %entry 604; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 605; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 606; GFX9-NEXT: v_mov_b32_e32 v1, 0 607; GFX9-NEXT: s_mov_b64 exec, s[4:5] 608; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 609; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 610; GFX9-NEXT: v_mov_b32_e32 v2, v0 611; GFX9-NEXT: s_not_b64 exec, exec 612; GFX9-NEXT: v_mov_b32_e32 v2, 0 613; GFX9-NEXT: s_not_b64 exec, exec 614; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 615; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 616; GFX9-NEXT: s_nop 1 617; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 618; GFX9-NEXT: s_nop 1 619; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 620; GFX9-NEXT: s_nop 1 621; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 622; GFX9-NEXT: s_nop 1 623; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 624; GFX9-NEXT: s_nop 1 625; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 626; GFX9-NEXT: v_readlane_b32 s6, v2, 63 627; GFX9-NEXT: s_nop 0 628; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 629; GFX9-NEXT: s_mov_b64 exec, s[4:5] 630; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 631; GFX9-NEXT: ; implicit-def: $vgpr0 632; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 633; GFX9-NEXT: s_cbranch_execz .LBB3_2 634; GFX9-NEXT: ; %bb.1: 635; GFX9-NEXT: s_load_dword s7, s[0:1], 0x44 636; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 637; GFX9-NEXT: v_mov_b32_e32 v0, s6 638; GFX9-NEXT: s_waitcnt lgkmcnt(0) 639; GFX9-NEXT: v_mov_b32_e32 v3, s7 640; GFX9-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc 641; GFX9-NEXT: .LBB3_2: 642; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 643; GFX9-NEXT: s_waitcnt vmcnt(0) 644; GFX9-NEXT: v_readfirstlane_b32 s0, v0 645; GFX9-NEXT: v_mov_b32_e32 v0, v1 646; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 647; GFX9-NEXT: v_mov_b32_e32 v3, 0 648; GFX9-NEXT: s_waitcnt lgkmcnt(0) 649; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 650; GFX9-NEXT: s_endpgm 651; 652; GFX10W64-LABEL: struct_add_i32_varying_vdata: 653; GFX10W64: ; %bb.0: ; %entry 654; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 655; GFX10W64-NEXT: s_not_b64 exec, exec 656; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 657; GFX10W64-NEXT: s_not_b64 exec, exec 658; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 659; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 660; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 661; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 662; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 663; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 664; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 665; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 666; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 667; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 668; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 669; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 670; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 671; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 672; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 673; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 674; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 675; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 676; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 677; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 678; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 679; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 680; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 681; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 682; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 683; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 684; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 685; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 686; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 687; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 688; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 689; GFX10W64-NEXT: ; implicit-def: $vgpr0 690; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 691; GFX10W64-NEXT: s_cbranch_execz .LBB3_2 692; GFX10W64-NEXT: ; %bb.1: 693; GFX10W64-NEXT: s_clause 0x1 694; GFX10W64-NEXT: s_load_dword s7, s[0:1], 0x44 695; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 696; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 697; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 698; GFX10W64-NEXT: v_mov_b32_e32 v4, s7 699; GFX10W64-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc 700; GFX10W64-NEXT: .LBB3_2: 701; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 702; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 703; GFX10W64-NEXT: s_waitcnt vmcnt(0) 704; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 705; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 706; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 707; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 708; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 709; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 710; GFX10W64-NEXT: s_endpgm 711; 712; GFX10W32-LABEL: struct_add_i32_varying_vdata: 713; GFX10W32: ; %bb.0: ; %entry 714; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 715; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 716; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 717; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 718; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 719; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 720; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 721; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 722; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 723; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 724; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 725; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 726; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 727; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 728; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 729; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 730; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 731; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 732; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 733; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 734; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 735; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 736; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 737; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 738; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 739; GFX10W32-NEXT: ; implicit-def: $vgpr0 740; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 741; GFX10W32-NEXT: s_cbranch_execz .LBB3_2 742; GFX10W32-NEXT: ; %bb.1: 743; GFX10W32-NEXT: s_mov_b32 s5, s6 744; GFX10W32-NEXT: s_clause 0x1 745; GFX10W32-NEXT: s_load_dword s6, s[0:1], 0x44 746; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 747; GFX10W32-NEXT: v_mov_b32_e32 v0, s5 748; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 749; GFX10W32-NEXT: v_mov_b32_e32 v4, s6 750; GFX10W32-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc 751; GFX10W32-NEXT: .LBB3_2: 752; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 753; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 754; GFX10W32-NEXT: s_waitcnt vmcnt(0) 755; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 756; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 757; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 758; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 759; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 760; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 761; GFX10W32-NEXT: s_endpgm 762entry: 763 %lane = call i32 @llvm.amdgcn.workitem.id.x() 764 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 %vindex, i32 0, i32 0, i32 0) 765 store i32 %old, i32 addrspace(1)* %out 766 ret void 767} 768 769define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { 770; GFX6-LABEL: add_i32_varying_offset: 771; GFX6: ; %bb.0: ; %entry 772; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 773; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 774; GFX6-NEXT: v_mov_b32_e32 v1, 1 775; GFX6-NEXT: s_waitcnt lgkmcnt(0) 776; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc 777; GFX6-NEXT: s_mov_b32 s3, 0xf000 778; GFX6-NEXT: s_mov_b32 s2, -1 779; GFX6-NEXT: s_waitcnt vmcnt(0) 780; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 781; GFX6-NEXT: s_endpgm 782; 783; GFX8-LABEL: add_i32_varying_offset: 784; GFX8: ; %bb.0: ; %entry 785; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 786; GFX8-NEXT: v_mov_b32_e32 v2, 1 787; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 788; GFX8-NEXT: s_waitcnt lgkmcnt(0) 789; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc 790; GFX8-NEXT: v_mov_b32_e32 v0, s0 791; GFX8-NEXT: v_mov_b32_e32 v1, s1 792; GFX8-NEXT: s_waitcnt vmcnt(0) 793; GFX8-NEXT: flat_store_dword v[0:1], v2 794; GFX8-NEXT: s_endpgm 795; 796; GFX9-LABEL: add_i32_varying_offset: 797; GFX9: ; %bb.0: ; %entry 798; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 799; GFX9-NEXT: v_mov_b32_e32 v1, 1 800; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 801; GFX9-NEXT: s_waitcnt lgkmcnt(0) 802; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc 803; GFX9-NEXT: v_mov_b32_e32 v0, 0 804; GFX9-NEXT: s_waitcnt vmcnt(0) 805; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 806; GFX9-NEXT: s_endpgm 807; 808; GFX10-LABEL: add_i32_varying_offset: 809; GFX10: ; %bb.0: ; %entry 810; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 811; GFX10-NEXT: v_mov_b32_e32 v1, 1 812; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 813; GFX10-NEXT: s_waitcnt lgkmcnt(0) 814; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc 815; GFX10-NEXT: v_mov_b32_e32 v0, 0 816; GFX10-NEXT: s_waitcnt vmcnt(0) 817; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 818; GFX10-NEXT: s_endpgm 819entry: 820 %lane = call i32 @llvm.amdgcn.workitem.id.x() 821 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) 822 store i32 %old, i32 addrspace(1)* %out 823 ret void 824} 825 826define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { 827; GFX6-LABEL: sub_i32_constant: 828; GFX6: ; %bb.0: ; %entry 829; GFX6-NEXT: s_mov_b64 s[2:3], exec 830; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 831; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 832; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 833; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 834; GFX6-NEXT: ; implicit-def: $vgpr1 835; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 836; GFX6-NEXT: s_cbranch_execz .LBB5_2 837; GFX6-NEXT: ; %bb.1: 838; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 839; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 840; GFX6-NEXT: s_mul_i32 s0, s0, 5 841; GFX6-NEXT: v_mov_b32_e32 v1, s0 842; GFX6-NEXT: s_waitcnt lgkmcnt(0) 843; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 844; GFX6-NEXT: .LBB5_2: 845; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 846; GFX6-NEXT: s_waitcnt vmcnt(0) 847; GFX6-NEXT: v_readfirstlane_b32 s0, v1 848; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0 849; GFX6-NEXT: s_mov_b32 s7, 0xf000 850; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 851; GFX6-NEXT: s_mov_b32 s6, -1 852; GFX6-NEXT: s_waitcnt lgkmcnt(0) 853; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 854; GFX6-NEXT: s_endpgm 855; 856; GFX8-LABEL: sub_i32_constant: 857; GFX8: ; %bb.0: ; %entry 858; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 859; GFX8-NEXT: s_mov_b64 s[6:7], exec 860; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 861; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 862; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 863; GFX8-NEXT: ; implicit-def: $vgpr1 864; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 865; GFX8-NEXT: s_cbranch_execz .LBB5_2 866; GFX8-NEXT: ; %bb.1: 867; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 868; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 869; GFX8-NEXT: s_mul_i32 s0, s0, 5 870; GFX8-NEXT: v_mov_b32_e32 v1, s0 871; GFX8-NEXT: s_waitcnt lgkmcnt(0) 872; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 873; GFX8-NEXT: .LBB5_2: 874; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 875; GFX8-NEXT: s_waitcnt vmcnt(0) 876; GFX8-NEXT: v_readfirstlane_b32 s0, v1 877; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 878; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 879; GFX8-NEXT: s_waitcnt lgkmcnt(0) 880; GFX8-NEXT: v_mov_b32_e32 v0, s2 881; GFX8-NEXT: v_mov_b32_e32 v1, s3 882; GFX8-NEXT: flat_store_dword v[0:1], v2 883; GFX8-NEXT: s_endpgm 884; 885; GFX9-LABEL: sub_i32_constant: 886; GFX9: ; %bb.0: ; %entry 887; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 888; GFX9-NEXT: s_mov_b64 s[6:7], exec 889; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 890; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 891; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 892; GFX9-NEXT: ; implicit-def: $vgpr1 893; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 894; GFX9-NEXT: s_cbranch_execz .LBB5_2 895; GFX9-NEXT: ; %bb.1: 896; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 897; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 898; GFX9-NEXT: s_mul_i32 s0, s0, 5 899; GFX9-NEXT: v_mov_b32_e32 v1, s0 900; GFX9-NEXT: s_waitcnt lgkmcnt(0) 901; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 902; GFX9-NEXT: .LBB5_2: 903; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 904; GFX9-NEXT: s_waitcnt vmcnt(0) 905; GFX9-NEXT: v_readfirstlane_b32 s0, v1 906; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 907; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 908; GFX9-NEXT: v_mov_b32_e32 v1, 0 909; GFX9-NEXT: s_waitcnt lgkmcnt(0) 910; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 911; GFX9-NEXT: s_endpgm 912; 913; GFX10W64-LABEL: sub_i32_constant: 914; GFX10W64: ; %bb.0: ; %entry 915; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 916; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 917; GFX10W64-NEXT: ; implicit-def: $vgpr1 918; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 919; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 920; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 921; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 922; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 923; GFX10W64-NEXT: ; %bb.1: 924; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 925; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 926; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 927; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 928; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 929; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 930; GFX10W64-NEXT: .LBB5_2: 931; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 932; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 933; GFX10W64-NEXT: s_waitcnt vmcnt(0) 934; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 935; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 936; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 937; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 938; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 939; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 940; GFX10W64-NEXT: s_endpgm 941; 942; GFX10W32-LABEL: sub_i32_constant: 943; GFX10W32: ; %bb.0: ; %entry 944; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 945; GFX10W32-NEXT: s_mov_b32 s5, exec_lo 946; GFX10W32-NEXT: ; implicit-def: $vgpr1 947; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 948; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 949; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 950; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 951; GFX10W32-NEXT: ; %bb.1: 952; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 953; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 954; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 955; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 956; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 957; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 958; GFX10W32-NEXT: .LBB5_2: 959; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 960; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 961; GFX10W32-NEXT: s_waitcnt vmcnt(0) 962; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 963; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 964; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 965; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 966; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 967; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 968; GFX10W32-NEXT: s_endpgm 969entry: 970 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) 971 store i32 %old, i32 addrspace(1)* %out 972 ret void 973} 974 975define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) { 976; GFX6-LABEL: sub_i32_uniform: 977; GFX6: ; %bb.0: ; %entry 978; GFX6-NEXT: s_mov_b64 s[2:3], exec 979; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 980; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 981; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 982; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 983; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 984; GFX6-NEXT: ; implicit-def: $vgpr1 985; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 986; GFX6-NEXT: s_cbranch_execz .LBB6_2 987; GFX6-NEXT: ; %bb.1: 988; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 989; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 990; GFX6-NEXT: s_waitcnt lgkmcnt(0) 991; GFX6-NEXT: s_mul_i32 s0, s8, s0 992; GFX6-NEXT: v_mov_b32_e32 v1, s0 993; GFX6-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 994; GFX6-NEXT: .LBB6_2: 995; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 996; GFX6-NEXT: s_waitcnt vmcnt(0) 997; GFX6-NEXT: v_readfirstlane_b32 s0, v1 998; GFX6-NEXT: s_waitcnt lgkmcnt(0) 999; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 1000; GFX6-NEXT: s_mov_b32 s7, 0xf000 1001; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1002; GFX6-NEXT: s_mov_b32 s6, -1 1003; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 1004; GFX6-NEXT: s_endpgm 1005; 1006; GFX8-LABEL: sub_i32_uniform: 1007; GFX8: ; %bb.0: ; %entry 1008; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1009; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 1010; GFX8-NEXT: s_mov_b64 s[4:5], exec 1011; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1012; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1013; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1014; GFX8-NEXT: ; implicit-def: $vgpr1 1015; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 1016; GFX8-NEXT: s_cbranch_execz .LBB6_2 1017; GFX8-NEXT: ; %bb.1: 1018; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 1019; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 1020; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1021; GFX8-NEXT: s_mul_i32 s0, s8, s0 1022; GFX8-NEXT: v_mov_b32_e32 v1, s0 1023; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1024; GFX8-NEXT: .LBB6_2: 1025; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 1026; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1027; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 1028; GFX8-NEXT: s_waitcnt vmcnt(0) 1029; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1030; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 1031; GFX8-NEXT: v_mov_b32_e32 v0, s2 1032; GFX8-NEXT: v_mov_b32_e32 v1, s3 1033; GFX8-NEXT: flat_store_dword v[0:1], v2 1034; GFX8-NEXT: s_endpgm 1035; 1036; GFX9-LABEL: sub_i32_uniform: 1037; GFX9: ; %bb.0: ; %entry 1038; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1039; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 1040; GFX9-NEXT: s_mov_b64 s[4:5], exec 1041; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1042; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1043; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1044; GFX9-NEXT: ; implicit-def: $vgpr1 1045; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc 1046; GFX9-NEXT: s_cbranch_execz .LBB6_2 1047; GFX9-NEXT: ; %bb.1: 1048; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 1049; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 1050; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1051; GFX9-NEXT: s_mul_i32 s0, s8, s0 1052; GFX9-NEXT: v_mov_b32_e32 v1, s0 1053; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1054; GFX9-NEXT: .LBB6_2: 1055; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 1056; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1057; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 1058; GFX9-NEXT: s_waitcnt vmcnt(0) 1059; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1060; GFX9-NEXT: v_mov_b32_e32 v1, 0 1061; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1062; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 1063; GFX9-NEXT: s_endpgm 1064; 1065; GFX10W64-LABEL: sub_i32_uniform: 1066; GFX10W64: ; %bb.0: ; %entry 1067; GFX10W64-NEXT: s_clause 0x1 1068; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1069; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 1070; GFX10W64-NEXT: s_mov_b64 s[4:5], exec 1071; GFX10W64-NEXT: ; implicit-def: $vgpr1 1072; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1073; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1074; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1075; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc 1076; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 1077; GFX10W64-NEXT: ; %bb.1: 1078; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 1079; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 1080; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1081; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 1082; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 1083; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1084; GFX10W64-NEXT: .LBB6_2: 1085; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1086; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7] 1087; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1088; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0 1089; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1090; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 1091; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1092; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1093; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 1094; GFX10W64-NEXT: s_endpgm 1095; 1096; GFX10W32-LABEL: sub_i32_uniform: 1097; GFX10W32: ; %bb.0: ; %entry 1098; GFX10W32-NEXT: s_clause 0x1 1099; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1100; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 1101; GFX10W32-NEXT: s_mov_b32 s6, exec_lo 1102; GFX10W32-NEXT: ; implicit-def: $vgpr1 1103; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1104; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1105; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 1106; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 1107; GFX10W32-NEXT: ; %bb.1: 1108; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1109; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 1110; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1111; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 1112; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 1113; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1114; GFX10W32-NEXT: .LBB6_2: 1115; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1116; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 1117; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1118; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0 1119; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1120; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 1121; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1122; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1123; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 1124; GFX10W32-NEXT: s_endpgm 1125entry: 1126 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0) 1127 store i32 %old, i32 addrspace(1)* %out 1128 ret void 1129} 1130 1131define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { 1132; GFX6-LABEL: sub_i32_varying_vdata: 1133; GFX6: ; %bb.0: ; %entry 1134; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1135; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1136; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1137; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 1138; GFX6-NEXT: s_mov_b32 s3, 0xf000 1139; GFX6-NEXT: s_mov_b32 s2, -1 1140; GFX6-NEXT: s_waitcnt vmcnt(0) 1141; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 1142; GFX6-NEXT: s_endpgm 1143; 1144; GFX8-LABEL: sub_i32_varying_vdata: 1145; GFX8: ; %bb.0: ; %entry 1146; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1147; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1148; GFX8-NEXT: v_mov_b32_e32 v1, 0 1149; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1150; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1151; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1152; GFX8-NEXT: v_mov_b32_e32 v2, v0 1153; GFX8-NEXT: s_not_b64 exec, exec 1154; GFX8-NEXT: v_mov_b32_e32 v2, 0 1155; GFX8-NEXT: s_not_b64 exec, exec 1156; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1157; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1158; GFX8-NEXT: s_nop 1 1159; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1160; GFX8-NEXT: s_nop 1 1161; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1162; GFX8-NEXT: s_nop 1 1163; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1164; GFX8-NEXT: s_nop 1 1165; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1166; GFX8-NEXT: s_nop 1 1167; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1168; GFX8-NEXT: v_readlane_b32 s6, v2, 63 1169; GFX8-NEXT: s_nop 0 1170; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1171; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1172; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1173; GFX8-NEXT: ; implicit-def: $vgpr0 1174; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1175; GFX8-NEXT: s_cbranch_execz .LBB7_2 1176; GFX8-NEXT: ; %bb.1: 1177; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1178; GFX8-NEXT: v_mov_b32_e32 v0, s6 1179; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1180; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1181; GFX8-NEXT: .LBB7_2: 1182; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1183; GFX8-NEXT: s_waitcnt vmcnt(0) 1184; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1185; GFX8-NEXT: v_mov_b32_e32 v0, v1 1186; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1187; GFX8-NEXT: v_mov_b32_e32 v4, s3 1188; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1189; GFX8-NEXT: v_mov_b32_e32 v3, s2 1190; GFX8-NEXT: flat_store_dword v[3:4], v0 1191; GFX8-NEXT: s_endpgm 1192; 1193; GFX9-LABEL: sub_i32_varying_vdata: 1194; GFX9: ; %bb.0: ; %entry 1195; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1196; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1197; GFX9-NEXT: v_mov_b32_e32 v1, 0 1198; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1199; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1200; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1201; GFX9-NEXT: v_mov_b32_e32 v2, v0 1202; GFX9-NEXT: s_not_b64 exec, exec 1203; GFX9-NEXT: v_mov_b32_e32 v2, 0 1204; GFX9-NEXT: s_not_b64 exec, exec 1205; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1206; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1207; GFX9-NEXT: s_nop 1 1208; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1209; GFX9-NEXT: s_nop 1 1210; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1211; GFX9-NEXT: s_nop 1 1212; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1213; GFX9-NEXT: s_nop 1 1214; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1215; GFX9-NEXT: s_nop 1 1216; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1217; GFX9-NEXT: v_readlane_b32 s6, v2, 63 1218; GFX9-NEXT: s_nop 0 1219; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1220; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1221; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1222; GFX9-NEXT: ; implicit-def: $vgpr0 1223; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1224; GFX9-NEXT: s_cbranch_execz .LBB7_2 1225; GFX9-NEXT: ; %bb.1: 1226; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1227; GFX9-NEXT: v_mov_b32_e32 v0, s6 1228; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1229; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1230; GFX9-NEXT: .LBB7_2: 1231; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1232; GFX9-NEXT: s_waitcnt vmcnt(0) 1233; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1234; GFX9-NEXT: v_mov_b32_e32 v0, v1 1235; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1236; GFX9-NEXT: v_mov_b32_e32 v3, 0 1237; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1238; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 1239; GFX9-NEXT: s_endpgm 1240; 1241; GFX10W64-LABEL: sub_i32_varying_vdata: 1242; GFX10W64: ; %bb.0: ; %entry 1243; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 1244; GFX10W64-NEXT: s_not_b64 exec, exec 1245; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1246; GFX10W64-NEXT: s_not_b64 exec, exec 1247; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 1248; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1249; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 1250; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1251; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1252; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1253; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 1254; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1255; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1256; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 1257; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 1258; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1259; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 1260; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1261; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 1262; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1263; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1264; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 1265; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 1266; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1267; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1268; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1269; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 1270; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 1271; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 1272; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1273; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1274; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1275; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 1276; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1277; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1278; GFX10W64-NEXT: ; implicit-def: $vgpr0 1279; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1280; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 1281; GFX10W64-NEXT: ; %bb.1: 1282; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1283; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 1284; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1285; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1286; GFX10W64-NEXT: .LBB7_2: 1287; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1288; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 1289; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1290; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 1291; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 1292; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 1293; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1294; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1295; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 1296; GFX10W64-NEXT: s_endpgm 1297; 1298; GFX10W32-LABEL: sub_i32_varying_vdata: 1299; GFX10W32: ; %bb.0: ; %entry 1300; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 1301; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 1302; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1303; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 1304; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 1305; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1306; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1307; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1308; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1309; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 1310; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1311; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 1312; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1313; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 1314; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1315; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 1316; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 1317; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1318; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 1319; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 1320; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1321; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 1322; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 1323; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 1324; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1325; GFX10W32-NEXT: ; implicit-def: $vgpr0 1326; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 1327; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 1328; GFX10W32-NEXT: ; %bb.1: 1329; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1330; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 1331; GFX10W32-NEXT: s_mov_b32 s5, s6 1332; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1333; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1334; GFX10W32-NEXT: .LBB7_2: 1335; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1336; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1337; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1338; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 1339; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 1340; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 1341; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1342; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1343; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 1344; GFX10W32-NEXT: s_endpgm 1345entry: 1346 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1347 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) 1348 store i32 %old, i32 addrspace(1)* %out 1349 ret void 1350} 1351 1352define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { 1353; GFX6-LABEL: sub_i32_varying_offset: 1354; GFX6: ; %bb.0: ; %entry 1355; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1356; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1357; GFX6-NEXT: v_mov_b32_e32 v1, 1 1358; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1359; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc 1360; GFX6-NEXT: s_mov_b32 s3, 0xf000 1361; GFX6-NEXT: s_mov_b32 s2, -1 1362; GFX6-NEXT: s_waitcnt vmcnt(0) 1363; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 1364; GFX6-NEXT: s_endpgm 1365; 1366; GFX8-LABEL: sub_i32_varying_offset: 1367; GFX8: ; %bb.0: ; %entry 1368; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1369; GFX8-NEXT: v_mov_b32_e32 v2, 1 1370; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1371; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1372; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc 1373; GFX8-NEXT: v_mov_b32_e32 v0, s0 1374; GFX8-NEXT: v_mov_b32_e32 v1, s1 1375; GFX8-NEXT: s_waitcnt vmcnt(0) 1376; GFX8-NEXT: flat_store_dword v[0:1], v2 1377; GFX8-NEXT: s_endpgm 1378; 1379; GFX9-LABEL: sub_i32_varying_offset: 1380; GFX9: ; %bb.0: ; %entry 1381; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1382; GFX9-NEXT: v_mov_b32_e32 v1, 1 1383; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1384; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1385; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc 1386; GFX9-NEXT: v_mov_b32_e32 v0, 0 1387; GFX9-NEXT: s_waitcnt vmcnt(0) 1388; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1389; GFX9-NEXT: s_endpgm 1390; 1391; GFX10-LABEL: sub_i32_varying_offset: 1392; GFX10: ; %bb.0: ; %entry 1393; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1394; GFX10-NEXT: v_mov_b32_e32 v1, 1 1395; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1396; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1397; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc 1398; GFX10-NEXT: v_mov_b32_e32 v0, 0 1399; GFX10-NEXT: s_waitcnt vmcnt(0) 1400; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1401; GFX10-NEXT: s_endpgm 1402entry: 1403 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1404 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) 1405 store i32 %old, i32 addrspace(1)* %out 1406 ret void 1407} 1408