1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s 3; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg) 10declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32 immarg) 11declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32 immarg) 12 13; Show what the atomic optimization pass will do for raw buffers. 14 15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { 16; GFX6-LABEL: add_i32_constant: 17; GFX6: ; %bb.0: ; %entry 18; GFX6-NEXT: s_mov_b64 s[2:3], exec 19; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 20; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 21; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 22; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 23; GFX6-NEXT: ; implicit-def: $vgpr1 24; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 25; GFX6-NEXT: s_cbranch_execz .LBB0_2 26; GFX6-NEXT: ; %bb.1: 27; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 28; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 29; GFX6-NEXT: s_mul_i32 s0, s0, 5 30; GFX6-NEXT: v_mov_b32_e32 v1, s0 31; GFX6-NEXT: s_waitcnt lgkmcnt(0) 32; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 33; GFX6-NEXT: .LBB0_2: 34; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 35; GFX6-NEXT: s_waitcnt vmcnt(0) 36; GFX6-NEXT: v_readfirstlane_b32 s0, v1 37; GFX6-NEXT: s_mov_b32 s7, 0xf000 38; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s0 39; GFX6-NEXT: s_mov_b32 s6, -1 40; GFX6-NEXT: s_waitcnt lgkmcnt(0) 41; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 42; GFX6-NEXT: s_endpgm 43; 44; GFX8-LABEL: add_i32_constant: 45; GFX8: ; %bb.0: ; %entry 46; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 47; GFX8-NEXT: s_mov_b64 s[6:7], exec 48; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 49; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 50; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 51; GFX8-NEXT: ; implicit-def: $vgpr1 52; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 53; GFX8-NEXT: s_cbranch_execz .LBB0_2 54; GFX8-NEXT: ; %bb.1: 55; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 56; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 57; GFX8-NEXT: s_mul_i32 s0, s0, 5 58; GFX8-NEXT: v_mov_b32_e32 v1, s0 59; GFX8-NEXT: s_waitcnt lgkmcnt(0) 60; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 61; GFX8-NEXT: .LBB0_2: 62; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 63; GFX8-NEXT: s_waitcnt vmcnt(0) 64; GFX8-NEXT: v_readfirstlane_b32 s0, v1 65; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0 66; GFX8-NEXT: s_waitcnt lgkmcnt(0) 67; GFX8-NEXT: v_mov_b32_e32 v0, s2 68; GFX8-NEXT: v_mov_b32_e32 v1, s3 69; GFX8-NEXT: flat_store_dword v[0:1], v2 70; GFX8-NEXT: s_endpgm 71; 72; GFX9-LABEL: add_i32_constant: 73; GFX9: ; %bb.0: ; %entry 74; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 75; GFX9-NEXT: s_mov_b64 s[6:7], exec 76; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 77; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 78; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 79; GFX9-NEXT: ; implicit-def: $vgpr1 80; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 81; GFX9-NEXT: s_cbranch_execz .LBB0_2 82; GFX9-NEXT: ; %bb.1: 83; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 84; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 85; GFX9-NEXT: s_mul_i32 s0, s0, 5 86; GFX9-NEXT: v_mov_b32_e32 v1, s0 87; GFX9-NEXT: s_waitcnt lgkmcnt(0) 88; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 89; GFX9-NEXT: .LBB0_2: 90; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 91; GFX9-NEXT: s_waitcnt vmcnt(0) 92; GFX9-NEXT: v_readfirstlane_b32 s0, v1 93; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0 94; GFX9-NEXT: v_mov_b32_e32 v1, 0 95; GFX9-NEXT: s_waitcnt lgkmcnt(0) 96; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 97; GFX9-NEXT: s_endpgm 98; 99; GFX10W64-LABEL: add_i32_constant: 100; GFX10W64: ; %bb.0: ; %entry 101; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 102; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 103; GFX10W64-NEXT: ; implicit-def: $vgpr1 104; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 105; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 106; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 107; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 108; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 109; GFX10W64-NEXT: ; %bb.1: 110; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 111; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 112; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 113; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 114; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 115; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 116; GFX10W64-NEXT: .LBB0_2: 117; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 118; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 119; GFX10W64-NEXT: s_waitcnt vmcnt(0) 120; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 121; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 122; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 123; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 124; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 125; GFX10W64-NEXT: s_endpgm 126; 127; GFX10W32-LABEL: add_i32_constant: 128; GFX10W32: ; %bb.0: ; %entry 129; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 130; GFX10W32-NEXT: s_mov_b32 s5, exec_lo 131; GFX10W32-NEXT: ; implicit-def: $vgpr1 132; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 133; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 134; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 135; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 136; GFX10W32-NEXT: ; %bb.1: 137; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 138; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 139; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 140; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 141; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 142; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 143; GFX10W32-NEXT: .LBB0_2: 144; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 145; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 146; GFX10W32-NEXT: s_waitcnt vmcnt(0) 147; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 148; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 149; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 150; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 151; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 152; GFX10W32-NEXT: s_endpgm 153entry: 154 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) 155 store i32 %old, i32 addrspace(1)* %out 156 ret void 157} 158 159define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) { 160; GFX6-LABEL: add_i32_uniform: 161; GFX6: ; %bb.0: ; %entry 162; GFX6-NEXT: s_mov_b64 s[2:3], exec 163; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 164; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 165; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 166; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 167; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 168; GFX6-NEXT: ; implicit-def: $vgpr1 169; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 170; GFX6-NEXT: s_cbranch_execz .LBB1_2 171; GFX6-NEXT: ; %bb.1: 172; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 173; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 174; GFX6-NEXT: s_waitcnt lgkmcnt(0) 175; GFX6-NEXT: s_mul_i32 s0, s8, s0 176; GFX6-NEXT: v_mov_b32_e32 v1, s0 177; GFX6-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 178; GFX6-NEXT: .LBB1_2: 179; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 180; GFX6-NEXT: s_waitcnt vmcnt(0) 181; GFX6-NEXT: v_readfirstlane_b32 s0, v1 182; GFX6-NEXT: s_waitcnt lgkmcnt(0) 183; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 184; GFX6-NEXT: s_mov_b32 s7, 0xf000 185; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 186; GFX6-NEXT: s_mov_b32 s6, -1 187; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 188; GFX6-NEXT: s_endpgm 189; 190; GFX8-LABEL: add_i32_uniform: 191; GFX8: ; %bb.0: ; %entry 192; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 193; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 194; GFX8-NEXT: s_mov_b64 s[4:5], exec 195; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 196; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 197; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 198; GFX8-NEXT: ; implicit-def: $vgpr1 199; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 200; GFX8-NEXT: s_cbranch_execz .LBB1_2 201; GFX8-NEXT: ; %bb.1: 202; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 203; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 204; GFX8-NEXT: s_waitcnt lgkmcnt(0) 205; GFX8-NEXT: s_mul_i32 s0, s8, s0 206; GFX8-NEXT: v_mov_b32_e32 v1, s0 207; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 208; GFX8-NEXT: .LBB1_2: 209; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 210; GFX8-NEXT: s_waitcnt lgkmcnt(0) 211; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 212; GFX8-NEXT: s_waitcnt vmcnt(0) 213; GFX8-NEXT: v_readfirstlane_b32 s0, v1 214; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 215; GFX8-NEXT: v_mov_b32_e32 v0, s2 216; GFX8-NEXT: v_mov_b32_e32 v1, s3 217; GFX8-NEXT: flat_store_dword v[0:1], v2 218; GFX8-NEXT: s_endpgm 219; 220; GFX9-LABEL: add_i32_uniform: 221; GFX9: ; %bb.0: ; %entry 222; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 223; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 224; GFX9-NEXT: s_mov_b64 s[4:5], exec 225; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 226; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 227; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 228; GFX9-NEXT: ; implicit-def: $vgpr1 229; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc 230; GFX9-NEXT: s_cbranch_execz .LBB1_2 231; GFX9-NEXT: ; %bb.1: 232; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 233; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 234; GFX9-NEXT: s_waitcnt lgkmcnt(0) 235; GFX9-NEXT: s_mul_i32 s0, s8, s0 236; GFX9-NEXT: v_mov_b32_e32 v1, s0 237; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 238; GFX9-NEXT: .LBB1_2: 239; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 240; GFX9-NEXT: s_waitcnt lgkmcnt(0) 241; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 242; GFX9-NEXT: s_waitcnt vmcnt(0) 243; GFX9-NEXT: v_readfirstlane_b32 s0, v1 244; GFX9-NEXT: v_mov_b32_e32 v1, 0 245; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 246; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 247; GFX9-NEXT: s_endpgm 248; 249; GFX10W64-LABEL: add_i32_uniform: 250; GFX10W64: ; %bb.0: ; %entry 251; GFX10W64-NEXT: s_clause 0x1 252; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 253; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 254; GFX10W64-NEXT: s_mov_b64 s[4:5], exec 255; GFX10W64-NEXT: ; implicit-def: $vgpr1 256; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 257; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 258; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 259; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc 260; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 261; GFX10W64-NEXT: ; %bb.1: 262; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 263; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 264; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 265; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 266; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 267; GFX10W64-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 268; GFX10W64-NEXT: .LBB1_2: 269; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 270; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7] 271; GFX10W64-NEXT: s_waitcnt vmcnt(0) 272; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 273; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 274; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], null, s8, v0, s[0:1] 275; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 276; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 277; GFX10W64-NEXT: s_endpgm 278; 279; GFX10W32-LABEL: add_i32_uniform: 280; GFX10W32: ; %bb.0: ; %entry 281; GFX10W32-NEXT: s_clause 0x1 282; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 283; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 284; GFX10W32-NEXT: s_mov_b32 s6, exec_lo 285; GFX10W32-NEXT: ; implicit-def: $vgpr1 286; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 287; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 288; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 289; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 290; GFX10W32-NEXT: ; %bb.1: 291; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 292; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 293; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 294; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 295; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 296; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 297; GFX10W32-NEXT: .LBB1_2: 298; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 299; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 300; GFX10W32-NEXT: s_waitcnt vmcnt(0) 301; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 302; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 303; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], null, s4, v0, s[0:1] 304; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 305; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 306; GFX10W32-NEXT: s_endpgm 307entry: 308 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0) 309 store i32 %old, i32 addrspace(1)* %out 310 ret void 311} 312 313define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { 314; GFX6-LABEL: add_i32_varying_vdata: 315; GFX6: ; %bb.0: ; %entry 316; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 317; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 318; GFX6-NEXT: s_waitcnt lgkmcnt(0) 319; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 320; GFX6-NEXT: s_mov_b32 s3, 0xf000 321; GFX6-NEXT: s_mov_b32 s2, -1 322; GFX6-NEXT: s_waitcnt vmcnt(0) 323; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 324; GFX6-NEXT: s_endpgm 325; 326; GFX8-LABEL: add_i32_varying_vdata: 327; GFX8: ; %bb.0: ; %entry 328; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 329; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 330; GFX8-NEXT: v_mov_b32_e32 v1, 0 331; GFX8-NEXT: s_mov_b64 exec, s[4:5] 332; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 333; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 334; GFX8-NEXT: v_mov_b32_e32 v2, v0 335; GFX8-NEXT: s_not_b64 exec, exec 336; GFX8-NEXT: v_mov_b32_e32 v2, 0 337; GFX8-NEXT: s_not_b64 exec, exec 338; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 339; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 340; GFX8-NEXT: s_nop 1 341; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 342; GFX8-NEXT: s_nop 1 343; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 344; GFX8-NEXT: s_nop 1 345; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 346; GFX8-NEXT: s_nop 1 347; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 348; GFX8-NEXT: s_nop 1 349; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 350; GFX8-NEXT: v_readlane_b32 s6, v2, 63 351; GFX8-NEXT: s_nop 0 352; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 353; GFX8-NEXT: s_mov_b64 exec, s[4:5] 354; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 355; GFX8-NEXT: ; implicit-def: $vgpr0 356; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 357; GFX8-NEXT: s_cbranch_execz .LBB2_2 358; GFX8-NEXT: ; %bb.1: 359; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 360; GFX8-NEXT: v_mov_b32_e32 v0, s6 361; GFX8-NEXT: s_waitcnt lgkmcnt(0) 362; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 363; GFX8-NEXT: .LBB2_2: 364; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 365; GFX8-NEXT: s_waitcnt vmcnt(0) 366; GFX8-NEXT: v_readfirstlane_b32 s0, v0 367; GFX8-NEXT: v_mov_b32_e32 v0, v1 368; GFX8-NEXT: s_waitcnt lgkmcnt(0) 369; GFX8-NEXT: v_mov_b32_e32 v4, s3 370; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 371; GFX8-NEXT: v_mov_b32_e32 v3, s2 372; GFX8-NEXT: flat_store_dword v[3:4], v0 373; GFX8-NEXT: s_endpgm 374; 375; GFX9-LABEL: add_i32_varying_vdata: 376; GFX9: ; %bb.0: ; %entry 377; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 378; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 379; GFX9-NEXT: v_mov_b32_e32 v1, 0 380; GFX9-NEXT: s_mov_b64 exec, s[4:5] 381; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 382; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 383; GFX9-NEXT: v_mov_b32_e32 v2, v0 384; GFX9-NEXT: s_not_b64 exec, exec 385; GFX9-NEXT: v_mov_b32_e32 v2, 0 386; GFX9-NEXT: s_not_b64 exec, exec 387; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 388; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 389; GFX9-NEXT: s_nop 1 390; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 391; GFX9-NEXT: s_nop 1 392; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 393; GFX9-NEXT: s_nop 1 394; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 395; GFX9-NEXT: s_nop 1 396; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 397; GFX9-NEXT: s_nop 1 398; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 399; GFX9-NEXT: v_readlane_b32 s6, v2, 63 400; GFX9-NEXT: s_nop 0 401; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 402; GFX9-NEXT: s_mov_b64 exec, s[4:5] 403; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 404; GFX9-NEXT: ; implicit-def: $vgpr0 405; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 406; GFX9-NEXT: s_cbranch_execz .LBB2_2 407; GFX9-NEXT: ; %bb.1: 408; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 409; GFX9-NEXT: v_mov_b32_e32 v0, s6 410; GFX9-NEXT: s_waitcnt lgkmcnt(0) 411; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 412; GFX9-NEXT: .LBB2_2: 413; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 414; GFX9-NEXT: s_waitcnt vmcnt(0) 415; GFX9-NEXT: v_readfirstlane_b32 s0, v0 416; GFX9-NEXT: v_mov_b32_e32 v0, v1 417; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 418; GFX9-NEXT: v_mov_b32_e32 v3, 0 419; GFX9-NEXT: s_waitcnt lgkmcnt(0) 420; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 421; GFX9-NEXT: s_endpgm 422; 423; GFX10W64-LABEL: add_i32_varying_vdata: 424; GFX10W64: ; %bb.0: ; %entry 425; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 426; GFX10W64-NEXT: s_not_b64 exec, exec 427; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 428; GFX10W64-NEXT: s_not_b64 exec, exec 429; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 430; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 431; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 432; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 433; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 434; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 435; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 436; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 437; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 438; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 439; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 440; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 441; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 442; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 443; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 444; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 445; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 446; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 447; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 448; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 449; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 450; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 451; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 452; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 453; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 454; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 455; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 456; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 457; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 458; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 459; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 460; GFX10W64-NEXT: ; implicit-def: $vgpr0 461; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 462; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 463; GFX10W64-NEXT: ; %bb.1: 464; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 465; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 466; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 467; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 468; GFX10W64-NEXT: .LBB2_2: 469; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 470; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 471; GFX10W64-NEXT: s_waitcnt vmcnt(0) 472; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 473; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 474; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 475; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 476; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 477; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 478; GFX10W64-NEXT: s_endpgm 479; 480; GFX10W32-LABEL: add_i32_varying_vdata: 481; GFX10W32: ; %bb.0: ; %entry 482; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 483; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 484; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 485; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 486; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 487; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 488; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 489; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 490; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 491; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 492; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 493; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 494; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 495; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 496; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 497; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 498; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 499; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 500; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 501; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 502; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 503; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 504; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 505; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 506; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 507; GFX10W32-NEXT: ; implicit-def: $vgpr0 508; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 509; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 510; GFX10W32-NEXT: ; %bb.1: 511; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 512; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 513; GFX10W32-NEXT: s_mov_b32 s5, s6 514; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 515; GFX10W32-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 516; GFX10W32-NEXT: .LBB2_2: 517; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 518; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 519; GFX10W32-NEXT: s_waitcnt vmcnt(0) 520; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 521; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 522; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 523; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 524; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 525; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 526; GFX10W32-NEXT: s_endpgm 527entry: 528 %lane = call i32 @llvm.amdgcn.workitem.id.x() 529 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) 530 store i32 %old, i32 addrspace(1)* %out 531 ret void 532} 533 534define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %vindex) { 535; GFX6-LABEL: struct_add_i32_varying_vdata: 536; GFX6: ; %bb.0: ; %entry 537; GFX6-NEXT: s_load_dword s2, s[0:1], 0x11 538; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 539; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 540; GFX6-NEXT: s_waitcnt lgkmcnt(0) 541; GFX6-NEXT: v_mov_b32_e32 v1, s2 542; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc 543; GFX6-NEXT: s_mov_b32 s3, 0xf000 544; GFX6-NEXT: s_mov_b32 s2, -1 545; GFX6-NEXT: s_waitcnt vmcnt(0) 546; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 547; GFX6-NEXT: s_endpgm 548; 549; GFX8-LABEL: struct_add_i32_varying_vdata: 550; GFX8: ; %bb.0: ; %entry 551; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 552; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 553; GFX8-NEXT: v_mov_b32_e32 v1, 0 554; GFX8-NEXT: s_mov_b64 exec, s[4:5] 555; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 556; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 557; GFX8-NEXT: v_mov_b32_e32 v2, v0 558; GFX8-NEXT: s_not_b64 exec, exec 559; GFX8-NEXT: v_mov_b32_e32 v2, 0 560; GFX8-NEXT: s_not_b64 exec, exec 561; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 562; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 563; GFX8-NEXT: s_nop 1 564; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 565; GFX8-NEXT: s_nop 1 566; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 567; GFX8-NEXT: s_nop 1 568; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 569; GFX8-NEXT: s_nop 1 570; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 571; GFX8-NEXT: s_nop 1 572; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 573; GFX8-NEXT: v_readlane_b32 s6, v2, 63 574; GFX8-NEXT: s_nop 0 575; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 576; GFX8-NEXT: s_mov_b64 exec, s[4:5] 577; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 578; GFX8-NEXT: ; implicit-def: $vgpr0 579; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 580; GFX8-NEXT: s_cbranch_execz .LBB3_2 581; GFX8-NEXT: ; %bb.1: 582; GFX8-NEXT: s_load_dword s7, s[0:1], 0x44 583; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 584; GFX8-NEXT: v_mov_b32_e32 v0, s6 585; GFX8-NEXT: s_waitcnt lgkmcnt(0) 586; GFX8-NEXT: v_mov_b32_e32 v3, s7 587; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc 588; GFX8-NEXT: .LBB3_2: 589; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 590; GFX8-NEXT: s_waitcnt vmcnt(0) 591; GFX8-NEXT: v_readfirstlane_b32 s0, v0 592; GFX8-NEXT: v_mov_b32_e32 v0, v1 593; GFX8-NEXT: s_waitcnt lgkmcnt(0) 594; GFX8-NEXT: v_mov_b32_e32 v4, s3 595; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 596; GFX8-NEXT: v_mov_b32_e32 v3, s2 597; GFX8-NEXT: flat_store_dword v[3:4], v0 598; GFX8-NEXT: s_endpgm 599; 600; GFX9-LABEL: struct_add_i32_varying_vdata: 601; GFX9: ; %bb.0: ; %entry 602; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 603; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 604; GFX9-NEXT: v_mov_b32_e32 v1, 0 605; GFX9-NEXT: s_mov_b64 exec, s[4:5] 606; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 607; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 608; GFX9-NEXT: v_mov_b32_e32 v2, v0 609; GFX9-NEXT: s_not_b64 exec, exec 610; GFX9-NEXT: v_mov_b32_e32 v2, 0 611; GFX9-NEXT: s_not_b64 exec, exec 612; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 613; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 614; GFX9-NEXT: s_nop 1 615; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 616; GFX9-NEXT: s_nop 1 617; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 618; GFX9-NEXT: s_nop 1 619; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 620; GFX9-NEXT: s_nop 1 621; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 622; GFX9-NEXT: s_nop 1 623; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 624; GFX9-NEXT: v_readlane_b32 s6, v2, 63 625; GFX9-NEXT: s_nop 0 626; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 627; GFX9-NEXT: s_mov_b64 exec, s[4:5] 628; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 629; GFX9-NEXT: ; implicit-def: $vgpr0 630; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 631; GFX9-NEXT: s_cbranch_execz .LBB3_2 632; GFX9-NEXT: ; %bb.1: 633; GFX9-NEXT: s_load_dword s7, s[0:1], 0x44 634; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 635; GFX9-NEXT: v_mov_b32_e32 v0, s6 636; GFX9-NEXT: s_waitcnt lgkmcnt(0) 637; GFX9-NEXT: v_mov_b32_e32 v3, s7 638; GFX9-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc 639; GFX9-NEXT: .LBB3_2: 640; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 641; GFX9-NEXT: s_waitcnt vmcnt(0) 642; GFX9-NEXT: v_readfirstlane_b32 s0, v0 643; GFX9-NEXT: v_mov_b32_e32 v0, v1 644; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 645; GFX9-NEXT: v_mov_b32_e32 v3, 0 646; GFX9-NEXT: s_waitcnt lgkmcnt(0) 647; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 648; GFX9-NEXT: s_endpgm 649; 650; GFX10W64-LABEL: struct_add_i32_varying_vdata: 651; GFX10W64: ; %bb.0: ; %entry 652; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 653; GFX10W64-NEXT: s_not_b64 exec, exec 654; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 655; GFX10W64-NEXT: s_not_b64 exec, exec 656; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 657; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 658; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 659; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 660; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 661; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 662; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 663; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 664; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 665; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 666; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 667; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 668; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 669; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 670; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 671; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 672; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 673; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 674; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 675; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 676; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 677; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 678; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 679; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 680; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 681; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 682; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 683; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 684; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 685; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 686; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 687; GFX10W64-NEXT: ; implicit-def: $vgpr0 688; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 689; GFX10W64-NEXT: s_cbranch_execz .LBB3_2 690; GFX10W64-NEXT: ; %bb.1: 691; GFX10W64-NEXT: s_clause 0x1 692; GFX10W64-NEXT: s_load_dword s7, s[0:1], 0x44 693; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 694; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 695; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 696; GFX10W64-NEXT: v_mov_b32_e32 v4, s7 697; GFX10W64-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc 698; GFX10W64-NEXT: .LBB3_2: 699; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 700; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 701; GFX10W64-NEXT: s_waitcnt vmcnt(0) 702; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 703; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 704; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 705; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 706; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 707; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 708; GFX10W64-NEXT: s_endpgm 709; 710; GFX10W32-LABEL: struct_add_i32_varying_vdata: 711; GFX10W32: ; %bb.0: ; %entry 712; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 713; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 714; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 715; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 716; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 717; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 718; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 719; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 720; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 721; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 722; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 723; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 724; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 725; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 726; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 727; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 728; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 729; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 730; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 731; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 732; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 733; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 734; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 735; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 736; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 737; GFX10W32-NEXT: ; implicit-def: $vgpr0 738; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 739; GFX10W32-NEXT: s_cbranch_execz .LBB3_2 740; GFX10W32-NEXT: ; %bb.1: 741; GFX10W32-NEXT: s_mov_b32 s5, s6 742; GFX10W32-NEXT: s_clause 0x1 743; GFX10W32-NEXT: s_load_dword s6, s[0:1], 0x44 744; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 745; GFX10W32-NEXT: v_mov_b32_e32 v0, s5 746; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 747; GFX10W32-NEXT: v_mov_b32_e32 v4, s6 748; GFX10W32-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc 749; GFX10W32-NEXT: .LBB3_2: 750; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 751; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 752; GFX10W32-NEXT: s_waitcnt vmcnt(0) 753; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 754; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 755; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 756; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 757; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 758; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 759; GFX10W32-NEXT: s_endpgm 760entry: 761 %lane = call i32 @llvm.amdgcn.workitem.id.x() 762 %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 %vindex, i32 0, i32 0, i32 0) 763 store i32 %old, i32 addrspace(1)* %out 764 ret void 765} 766 767define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { 768; GFX6-LABEL: add_i32_varying_offset: 769; GFX6: ; %bb.0: ; %entry 770; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 771; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 772; GFX6-NEXT: v_mov_b32_e32 v1, 1 773; GFX6-NEXT: s_waitcnt lgkmcnt(0) 774; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc 775; GFX6-NEXT: s_mov_b32 s3, 0xf000 776; GFX6-NEXT: s_mov_b32 s2, -1 777; GFX6-NEXT: s_waitcnt vmcnt(0) 778; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 779; GFX6-NEXT: s_endpgm 780; 781; GFX8-LABEL: add_i32_varying_offset: 782; GFX8: ; %bb.0: ; %entry 783; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 784; GFX8-NEXT: v_mov_b32_e32 v2, 1 785; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 786; GFX8-NEXT: s_waitcnt lgkmcnt(0) 787; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc 788; GFX8-NEXT: v_mov_b32_e32 v0, s0 789; GFX8-NEXT: v_mov_b32_e32 v1, s1 790; GFX8-NEXT: s_waitcnt vmcnt(0) 791; GFX8-NEXT: flat_store_dword v[0:1], v2 792; GFX8-NEXT: s_endpgm 793; 794; GFX9-LABEL: add_i32_varying_offset: 795; GFX9: ; %bb.0: ; %entry 796; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 797; GFX9-NEXT: v_mov_b32_e32 v1, 1 798; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 799; GFX9-NEXT: s_waitcnt lgkmcnt(0) 800; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc 801; GFX9-NEXT: v_mov_b32_e32 v0, 0 802; GFX9-NEXT: s_waitcnt vmcnt(0) 803; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 804; GFX9-NEXT: s_endpgm 805; 806; GFX10-LABEL: add_i32_varying_offset: 807; GFX10: ; %bb.0: ; %entry 808; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 809; GFX10-NEXT: v_mov_b32_e32 v1, 1 810; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 811; GFX10-NEXT: s_waitcnt lgkmcnt(0) 812; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc 813; GFX10-NEXT: v_mov_b32_e32 v0, 0 814; GFX10-NEXT: s_waitcnt vmcnt(0) 815; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 816; GFX10-NEXT: s_endpgm 817entry: 818 %lane = call i32 @llvm.amdgcn.workitem.id.x() 819 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) 820 store i32 %old, i32 addrspace(1)* %out 821 ret void 822} 823 824define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { 825; GFX6-LABEL: sub_i32_constant: 826; GFX6: ; %bb.0: ; %entry 827; GFX6-NEXT: s_mov_b64 s[2:3], exec 828; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 829; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 830; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 831; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 832; GFX6-NEXT: ; implicit-def: $vgpr1 833; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 834; GFX6-NEXT: s_cbranch_execz .LBB5_2 835; GFX6-NEXT: ; %bb.1: 836; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 837; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 838; GFX6-NEXT: s_mul_i32 s0, s0, 5 839; GFX6-NEXT: v_mov_b32_e32 v1, s0 840; GFX6-NEXT: s_waitcnt lgkmcnt(0) 841; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 842; GFX6-NEXT: .LBB5_2: 843; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 844; GFX6-NEXT: s_waitcnt vmcnt(0) 845; GFX6-NEXT: v_readfirstlane_b32 s0, v1 846; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0 847; GFX6-NEXT: s_mov_b32 s7, 0xf000 848; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 849; GFX6-NEXT: s_mov_b32 s6, -1 850; GFX6-NEXT: s_waitcnt lgkmcnt(0) 851; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 852; GFX6-NEXT: s_endpgm 853; 854; GFX8-LABEL: sub_i32_constant: 855; GFX8: ; %bb.0: ; %entry 856; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 857; GFX8-NEXT: s_mov_b64 s[6:7], exec 858; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 859; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 860; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 861; GFX8-NEXT: ; implicit-def: $vgpr1 862; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 863; GFX8-NEXT: s_cbranch_execz .LBB5_2 864; GFX8-NEXT: ; %bb.1: 865; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 866; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 867; GFX8-NEXT: s_mul_i32 s0, s0, 5 868; GFX8-NEXT: v_mov_b32_e32 v1, s0 869; GFX8-NEXT: s_waitcnt lgkmcnt(0) 870; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 871; GFX8-NEXT: .LBB5_2: 872; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 873; GFX8-NEXT: s_waitcnt vmcnt(0) 874; GFX8-NEXT: v_readfirstlane_b32 s0, v1 875; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 876; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 877; GFX8-NEXT: s_waitcnt lgkmcnt(0) 878; GFX8-NEXT: v_mov_b32_e32 v0, s2 879; GFX8-NEXT: v_mov_b32_e32 v1, s3 880; GFX8-NEXT: flat_store_dword v[0:1], v2 881; GFX8-NEXT: s_endpgm 882; 883; GFX9-LABEL: sub_i32_constant: 884; GFX9: ; %bb.0: ; %entry 885; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 886; GFX9-NEXT: s_mov_b64 s[6:7], exec 887; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 888; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 889; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 890; GFX9-NEXT: ; implicit-def: $vgpr1 891; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 892; GFX9-NEXT: s_cbranch_execz .LBB5_2 893; GFX9-NEXT: ; %bb.1: 894; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 895; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 896; GFX9-NEXT: s_mul_i32 s0, s0, 5 897; GFX9-NEXT: v_mov_b32_e32 v1, s0 898; GFX9-NEXT: s_waitcnt lgkmcnt(0) 899; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 900; GFX9-NEXT: .LBB5_2: 901; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 902; GFX9-NEXT: s_waitcnt vmcnt(0) 903; GFX9-NEXT: v_readfirstlane_b32 s0, v1 904; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 905; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 906; GFX9-NEXT: v_mov_b32_e32 v1, 0 907; GFX9-NEXT: s_waitcnt lgkmcnt(0) 908; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 909; GFX9-NEXT: s_endpgm 910; 911; GFX10W64-LABEL: sub_i32_constant: 912; GFX10W64: ; %bb.0: ; %entry 913; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 914; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 915; GFX10W64-NEXT: ; implicit-def: $vgpr1 916; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 917; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 918; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 919; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 920; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 921; GFX10W64-NEXT: ; %bb.1: 922; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 923; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 924; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 925; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 926; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 927; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 928; GFX10W64-NEXT: .LBB5_2: 929; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 930; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 931; GFX10W64-NEXT: s_waitcnt vmcnt(0) 932; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 933; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 934; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 935; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 936; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 937; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 938; GFX10W64-NEXT: s_endpgm 939; 940; GFX10W32-LABEL: sub_i32_constant: 941; GFX10W32: ; %bb.0: ; %entry 942; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 943; GFX10W32-NEXT: s_mov_b32 s5, exec_lo 944; GFX10W32-NEXT: ; implicit-def: $vgpr1 945; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 946; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 947; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 948; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 949; GFX10W32-NEXT: ; %bb.1: 950; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 951; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 952; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 953; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 954; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 955; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 956; GFX10W32-NEXT: .LBB5_2: 957; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 958; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 959; GFX10W32-NEXT: s_waitcnt vmcnt(0) 960; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 961; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 962; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 963; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 964; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 965; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 966; GFX10W32-NEXT: s_endpgm 967entry: 968 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) 969 store i32 %old, i32 addrspace(1)* %out 970 ret void 971} 972 973define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) { 974; GFX6-LABEL: sub_i32_uniform: 975; GFX6: ; %bb.0: ; %entry 976; GFX6-NEXT: s_mov_b64 s[2:3], exec 977; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 978; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 979; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 980; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 981; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 982; GFX6-NEXT: ; implicit-def: $vgpr1 983; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 984; GFX6-NEXT: s_cbranch_execz .LBB6_2 985; GFX6-NEXT: ; %bb.1: 986; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 987; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 988; GFX6-NEXT: s_waitcnt lgkmcnt(0) 989; GFX6-NEXT: s_mul_i32 s0, s8, s0 990; GFX6-NEXT: v_mov_b32_e32 v1, s0 991; GFX6-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 992; GFX6-NEXT: .LBB6_2: 993; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 994; GFX6-NEXT: s_waitcnt vmcnt(0) 995; GFX6-NEXT: v_readfirstlane_b32 s0, v1 996; GFX6-NEXT: s_waitcnt lgkmcnt(0) 997; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 998; GFX6-NEXT: s_mov_b32 s7, 0xf000 999; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1000; GFX6-NEXT: s_mov_b32 s6, -1 1001; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 1002; GFX6-NEXT: s_endpgm 1003; 1004; GFX8-LABEL: sub_i32_uniform: 1005; GFX8: ; %bb.0: ; %entry 1006; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1007; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 1008; GFX8-NEXT: s_mov_b64 s[4:5], exec 1009; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1010; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1011; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1012; GFX8-NEXT: ; implicit-def: $vgpr1 1013; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 1014; GFX8-NEXT: s_cbranch_execz .LBB6_2 1015; GFX8-NEXT: ; %bb.1: 1016; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 1017; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 1018; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1019; GFX8-NEXT: s_mul_i32 s0, s8, s0 1020; GFX8-NEXT: v_mov_b32_e32 v1, s0 1021; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1022; GFX8-NEXT: .LBB6_2: 1023; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 1024; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1025; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 1026; GFX8-NEXT: s_waitcnt vmcnt(0) 1027; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1028; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 1029; GFX8-NEXT: v_mov_b32_e32 v0, s2 1030; GFX8-NEXT: v_mov_b32_e32 v1, s3 1031; GFX8-NEXT: flat_store_dword v[0:1], v2 1032; GFX8-NEXT: s_endpgm 1033; 1034; GFX9-LABEL: sub_i32_uniform: 1035; GFX9: ; %bb.0: ; %entry 1036; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1037; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 1038; GFX9-NEXT: s_mov_b64 s[4:5], exec 1039; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1040; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1041; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1042; GFX9-NEXT: ; implicit-def: $vgpr1 1043; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc 1044; GFX9-NEXT: s_cbranch_execz .LBB6_2 1045; GFX9-NEXT: ; %bb.1: 1046; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 1047; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 1048; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1049; GFX9-NEXT: s_mul_i32 s0, s8, s0 1050; GFX9-NEXT: v_mov_b32_e32 v1, s0 1051; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1052; GFX9-NEXT: .LBB6_2: 1053; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 1054; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1055; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 1056; GFX9-NEXT: s_waitcnt vmcnt(0) 1057; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1058; GFX9-NEXT: v_mov_b32_e32 v1, 0 1059; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1060; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 1061; GFX9-NEXT: s_endpgm 1062; 1063; GFX10W64-LABEL: sub_i32_uniform: 1064; GFX10W64: ; %bb.0: ; %entry 1065; GFX10W64-NEXT: s_clause 0x1 1066; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1067; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 1068; GFX10W64-NEXT: s_mov_b64 s[4:5], exec 1069; GFX10W64-NEXT: ; implicit-def: $vgpr1 1070; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1071; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1072; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1073; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc 1074; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 1075; GFX10W64-NEXT: ; %bb.1: 1076; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 1077; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 1078; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1079; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 1080; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 1081; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1082; GFX10W64-NEXT: .LBB6_2: 1083; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1084; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7] 1085; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1086; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0 1087; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1088; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 1089; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1090; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1091; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 1092; GFX10W64-NEXT: s_endpgm 1093; 1094; GFX10W32-LABEL: sub_i32_uniform: 1095; GFX10W32: ; %bb.0: ; %entry 1096; GFX10W32-NEXT: s_clause 0x1 1097; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1098; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 1099; GFX10W32-NEXT: s_mov_b32 s6, exec_lo 1100; GFX10W32-NEXT: ; implicit-def: $vgpr1 1101; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1102; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1103; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 1104; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 1105; GFX10W32-NEXT: ; %bb.1: 1106; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1107; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 1108; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1109; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 1110; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 1111; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1112; GFX10W32-NEXT: .LBB6_2: 1113; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1114; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 1115; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1116; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0 1117; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1118; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 1119; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1120; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1121; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 1122; GFX10W32-NEXT: s_endpgm 1123entry: 1124 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0) 1125 store i32 %old, i32 addrspace(1)* %out 1126 ret void 1127} 1128 1129define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { 1130; GFX6-LABEL: sub_i32_varying_vdata: 1131; GFX6: ; %bb.0: ; %entry 1132; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1133; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1134; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1135; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 1136; GFX6-NEXT: s_mov_b32 s3, 0xf000 1137; GFX6-NEXT: s_mov_b32 s2, -1 1138; GFX6-NEXT: s_waitcnt vmcnt(0) 1139; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 1140; GFX6-NEXT: s_endpgm 1141; 1142; GFX8-LABEL: sub_i32_varying_vdata: 1143; GFX8: ; %bb.0: ; %entry 1144; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1145; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1146; GFX8-NEXT: v_mov_b32_e32 v1, 0 1147; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1148; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1149; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1150; GFX8-NEXT: v_mov_b32_e32 v2, v0 1151; GFX8-NEXT: s_not_b64 exec, exec 1152; GFX8-NEXT: v_mov_b32_e32 v2, 0 1153; GFX8-NEXT: s_not_b64 exec, exec 1154; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1155; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1156; GFX8-NEXT: s_nop 1 1157; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1158; GFX8-NEXT: s_nop 1 1159; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1160; GFX8-NEXT: s_nop 1 1161; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1162; GFX8-NEXT: s_nop 1 1163; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1164; GFX8-NEXT: s_nop 1 1165; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1166; GFX8-NEXT: v_readlane_b32 s6, v2, 63 1167; GFX8-NEXT: s_nop 0 1168; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1169; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1170; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1171; GFX8-NEXT: ; implicit-def: $vgpr0 1172; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1173; GFX8-NEXT: s_cbranch_execz .LBB7_2 1174; GFX8-NEXT: ; %bb.1: 1175; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1176; GFX8-NEXT: v_mov_b32_e32 v0, s6 1177; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1178; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1179; GFX8-NEXT: .LBB7_2: 1180; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1181; GFX8-NEXT: s_waitcnt vmcnt(0) 1182; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1183; GFX8-NEXT: v_mov_b32_e32 v0, v1 1184; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1185; GFX8-NEXT: v_mov_b32_e32 v4, s3 1186; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1187; GFX8-NEXT: v_mov_b32_e32 v3, s2 1188; GFX8-NEXT: flat_store_dword v[3:4], v0 1189; GFX8-NEXT: s_endpgm 1190; 1191; GFX9-LABEL: sub_i32_varying_vdata: 1192; GFX9: ; %bb.0: ; %entry 1193; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1194; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1195; GFX9-NEXT: v_mov_b32_e32 v1, 0 1196; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1197; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1198; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1199; GFX9-NEXT: v_mov_b32_e32 v2, v0 1200; GFX9-NEXT: s_not_b64 exec, exec 1201; GFX9-NEXT: v_mov_b32_e32 v2, 0 1202; GFX9-NEXT: s_not_b64 exec, exec 1203; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1204; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1205; GFX9-NEXT: s_nop 1 1206; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1207; GFX9-NEXT: s_nop 1 1208; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1209; GFX9-NEXT: s_nop 1 1210; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1211; GFX9-NEXT: s_nop 1 1212; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1213; GFX9-NEXT: s_nop 1 1214; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1215; GFX9-NEXT: v_readlane_b32 s6, v2, 63 1216; GFX9-NEXT: s_nop 0 1217; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1218; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1219; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1220; GFX9-NEXT: ; implicit-def: $vgpr0 1221; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1222; GFX9-NEXT: s_cbranch_execz .LBB7_2 1223; GFX9-NEXT: ; %bb.1: 1224; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1225; GFX9-NEXT: v_mov_b32_e32 v0, s6 1226; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1227; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1228; GFX9-NEXT: .LBB7_2: 1229; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1230; GFX9-NEXT: s_waitcnt vmcnt(0) 1231; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1232; GFX9-NEXT: v_mov_b32_e32 v0, v1 1233; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1234; GFX9-NEXT: v_mov_b32_e32 v3, 0 1235; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1236; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 1237; GFX9-NEXT: s_endpgm 1238; 1239; GFX10W64-LABEL: sub_i32_varying_vdata: 1240; GFX10W64: ; %bb.0: ; %entry 1241; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 1242; GFX10W64-NEXT: s_not_b64 exec, exec 1243; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1244; GFX10W64-NEXT: s_not_b64 exec, exec 1245; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 1246; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1247; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 1248; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1249; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1250; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1251; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 1252; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1253; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1254; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 1255; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 1256; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1257; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 1258; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1259; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 1260; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1261; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1262; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 1263; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 1264; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1265; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1266; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1267; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 1268; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 1269; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 1270; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1271; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1272; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1273; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 1274; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1275; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1276; GFX10W64-NEXT: ; implicit-def: $vgpr0 1277; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1278; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 1279; GFX10W64-NEXT: ; %bb.1: 1280; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1281; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 1282; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1283; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1284; GFX10W64-NEXT: .LBB7_2: 1285; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1286; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 1287; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1288; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 1289; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 1290; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 1291; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1292; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1293; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 1294; GFX10W64-NEXT: s_endpgm 1295; 1296; GFX10W32-LABEL: sub_i32_varying_vdata: 1297; GFX10W32: ; %bb.0: ; %entry 1298; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 1299; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 1300; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1301; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 1302; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 1303; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1304; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1305; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1306; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1307; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 1308; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1309; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 1310; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1311; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 1312; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1313; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 1314; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 1315; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1316; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 1317; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 1318; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1319; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 1320; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 1321; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 1322; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1323; GFX10W32-NEXT: ; implicit-def: $vgpr0 1324; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 1325; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 1326; GFX10W32-NEXT: ; %bb.1: 1327; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1328; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 1329; GFX10W32-NEXT: s_mov_b32 s5, s6 1330; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1331; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1332; GFX10W32-NEXT: .LBB7_2: 1333; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1334; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1335; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1336; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 1337; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 1338; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 1339; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1340; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1341; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 1342; GFX10W32-NEXT: s_endpgm 1343entry: 1344 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1345 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) 1346 store i32 %old, i32 addrspace(1)* %out 1347 ret void 1348} 1349 1350define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { 1351; GFX6-LABEL: sub_i32_varying_offset: 1352; GFX6: ; %bb.0: ; %entry 1353; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1354; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1355; GFX6-NEXT: v_mov_b32_e32 v1, 1 1356; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1357; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc 1358; GFX6-NEXT: s_mov_b32 s3, 0xf000 1359; GFX6-NEXT: s_mov_b32 s2, -1 1360; GFX6-NEXT: s_waitcnt vmcnt(0) 1361; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 1362; GFX6-NEXT: s_endpgm 1363; 1364; GFX8-LABEL: sub_i32_varying_offset: 1365; GFX8: ; %bb.0: ; %entry 1366; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1367; GFX8-NEXT: v_mov_b32_e32 v2, 1 1368; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1369; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1370; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc 1371; GFX8-NEXT: v_mov_b32_e32 v0, s0 1372; GFX8-NEXT: v_mov_b32_e32 v1, s1 1373; GFX8-NEXT: s_waitcnt vmcnt(0) 1374; GFX8-NEXT: flat_store_dword v[0:1], v2 1375; GFX8-NEXT: s_endpgm 1376; 1377; GFX9-LABEL: sub_i32_varying_offset: 1378; GFX9: ; %bb.0: ; %entry 1379; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1380; GFX9-NEXT: v_mov_b32_e32 v1, 1 1381; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1382; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1383; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc 1384; GFX9-NEXT: v_mov_b32_e32 v0, 0 1385; GFX9-NEXT: s_waitcnt vmcnt(0) 1386; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1387; GFX9-NEXT: s_endpgm 1388; 1389; GFX10-LABEL: sub_i32_varying_offset: 1390; GFX10: ; %bb.0: ; %entry 1391; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1392; GFX10-NEXT: v_mov_b32_e32 v1, 1 1393; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1394; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1395; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc 1396; GFX10-NEXT: v_mov_b32_e32 v0, 0 1397; GFX10-NEXT: s_waitcnt vmcnt(0) 1398; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1399; GFX10-NEXT: s_endpgm 1400entry: 1401 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1402 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) 1403 store i32 %old, i32 addrspace(1)* %out 1404 ret void 1405} 1406