1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s 3; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32) 10declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32) 11 12; Show what the atomic optimization pass will do for raw buffers. 13 14define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { 15; GFX6-LABEL: add_i32_constant: 16; GFX6: ; %bb.0: ; %entry 17; GFX6-NEXT: s_mov_b64 s[2:3], exec 18; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 19; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 20; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 21; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 22; GFX6-NEXT: ; implicit-def: $vgpr1 23; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 24; GFX6-NEXT: s_cbranch_execz .LBB0_2 25; GFX6-NEXT: ; %bb.1: 26; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 27; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 28; GFX6-NEXT: s_mul_i32 s0, s0, 5 29; GFX6-NEXT: v_mov_b32_e32 v1, s0 30; GFX6-NEXT: s_waitcnt lgkmcnt(0) 31; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 32; GFX6-NEXT: .LBB0_2: 33; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 34; GFX6-NEXT: s_waitcnt vmcnt(0) 35; GFX6-NEXT: v_readfirstlane_b32 s0, v1 36; GFX6-NEXT: s_mov_b32 s7, 0xf000 37; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s0 38; GFX6-NEXT: s_mov_b32 s6, -1 39; GFX6-NEXT: s_waitcnt lgkmcnt(0) 40; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 41; GFX6-NEXT: s_endpgm 42; 43; GFX8-LABEL: add_i32_constant: 44; GFX8: ; %bb.0: ; %entry 45; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 46; GFX8-NEXT: s_mov_b64 s[6:7], exec 47; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 48; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 49; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 50; GFX8-NEXT: ; implicit-def: $vgpr1 51; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 52; GFX8-NEXT: s_cbranch_execz .LBB0_2 53; GFX8-NEXT: ; %bb.1: 54; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 55; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 56; GFX8-NEXT: s_mul_i32 s0, s0, 5 57; GFX8-NEXT: v_mov_b32_e32 v1, s0 58; GFX8-NEXT: s_waitcnt lgkmcnt(0) 59; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 60; GFX8-NEXT: .LBB0_2: 61; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 62; GFX8-NEXT: s_waitcnt vmcnt(0) 63; GFX8-NEXT: v_readfirstlane_b32 s0, v1 64; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0 65; GFX8-NEXT: s_waitcnt lgkmcnt(0) 66; GFX8-NEXT: v_mov_b32_e32 v0, s2 67; GFX8-NEXT: v_mov_b32_e32 v1, s3 68; GFX8-NEXT: flat_store_dword v[0:1], v2 69; GFX8-NEXT: s_endpgm 70; 71; GFX9-LABEL: add_i32_constant: 72; GFX9: ; %bb.0: ; %entry 73; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 74; GFX9-NEXT: s_mov_b64 s[6:7], exec 75; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 76; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 77; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 78; GFX9-NEXT: ; implicit-def: $vgpr1 79; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 80; GFX9-NEXT: s_cbranch_execz .LBB0_2 81; GFX9-NEXT: ; %bb.1: 82; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 83; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 84; GFX9-NEXT: s_mul_i32 s0, s0, 5 85; GFX9-NEXT: v_mov_b32_e32 v1, s0 86; GFX9-NEXT: s_waitcnt lgkmcnt(0) 87; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 88; GFX9-NEXT: .LBB0_2: 89; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 90; GFX9-NEXT: s_waitcnt vmcnt(0) 91; GFX9-NEXT: v_readfirstlane_b32 s0, v1 92; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0 93; GFX9-NEXT: v_mov_b32_e32 v1, 0 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 96; GFX9-NEXT: s_endpgm 97; 98; GFX10W64-LABEL: add_i32_constant: 99; GFX10W64: ; %bb.0: ; %entry 100; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 101; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 102; GFX10W64-NEXT: ; implicit-def: $vgpr1 103; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 104; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 105; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 106; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 107; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 108; GFX10W64-NEXT: ; %bb.1: 109; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 110; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 111; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 112; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 113; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 114; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 115; GFX10W64-NEXT: .LBB0_2: 116; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 117; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 118; GFX10W64-NEXT: s_waitcnt vmcnt(0) 119; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 120; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 121; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 122; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 123; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 124; GFX10W64-NEXT: s_endpgm 125; 126; GFX10W32-LABEL: add_i32_constant: 127; GFX10W32: ; %bb.0: ; %entry 128; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 129; GFX10W32-NEXT: s_mov_b32 s5, exec_lo 130; GFX10W32-NEXT: ; implicit-def: $vgpr1 131; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 132; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 133; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 134; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 135; GFX10W32-NEXT: ; %bb.1: 136; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 137; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 138; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 139; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 140; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 141; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 142; GFX10W32-NEXT: .LBB0_2: 143; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 144; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 145; GFX10W32-NEXT: s_waitcnt vmcnt(0) 146; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 147; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 148; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 149; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 150; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 151; GFX10W32-NEXT: s_endpgm 152entry: 153 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) 154 store i32 %old, i32 addrspace(1)* %out 155 ret void 156} 157 158define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) { 159; GFX6-LABEL: add_i32_uniform: 160; GFX6: ; %bb.0: ; %entry 161; GFX6-NEXT: s_mov_b64 s[2:3], exec 162; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 163; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 164; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 165; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 166; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 167; GFX6-NEXT: ; implicit-def: $vgpr1 168; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 169; GFX6-NEXT: s_cbranch_execz .LBB1_2 170; GFX6-NEXT: ; %bb.1: 171; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 172; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 173; GFX6-NEXT: s_waitcnt lgkmcnt(0) 174; GFX6-NEXT: s_mul_i32 s0, s8, s0 175; GFX6-NEXT: v_mov_b32_e32 v1, s0 176; GFX6-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 177; GFX6-NEXT: .LBB1_2: 178; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 179; GFX6-NEXT: s_waitcnt vmcnt(0) 180; GFX6-NEXT: v_readfirstlane_b32 s0, v1 181; GFX6-NEXT: s_waitcnt lgkmcnt(0) 182; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 183; GFX6-NEXT: s_mov_b32 s7, 0xf000 184; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 185; GFX6-NEXT: s_mov_b32 s6, -1 186; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 187; GFX6-NEXT: s_endpgm 188; 189; GFX8-LABEL: add_i32_uniform: 190; GFX8: ; %bb.0: ; %entry 191; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 192; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 193; GFX8-NEXT: s_mov_b64 s[4:5], exec 194; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 195; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 196; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 197; GFX8-NEXT: ; implicit-def: $vgpr1 198; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 199; GFX8-NEXT: s_cbranch_execz .LBB1_2 200; GFX8-NEXT: ; %bb.1: 201; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 202; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 203; GFX8-NEXT: s_waitcnt lgkmcnt(0) 204; GFX8-NEXT: s_mul_i32 s0, s8, s0 205; GFX8-NEXT: v_mov_b32_e32 v1, s0 206; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 207; GFX8-NEXT: .LBB1_2: 208; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 209; GFX8-NEXT: s_waitcnt lgkmcnt(0) 210; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 211; GFX8-NEXT: s_waitcnt vmcnt(0) 212; GFX8-NEXT: v_readfirstlane_b32 s0, v1 213; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 214; GFX8-NEXT: v_mov_b32_e32 v0, s2 215; GFX8-NEXT: v_mov_b32_e32 v1, s3 216; GFX8-NEXT: flat_store_dword v[0:1], v2 217; GFX8-NEXT: s_endpgm 218; 219; GFX9-LABEL: add_i32_uniform: 220; GFX9: ; %bb.0: ; %entry 221; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 222; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 223; GFX9-NEXT: s_mov_b64 s[4:5], exec 224; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 225; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 226; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 227; GFX9-NEXT: ; implicit-def: $vgpr1 228; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc 229; GFX9-NEXT: s_cbranch_execz .LBB1_2 230; GFX9-NEXT: ; %bb.1: 231; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 232; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 233; GFX9-NEXT: s_waitcnt lgkmcnt(0) 234; GFX9-NEXT: s_mul_i32 s0, s8, s0 235; GFX9-NEXT: v_mov_b32_e32 v1, s0 236; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 237; GFX9-NEXT: .LBB1_2: 238; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 239; GFX9-NEXT: s_waitcnt lgkmcnt(0) 240; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 241; GFX9-NEXT: s_waitcnt vmcnt(0) 242; GFX9-NEXT: v_readfirstlane_b32 s0, v1 243; GFX9-NEXT: v_mov_b32_e32 v1, 0 244; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 245; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 246; GFX9-NEXT: s_endpgm 247; 248; GFX10W64-LABEL: add_i32_uniform: 249; GFX10W64: ; %bb.0: ; %entry 250; GFX10W64-NEXT: s_clause 0x1 251; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 252; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 253; GFX10W64-NEXT: s_mov_b64 s[4:5], exec 254; GFX10W64-NEXT: ; implicit-def: $vgpr1 255; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 256; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 257; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 258; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc 259; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 260; GFX10W64-NEXT: ; %bb.1: 261; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 262; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 263; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 264; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 265; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 266; GFX10W64-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 267; GFX10W64-NEXT: .LBB1_2: 268; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 269; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7] 270; GFX10W64-NEXT: s_waitcnt vmcnt(0) 271; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 272; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 273; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], null, s8, v0, s[0:1] 274; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 275; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 276; GFX10W64-NEXT: s_endpgm 277; 278; GFX10W32-LABEL: add_i32_uniform: 279; GFX10W32: ; %bb.0: ; %entry 280; GFX10W32-NEXT: s_clause 0x1 281; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 282; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 283; GFX10W32-NEXT: s_mov_b32 s6, exec_lo 284; GFX10W32-NEXT: ; implicit-def: $vgpr1 285; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 286; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 287; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 288; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 289; GFX10W32-NEXT: ; %bb.1: 290; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 291; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 292; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 293; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 294; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 295; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 296; GFX10W32-NEXT: .LBB1_2: 297; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 298; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 299; GFX10W32-NEXT: s_waitcnt vmcnt(0) 300; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 301; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 302; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], null, s4, v0, s[0:1] 303; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 304; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 305; GFX10W32-NEXT: s_endpgm 306entry: 307 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0) 308 store i32 %old, i32 addrspace(1)* %out 309 ret void 310} 311 312define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { 313; GFX6-LABEL: add_i32_varying_vdata: 314; GFX6: ; %bb.0: ; %entry 315; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 316; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 317; GFX6-NEXT: s_waitcnt lgkmcnt(0) 318; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 319; GFX6-NEXT: s_mov_b32 s3, 0xf000 320; GFX6-NEXT: s_mov_b32 s2, -1 321; GFX6-NEXT: s_waitcnt vmcnt(0) 322; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 323; GFX6-NEXT: s_endpgm 324; 325; GFX8-LABEL: add_i32_varying_vdata: 326; GFX8: ; %bb.0: ; %entry 327; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 328; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 329; GFX8-NEXT: v_mov_b32_e32 v1, 0 330; GFX8-NEXT: s_mov_b64 exec, s[4:5] 331; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 332; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 333; GFX8-NEXT: v_mov_b32_e32 v2, v0 334; GFX8-NEXT: s_not_b64 exec, exec 335; GFX8-NEXT: v_mov_b32_e32 v2, 0 336; GFX8-NEXT: s_not_b64 exec, exec 337; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 338; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 339; GFX8-NEXT: s_nop 1 340; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 341; GFX8-NEXT: s_nop 1 342; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 343; GFX8-NEXT: s_nop 1 344; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 345; GFX8-NEXT: s_nop 1 346; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 347; GFX8-NEXT: s_nop 1 348; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 349; GFX8-NEXT: v_readlane_b32 s6, v2, 63 350; GFX8-NEXT: s_nop 0 351; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 352; GFX8-NEXT: s_mov_b64 exec, s[4:5] 353; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 354; GFX8-NEXT: ; implicit-def: $vgpr0 355; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 356; GFX8-NEXT: s_cbranch_execz .LBB2_2 357; GFX8-NEXT: ; %bb.1: 358; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 359; GFX8-NEXT: v_mov_b32_e32 v0, s6 360; GFX8-NEXT: s_waitcnt lgkmcnt(0) 361; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 362; GFX8-NEXT: .LBB2_2: 363; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 364; GFX8-NEXT: s_waitcnt vmcnt(0) 365; GFX8-NEXT: v_readfirstlane_b32 s0, v0 366; GFX8-NEXT: v_mov_b32_e32 v0, v1 367; GFX8-NEXT: s_waitcnt lgkmcnt(0) 368; GFX8-NEXT: v_mov_b32_e32 v4, s3 369; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 370; GFX8-NEXT: v_mov_b32_e32 v3, s2 371; GFX8-NEXT: flat_store_dword v[3:4], v0 372; GFX8-NEXT: s_endpgm 373; 374; GFX9-LABEL: add_i32_varying_vdata: 375; GFX9: ; %bb.0: ; %entry 376; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 377; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 378; GFX9-NEXT: v_mov_b32_e32 v1, 0 379; GFX9-NEXT: s_mov_b64 exec, s[4:5] 380; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 381; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 382; GFX9-NEXT: v_mov_b32_e32 v2, v0 383; GFX9-NEXT: s_not_b64 exec, exec 384; GFX9-NEXT: v_mov_b32_e32 v2, 0 385; GFX9-NEXT: s_not_b64 exec, exec 386; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 387; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 388; GFX9-NEXT: s_nop 1 389; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 390; GFX9-NEXT: s_nop 1 391; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 392; GFX9-NEXT: s_nop 1 393; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 394; GFX9-NEXT: s_nop 1 395; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 396; GFX9-NEXT: s_nop 1 397; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 398; GFX9-NEXT: v_readlane_b32 s6, v2, 63 399; GFX9-NEXT: s_nop 0 400; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 401; GFX9-NEXT: s_mov_b64 exec, s[4:5] 402; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 403; GFX9-NEXT: ; implicit-def: $vgpr0 404; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 405; GFX9-NEXT: s_cbranch_execz .LBB2_2 406; GFX9-NEXT: ; %bb.1: 407; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 408; GFX9-NEXT: v_mov_b32_e32 v0, s6 409; GFX9-NEXT: s_waitcnt lgkmcnt(0) 410; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 411; GFX9-NEXT: .LBB2_2: 412; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 413; GFX9-NEXT: s_waitcnt vmcnt(0) 414; GFX9-NEXT: v_readfirstlane_b32 s0, v0 415; GFX9-NEXT: v_mov_b32_e32 v0, v1 416; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 417; GFX9-NEXT: v_mov_b32_e32 v3, 0 418; GFX9-NEXT: s_waitcnt lgkmcnt(0) 419; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 420; GFX9-NEXT: s_endpgm 421; 422; GFX10W64-LABEL: add_i32_varying_vdata: 423; GFX10W64: ; %bb.0: ; %entry 424; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 425; GFX10W64-NEXT: s_not_b64 exec, exec 426; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 427; GFX10W64-NEXT: s_not_b64 exec, exec 428; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 429; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 430; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 431; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 432; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 433; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 434; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 435; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 436; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 437; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 438; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 439; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 440; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 441; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 442; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 443; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 444; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 445; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 446; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 447; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 448; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 449; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 450; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 451; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 452; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 453; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 454; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 455; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 456; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 457; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 458; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 459; GFX10W64-NEXT: ; implicit-def: $vgpr0 460; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 461; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 462; GFX10W64-NEXT: ; %bb.1: 463; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 464; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 465; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 466; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 467; GFX10W64-NEXT: .LBB2_2: 468; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 469; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 470; GFX10W64-NEXT: s_waitcnt vmcnt(0) 471; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 472; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 473; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 474; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 475; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 476; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 477; GFX10W64-NEXT: s_endpgm 478; 479; GFX10W32-LABEL: add_i32_varying_vdata: 480; GFX10W32: ; %bb.0: ; %entry 481; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 482; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 483; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 484; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 485; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 486; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 487; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 488; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 489; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 490; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 491; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 492; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 493; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 494; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 495; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 496; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 497; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 498; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 499; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 500; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 501; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 502; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 503; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 504; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 505; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 506; GFX10W32-NEXT: ; implicit-def: $vgpr0 507; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 508; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 509; GFX10W32-NEXT: ; %bb.1: 510; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 511; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 512; GFX10W32-NEXT: s_mov_b32 s5, s6 513; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 514; GFX10W32-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 515; GFX10W32-NEXT: .LBB2_2: 516; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 517; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 518; GFX10W32-NEXT: s_waitcnt vmcnt(0) 519; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 520; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 521; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 522; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 523; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 524; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 525; GFX10W32-NEXT: s_endpgm 526entry: 527 %lane = call i32 @llvm.amdgcn.workitem.id.x() 528 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) 529 store i32 %old, i32 addrspace(1)* %out 530 ret void 531} 532 533define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { 534; GFX6-LABEL: add_i32_varying_offset: 535; GFX6: ; %bb.0: ; %entry 536; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 537; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 538; GFX6-NEXT: v_mov_b32_e32 v1, 1 539; GFX6-NEXT: s_waitcnt lgkmcnt(0) 540; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc 541; GFX6-NEXT: s_mov_b32 s3, 0xf000 542; GFX6-NEXT: s_mov_b32 s2, -1 543; GFX6-NEXT: s_waitcnt vmcnt(0) 544; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 545; GFX6-NEXT: s_endpgm 546; 547; GFX8-LABEL: add_i32_varying_offset: 548; GFX8: ; %bb.0: ; %entry 549; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 550; GFX8-NEXT: v_mov_b32_e32 v2, 1 551; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 552; GFX8-NEXT: s_waitcnt lgkmcnt(0) 553; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc 554; GFX8-NEXT: v_mov_b32_e32 v0, s0 555; GFX8-NEXT: v_mov_b32_e32 v1, s1 556; GFX8-NEXT: s_waitcnt vmcnt(0) 557; GFX8-NEXT: flat_store_dword v[0:1], v2 558; GFX8-NEXT: s_endpgm 559; 560; GFX9-LABEL: add_i32_varying_offset: 561; GFX9: ; %bb.0: ; %entry 562; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 563; GFX9-NEXT: v_mov_b32_e32 v1, 1 564; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 565; GFX9-NEXT: s_waitcnt lgkmcnt(0) 566; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc 567; GFX9-NEXT: v_mov_b32_e32 v0, 0 568; GFX9-NEXT: s_waitcnt vmcnt(0) 569; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 570; GFX9-NEXT: s_endpgm 571; 572; GFX10-LABEL: add_i32_varying_offset: 573; GFX10: ; %bb.0: ; %entry 574; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 575; GFX10-NEXT: v_mov_b32_e32 v1, 1 576; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 577; GFX10-NEXT: s_waitcnt lgkmcnt(0) 578; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc 579; GFX10-NEXT: v_mov_b32_e32 v0, 0 580; GFX10-NEXT: s_waitcnt vmcnt(0) 581; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 582; GFX10-NEXT: s_endpgm 583entry: 584 %lane = call i32 @llvm.amdgcn.workitem.id.x() 585 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) 586 store i32 %old, i32 addrspace(1)* %out 587 ret void 588} 589 590define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { 591; GFX6-LABEL: sub_i32_constant: 592; GFX6: ; %bb.0: ; %entry 593; GFX6-NEXT: s_mov_b64 s[2:3], exec 594; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 595; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 596; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 597; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 598; GFX6-NEXT: ; implicit-def: $vgpr1 599; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 600; GFX6-NEXT: s_cbranch_execz .LBB4_2 601; GFX6-NEXT: ; %bb.1: 602; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 603; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 604; GFX6-NEXT: s_mul_i32 s0, s0, 5 605; GFX6-NEXT: v_mov_b32_e32 v1, s0 606; GFX6-NEXT: s_waitcnt lgkmcnt(0) 607; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 608; GFX6-NEXT: .LBB4_2: 609; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 610; GFX6-NEXT: s_waitcnt vmcnt(0) 611; GFX6-NEXT: v_readfirstlane_b32 s0, v1 612; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0 613; GFX6-NEXT: s_mov_b32 s7, 0xf000 614; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 615; GFX6-NEXT: s_mov_b32 s6, -1 616; GFX6-NEXT: s_waitcnt lgkmcnt(0) 617; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 618; GFX6-NEXT: s_endpgm 619; 620; GFX8-LABEL: sub_i32_constant: 621; GFX8: ; %bb.0: ; %entry 622; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 623; GFX8-NEXT: s_mov_b64 s[6:7], exec 624; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 625; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 626; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 627; GFX8-NEXT: ; implicit-def: $vgpr1 628; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 629; GFX8-NEXT: s_cbranch_execz .LBB4_2 630; GFX8-NEXT: ; %bb.1: 631; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 632; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 633; GFX8-NEXT: s_mul_i32 s0, s0, 5 634; GFX8-NEXT: v_mov_b32_e32 v1, s0 635; GFX8-NEXT: s_waitcnt lgkmcnt(0) 636; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 637; GFX8-NEXT: .LBB4_2: 638; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 639; GFX8-NEXT: s_waitcnt vmcnt(0) 640; GFX8-NEXT: v_readfirstlane_b32 s0, v1 641; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 642; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 643; GFX8-NEXT: s_waitcnt lgkmcnt(0) 644; GFX8-NEXT: v_mov_b32_e32 v0, s2 645; GFX8-NEXT: v_mov_b32_e32 v1, s3 646; GFX8-NEXT: flat_store_dword v[0:1], v2 647; GFX8-NEXT: s_endpgm 648; 649; GFX9-LABEL: sub_i32_constant: 650; GFX9: ; %bb.0: ; %entry 651; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 652; GFX9-NEXT: s_mov_b64 s[6:7], exec 653; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 654; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 655; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 656; GFX9-NEXT: ; implicit-def: $vgpr1 657; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 658; GFX9-NEXT: s_cbranch_execz .LBB4_2 659; GFX9-NEXT: ; %bb.1: 660; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 661; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 662; GFX9-NEXT: s_mul_i32 s0, s0, 5 663; GFX9-NEXT: v_mov_b32_e32 v1, s0 664; GFX9-NEXT: s_waitcnt lgkmcnt(0) 665; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 666; GFX9-NEXT: .LBB4_2: 667; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 668; GFX9-NEXT: s_waitcnt vmcnt(0) 669; GFX9-NEXT: v_readfirstlane_b32 s0, v1 670; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 671; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 672; GFX9-NEXT: v_mov_b32_e32 v1, 0 673; GFX9-NEXT: s_waitcnt lgkmcnt(0) 674; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 675; GFX9-NEXT: s_endpgm 676; 677; GFX10W64-LABEL: sub_i32_constant: 678; GFX10W64: ; %bb.0: ; %entry 679; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 680; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 681; GFX10W64-NEXT: ; implicit-def: $vgpr1 682; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 683; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 684; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 685; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 686; GFX10W64-NEXT: s_cbranch_execz .LBB4_2 687; GFX10W64-NEXT: ; %bb.1: 688; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 689; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 690; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 691; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 692; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 693; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 694; GFX10W64-NEXT: .LBB4_2: 695; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 696; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 697; GFX10W64-NEXT: s_waitcnt vmcnt(0) 698; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 699; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 700; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 701; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 702; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 703; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 704; GFX10W64-NEXT: s_endpgm 705; 706; GFX10W32-LABEL: sub_i32_constant: 707; GFX10W32: ; %bb.0: ; %entry 708; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 709; GFX10W32-NEXT: s_mov_b32 s5, exec_lo 710; GFX10W32-NEXT: ; implicit-def: $vgpr1 711; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 712; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 713; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 714; GFX10W32-NEXT: s_cbranch_execz .LBB4_2 715; GFX10W32-NEXT: ; %bb.1: 716; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 717; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 718; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 719; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 720; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 721; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 722; GFX10W32-NEXT: .LBB4_2: 723; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 724; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 725; GFX10W32-NEXT: s_waitcnt vmcnt(0) 726; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 727; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 728; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 729; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 730; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 731; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 732; GFX10W32-NEXT: s_endpgm 733entry: 734 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) 735 store i32 %old, i32 addrspace(1)* %out 736 ret void 737} 738 739define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) { 740; GFX6-LABEL: sub_i32_uniform: 741; GFX6: ; %bb.0: ; %entry 742; GFX6-NEXT: s_mov_b64 s[2:3], exec 743; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 744; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 745; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 746; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 747; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 748; GFX6-NEXT: ; implicit-def: $vgpr1 749; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 750; GFX6-NEXT: s_cbranch_execz .LBB5_2 751; GFX6-NEXT: ; %bb.1: 752; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 753; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 754; GFX6-NEXT: s_waitcnt lgkmcnt(0) 755; GFX6-NEXT: s_mul_i32 s0, s8, s0 756; GFX6-NEXT: v_mov_b32_e32 v1, s0 757; GFX6-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 758; GFX6-NEXT: .LBB5_2: 759; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 760; GFX6-NEXT: s_waitcnt vmcnt(0) 761; GFX6-NEXT: v_readfirstlane_b32 s0, v1 762; GFX6-NEXT: s_waitcnt lgkmcnt(0) 763; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 764; GFX6-NEXT: s_mov_b32 s7, 0xf000 765; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 766; GFX6-NEXT: s_mov_b32 s6, -1 767; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 768; GFX6-NEXT: s_endpgm 769; 770; GFX8-LABEL: sub_i32_uniform: 771; GFX8: ; %bb.0: ; %entry 772; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 773; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 774; GFX8-NEXT: s_mov_b64 s[4:5], exec 775; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 776; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 777; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 778; GFX8-NEXT: ; implicit-def: $vgpr1 779; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 780; GFX8-NEXT: s_cbranch_execz .LBB5_2 781; GFX8-NEXT: ; %bb.1: 782; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 783; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 784; GFX8-NEXT: s_waitcnt lgkmcnt(0) 785; GFX8-NEXT: s_mul_i32 s0, s8, s0 786; GFX8-NEXT: v_mov_b32_e32 v1, s0 787; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 788; GFX8-NEXT: .LBB5_2: 789; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 790; GFX8-NEXT: s_waitcnt lgkmcnt(0) 791; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 792; GFX8-NEXT: s_waitcnt vmcnt(0) 793; GFX8-NEXT: v_readfirstlane_b32 s0, v1 794; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 795; GFX8-NEXT: v_mov_b32_e32 v0, s2 796; GFX8-NEXT: v_mov_b32_e32 v1, s3 797; GFX8-NEXT: flat_store_dword v[0:1], v2 798; GFX8-NEXT: s_endpgm 799; 800; GFX9-LABEL: sub_i32_uniform: 801; GFX9: ; %bb.0: ; %entry 802; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 803; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 804; GFX9-NEXT: s_mov_b64 s[4:5], exec 805; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 806; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 807; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 808; GFX9-NEXT: ; implicit-def: $vgpr1 809; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc 810; GFX9-NEXT: s_cbranch_execz .LBB5_2 811; GFX9-NEXT: ; %bb.1: 812; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 813; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 814; GFX9-NEXT: s_waitcnt lgkmcnt(0) 815; GFX9-NEXT: s_mul_i32 s0, s8, s0 816; GFX9-NEXT: v_mov_b32_e32 v1, s0 817; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 818; GFX9-NEXT: .LBB5_2: 819; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 820; GFX9-NEXT: s_waitcnt lgkmcnt(0) 821; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 822; GFX9-NEXT: s_waitcnt vmcnt(0) 823; GFX9-NEXT: v_readfirstlane_b32 s0, v1 824; GFX9-NEXT: v_mov_b32_e32 v1, 0 825; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 826; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 827; GFX9-NEXT: s_endpgm 828; 829; GFX10W64-LABEL: sub_i32_uniform: 830; GFX10W64: ; %bb.0: ; %entry 831; GFX10W64-NEXT: s_clause 0x1 832; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 833; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 834; GFX10W64-NEXT: s_mov_b64 s[4:5], exec 835; GFX10W64-NEXT: ; implicit-def: $vgpr1 836; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 837; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 838; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 839; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc 840; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 841; GFX10W64-NEXT: ; %bb.1: 842; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 843; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 844; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 845; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 846; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 847; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 848; GFX10W64-NEXT: .LBB5_2: 849; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 850; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7] 851; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 852; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0 853; GFX10W64-NEXT: s_waitcnt vmcnt(0) 854; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 855; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 856; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 857; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 858; GFX10W64-NEXT: s_endpgm 859; 860; GFX10W32-LABEL: sub_i32_uniform: 861; GFX10W32: ; %bb.0: ; %entry 862; GFX10W32-NEXT: s_clause 0x1 863; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 864; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 865; GFX10W32-NEXT: s_mov_b32 s6, exec_lo 866; GFX10W32-NEXT: ; implicit-def: $vgpr1 867; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 868; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 869; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 870; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 871; GFX10W32-NEXT: ; %bb.1: 872; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 873; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 874; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 875; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 876; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 877; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 878; GFX10W32-NEXT: .LBB5_2: 879; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 880; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 881; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 882; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0 883; GFX10W32-NEXT: s_waitcnt vmcnt(0) 884; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 885; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 886; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 887; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 888; GFX10W32-NEXT: s_endpgm 889entry: 890 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0) 891 store i32 %old, i32 addrspace(1)* %out 892 ret void 893} 894 895define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { 896; GFX6-LABEL: sub_i32_varying_vdata: 897; GFX6: ; %bb.0: ; %entry 898; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 899; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 900; GFX6-NEXT: s_waitcnt lgkmcnt(0) 901; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 902; GFX6-NEXT: s_mov_b32 s3, 0xf000 903; GFX6-NEXT: s_mov_b32 s2, -1 904; GFX6-NEXT: s_waitcnt vmcnt(0) 905; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 906; GFX6-NEXT: s_endpgm 907; 908; GFX8-LABEL: sub_i32_varying_vdata: 909; GFX8: ; %bb.0: ; %entry 910; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 911; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 912; GFX8-NEXT: v_mov_b32_e32 v1, 0 913; GFX8-NEXT: s_mov_b64 exec, s[4:5] 914; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 915; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 916; GFX8-NEXT: v_mov_b32_e32 v2, v0 917; GFX8-NEXT: s_not_b64 exec, exec 918; GFX8-NEXT: v_mov_b32_e32 v2, 0 919; GFX8-NEXT: s_not_b64 exec, exec 920; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 921; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 922; GFX8-NEXT: s_nop 1 923; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 924; GFX8-NEXT: s_nop 1 925; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 926; GFX8-NEXT: s_nop 1 927; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 928; GFX8-NEXT: s_nop 1 929; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 930; GFX8-NEXT: s_nop 1 931; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 932; GFX8-NEXT: v_readlane_b32 s6, v2, 63 933; GFX8-NEXT: s_nop 0 934; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 935; GFX8-NEXT: s_mov_b64 exec, s[4:5] 936; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 937; GFX8-NEXT: ; implicit-def: $vgpr0 938; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 939; GFX8-NEXT: s_cbranch_execz .LBB6_2 940; GFX8-NEXT: ; %bb.1: 941; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 942; GFX8-NEXT: v_mov_b32_e32 v0, s6 943; GFX8-NEXT: s_waitcnt lgkmcnt(0) 944; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 945; GFX8-NEXT: .LBB6_2: 946; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 947; GFX8-NEXT: s_waitcnt vmcnt(0) 948; GFX8-NEXT: v_readfirstlane_b32 s0, v0 949; GFX8-NEXT: v_mov_b32_e32 v0, v1 950; GFX8-NEXT: s_waitcnt lgkmcnt(0) 951; GFX8-NEXT: v_mov_b32_e32 v4, s3 952; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 953; GFX8-NEXT: v_mov_b32_e32 v3, s2 954; GFX8-NEXT: flat_store_dword v[3:4], v0 955; GFX8-NEXT: s_endpgm 956; 957; GFX9-LABEL: sub_i32_varying_vdata: 958; GFX9: ; %bb.0: ; %entry 959; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 960; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 961; GFX9-NEXT: v_mov_b32_e32 v1, 0 962; GFX9-NEXT: s_mov_b64 exec, s[4:5] 963; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 964; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 965; GFX9-NEXT: v_mov_b32_e32 v2, v0 966; GFX9-NEXT: s_not_b64 exec, exec 967; GFX9-NEXT: v_mov_b32_e32 v2, 0 968; GFX9-NEXT: s_not_b64 exec, exec 969; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 970; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 971; GFX9-NEXT: s_nop 1 972; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 973; GFX9-NEXT: s_nop 1 974; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 975; GFX9-NEXT: s_nop 1 976; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 977; GFX9-NEXT: s_nop 1 978; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 979; GFX9-NEXT: s_nop 1 980; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 981; GFX9-NEXT: v_readlane_b32 s6, v2, 63 982; GFX9-NEXT: s_nop 0 983; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 984; GFX9-NEXT: s_mov_b64 exec, s[4:5] 985; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 986; GFX9-NEXT: ; implicit-def: $vgpr0 987; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 988; GFX9-NEXT: s_cbranch_execz .LBB6_2 989; GFX9-NEXT: ; %bb.1: 990; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 991; GFX9-NEXT: v_mov_b32_e32 v0, s6 992; GFX9-NEXT: s_waitcnt lgkmcnt(0) 993; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 994; GFX9-NEXT: .LBB6_2: 995; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 996; GFX9-NEXT: s_waitcnt vmcnt(0) 997; GFX9-NEXT: v_readfirstlane_b32 s0, v0 998; GFX9-NEXT: v_mov_b32_e32 v0, v1 999; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1000; GFX9-NEXT: v_mov_b32_e32 v3, 0 1001; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1002; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 1003; GFX9-NEXT: s_endpgm 1004; 1005; GFX10W64-LABEL: sub_i32_varying_vdata: 1006; GFX10W64: ; %bb.0: ; %entry 1007; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 1008; GFX10W64-NEXT: s_not_b64 exec, exec 1009; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1010; GFX10W64-NEXT: s_not_b64 exec, exec 1011; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 1012; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1013; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 1014; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1015; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1016; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1017; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 1018; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1019; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1020; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 1021; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 1022; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1023; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 1024; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1025; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 1026; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1027; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1028; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 1029; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 1030; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1031; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1032; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1033; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 1034; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 1035; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 1036; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1037; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1038; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1039; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 1040; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1041; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1042; GFX10W64-NEXT: ; implicit-def: $vgpr0 1043; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1044; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 1045; GFX10W64-NEXT: ; %bb.1: 1046; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1047; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 1048; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1049; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1050; GFX10W64-NEXT: .LBB6_2: 1051; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1052; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 1053; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1054; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 1055; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 1056; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 1057; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1058; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1059; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 1060; GFX10W64-NEXT: s_endpgm 1061; 1062; GFX10W32-LABEL: sub_i32_varying_vdata: 1063; GFX10W32: ; %bb.0: ; %entry 1064; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 1065; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 1066; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1067; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 1068; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 1069; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1070; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1071; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1072; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1073; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 1074; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1075; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 1076; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1077; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 1078; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1079; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 1080; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 1081; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1082; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 1083; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 1084; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1085; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 1086; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 1087; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 1088; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1089; GFX10W32-NEXT: ; implicit-def: $vgpr0 1090; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 1091; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 1092; GFX10W32-NEXT: ; %bb.1: 1093; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1094; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 1095; GFX10W32-NEXT: s_mov_b32 s5, s6 1096; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1097; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1098; GFX10W32-NEXT: .LBB6_2: 1099; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1100; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1101; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1102; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 1103; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 1104; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 1105; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1106; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1107; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 1108; GFX10W32-NEXT: s_endpgm 1109entry: 1110 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1111 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) 1112 store i32 %old, i32 addrspace(1)* %out 1113 ret void 1114} 1115 1116define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { 1117; GFX6-LABEL: sub_i32_varying_offset: 1118; GFX6: ; %bb.0: ; %entry 1119; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1120; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1121; GFX6-NEXT: v_mov_b32_e32 v1, 1 1122; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1123; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc 1124; GFX6-NEXT: s_mov_b32 s3, 0xf000 1125; GFX6-NEXT: s_mov_b32 s2, -1 1126; GFX6-NEXT: s_waitcnt vmcnt(0) 1127; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 1128; GFX6-NEXT: s_endpgm 1129; 1130; GFX8-LABEL: sub_i32_varying_offset: 1131; GFX8: ; %bb.0: ; %entry 1132; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1133; GFX8-NEXT: v_mov_b32_e32 v2, 1 1134; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1135; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1136; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc 1137; GFX8-NEXT: v_mov_b32_e32 v0, s0 1138; GFX8-NEXT: v_mov_b32_e32 v1, s1 1139; GFX8-NEXT: s_waitcnt vmcnt(0) 1140; GFX8-NEXT: flat_store_dword v[0:1], v2 1141; GFX8-NEXT: s_endpgm 1142; 1143; GFX9-LABEL: sub_i32_varying_offset: 1144; GFX9: ; %bb.0: ; %entry 1145; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1146; GFX9-NEXT: v_mov_b32_e32 v1, 1 1147; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1148; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1149; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc 1150; GFX9-NEXT: v_mov_b32_e32 v0, 0 1151; GFX9-NEXT: s_waitcnt vmcnt(0) 1152; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1153; GFX9-NEXT: s_endpgm 1154; 1155; GFX10-LABEL: sub_i32_varying_offset: 1156; GFX10: ; %bb.0: ; %entry 1157; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1158; GFX10-NEXT: v_mov_b32_e32 v1, 1 1159; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1160; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1161; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc 1162; GFX10-NEXT: v_mov_b32_e32 v0, 0 1163; GFX10-NEXT: s_waitcnt vmcnt(0) 1164; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1165; GFX10-NEXT: s_endpgm 1166entry: 1167 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1168 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) 1169 store i32 %old, i32 addrspace(1)* %out 1170 ret void 1171} 1172