1; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s 2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s 4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s 5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s 6 7declare i32 @llvm.amdgcn.workitem.id.x() 8 9; Show that what the atomic optimization pass will do for global pointers. 10 11; GCN-LABEL: add_i32_constant: 12; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 13; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 14; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 15; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 16; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 17; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 18; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 19; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 20; GCN: {{flat|buffer|global}}_atomic_add v[[value]] 21define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 22entry: 23 %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel 24 store i32 %old, i32 addrspace(1)* %out 25 ret void 26} 27 28; GCN-LABEL: add_i32_uniform: 29; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 30; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 31; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 32; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 33; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 34; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 35; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 36; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] 37; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 38; GCN: {{flat|buffer|global}}_atomic_add v[[value]] 39define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) { 40entry: 41 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel 42 store i32 %old, i32 addrspace(1)* %out 43 ret void 44} 45 46; GCN-LABEL: add_i32_varying: 47; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 48; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 49; GFX7LESS-NOT: s_bcnt1_i32_b64 50; GFX7LESS: buffer_atomic_add v{{[0-9]+}} 51; DPPCOMB: v_add_u32_dpp 52; DPPCOMB: v_add_u32_dpp 53; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 54; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 55; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 56; GFX8MORE: buffer_atomic_add v[[value]] 57define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 58entry: 59 %lane = call i32 @llvm.amdgcn.workitem.id.x() 60 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel 61 store i32 %old, i32 addrspace(1)* %out 62 ret void 63} 64 65; GCN-LABEL: add_i64_constant: 66; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 67; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 68; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 69; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 70; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 71; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 72; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 73; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 74; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 75; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} 76define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 77entry: 78 %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel 79 store i64 %old, i64 addrspace(1)* %out 80 ret void 81} 82 83; GCN-LABEL: add_i64_uniform: 84; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 85; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 86; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 87; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 88; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 89; GCN32: s_bcnt1_i32_b32 s{{[0-9]+}}, s[[exec_lo]] 90; GCN64: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 91; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 92define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) { 93entry: 94 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel 95 store i64 %old, i64 addrspace(1)* %out 96 ret void 97} 98 99; GCN-LABEL: add_i64_varying: 100; GCN-NOT: v_mbcnt_lo_u32_b32 101; GCN-NOT: v_mbcnt_hi_u32_b32 102; GCN-NOT: s_bcnt1_i32_b64 103; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 104define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 105entry: 106 %lane = call i32 @llvm.amdgcn.workitem.id.x() 107 %zext = zext i32 %lane to i64 108 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel 109 store i64 %old, i64 addrspace(1)* %out 110 ret void 111} 112 113; GCN-LABEL: sub_i32_constant: 114; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 115; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 116; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 117; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 118; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 119; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 120; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 121; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 122; GCN: {{flat|buffer|global}}_atomic_sub v[[value]] 123define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 124entry: 125 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel 126 store i32 %old, i32 addrspace(1)* %out 127 ret void 128} 129 130; GCN-LABEL: sub_i32_uniform: 131; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 132; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 133; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 134; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 135; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 136; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 137; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 138; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] 139; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 140; GCN: {{flat|buffer|global}}_atomic_sub v[[value]] 141define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) { 142entry: 143 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel 144 store i32 %old, i32 addrspace(1)* %out 145 ret void 146} 147 148; GCN-LABEL: sub_i32_varying: 149; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 150; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 151; GFX7LESS-NOT: s_bcnt1_i32_b64 152; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} 153; DPPCOMB: v_add_u32_dpp 154; DPPCOMB: v_add_u32_dpp 155; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 156; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 157; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 158; GFX8MORE: buffer_atomic_sub v[[value]] 159define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 160entry: 161 %lane = call i32 @llvm.amdgcn.workitem.id.x() 162 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel 163 store i32 %old, i32 addrspace(1)* %out 164 ret void 165} 166 167; GCN-LABEL: sub_i64_constant: 168; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 169; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 170; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 171; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 172; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 173; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 174; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 175; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 176; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 177; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} 178define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 179entry: 180 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel 181 store i64 %old, i64 addrspace(1)* %out 182 ret void 183} 184 185; GCN-LABEL: sub_i64_uniform: 186; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 187; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 188; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 189; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 190; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 191; GCN32: s_bcnt1_i32_b32 s{{[0-9]+}}, s[[exec_lo]] 192; GCN64: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 193; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 194define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) { 195entry: 196 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel 197 store i64 %old, i64 addrspace(1)* %out 198 ret void 199} 200 201; GCN-LABEL: sub_i64_varying: 202; GCN-NOT: v_mbcnt_lo_u32_b32 203; GCN-NOT: v_mbcnt_hi_u32_b32 204; GCN-NOT: s_bcnt1_i32_b64 205; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 206define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 207entry: 208 %lane = call i32 @llvm.amdgcn.workitem.id.x() 209 %zext = zext i32 %lane to i64 210 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel 211 store i64 %old, i64 addrspace(1)* %out 212 ret void 213} 214