1; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s 2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s 4 5declare i32 @llvm.amdgcn.workitem.id.x() 6 7@local_var32 = addrspace(3) global i32 undef, align 4 8@local_var64 = addrspace(3) global i64 undef, align 8 9 10; Show that what the atomic optimization pass will do for local pointers. 11 12; GCN-LABEL: add_i32_constant: 13; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 14; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 15; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 16; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 17; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 18; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 19; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 20define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 21entry: 22 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 23 store i32 %old, i32 addrspace(1)* %out 24 ret void 25} 26 27; GCN-LABEL: add_i32_uniform: 28; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 29; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 30; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 31; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 32; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 33; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] 34; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 35; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 36define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 37entry: 38 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 39 store i32 %old, i32 addrspace(1)* %out 40 ret void 41} 42 43; GCN-LABEL: add_i32_varying: 44; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 45; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 46; GFX7LESS-NOT: s_bcnt1_i32_b64 47; GFX7LESS: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 48; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 49; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 50; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 51define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 52entry: 53 %lane = call i32 @llvm.amdgcn.workitem.id.x() 54 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 55 store i32 %old, i32 addrspace(1)* %out 56 ret void 57} 58 59; GCN-LABEL: add_i64_constant: 60; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 61; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 62; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 63; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 64; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 65; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 66; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 67; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} 68define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 69entry: 70 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 71 store i64 %old, i64 addrspace(1)* %out 72 ret void 73} 74 75; GCN-LABEL: add_i64_uniform: 76; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 77; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 78; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 79; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 80; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 81; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 82define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 83entry: 84 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 85 store i64 %old, i64 addrspace(1)* %out 86 ret void 87} 88 89; GCN-LABEL: add_i64_varying: 90; GCN-NOT: v_mbcnt_lo_u32_b32 91; GCN-NOT: v_mbcnt_hi_u32_b32 92; GCN-NOT: s_bcnt1_i32_b64 93; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 94define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 95entry: 96 %lane = call i32 @llvm.amdgcn.workitem.id.x() 97 %zext = zext i32 %lane to i64 98 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 99 store i64 %old, i64 addrspace(1)* %out 100 ret void 101} 102 103; GCN-LABEL: sub_i32_constant: 104; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 105; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 106; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 107; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 108; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 109; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 110; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 111define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 112entry: 113 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 114 store i32 %old, i32 addrspace(1)* %out 115 ret void 116} 117 118; GCN-LABEL: sub_i32_uniform: 119; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 120; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 121; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 122; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 123; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 124; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] 125; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 126; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 127define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 128entry: 129 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 130 store i32 %old, i32 addrspace(1)* %out 131 ret void 132} 133 134; GCN-LABEL: sub_i32_varying: 135; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 136; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 137; GFX7LESS-NOT: s_bcnt1_i32_b64 138; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 139; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 140; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 141; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 142define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 143entry: 144 %lane = call i32 @llvm.amdgcn.workitem.id.x() 145 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 146 store i32 %old, i32 addrspace(1)* %out 147 ret void 148} 149 150; GCN-LABEL: sub_i64_constant: 151; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 152; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 153; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 154; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 155; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 156; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 157; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 158; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} 159define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 160entry: 161 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 162 store i64 %old, i64 addrspace(1)* %out 163 ret void 164} 165 166; GCN-LABEL: sub_i64_uniform: 167; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 168; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 169; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 170; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 171; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 172; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 173define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 174entry: 175 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 176 store i64 %old, i64 addrspace(1)* %out 177 ret void 178} 179 180; GCN-LABEL: sub_i64_varying: 181; GCN-NOT: v_mbcnt_lo_u32_b32 182; GCN-NOT: v_mbcnt_hi_u32_b32 183; GCN-NOT: s_bcnt1_i32_b64 184; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 185define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 186entry: 187 %lane = call i32 @llvm.amdgcn.workitem.id.x() 188 %zext = zext i32 %lane to i64 189 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 190 store i64 %old, i64 addrspace(1)* %out 191 ret void 192} 193