1; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s 2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s 4 5declare i32 @llvm.amdgcn.workitem.id.x() 6 7@local_var32 = addrspace(3) global i32 undef, align 4 8@local_var64 = addrspace(3) global i64 undef, align 8 9 10; Show that what the atomic optimization pass will do for local pointers. 11 12; GCN-LABEL: add_i32_constant: 13; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 14; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 15; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 16; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 17; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 18; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 19; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 20define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 21entry: 22 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 23 store i32 %old, i32 addrspace(1)* %out 24 ret void 25} 26 27; GCN-LABEL: add_i32_uniform: 28; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 29; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 30; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 31; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 32; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 33; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] 34; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 35; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 36define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 37entry: 38 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 39 store i32 %old, i32 addrspace(1)* %out 40 ret void 41} 42 43; GCN-LABEL: add_i32_varying: 44; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 45; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 46; GFX7LESS-NOT: s_bcnt1_i32_b64 47; GFX7LESS: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 48; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 49; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 50; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 51define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 52entry: 53 %lane = call i32 @llvm.amdgcn.workitem.id.x() 54 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 55 store i32 %old, i32 addrspace(1)* %out 56 ret void 57} 58 59; GCN-LABEL: add_i64_constant: 60; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 61; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 62; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 63; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 64; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 65; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 66; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 67; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} 68define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 69entry: 70 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 71 store i64 %old, i64 addrspace(1)* %out 72 ret void 73} 74 75; GCN-LABEL: add_i64_uniform: 76; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 77; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 78; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 79; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 80; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 81; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 82define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 83entry: 84 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 85 store i64 %old, i64 addrspace(1)* %out 86 ret void 87} 88 89; GCN-LABEL: add_i64_varying: 90; GCN-NOT: v_mbcnt_lo_u32_b32 91; GCN-NOT: v_mbcnt_hi_u32_b32 92; GCN-NOT: s_bcnt1_i32_b64 93; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 94define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 95entry: 96 %lane = call i32 @llvm.amdgcn.workitem.id.x() 97 %zext = zext i32 %lane to i64 98 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 99 store i64 %old, i64 addrspace(1)* %out 100 ret void 101} 102 103; GCN-LABEL: sub_i32_constant: 104; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 105; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 106; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 107; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 108; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 109; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 110; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 111define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 112entry: 113 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 114 store i32 %old, i32 addrspace(1)* %out 115 ret void 116} 117 118; GCN-LABEL: sub_i32_uniform: 119; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 120; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 121; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 122; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 123; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 124; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] 125; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 126; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 127define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 128entry: 129 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 130 store i32 %old, i32 addrspace(1)* %out 131 ret void 132} 133 134; GCN-LABEL: sub_i32_varying: 135; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 136; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 137; GFX7LESS-NOT: s_bcnt1_i32_b64 138; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 139; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf 140; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}} 141; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63 142; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 143; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 144define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 145entry: 146 %lane = call i32 @llvm.amdgcn.workitem.id.x() 147 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 148 store i32 %old, i32 addrspace(1)* %out 149 ret void 150} 151 152; GCN-LABEL: sub_i64_constant: 153; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 154; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 155; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 156; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 157; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 158; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 159; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 160; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} 161define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 162entry: 163 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 164 store i64 %old, i64 addrspace(1)* %out 165 ret void 166} 167 168; GCN-LABEL: sub_i64_uniform: 169; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 170; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 171; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 172; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 173; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 174; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 175define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 176entry: 177 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 178 store i64 %old, i64 addrspace(1)* %out 179 ret void 180} 181 182; GCN-LABEL: sub_i64_varying: 183; GCN-NOT: v_mbcnt_lo_u32_b32 184; GCN-NOT: v_mbcnt_hi_u32_b32 185; GCN-NOT: s_bcnt1_i32_b64 186; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 187define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 188entry: 189 %lane = call i32 @llvm.amdgcn.workitem.id.x() 190 %zext = zext i32 %lane to i64 191 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 192 store i64 %old, i64 addrspace(1)* %out 193 ret void 194} 195