1; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s 2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s 4 5declare i32 @llvm.amdgcn.workitem.id.x() 6 7; Show that what the atomic optimization pass will do for global pointers. 8 9; GCN-LABEL: add_i32_constant: 10; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 11; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 12; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 13; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 14; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 15; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 16; GCN: {{flat|buffer|global}}_atomic_add v[[value]] 17define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 18entry: 19 %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel 20 store i32 %old, i32 addrspace(1)* %out 21 ret void 22} 23 24; GCN-LABEL: add_i32_uniform: 25; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 26; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 27; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 28; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 29; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 30; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] 31; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 32; GCN: {{flat|buffer|global}}_atomic_add v[[value]] 33define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) { 34entry: 35 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel 36 store i32 %old, i32 addrspace(1)* %out 37 ret void 38} 39 40; GCN-LABEL: add_i32_varying: 41; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 42; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 43; GFX7LESS-NOT: s_bcnt1_i32_b64 44; GFX7LESS: buffer_atomic_add v{{[0-9]+}} 45; GFX8MORE: v_add_u32_dpp 46; GFX8MORE: v_add_u32_dpp 47; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 48; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 49; GFX8MORE: buffer_atomic_add v[[value]] 50define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 51entry: 52 %lane = call i32 @llvm.amdgcn.workitem.id.x() 53 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel 54 store i32 %old, i32 addrspace(1)* %out 55 ret void 56} 57 58; GCN-LABEL: add_i64_constant: 59; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 60; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 61; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 62; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 63; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 64; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 65; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 66; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} 67define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 68entry: 69 %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel 70 store i64 %old, i64 addrspace(1)* %out 71 ret void 72} 73 74; GCN-LABEL: add_i64_uniform: 75; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 76; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 77; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 78; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 79; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 80; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 81define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) { 82entry: 83 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel 84 store i64 %old, i64 addrspace(1)* %out 85 ret void 86} 87 88; GCN-LABEL: add_i64_varying: 89; GCN-NOT: v_mbcnt_lo_u32_b32 90; GCN-NOT: v_mbcnt_hi_u32_b32 91; GCN-NOT: s_bcnt1_i32_b64 92; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 93define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 94entry: 95 %lane = call i32 @llvm.amdgcn.workitem.id.x() 96 %zext = zext i32 %lane to i64 97 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel 98 store i64 %old, i64 addrspace(1)* %out 99 ret void 100} 101 102; GCN-LABEL: sub_i32_constant: 103; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 104; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 105; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 106; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 107; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 108; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 109; GCN: {{flat|buffer|global}}_atomic_sub v[[value]] 110define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 111entry: 112 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel 113 store i32 %old, i32 addrspace(1)* %out 114 ret void 115} 116 117; GCN-LABEL: sub_i32_uniform: 118; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 119; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 120; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 121; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 122; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 123; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] 124; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 125; GCN: {{flat|buffer|global}}_atomic_sub v[[value]] 126define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) { 127entry: 128 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel 129 store i32 %old, i32 addrspace(1)* %out 130 ret void 131} 132 133; GCN-LABEL: sub_i32_varying: 134; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 135; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 136; GFX7LESS-NOT: s_bcnt1_i32_b64 137; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} 138; GFX8MORE: v_sub{{(rev)?}}_u32_dpp 139; GFX8MORE: v_sub{{(rev)?}}_u32_dpp 140; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 141; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 142; GFX8MORE: buffer_atomic_sub v[[value]] 143define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 144entry: 145 %lane = call i32 @llvm.amdgcn.workitem.id.x() 146 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel 147 store i32 %old, i32 addrspace(1)* %out 148 ret void 149} 150 151; GCN-LABEL: sub_i64_constant: 152; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 153; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 154; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 155; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 156; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 157; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 158; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 159; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} 160define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 161entry: 162 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel 163 store i64 %old, i64 addrspace(1)* %out 164 ret void 165} 166 167; GCN-LABEL: sub_i64_uniform: 168; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 169; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 170; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 171; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 172; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 173; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 174define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) { 175entry: 176 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel 177 store i64 %old, i64 addrspace(1)* %out 178 ret void 179} 180 181; GCN-LABEL: sub_i64_varying: 182; GCN-NOT: v_mbcnt_lo_u32_b32 183; GCN-NOT: v_mbcnt_hi_u32_b32 184; GCN-NOT: s_bcnt1_i32_b64 185; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 186define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 187entry: 188 %lane = call i32 @llvm.amdgcn.workitem.id.x() 189 %zext = zext i32 %lane to i64 190 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel 191 store i64 %old, i64 addrspace(1)* %out 192 ret void 193} 194