1; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s 2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s 4 5declare i32 @llvm.amdgcn.workitem.id.x() 6 7; Show that what the atomic optimization pass will do for global pointers. 8 9; GCN-LABEL: add_i32_constant: 10; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 11; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 12; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 13; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 14; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 15; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 16; GCN: {{flat|buffer|global}}_atomic_add v[[value]] 17define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 18entry: 19 %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel 20 store i32 %old, i32 addrspace(1)* %out 21 ret void 22} 23 24; GCN-LABEL: add_i32_uniform: 25; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 26; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 27; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 28; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 29; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 30; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] 31; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 32; GCN: {{flat|buffer|global}}_atomic_add v[[value]] 33define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) { 34entry: 35 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel 36 store i32 %old, i32 addrspace(1)* %out 37 ret void 38} 39 40; GCN-LABEL: add_i32_varying: 41; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 42; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 43; GFX7LESS-NOT: s_bcnt1_i32_b64 44; GFX7LESS: buffer_atomic_add v{{[0-9]+}} 45; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 46; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 47; GFX8MORE: buffer_atomic_add v[[value]] 48define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 49entry: 50 %lane = call i32 @llvm.amdgcn.workitem.id.x() 51 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel 52 store i32 %old, i32 addrspace(1)* %out 53 ret void 54} 55 56; GCN-LABEL: add_i64_constant: 57; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 58; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 59; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 60; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 61; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 62; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 63; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 64; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} 65define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 66entry: 67 %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel 68 store i64 %old, i64 addrspace(1)* %out 69 ret void 70} 71 72; GCN-LABEL: add_i64_uniform: 73; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 74; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 75; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 76; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 77; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 78; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 79define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) { 80entry: 81 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel 82 store i64 %old, i64 addrspace(1)* %out 83 ret void 84} 85 86; GCN-LABEL: add_i64_varying: 87; GCN-NOT: v_mbcnt_lo_u32_b32 88; GCN-NOT: v_mbcnt_hi_u32_b32 89; GCN-NOT: s_bcnt1_i32_b64 90; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 91define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 92entry: 93 %lane = call i32 @llvm.amdgcn.workitem.id.x() 94 %zext = zext i32 %lane to i64 95 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel 96 store i64 %old, i64 addrspace(1)* %out 97 ret void 98} 99 100; GCN-LABEL: sub_i32_constant: 101; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 102; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 103; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 104; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 105; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 106; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 107; GCN: {{flat|buffer|global}}_atomic_sub v[[value]] 108define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 109entry: 110 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel 111 store i32 %old, i32 addrspace(1)* %out 112 ret void 113} 114 115; GCN-LABEL: sub_i32_uniform: 116; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 117; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 118; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 119; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 120; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 121; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] 122; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 123; GCN: {{flat|buffer|global}}_atomic_sub v[[value]] 124define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) { 125entry: 126 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel 127 store i32 %old, i32 addrspace(1)* %out 128 ret void 129} 130 131; GCN-LABEL: sub_i32_varying: 132; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 133; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 134; GFX7LESS-NOT: s_bcnt1_i32_b64 135; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} 136; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 137; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 138; GFX8MORE: buffer_atomic_sub v[[value]] 139define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 140entry: 141 %lane = call i32 @llvm.amdgcn.workitem.id.x() 142 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel 143 store i32 %old, i32 addrspace(1)* %out 144 ret void 145} 146 147; GCN-LABEL: sub_i64_constant: 148; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 149; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 150; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 151; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 152; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 153; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 154; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 155; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} 156define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 157entry: 158 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel 159 store i64 %old, i64 addrspace(1)* %out 160 ret void 161} 162 163; GCN-LABEL: sub_i64_uniform: 164; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 165; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 166; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] 167; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] 168; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 169; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 170define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) { 171entry: 172 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel 173 store i64 %old, i64 addrspace(1)* %out 174 ret void 175} 176 177; GCN-LABEL: sub_i64_varying: 178; GCN-NOT: v_mbcnt_lo_u32_b32 179; GCN-NOT: v_mbcnt_hi_u32_b32 180; GCN-NOT: s_bcnt1_i32_b64 181; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 182define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 183entry: 184 %lane = call i32 @llvm.amdgcn.workitem.id.x() 185 %zext = zext i32 %lane to i64 186 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel 187 store i64 %old, i64 addrspace(1)* %out 188 ret void 189} 190