1; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s 2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s 4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s 5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s 6 7declare i32 @llvm.amdgcn.workitem.id.x() 8 9; Show what the atomic optimization pass will do for global pointers. 10 11; GCN-LABEL: add_i32_constant: 12; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo 13; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec 14; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 15; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 16; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 17; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc 18; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 19; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 20; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 21; GCN: {{flat|buffer|global}}_atomic_add v[[value]] 22define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 23entry: 24 %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel 25 store i32 %old, i32 addrspace(1)* %out 26 ret void 27} 28 29; GCN-LABEL: add_i32_uniform: 30; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo 31; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec 32; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 33; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 34; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 35; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc 36; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 37; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 38; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] 39; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 40; GCN: {{flat|buffer|global}}_atomic_add v[[value]] 41define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) { 42entry: 43 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel 44 store i32 %old, i32 addrspace(1)* %out 45 ret void 46} 47 48; GCN-LABEL: add_i32_varying: 49; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 50; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 51; GFX7LESS-NOT: s_bcnt1_i32_b64 52; GFX7LESS: buffer_atomic_add v{{[0-9]+}} 53; DPPCOMB: v_add_u32_dpp 54; DPPCOMB: v_add_u32_dpp 55; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 56; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 57; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 58; GFX8MORE: buffer_atomic_add v[[value]] 59define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 60entry: 61 %lane = call i32 @llvm.amdgcn.workitem.id.x() 62 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel 63 store i32 %old, i32 addrspace(1)* %out 64 ret void 65} 66 67; GCN-LABEL: add_i64_constant: 68; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo 69; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec 70; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 71; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 72; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 73; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc 74; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 75; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 76; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 77; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 78; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} 79define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 80entry: 81 %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel 82 store i64 %old, i64 addrspace(1)* %out 83 ret void 84} 85 86; GCN-LABEL: add_i64_uniform: 87; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo 88; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec 89; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 90; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 91; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 92; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc 93; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 94; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 95; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 96define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) { 97entry: 98 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel 99 store i64 %old, i64 addrspace(1)* %out 100 ret void 101} 102 103; GCN-LABEL: add_i64_varying: 104; GCN-NOT: v_mbcnt_lo_u32_b32 105; GCN-NOT: v_mbcnt_hi_u32_b32 106; GCN-NOT: s_bcnt1_i32_b64 107; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 108define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 109entry: 110 %lane = call i32 @llvm.amdgcn.workitem.id.x() 111 %zext = zext i32 %lane to i64 112 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel 113 store i64 %old, i64 addrspace(1)* %out 114 ret void 115} 116 117; GCN-LABEL: sub_i32_constant: 118; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo 119; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec 120; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 121; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 122; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 123; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc 124; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 125; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 126; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 127; GCN: {{flat|buffer|global}}_atomic_sub v[[value]] 128define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 129entry: 130 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel 131 store i32 %old, i32 addrspace(1)* %out 132 ret void 133} 134 135; GCN-LABEL: sub_i32_uniform: 136; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo 137; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec 138; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 139; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 140; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 141; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc 142; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 143; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 144; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] 145; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 146; GCN: {{flat|buffer|global}}_atomic_sub v[[value]] 147define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) { 148entry: 149 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel 150 store i32 %old, i32 addrspace(1)* %out 151 ret void 152} 153 154; GCN-LABEL: sub_i32_varying: 155; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 156; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 157; GFX7LESS-NOT: s_bcnt1_i32_b64 158; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} 159; DPPCOMB: v_add_u32_dpp 160; DPPCOMB: v_add_u32_dpp 161; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 162; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 163; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 164; GFX8MORE: buffer_atomic_sub v[[value]] 165define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 166entry: 167 %lane = call i32 @llvm.amdgcn.workitem.id.x() 168 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel 169 store i32 %old, i32 addrspace(1)* %out 170 ret void 171} 172 173; GCN-LABEL: sub_i64_constant: 174; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo 175; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec 176; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 177; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 178; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 179; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc 180; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 181; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 182; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 183; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 184; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} 185define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 186entry: 187 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel 188 store i64 %old, i64 addrspace(1)* %out 189 ret void 190} 191 192; GCN-LABEL: sub_i64_uniform: 193; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo 194; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec 195; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 196; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 197; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 198; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc 199; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 200; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 201; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 202define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) { 203entry: 204 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel 205 store i64 %old, i64 addrspace(1)* %out 206 ret void 207} 208 209; GCN-LABEL: sub_i64_varying: 210; GCN-NOT: v_mbcnt_lo_u32_b32 211; GCN-NOT: v_mbcnt_hi_u32_b32 212; GCN-NOT: s_bcnt1_i32_b64 213; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 214define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 215entry: 216 %lane = call i32 @llvm.amdgcn.workitem.id.x() 217 %zext = zext i32 %lane to i64 218 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel 219 store i64 %old, i64 addrspace(1)* %out 220 ret void 221} 222