1; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
4
5declare i32 @llvm.amdgcn.workitem.id.x()
6
7; Show that what the atomic optimization pass will do for global pointers.
8
9; GCN-LABEL: add_i32_constant:
10; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
11; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
12; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
13; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
14; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
15; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
16; GCN: {{flat|buffer|global}}_atomic_add v[[value]]
17define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
18entry:
19  %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel
20  store i32 %old, i32 addrspace(1)* %out
21  ret void
22}
23
24; GCN-LABEL: add_i32_uniform:
25; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
26; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
27; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
28; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
29; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
30; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
31; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
32; GCN: {{flat|buffer|global}}_atomic_add v[[value]]
33define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) {
34entry:
35  %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel
36  store i32 %old, i32 addrspace(1)* %out
37  ret void
38}
39
40; GCN-LABEL: add_i32_varying:
41; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
42; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
43; GFX7LESS-NOT: s_bcnt1_i32_b64
44; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
45; GFX8MORE: v_add_u32_dpp
46; GFX8MORE: v_add_u32_dpp
47; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
48; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
49; GFX8MORE: buffer_atomic_add v[[value]]
50define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
51entry:
52  %lane = call i32 @llvm.amdgcn.workitem.id.x()
53  %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel
54  store i32 %old, i32 addrspace(1)* %out
55  ret void
56}
57
58; GCN-LABEL: add_i64_constant:
59; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
60; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
61; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
62; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
63; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
64; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5
65; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5
66; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
67define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
68entry:
69  %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel
70  store i64 %old, i64 addrspace(1)* %out
71  ret void
72}
73
74; GCN-LABEL: add_i64_uniform:
75; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
76; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
77; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
78; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
79; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
80; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
81define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) {
82entry:
83  %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel
84  store i64 %old, i64 addrspace(1)* %out
85  ret void
86}
87
88; GCN-LABEL: add_i64_varying:
89; GCN-NOT: v_mbcnt_lo_u32_b32
90; GCN-NOT: v_mbcnt_hi_u32_b32
91; GCN-NOT: s_bcnt1_i32_b64
92; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
93define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
94entry:
95  %lane = call i32 @llvm.amdgcn.workitem.id.x()
96  %zext = zext i32 %lane to i64
97  %old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel
98  store i64 %old, i64 addrspace(1)* %out
99  ret void
100}
101
102; GCN-LABEL: sub_i32_constant:
103; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
104; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
105; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
106; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
107; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
108; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
109; GCN: {{flat|buffer|global}}_atomic_sub v[[value]]
110define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
111entry:
112  %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel
113  store i32 %old, i32 addrspace(1)* %out
114  ret void
115}
116
117; GCN-LABEL: sub_i32_uniform:
118; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
119; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
120; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
121; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
122; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
123; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
124; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
125; GCN: {{flat|buffer|global}}_atomic_sub v[[value]]
126define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) {
127entry:
128  %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel
129  store i32 %old, i32 addrspace(1)* %out
130  ret void
131}
132
133; GCN-LABEL: sub_i32_varying:
134; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
135; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
136; GFX7LESS-NOT: s_bcnt1_i32_b64
137; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
138; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
139; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
140; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
141; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
142; GFX8MORE: buffer_atomic_sub v[[value]]
143define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
144entry:
145  %lane = call i32 @llvm.amdgcn.workitem.id.x()
146  %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel
147  store i32 %old, i32 addrspace(1)* %out
148  ret void
149}
150
151; GCN-LABEL: sub_i64_constant:
152; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
153; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
154; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
155; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
156; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
157; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5
158; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5
159; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
160define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
161entry:
162  %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel
163  store i64 %old, i64 addrspace(1)* %out
164  ret void
165}
166
167; GCN-LABEL: sub_i64_uniform:
168; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
169; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
170; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
171; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
172; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
173; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
174define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) {
175entry:
176  %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel
177  store i64 %old, i64 addrspace(1)* %out
178  ret void
179}
180
181; GCN-LABEL: sub_i64_varying:
182; GCN-NOT: v_mbcnt_lo_u32_b32
183; GCN-NOT: v_mbcnt_hi_u32_b32
184; GCN-NOT: s_bcnt1_i32_b64
185; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
186define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
187entry:
188  %lane = call i32 @llvm.amdgcn.workitem.id.x()
189  %zext = zext i32 %lane to i64
190  %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel
191  store i64 %old, i64 addrspace(1)* %out
192  ret void
193}
194