1; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
4
5declare i32 @llvm.amdgcn.workitem.id.x()
6
7; Show that what the atomic optimization pass will do for global pointers.
8
9; GCN-LABEL: add_i32_constant:
10; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
11; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
12; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
13; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
14; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
15; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
16; GCN: {{flat|buffer|global}}_atomic_add v[[value]]
17define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
18entry:
19  %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel
20  store i32 %old, i32 addrspace(1)* %out
21  ret void
22}
23
24; GCN-LABEL: add_i32_uniform:
25; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
26; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
27; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
28; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
29; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
30; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
31; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
32; GCN: {{flat|buffer|global}}_atomic_add v[[value]]
33define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) {
34entry:
35  %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel
36  store i32 %old, i32 addrspace(1)* %out
37  ret void
38}
39
40; GCN-LABEL: add_i32_varying:
41; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
42; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
43; GFX7LESS-NOT: s_bcnt1_i32_b64
44; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
45; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
46; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
47; GFX8MORE: buffer_atomic_add v[[value]]
48define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
49entry:
50  %lane = call i32 @llvm.amdgcn.workitem.id.x()
51  %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel
52  store i32 %old, i32 addrspace(1)* %out
53  ret void
54}
55
56; GCN-LABEL: add_i64_constant:
57; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
58; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
59; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
60; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
61; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
62; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5
63; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5
64; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
65define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
66entry:
67  %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel
68  store i64 %old, i64 addrspace(1)* %out
69  ret void
70}
71
72; GCN-LABEL: add_i64_uniform:
73; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
74; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
75; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
76; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
77; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
78; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
79define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) {
80entry:
81  %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel
82  store i64 %old, i64 addrspace(1)* %out
83  ret void
84}
85
86; GCN-LABEL: add_i64_varying:
87; GCN-NOT: v_mbcnt_lo_u32_b32
88; GCN-NOT: v_mbcnt_hi_u32_b32
89; GCN-NOT: s_bcnt1_i32_b64
90; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
91define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
92entry:
93  %lane = call i32 @llvm.amdgcn.workitem.id.x()
94  %zext = zext i32 %lane to i64
95  %old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel
96  store i64 %old, i64 addrspace(1)* %out
97  ret void
98}
99
100; GCN-LABEL: sub_i32_constant:
101; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
102; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
103; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
104; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
105; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
106; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
107; GCN: {{flat|buffer|global}}_atomic_sub v[[value]]
108define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
109entry:
110  %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel
111  store i32 %old, i32 addrspace(1)* %out
112  ret void
113}
114
115; GCN-LABEL: sub_i32_uniform:
116; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
117; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
118; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
119; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
120; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
121; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
122; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
123; GCN: {{flat|buffer|global}}_atomic_sub v[[value]]
124define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) {
125entry:
126  %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel
127  store i32 %old, i32 addrspace(1)* %out
128  ret void
129}
130
131; GCN-LABEL: sub_i32_varying:
132; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
133; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
134; GFX7LESS-NOT: s_bcnt1_i32_b64
135; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
136; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
137; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
138; GFX8MORE: buffer_atomic_sub v[[value]]
139define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
140entry:
141  %lane = call i32 @llvm.amdgcn.workitem.id.x()
142  %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel
143  store i32 %old, i32 addrspace(1)* %out
144  ret void
145}
146
147; GCN-LABEL: sub_i64_constant:
148; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
149; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
150; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
151; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
152; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
153; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5
154; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5
155; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
156define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
157entry:
158  %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel
159  store i64 %old, i64 addrspace(1)* %out
160  ret void
161}
162
163; GCN-LABEL: sub_i64_uniform:
164; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
165; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
166; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
167; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
168; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
169; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
170define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) {
171entry:
172  %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel
173  store i64 %old, i64 addrspace(1)* %out
174  ret void
175}
176
177; GCN-LABEL: sub_i64_varying:
178; GCN-NOT: v_mbcnt_lo_u32_b32
179; GCN-NOT: v_mbcnt_hi_u32_b32
180; GCN-NOT: s_bcnt1_i32_b64
181; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
182define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
183entry:
184  %lane = call i32 @llvm.amdgcn.workitem.id.x()
185  %zext = zext i32 %lane to i64
186  %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel
187  store i64 %old, i64 addrspace(1)* %out
188  ret void
189}
190