1; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s
2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s
5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s
6
7declare i32 @llvm.amdgcn.workitem.id.x()
8declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32)
9declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, i32)
10
11; Show that what the atomic optimization pass will do for struct buffers.
12
13; GCN-LABEL: add_i32_constant:
14; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
15; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
16; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
17; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
18; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
19; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
20; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
21; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
22; GCN: buffer_atomic_add v[[value]]
23define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
24entry:
25  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
26  store i32 %old, i32 addrspace(1)* %out
27  ret void
28}
29
30; GCN-LABEL: add_i32_uniform:
31; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
32; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
33; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
34; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
35; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
36; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
37; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
38; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
39; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
40; GCN: buffer_atomic_add v[[value]]
41define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
42entry:
43  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
44  store i32 %old, i32 addrspace(1)* %out
45  ret void
46}
47
48; GCN-LABEL: add_i32_varying_vdata:
49; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
50; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
51; GFX7LESS-NOT: s_bcnt1_i32_b64
52; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
53; DPPCOMB: v_add_u32_dpp
54; DPPCOMB: v_add_u32_dpp
55; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
56; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
57; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
58; GFX8MORE: buffer_atomic_add v[[value]]
59define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
60entry:
61  %lane = call i32 @llvm.amdgcn.workitem.id.x()
62  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
63  store i32 %old, i32 addrspace(1)* %out
64  ret void
65}
66
67; GCN-LABEL: add_i32_varying_vindex:
68; GCN-NOT: v_mbcnt_lo_u32_b32
69; GCN-NOT: v_mbcnt_hi_u32_b32
70; GCN-NOT: s_bcnt1_i32_b64
71; GCN: buffer_atomic_add v{{[0-9]+}}
72define amdgpu_kernel void @add_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) {
73entry:
74  %lane = call i32 @llvm.amdgcn.workitem.id.x()
75  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0)
76  store i32 %old, i32 addrspace(1)* %out
77  ret void
78}
79
80; GCN-LABEL: add_i32_varying_offset:
81; GCN-NOT: v_mbcnt_lo_u32_b32
82; GCN-NOT: v_mbcnt_hi_u32_b32
83; GCN-NOT: s_bcnt1_i32_b64
84; GCN: buffer_atomic_add v{{[0-9]+}}
85define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
86entry:
87  %lane = call i32 @llvm.amdgcn.workitem.id.x()
88  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0)
89  store i32 %old, i32 addrspace(1)* %out
90  ret void
91}
92
93; GCN-LABEL: sub_i32_constant:
94; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
95; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
96; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
97; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
98; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
99; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
100; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
101; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
102; GCN: buffer_atomic_sub v[[value]]
103define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
104entry:
105  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
106  store i32 %old, i32 addrspace(1)* %out
107  ret void
108}
109
110; GCN-LABEL: sub_i32_uniform:
111; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
112; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
113; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
114; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
115; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
116; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
117; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
118; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
119; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
120; GCN: buffer_atomic_sub v[[value]]
121define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
122entry:
123  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
124  store i32 %old, i32 addrspace(1)* %out
125  ret void
126}
127
128; GCN-LABEL: sub_i32_varying_vdata:
129; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
130; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
131; GFX7LESS-NOT: s_bcnt1_i32_b64
132; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
133; DPPCOMB: v_add_u32_dpp
134; DPPCOMB: v_add_u32_dpp
135; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
136; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
137; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
138; GFX8MORE: buffer_atomic_sub v[[value]]
139define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
140entry:
141  %lane = call i32 @llvm.amdgcn.workitem.id.x()
142  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
143  store i32 %old, i32 addrspace(1)* %out
144  ret void
145}
146
147; GCN-LABEL: sub_i32_varying_vindex:
148; GCN-NOT: v_mbcnt_lo_u32_b32
149; GCN-NOT: v_mbcnt_hi_u32_b32
150; GCN-NOT: s_bcnt1_i32_b64
151; GCN: buffer_atomic_sub v{{[0-9]+}}
152define amdgpu_kernel void @sub_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) {
153entry:
154  %lane = call i32 @llvm.amdgcn.workitem.id.x()
155  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0)
156  store i32 %old, i32 addrspace(1)* %out
157  ret void
158}
159
160; GCN-LABEL: sub_i32_varying_offset:
161; GCN-NOT: v_mbcnt_lo_u32_b32
162; GCN-NOT: v_mbcnt_hi_u32_b32
163; GCN-NOT: s_bcnt1_i32_b64
164; GCN: buffer_atomic_sub v{{[0-9]+}}
165define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
166entry:
167  %lane = call i32 @llvm.amdgcn.workitem.id.x()
168  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0)
169  store i32 %old, i32 addrspace(1)* %out
170  ret void
171}
172