1; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s
2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s
5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s
6
7declare i32 @llvm.amdgcn.workitem.id.x()
8declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32)
9declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, i32)
10
11; Show what the atomic optimization pass will do for struct buffers.
12
13; GCN-LABEL: add_i32_constant:
14; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo
15; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
16; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
17; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
18; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
19; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc
20; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
21; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
22; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
23; GCN: buffer_atomic_add v[[value]]
24define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
25entry:
26  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
27  store i32 %old, i32 addrspace(1)* %out
28  ret void
29}
30
31; GCN-LABEL: add_i32_uniform:
32; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo
33; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
34; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
35; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
36; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
37; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc
38; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
39; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
40; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
41; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
42; GCN: buffer_atomic_add v[[value]]
43define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
44entry:
45  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
46  store i32 %old, i32 addrspace(1)* %out
47  ret void
48}
49
50; GCN-LABEL: add_i32_varying_vdata:
51; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
52; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
53; GFX7LESS-NOT: s_bcnt1_i32_b64
54; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
55; DPPCOMB: v_add_u32_dpp
56; DPPCOMB: v_add_u32_dpp
57; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
58; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
59; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
60; GFX8MORE: buffer_atomic_add v[[value]]
61define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
62entry:
63  %lane = call i32 @llvm.amdgcn.workitem.id.x()
64  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
65  store i32 %old, i32 addrspace(1)* %out
66  ret void
67}
68
69; GCN-LABEL: add_i32_varying_vindex:
70; GCN-NOT: v_mbcnt_lo_u32_b32
71; GCN-NOT: v_mbcnt_hi_u32_b32
72; GCN-NOT: s_bcnt1_i32_b64
73; GCN: buffer_atomic_add v{{[0-9]+}}
74define amdgpu_kernel void @add_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) {
75entry:
76  %lane = call i32 @llvm.amdgcn.workitem.id.x()
77  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0)
78  store i32 %old, i32 addrspace(1)* %out
79  ret void
80}
81
82; GCN-LABEL: add_i32_varying_offset:
83; GCN-NOT: v_mbcnt_lo_u32_b32
84; GCN-NOT: v_mbcnt_hi_u32_b32
85; GCN-NOT: s_bcnt1_i32_b64
86; GCN: buffer_atomic_add v{{[0-9]+}}
87define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
88entry:
89  %lane = call i32 @llvm.amdgcn.workitem.id.x()
90  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0)
91  store i32 %old, i32 addrspace(1)* %out
92  ret void
93}
94
95; GCN-LABEL: sub_i32_constant:
96; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo
97; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
98; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
99; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
100; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
101; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc
102; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
103; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
104; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
105; GCN: buffer_atomic_sub v[[value]]
106define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
107entry:
108  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
109  store i32 %old, i32 addrspace(1)* %out
110  ret void
111}
112
113; GCN-LABEL: sub_i32_uniform:
114; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo
115; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
116; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
117; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
118; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
119; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc
120; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
121; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
122; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
123; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
124; GCN: buffer_atomic_sub v[[value]]
125define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
126entry:
127  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
128  store i32 %old, i32 addrspace(1)* %out
129  ret void
130}
131
132; GCN-LABEL: sub_i32_varying_vdata:
133; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
134; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
135; GFX7LESS-NOT: s_bcnt1_i32_b64
136; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
137; DPPCOMB: v_add_u32_dpp
138; DPPCOMB: v_add_u32_dpp
139; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
140; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
141; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
142; GFX8MORE: buffer_atomic_sub v[[value]]
143define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
144entry:
145  %lane = call i32 @llvm.amdgcn.workitem.id.x()
146  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
147  store i32 %old, i32 addrspace(1)* %out
148  ret void
149}
150
151; GCN-LABEL: sub_i32_varying_vindex:
152; GCN-NOT: v_mbcnt_lo_u32_b32
153; GCN-NOT: v_mbcnt_hi_u32_b32
154; GCN-NOT: s_bcnt1_i32_b64
155; GCN: buffer_atomic_sub v{{[0-9]+}}
156define amdgpu_kernel void @sub_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) {
157entry:
158  %lane = call i32 @llvm.amdgcn.workitem.id.x()
159  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0)
160  store i32 %old, i32 addrspace(1)* %out
161  ret void
162}
163
164; GCN-LABEL: sub_i32_varying_offset:
165; GCN-NOT: v_mbcnt_lo_u32_b32
166; GCN-NOT: v_mbcnt_hi_u32_b32
167; GCN-NOT: s_bcnt1_i32_b64
168; GCN: buffer_atomic_sub v{{[0-9]+}}
169define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
170entry:
171  %lane = call i32 @llvm.amdgcn.workitem.id.x()
172  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0)
173  store i32 %old, i32 addrspace(1)* %out
174  ret void
175}
176