1;RUN: llc < %s -march=amdgcn -mcpu=verde -amdgpu-atomic-optimizations=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
2;RUN: llc < %s -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
3
4;CHECK-LABEL: {{^}}test1:
5;CHECK-NOT: s_waitcnt
6;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc
7;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
8;CHECK: s_waitcnt vmcnt(0)
9;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc
10;CHECK: s_waitcnt vmcnt(0)
11;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen glc
12;CHECK: s_waitcnt vmcnt(0)
13;CHECK: buffer_atomic_swap v0, v[1:2], s[0:3], 0 idxen offen glc
14;SICI: v_mov_b32_e32 v1, 0x2000
15;CHECK: s_waitcnt vmcnt(0)
16;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc
17;CHECK-DAG: s_waitcnt vmcnt(0)
18;SICI: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc
19;VI: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:4 glc
20;CHECK: s_waitcnt vmcnt(0)
21;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}}
22define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) {
23main_body:
24  %o1 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
25  %o2 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
26  %o3 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0)
27  %o4 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0)
28  %ofs.5 = add i32 %voffset, 42
29  %o5 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0)
30  %o6 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o5, <4 x i32> %rsrc, i32 0, i32 8192, i1 0)
31  %unused = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
32  %out = bitcast i32 %o6 to float
33  ret float %out
34}
35
36;CHECK-LABEL: {{^}}test11:
37;CHECK-NOT: s_waitcnt
38;CHECK: buffer_atomic_swap_x2 v[3:4], off, s[0:3], 0 glc
39;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
40;CHECK: s_waitcnt vmcnt(0)
41;CHECK: buffer_atomic_swap_x2 v[3:4], v1, s[0:3], 0 idxen glc
42;CHECK: s_waitcnt vmcnt(0)
43;CHECK: buffer_atomic_swap_x2 v[3:4], v2, s[0:3], 0 offen glc
44;CHECK: s_waitcnt vmcnt(0)
45;CHECK: buffer_atomic_swap_x2 v[3:4], v[1:2], s[0:3], 0 idxen offen glc
46;CHECK: s_waitcnt vmcnt(0)
47;CHECK: buffer_atomic_swap_x2 v[3:4], v2, s[0:3], 0 offen offset:42 glc
48;CHECK-DAG: s_waitcnt vmcnt(0)
49;SICI: buffer_atomic_swap_x2 v[3:4], v0, s[0:3], 0 offen glc
50;VI: buffer_atomic_swap_x2 v[3:4], off, s[0:3], [[SOFS]] offset:4 glc
51;CHECK: s_waitcnt vmcnt(0)
52;CHECK: buffer_atomic_swap_x2 v[3:4], off, s[0:3], 0{{$}}
53define amdgpu_ps float @test11(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) {
54main_body:
55  %o0 = sext i32 %data to i64
56  %o1 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o0, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
57  %o2 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
58  %o3 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0)
59  %o4 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0)
60  %ofs.5 = add i32 %voffset, 42
61  %o5 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0)
62  %o6 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o5, <4 x i32> %rsrc, i32 0, i32 8192, i1 0)
63  %unused = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o6, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
64  %o7 = trunc i64 %o6 to i32
65  %out = bitcast i32 %o7 to float
66  ret float %out
67}
68
69;CHECK-LABEL: {{^}}test2:
70;CHECK-NOT: s_waitcnt
71;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 idxen glc
72;CHECK: s_waitcnt vmcnt(0)
73;CHECK: buffer_atomic_sub v0, v1, s[0:3], 0 idxen glc
74;CHECK: s_waitcnt vmcnt(0)
75;CHECK: buffer_atomic_smin v0, v1, s[0:3], 0 idxen glc
76;CHECK: s_waitcnt vmcnt(0)
77;CHECK: buffer_atomic_umin v0, v1, s[0:3], 0 idxen glc
78;CHECK: s_waitcnt vmcnt(0)
79;CHECK: buffer_atomic_smax v0, v1, s[0:3], 0 idxen glc
80;CHECK: s_waitcnt vmcnt(0)
81;CHECK: buffer_atomic_umax v0, v1, s[0:3], 0 idxen glc
82;CHECK: s_waitcnt vmcnt(0)
83;CHECK: buffer_atomic_and v0, v1, s[0:3], 0 idxen glc
84;CHECK: s_waitcnt vmcnt(0)
85;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 idxen glc
86;CHECK: s_waitcnt vmcnt(0)
87;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 idxen glc
88define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) {
89main_body:
90  %t1 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
91  %t2 = call i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
92  %t3 = call i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
93  %t4 = call i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
94  %t5 = call i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
95  %t6 = call i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
96  %t7 = call i32 @llvm.amdgcn.buffer.atomic.and.i32(i32 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
97  %t8 = call i32 @llvm.amdgcn.buffer.atomic.or.i32(i32 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
98  %t9 = call i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
99  %out = bitcast i32 %t9 to float
100  ret float %out
101}
102
103;CHECK-LABEL: {{^}}test3:
104;CHECK-NOT: s_waitcnt
105;CHECK: buffer_atomic_add_x2 v[0:1], v2, s[0:3], 0 idxen glc
106;CHECK: s_waitcnt vmcnt(0)
107;CHECK: buffer_atomic_sub_x2 v[0:1], v2, s[0:3], 0 idxen glc
108;CHECK: s_waitcnt vmcnt(0)
109;CHECK: buffer_atomic_smin_x2 v[0:1], v2, s[0:3], 0 idxen glc
110;CHECK: s_waitcnt vmcnt(0)
111;CHECK: buffer_atomic_umin_x2 v[0:1], v2, s[0:3], 0 idxen glc
112;CHECK: s_waitcnt vmcnt(0)
113;CHECK: buffer_atomic_smax_x2 v[0:1], v2, s[0:3], 0 idxen glc
114;CHECK: s_waitcnt vmcnt(0)
115;CHECK: buffer_atomic_umax_x2 v[0:1], v2, s[0:3], 0 idxen glc
116;CHECK: s_waitcnt vmcnt(0)
117;CHECK: buffer_atomic_and_x2 v[0:1], v2, s[0:3], 0 idxen glc
118;CHECK: s_waitcnt vmcnt(0)
119;CHECK: buffer_atomic_or_x2 v[0:1], v2, s[0:3], 0 idxen glc
120;CHECK: s_waitcnt vmcnt(0)
121;CHECK: buffer_atomic_xor_x2 v[0:1], v2, s[0:3], 0 idxen glc
122define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) {
123main_body:
124  %t0 = sext i32 %data to i64
125  %t1 = call i64 @llvm.amdgcn.buffer.atomic.add.i64(i64 %t0, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
126  %t2 = call i64 @llvm.amdgcn.buffer.atomic.sub.i64(i64 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
127  %t3 = call i64 @llvm.amdgcn.buffer.atomic.smin.i64(i64 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
128  %t4 = call i64 @llvm.amdgcn.buffer.atomic.umin.i64(i64 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
129  %t5 = call i64 @llvm.amdgcn.buffer.atomic.smax.i64(i64 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
130  %t6 = call i64 @llvm.amdgcn.buffer.atomic.umax.i64(i64 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
131  %t7 = call i64 @llvm.amdgcn.buffer.atomic.and.i64(i64 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
132  %t8 = call i64 @llvm.amdgcn.buffer.atomic.or.i64(i64 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
133  %t9 = call i64 @llvm.amdgcn.buffer.atomic.xor.i64(i64 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
134  %t10 = trunc i64 %t9 to i32
135  %out = bitcast i32 %t10 to float
136  ret float %out
137}
138
139; Ideally, we would teach tablegen & friends that cmpswap only modifies the
140; first vgpr. Since we don't do that yet, the register allocator will have to
141; create copies which we don't bother to track here.
142;
143;CHECK-LABEL: {{^}}test4:
144;CHECK-NOT: s_waitcnt
145;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc
146;CHECK: s_waitcnt vmcnt(0)
147;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
148;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc
149;CHECK: s_waitcnt vmcnt(0)
150;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen glc
151;CHECK: s_waitcnt vmcnt(0)
152;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v[2:3], s[0:3], 0 idxen offen glc
153;CHECK: s_waitcnt vmcnt(0)
154;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:44 glc
155;CHECK-DAG: s_waitcnt vmcnt(0)
156;SICI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen glc
157;VI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:4 glc
158define amdgpu_ps float @test4(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) {
159main_body:
160  %o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
161  %o2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
162  %o3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0)
163  %o4 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0)
164  %ofs.5 = add i32 %voffset, 44
165  %o5 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0)
166  %o6 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 8192, i1 0)
167
168; Detecting the no-return variant doesn't work right now because of how the
169; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG.
170; Since there probably isn't a reasonable use-case of cmpswap that discards
171; the return value, that seems okay.
172;
173;  %unused = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
174  %out = bitcast i32 %o6 to float
175  ret float %out
176}
177
178;CHECK-LABEL: {{^}}test7:
179;CHECK: buffer_atomic_add v0,
180define amdgpu_ps float @test7() {
181main_body:
182  %v = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 1, <4 x i32> undef, i32 0, i32 4, i1 false)
183  %v.float = bitcast i32 %v to float
184  ret float %v.float
185}
186
187declare i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i1) #0
188declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #0
189declare i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i1) #0
190declare i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i1) #0
191declare i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i1) #0
192declare i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i1) #0
193declare i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i1) #0
194declare i32 @llvm.amdgcn.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i1) #0
195declare i32 @llvm.amdgcn.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i1) #0
196declare i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i1) #0
197declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0
198declare i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64, <4 x i32>, i32, i32, i1) #0
199declare i64 @llvm.amdgcn.buffer.atomic.add.i64(i64, <4 x i32>, i32, i32, i1) #0
200declare i64 @llvm.amdgcn.buffer.atomic.sub.i64(i64, <4 x i32>, i32, i32, i1) #0
201declare i64 @llvm.amdgcn.buffer.atomic.smin.i64(i64, <4 x i32>, i32, i32, i1) #0
202declare i64 @llvm.amdgcn.buffer.atomic.umin.i64(i64, <4 x i32>, i32, i32, i1) #0
203declare i64 @llvm.amdgcn.buffer.atomic.smax.i64(i64, <4 x i32>, i32, i32, i1) #0
204declare i64 @llvm.amdgcn.buffer.atomic.umax.i64(i64, <4 x i32>, i32, i32, i1) #0
205declare i64 @llvm.amdgcn.buffer.atomic.and.i64(i64, <4 x i32>, i32, i32, i1) #0
206declare i64 @llvm.amdgcn.buffer.atomic.or.i64(i64, <4 x i32>, i32, i32, i1) #0
207declare i64 @llvm.amdgcn.buffer.atomic.xor.i64(i64, <4 x i32>, i32, i32, i1) #0
208
209attributes #0 = { nounwind }
210