1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
6; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
8; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
9; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
10; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
11; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
12
13define amdgpu_kernel void @flat_singlethread_unordered_load(
14; GFX7-LABEL: flat_singlethread_unordered_load:
15; GFX7:       ; %bb.0: ; %entry
16; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
17; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18; GFX7-NEXT:    v_mov_b32_e32 v0, s0
19; GFX7-NEXT:    v_mov_b32_e32 v1, s1
20; GFX7-NEXT:    flat_load_dword v2, v[0:1]
21; GFX7-NEXT:    v_mov_b32_e32 v0, s2
22; GFX7-NEXT:    v_mov_b32_e32 v1, s3
23; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
24; GFX7-NEXT:    flat_store_dword v[0:1], v2
25; GFX7-NEXT:    s_endpgm
26;
27; GFX10-WGP-LABEL: flat_singlethread_unordered_load:
28; GFX10-WGP:       ; %bb.0: ; %entry
29; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
30; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
31; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
32; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
33; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
34; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
35; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
36; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
37; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
38; GFX10-WGP-NEXT:    s_endpgm
39;
40; GFX10-CU-LABEL: flat_singlethread_unordered_load:
41; GFX10-CU:       ; %bb.0: ; %entry
42; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
43; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
44; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
45; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
46; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
47; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
48; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
49; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
50; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
51; GFX10-CU-NEXT:    s_endpgm
52;
53; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_load:
54; SKIP-CACHE-INV:       ; %bb.0: ; %entry
55; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
56; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
57; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
58; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
59; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
60; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
61; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
62; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
63; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
64; SKIP-CACHE-INV-NEXT:    s_endpgm
65;
66; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
67; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
68; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
69; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
70; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
71; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
72; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
73; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
74; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
75; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
76; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
77; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
78;
79; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load:
80; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
81; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
82; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
83; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
84; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
85; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
86; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
87; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
88; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
89; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
90; GFX90A-TGSPLIT-NEXT:    s_endpgm
91;
92; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
93; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
94; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
95; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
97; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
98; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
99; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
100; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
101; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
102; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
103; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
104;
105; GFX940-TGSPLIT-LABEL: flat_singlethread_unordered_load:
106; GFX940-TGSPLIT:       ; %bb.0: ; %entry
107; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
108; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
109; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
110; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
111; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
112; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
113; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
114; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
115; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
116; GFX940-TGSPLIT-NEXT:    s_endpgm
117;
118; GFX11-WGP-LABEL: flat_singlethread_unordered_load:
119; GFX11-WGP:       ; %bb.0: ; %entry
120; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
121; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
122; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
123; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
124; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
125; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
126; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
127; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
128; GFX11-WGP-NEXT:    s_endpgm
129;
130; GFX11-CU-LABEL: flat_singlethread_unordered_load:
131; GFX11-CU:       ; %bb.0: ; %entry
132; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
133; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
134; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
135; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
136; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
137; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
138; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
139; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
140; GFX11-CU-NEXT:    s_endpgm
141    i32* %in, i32* %out) {
142entry:
143  %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4
144  store i32 %val, i32* %out
145  ret void
146}
147
148define amdgpu_kernel void @flat_singlethread_monotonic_load(
149; GFX7-LABEL: flat_singlethread_monotonic_load:
150; GFX7:       ; %bb.0: ; %entry
151; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
152; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
153; GFX7-NEXT:    v_mov_b32_e32 v0, s0
154; GFX7-NEXT:    v_mov_b32_e32 v1, s1
155; GFX7-NEXT:    flat_load_dword v2, v[0:1]
156; GFX7-NEXT:    v_mov_b32_e32 v0, s2
157; GFX7-NEXT:    v_mov_b32_e32 v1, s3
158; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
159; GFX7-NEXT:    flat_store_dword v[0:1], v2
160; GFX7-NEXT:    s_endpgm
161;
162; GFX10-WGP-LABEL: flat_singlethread_monotonic_load:
163; GFX10-WGP:       ; %bb.0: ; %entry
164; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
165; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
166; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
167; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
168; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
169; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
170; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
171; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
172; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
173; GFX10-WGP-NEXT:    s_endpgm
174;
175; GFX10-CU-LABEL: flat_singlethread_monotonic_load:
176; GFX10-CU:       ; %bb.0: ; %entry
177; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
178; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
179; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
180; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
181; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
182; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
183; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
184; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
185; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
186; GFX10-CU-NEXT:    s_endpgm
187;
188; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_load:
189; SKIP-CACHE-INV:       ; %bb.0: ; %entry
190; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
191; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
192; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
193; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
194; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
195; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
196; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
197; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
198; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
199; SKIP-CACHE-INV-NEXT:    s_endpgm
200;
201; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
202; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
203; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
204; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
205; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
206; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
207; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
208; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
209; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
210; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
211; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
212; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
213;
214; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
215; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
216; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
217; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
218; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
219; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
220; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
221; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
222; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
223; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
224; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
225; GFX90A-TGSPLIT-NEXT:    s_endpgm
226;
227; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
228; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
229; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
230; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
231; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
232; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
233; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
234; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
235; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
236; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
237; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
238; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
239;
240; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
241; GFX940-TGSPLIT:       ; %bb.0: ; %entry
242; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
243; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
244; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
245; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
246; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
247; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
248; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
249; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
250; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
251; GFX940-TGSPLIT-NEXT:    s_endpgm
252;
253; GFX11-WGP-LABEL: flat_singlethread_monotonic_load:
254; GFX11-WGP:       ; %bb.0: ; %entry
255; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
256; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
257; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
258; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
259; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
260; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
261; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
262; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
263; GFX11-WGP-NEXT:    s_endpgm
264;
265; GFX11-CU-LABEL: flat_singlethread_monotonic_load:
266; GFX11-CU:       ; %bb.0: ; %entry
267; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
268; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
269; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
270; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
271; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
272; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
273; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
274; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
275; GFX11-CU-NEXT:    s_endpgm
276    i32* %in, i32* %out) {
277entry:
278  %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4
279  store i32 %val, i32* %out
280  ret void
281}
282
283define amdgpu_kernel void @flat_singlethread_acquire_load(
284; GFX7-LABEL: flat_singlethread_acquire_load:
285; GFX7:       ; %bb.0: ; %entry
286; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
287; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
288; GFX7-NEXT:    v_mov_b32_e32 v0, s0
289; GFX7-NEXT:    v_mov_b32_e32 v1, s1
290; GFX7-NEXT:    flat_load_dword v2, v[0:1]
291; GFX7-NEXT:    v_mov_b32_e32 v0, s2
292; GFX7-NEXT:    v_mov_b32_e32 v1, s3
293; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
294; GFX7-NEXT:    flat_store_dword v[0:1], v2
295; GFX7-NEXT:    s_endpgm
296;
297; GFX10-WGP-LABEL: flat_singlethread_acquire_load:
298; GFX10-WGP:       ; %bb.0: ; %entry
299; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
300; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
301; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
302; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
303; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
304; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
305; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
306; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
307; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
308; GFX10-WGP-NEXT:    s_endpgm
309;
310; GFX10-CU-LABEL: flat_singlethread_acquire_load:
311; GFX10-CU:       ; %bb.0: ; %entry
312; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
313; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
314; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
315; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
316; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
317; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
318; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
319; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
320; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
321; GFX10-CU-NEXT:    s_endpgm
322;
323; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_load:
324; SKIP-CACHE-INV:       ; %bb.0: ; %entry
325; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
326; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
327; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
328; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
329; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
330; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
331; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
332; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
333; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
334; SKIP-CACHE-INV-NEXT:    s_endpgm
335;
336; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
337; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
338; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
339; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
340; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
341; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
342; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
343; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
344; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
345; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
346; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
347; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
348;
349; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load:
350; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
351; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
352; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
354; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
355; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
356; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
357; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
358; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
359; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
360; GFX90A-TGSPLIT-NEXT:    s_endpgm
361;
362; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
363; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
364; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
365; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
366; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
367; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
368; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
369; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
370; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
371; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
372; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
373; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
374;
375; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_load:
376; GFX940-TGSPLIT:       ; %bb.0: ; %entry
377; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
378; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
379; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
380; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
381; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
382; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
383; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
384; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
385; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
386; GFX940-TGSPLIT-NEXT:    s_endpgm
387;
388; GFX11-WGP-LABEL: flat_singlethread_acquire_load:
389; GFX11-WGP:       ; %bb.0: ; %entry
390; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
391; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
392; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
393; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
394; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
395; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
396; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
397; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
398; GFX11-WGP-NEXT:    s_endpgm
399;
400; GFX11-CU-LABEL: flat_singlethread_acquire_load:
401; GFX11-CU:       ; %bb.0: ; %entry
402; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
403; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
404; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
405; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
406; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
407; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
408; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
409; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
410; GFX11-CU-NEXT:    s_endpgm
411    i32* %in, i32* %out) {
412entry:
413  %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4
414  store i32 %val, i32* %out
415  ret void
416}
417
418define amdgpu_kernel void @flat_singlethread_seq_cst_load(
419; GFX7-LABEL: flat_singlethread_seq_cst_load:
420; GFX7:       ; %bb.0: ; %entry
421; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
422; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
423; GFX7-NEXT:    v_mov_b32_e32 v0, s0
424; GFX7-NEXT:    v_mov_b32_e32 v1, s1
425; GFX7-NEXT:    flat_load_dword v2, v[0:1]
426; GFX7-NEXT:    v_mov_b32_e32 v0, s2
427; GFX7-NEXT:    v_mov_b32_e32 v1, s3
428; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
429; GFX7-NEXT:    flat_store_dword v[0:1], v2
430; GFX7-NEXT:    s_endpgm
431;
432; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load:
433; GFX10-WGP:       ; %bb.0: ; %entry
434; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
435; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
436; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
437; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
438; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
439; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
440; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
441; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
442; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
443; GFX10-WGP-NEXT:    s_endpgm
444;
445; GFX10-CU-LABEL: flat_singlethread_seq_cst_load:
446; GFX10-CU:       ; %bb.0: ; %entry
447; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
448; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
449; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
450; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
451; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
452; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
453; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
454; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
455; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
456; GFX10-CU-NEXT:    s_endpgm
457;
458; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_load:
459; SKIP-CACHE-INV:       ; %bb.0: ; %entry
460; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
461; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
462; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
463; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
464; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
465; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
466; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
467; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
468; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
469; SKIP-CACHE-INV-NEXT:    s_endpgm
470;
471; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
472; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
473; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
474; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
475; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
476; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
477; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
478; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
479; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
480; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
481; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
482; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
483;
484; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
485; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
486; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
487; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
488; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
489; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
490; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
491; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
492; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
493; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
494; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
495; GFX90A-TGSPLIT-NEXT:    s_endpgm
496;
497; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
498; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
499; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
500; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
501; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
502; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
503; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
504; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
505; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
506; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
507; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
508; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
509;
510; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
511; GFX940-TGSPLIT:       ; %bb.0: ; %entry
512; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
513; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
514; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
515; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
516; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
517; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
518; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
519; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
520; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
521; GFX940-TGSPLIT-NEXT:    s_endpgm
522;
523; GFX11-WGP-LABEL: flat_singlethread_seq_cst_load:
524; GFX11-WGP:       ; %bb.0: ; %entry
525; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
526; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
527; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
528; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
529; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
530; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
531; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
532; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
533; GFX11-WGP-NEXT:    s_endpgm
534;
535; GFX11-CU-LABEL: flat_singlethread_seq_cst_load:
536; GFX11-CU:       ; %bb.0: ; %entry
537; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
538; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
539; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
540; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
541; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
542; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
543; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
544; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
545; GFX11-CU-NEXT:    s_endpgm
546    i32* %in, i32* %out) {
547entry:
548  %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4
549  store i32 %val, i32* %out
550  ret void
551}
552
553define amdgpu_kernel void @flat_singlethread_unordered_store(
554; GFX7-LABEL: flat_singlethread_unordered_store:
555; GFX7:       ; %bb.0: ; %entry
556; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
557; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
558; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
559; GFX7-NEXT:    v_mov_b32_e32 v0, s0
560; GFX7-NEXT:    v_mov_b32_e32 v1, s1
561; GFX7-NEXT:    v_mov_b32_e32 v2, s2
562; GFX7-NEXT:    flat_store_dword v[0:1], v2
563; GFX7-NEXT:    s_endpgm
564;
565; GFX10-WGP-LABEL: flat_singlethread_unordered_store:
566; GFX10-WGP:       ; %bb.0: ; %entry
567; GFX10-WGP-NEXT:    s_clause 0x1
568; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
569; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
570; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
571; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
572; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
573; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
574; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
575; GFX10-WGP-NEXT:    s_endpgm
576;
577; GFX10-CU-LABEL: flat_singlethread_unordered_store:
578; GFX10-CU:       ; %bb.0: ; %entry
579; GFX10-CU-NEXT:    s_clause 0x1
580; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
581; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
582; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
583; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
584; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
585; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
586; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
587; GFX10-CU-NEXT:    s_endpgm
588;
589; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_store:
590; SKIP-CACHE-INV:       ; %bb.0: ; %entry
591; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
592; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
593; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
594; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
595; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
596; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
597; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
598; SKIP-CACHE-INV-NEXT:    s_endpgm
599;
600; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
601; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
602; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
603; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
604; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
605; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
606; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
607; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
608; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
609;
610; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store:
611; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
612; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
613; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
614; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
615; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
616; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
617; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
618; GFX90A-TGSPLIT-NEXT:    s_endpgm
619;
620; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
621; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
622; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
623; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
624; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
625; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
626; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
627; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
628; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
629;
630; GFX940-TGSPLIT-LABEL: flat_singlethread_unordered_store:
631; GFX940-TGSPLIT:       ; %bb.0: ; %entry
632; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
633; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
634; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
635; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
636; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
637; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
638; GFX940-TGSPLIT-NEXT:    s_endpgm
639;
640; GFX11-WGP-LABEL: flat_singlethread_unordered_store:
641; GFX11-WGP:       ; %bb.0: ; %entry
642; GFX11-WGP-NEXT:    s_clause 0x1
643; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
644; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
645; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
646; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
647; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
648; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
649; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
650; GFX11-WGP-NEXT:    s_endpgm
651;
652; GFX11-CU-LABEL: flat_singlethread_unordered_store:
653; GFX11-CU:       ; %bb.0: ; %entry
654; GFX11-CU-NEXT:    s_clause 0x1
655; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
656; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
657; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
658; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
659; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
660; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
661; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
662; GFX11-CU-NEXT:    s_endpgm
663    i32 %in, i32* %out) {
664entry:
665  store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4
666  ret void
667}
668
669define amdgpu_kernel void @flat_singlethread_monotonic_store(
670; GFX7-LABEL: flat_singlethread_monotonic_store:
671; GFX7:       ; %bb.0: ; %entry
672; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
673; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
674; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
675; GFX7-NEXT:    v_mov_b32_e32 v0, s0
676; GFX7-NEXT:    v_mov_b32_e32 v1, s1
677; GFX7-NEXT:    v_mov_b32_e32 v2, s2
678; GFX7-NEXT:    flat_store_dword v[0:1], v2
679; GFX7-NEXT:    s_endpgm
680;
681; GFX10-WGP-LABEL: flat_singlethread_monotonic_store:
682; GFX10-WGP:       ; %bb.0: ; %entry
683; GFX10-WGP-NEXT:    s_clause 0x1
684; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
685; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
686; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
687; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
688; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
689; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
690; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
691; GFX10-WGP-NEXT:    s_endpgm
692;
693; GFX10-CU-LABEL: flat_singlethread_monotonic_store:
694; GFX10-CU:       ; %bb.0: ; %entry
695; GFX10-CU-NEXT:    s_clause 0x1
696; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
697; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
698; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
699; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
700; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
701; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
702; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
703; GFX10-CU-NEXT:    s_endpgm
704;
705; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_store:
706; SKIP-CACHE-INV:       ; %bb.0: ; %entry
707; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
708; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
709; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
710; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
711; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
712; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
713; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
714; SKIP-CACHE-INV-NEXT:    s_endpgm
715;
716; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
717; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
718; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
719; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
720; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
721; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
722; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
723; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
724; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
725;
726; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
727; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
728; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
729; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
730; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
731; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
732; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
733; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
734; GFX90A-TGSPLIT-NEXT:    s_endpgm
735;
736; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
737; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
738; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
739; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
740; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
741; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
742; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
743; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
744; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
745;
746; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
747; GFX940-TGSPLIT:       ; %bb.0: ; %entry
748; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
749; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
750; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
751; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
752; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
753; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
754; GFX940-TGSPLIT-NEXT:    s_endpgm
755;
756; GFX11-WGP-LABEL: flat_singlethread_monotonic_store:
757; GFX11-WGP:       ; %bb.0: ; %entry
758; GFX11-WGP-NEXT:    s_clause 0x1
759; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
760; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
761; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
762; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
763; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
764; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
765; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
766; GFX11-WGP-NEXT:    s_endpgm
767;
768; GFX11-CU-LABEL: flat_singlethread_monotonic_store:
769; GFX11-CU:       ; %bb.0: ; %entry
770; GFX11-CU-NEXT:    s_clause 0x1
771; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
772; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
773; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
774; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
775; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
776; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
777; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
778; GFX11-CU-NEXT:    s_endpgm
779    i32 %in, i32* %out) {
780entry:
781  store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4
782  ret void
783}
784
785define amdgpu_kernel void @flat_singlethread_release_store(
786; GFX7-LABEL: flat_singlethread_release_store:
787; GFX7:       ; %bb.0: ; %entry
788; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
789; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
790; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
791; GFX7-NEXT:    v_mov_b32_e32 v0, s0
792; GFX7-NEXT:    v_mov_b32_e32 v1, s1
793; GFX7-NEXT:    v_mov_b32_e32 v2, s2
794; GFX7-NEXT:    flat_store_dword v[0:1], v2
795; GFX7-NEXT:    s_endpgm
796;
797; GFX10-WGP-LABEL: flat_singlethread_release_store:
798; GFX10-WGP:       ; %bb.0: ; %entry
799; GFX10-WGP-NEXT:    s_clause 0x1
800; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
801; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
802; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
803; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
804; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
805; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
806; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
807; GFX10-WGP-NEXT:    s_endpgm
808;
809; GFX10-CU-LABEL: flat_singlethread_release_store:
810; GFX10-CU:       ; %bb.0: ; %entry
811; GFX10-CU-NEXT:    s_clause 0x1
812; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
813; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
814; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
815; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
816; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
817; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
818; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
819; GFX10-CU-NEXT:    s_endpgm
820;
821; SKIP-CACHE-INV-LABEL: flat_singlethread_release_store:
822; SKIP-CACHE-INV:       ; %bb.0: ; %entry
823; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
824; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
825; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
826; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
827; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
828; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
829; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
830; SKIP-CACHE-INV-NEXT:    s_endpgm
831;
832; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
833; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
834; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
835; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
836; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
837; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
838; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
839; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
840; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
841;
842; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store:
843; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
844; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
845; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
846; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
847; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
848; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
849; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
850; GFX90A-TGSPLIT-NEXT:    s_endpgm
851;
852; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
853; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
854; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
855; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
856; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
857; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
858; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
859; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
860; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
861;
862; GFX940-TGSPLIT-LABEL: flat_singlethread_release_store:
863; GFX940-TGSPLIT:       ; %bb.0: ; %entry
864; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
865; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
866; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
867; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
868; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
869; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
870; GFX940-TGSPLIT-NEXT:    s_endpgm
871;
872; GFX11-WGP-LABEL: flat_singlethread_release_store:
873; GFX11-WGP:       ; %bb.0: ; %entry
874; GFX11-WGP-NEXT:    s_clause 0x1
875; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
876; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
877; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
878; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
879; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
880; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
881; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
882; GFX11-WGP-NEXT:    s_endpgm
883;
884; GFX11-CU-LABEL: flat_singlethread_release_store:
885; GFX11-CU:       ; %bb.0: ; %entry
886; GFX11-CU-NEXT:    s_clause 0x1
887; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
888; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
889; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
890; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
891; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
892; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
893; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
894; GFX11-CU-NEXT:    s_endpgm
895    i32 %in, i32* %out) {
896entry:
897  store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4
898  ret void
899}
900
901define amdgpu_kernel void @flat_singlethread_seq_cst_store(
902; GFX7-LABEL: flat_singlethread_seq_cst_store:
903; GFX7:       ; %bb.0: ; %entry
904; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
905; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
906; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
907; GFX7-NEXT:    v_mov_b32_e32 v0, s0
908; GFX7-NEXT:    v_mov_b32_e32 v1, s1
909; GFX7-NEXT:    v_mov_b32_e32 v2, s2
910; GFX7-NEXT:    flat_store_dword v[0:1], v2
911; GFX7-NEXT:    s_endpgm
912;
913; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store:
914; GFX10-WGP:       ; %bb.0: ; %entry
915; GFX10-WGP-NEXT:    s_clause 0x1
916; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
917; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
918; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
919; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
920; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
921; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
922; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
923; GFX10-WGP-NEXT:    s_endpgm
924;
925; GFX10-CU-LABEL: flat_singlethread_seq_cst_store:
926; GFX10-CU:       ; %bb.0: ; %entry
927; GFX10-CU-NEXT:    s_clause 0x1
928; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
929; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
930; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
931; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
932; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
933; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
934; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
935; GFX10-CU-NEXT:    s_endpgm
936;
937; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_store:
938; SKIP-CACHE-INV:       ; %bb.0: ; %entry
939; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
940; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
941; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
942; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
943; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
944; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
945; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
946; SKIP-CACHE-INV-NEXT:    s_endpgm
947;
948; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
949; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
950; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
951; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
952; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
953; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
954; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
955; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
956; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
957;
958; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
959; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
960; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
961; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
962; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
963; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
964; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
965; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
966; GFX90A-TGSPLIT-NEXT:    s_endpgm
967;
968; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
969; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
970; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
971; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
972; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
973; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
974; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
975; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
976; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
977;
978; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
979; GFX940-TGSPLIT:       ; %bb.0: ; %entry
980; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
981; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
982; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
983; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
984; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
985; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
986; GFX940-TGSPLIT-NEXT:    s_endpgm
987;
988; GFX11-WGP-LABEL: flat_singlethread_seq_cst_store:
989; GFX11-WGP:       ; %bb.0: ; %entry
990; GFX11-WGP-NEXT:    s_clause 0x1
991; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
992; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
993; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
994; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
995; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
996; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
997; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
998; GFX11-WGP-NEXT:    s_endpgm
999;
1000; GFX11-CU-LABEL: flat_singlethread_seq_cst_store:
1001; GFX11-CU:       ; %bb.0: ; %entry
1002; GFX11-CU-NEXT:    s_clause 0x1
1003; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
1004; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
1005; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1006; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1007; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1008; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
1009; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1010; GFX11-CU-NEXT:    s_endpgm
1011    i32 %in, i32* %out) {
1012entry:
1013  store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4
1014  ret void
1015}
1016
1017define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
1018; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw:
1019; GFX7:       ; %bb.0: ; %entry
1020; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1021; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1022; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1023; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1024; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1025; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1026; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1027; GFX7-NEXT:    s_endpgm
1028;
1029; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
1030; GFX10-WGP:       ; %bb.0: ; %entry
1031; GFX10-WGP-NEXT:    s_clause 0x1
1032; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1033; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1034; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1035; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1036; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1037; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1038; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1039; GFX10-WGP-NEXT:    s_endpgm
1040;
1041; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
1042; GFX10-CU:       ; %bb.0: ; %entry
1043; GFX10-CU-NEXT:    s_clause 0x1
1044; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1045; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1046; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1047; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1048; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1049; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1050; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1051; GFX10-CU-NEXT:    s_endpgm
1052;
1053; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_atomicrmw:
1054; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1055; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1056; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
1057; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1058; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1059; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1060; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1061; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1062; SKIP-CACHE-INV-NEXT:    s_endpgm
1063;
1064; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
1065; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1066; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1067; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1068; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1069; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1070; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1071; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1072; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1073;
1074; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
1075; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1076; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1077; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1078; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1079; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1080; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1081; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1082; GFX90A-TGSPLIT-NEXT:    s_endpgm
1083;
1084; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
1085; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1086; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1087; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
1088; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1089; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1090; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1091; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1092; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1093;
1094; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
1095; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1096; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1097; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
1098; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1099; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1100; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1101; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1102; GFX940-TGSPLIT-NEXT:    s_endpgm
1103;
1104; GFX11-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
1105; GFX11-WGP:       ; %bb.0: ; %entry
1106; GFX11-WGP-NEXT:    s_clause 0x1
1107; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
1108; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
1109; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1110; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1111; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1112; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1113; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1114; GFX11-WGP-NEXT:    s_endpgm
1115;
1116; GFX11-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
1117; GFX11-CU:       ; %bb.0: ; %entry
1118; GFX11-CU-NEXT:    s_clause 0x1
1119; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
1120; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
1121; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1122; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1123; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1124; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1125; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1126; GFX11-CU-NEXT:    s_endpgm
1127    i32* %out, i32 %in) {
1128entry:
1129  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") monotonic
1130  ret void
1131}
1132
1133define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
1134; GFX7-LABEL: flat_singlethread_acquire_atomicrmw:
1135; GFX7:       ; %bb.0: ; %entry
1136; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1137; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1138; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1139; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1140; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1141; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1142; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1143; GFX7-NEXT:    s_endpgm
1144;
1145; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
1146; GFX10-WGP:       ; %bb.0: ; %entry
1147; GFX10-WGP-NEXT:    s_clause 0x1
1148; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1149; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1150; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1151; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1152; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1153; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1154; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1155; GFX10-WGP-NEXT:    s_endpgm
1156;
1157; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw:
1158; GFX10-CU:       ; %bb.0: ; %entry
1159; GFX10-CU-NEXT:    s_clause 0x1
1160; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1161; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1162; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1163; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1164; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1165; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1166; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1167; GFX10-CU-NEXT:    s_endpgm
1168;
1169; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_atomicrmw:
1170; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1171; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1172; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
1173; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1174; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1175; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1176; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1177; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1178; SKIP-CACHE-INV-NEXT:    s_endpgm
1179;
1180; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
1181; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1182; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1183; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1184; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1185; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1186; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1187; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1188; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1189;
1190; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
1191; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1192; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1193; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1194; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1195; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1196; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1197; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1198; GFX90A-TGSPLIT-NEXT:    s_endpgm
1199;
1200; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
1201; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1202; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1203; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
1204; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1205; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1206; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1207; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1208; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1209;
1210; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
1211; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1212; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1213; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
1214; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1215; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1216; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1217; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1218; GFX940-TGSPLIT-NEXT:    s_endpgm
1219;
1220; GFX11-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
1221; GFX11-WGP:       ; %bb.0: ; %entry
1222; GFX11-WGP-NEXT:    s_clause 0x1
1223; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
1224; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
1225; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1226; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1227; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1228; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1229; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1230; GFX11-WGP-NEXT:    s_endpgm
1231;
1232; GFX11-CU-LABEL: flat_singlethread_acquire_atomicrmw:
1233; GFX11-CU:       ; %bb.0: ; %entry
1234; GFX11-CU-NEXT:    s_clause 0x1
1235; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
1236; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
1237; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1238; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1239; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1240; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1241; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1242; GFX11-CU-NEXT:    s_endpgm
1243    i32* %out, i32 %in) {
1244entry:
1245  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire
1246  ret void
1247}
1248
1249define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
1250; GFX7-LABEL: flat_singlethread_release_atomicrmw:
1251; GFX7:       ; %bb.0: ; %entry
1252; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1253; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1254; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1255; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1256; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1257; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1258; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1259; GFX7-NEXT:    s_endpgm
1260;
1261; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw:
1262; GFX10-WGP:       ; %bb.0: ; %entry
1263; GFX10-WGP-NEXT:    s_clause 0x1
1264; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1265; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1266; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1267; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1268; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1269; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1270; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1271; GFX10-WGP-NEXT:    s_endpgm
1272;
1273; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw:
1274; GFX10-CU:       ; %bb.0: ; %entry
1275; GFX10-CU-NEXT:    s_clause 0x1
1276; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1277; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1278; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1279; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1280; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1281; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1282; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1283; GFX10-CU-NEXT:    s_endpgm
1284;
1285; SKIP-CACHE-INV-LABEL: flat_singlethread_release_atomicrmw:
1286; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1287; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1288; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
1289; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1290; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1291; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1292; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1293; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1294; SKIP-CACHE-INV-NEXT:    s_endpgm
1295;
1296; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
1297; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1298; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1299; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1300; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1301; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1302; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1303; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1304; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1305;
1306; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
1307; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1308; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1309; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1310; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1311; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1312; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1313; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1314; GFX90A-TGSPLIT-NEXT:    s_endpgm
1315;
1316; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
1317; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1318; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1319; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
1320; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1321; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1322; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1323; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1324; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1325;
1326; GFX940-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
1327; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1328; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1329; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
1330; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1331; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1332; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1333; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1334; GFX940-TGSPLIT-NEXT:    s_endpgm
1335;
1336; GFX11-WGP-LABEL: flat_singlethread_release_atomicrmw:
1337; GFX11-WGP:       ; %bb.0: ; %entry
1338; GFX11-WGP-NEXT:    s_clause 0x1
1339; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
1340; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
1341; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1342; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1343; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1344; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1345; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1346; GFX11-WGP-NEXT:    s_endpgm
1347;
1348; GFX11-CU-LABEL: flat_singlethread_release_atomicrmw:
1349; GFX11-CU:       ; %bb.0: ; %entry
1350; GFX11-CU-NEXT:    s_clause 0x1
1351; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
1352; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
1353; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1354; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1355; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1356; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1357; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1358; GFX11-CU-NEXT:    s_endpgm
1359    i32* %out, i32 %in) {
1360entry:
1361  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") release
1362  ret void
1363}
1364
1365define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
1366; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw:
1367; GFX7:       ; %bb.0: ; %entry
1368; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1369; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1370; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1371; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1372; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1373; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1374; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1375; GFX7-NEXT:    s_endpgm
1376;
1377; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
1378; GFX10-WGP:       ; %bb.0: ; %entry
1379; GFX10-WGP-NEXT:    s_clause 0x1
1380; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1381; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1382; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1383; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1384; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1385; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1386; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1387; GFX10-WGP-NEXT:    s_endpgm
1388;
1389; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
1390; GFX10-CU:       ; %bb.0: ; %entry
1391; GFX10-CU-NEXT:    s_clause 0x1
1392; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1393; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1394; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1395; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1396; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1397; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1398; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1399; GFX10-CU-NEXT:    s_endpgm
1400;
1401; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_atomicrmw:
1402; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1403; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1404; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
1405; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1406; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1407; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1408; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1409; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1410; SKIP-CACHE-INV-NEXT:    s_endpgm
1411;
1412; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
1413; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1414; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1415; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1416; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1417; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1418; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1419; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1420; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1421;
1422; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
1423; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1424; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1425; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1426; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1427; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1428; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1429; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1430; GFX90A-TGSPLIT-NEXT:    s_endpgm
1431;
1432; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
1433; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1434; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1435; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
1436; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1437; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1438; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1439; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1440; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1441;
1442; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
1443; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1444; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1445; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
1446; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1447; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1448; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1449; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1450; GFX940-TGSPLIT-NEXT:    s_endpgm
1451;
1452; GFX11-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
1453; GFX11-WGP:       ; %bb.0: ; %entry
1454; GFX11-WGP-NEXT:    s_clause 0x1
1455; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
1456; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
1457; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1458; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1459; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1460; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1461; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1462; GFX11-WGP-NEXT:    s_endpgm
1463;
1464; GFX11-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
1465; GFX11-CU:       ; %bb.0: ; %entry
1466; GFX11-CU-NEXT:    s_clause 0x1
1467; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
1468; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
1469; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1470; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1471; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1472; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1473; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1474; GFX11-CU-NEXT:    s_endpgm
1475    i32* %out, i32 %in) {
1476entry:
1477  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel
1478  ret void
1479}
1480
1481define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
1482; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw:
1483; GFX7:       ; %bb.0: ; %entry
1484; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1485; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1486; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1487; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1488; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1489; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1490; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1491; GFX7-NEXT:    s_endpgm
1492;
1493; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
1494; GFX10-WGP:       ; %bb.0: ; %entry
1495; GFX10-WGP-NEXT:    s_clause 0x1
1496; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1497; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1498; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1499; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1500; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1501; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1502; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1503; GFX10-WGP-NEXT:    s_endpgm
1504;
1505; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
1506; GFX10-CU:       ; %bb.0: ; %entry
1507; GFX10-CU-NEXT:    s_clause 0x1
1508; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1509; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1510; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1511; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1512; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1513; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1514; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1515; GFX10-CU-NEXT:    s_endpgm
1516;
1517; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_atomicrmw:
1518; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1519; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1520; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
1521; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1522; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1523; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1524; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1525; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1526; SKIP-CACHE-INV-NEXT:    s_endpgm
1527;
1528; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
1529; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1530; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1531; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1532; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1533; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1534; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1535; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1536; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1537;
1538; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
1539; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1540; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1541; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1542; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1543; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1544; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1545; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1546; GFX90A-TGSPLIT-NEXT:    s_endpgm
1547;
1548; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
1549; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1550; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1551; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
1552; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1553; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1554; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1555; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1556; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1557;
1558; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
1559; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1560; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1561; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
1562; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1563; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1564; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1565; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1566; GFX940-TGSPLIT-NEXT:    s_endpgm
1567;
1568; GFX11-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
1569; GFX11-WGP:       ; %bb.0: ; %entry
1570; GFX11-WGP-NEXT:    s_clause 0x1
1571; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
1572; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
1573; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1574; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1575; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1576; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1577; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1578; GFX11-WGP-NEXT:    s_endpgm
1579;
1580; GFX11-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
1581; GFX11-CU:       ; %bb.0: ; %entry
1582; GFX11-CU-NEXT:    s_clause 0x1
1583; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
1584; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
1585; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1586; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1587; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1588; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1589; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1590; GFX11-CU-NEXT:    s_endpgm
1591    i32* %out, i32 %in) {
1592entry:
1593  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst
1594  ret void
1595}
1596
1597define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
1598; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw:
1599; GFX7:       ; %bb.0: ; %entry
1600; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1601; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1602; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1603; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1604; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1605; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1606; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1607; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1608; GFX7-NEXT:    flat_store_dword v[0:1], v2
1609; GFX7-NEXT:    s_endpgm
1610;
1611; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
1612; GFX10-WGP:       ; %bb.0: ; %entry
1613; GFX10-WGP-NEXT:    s_clause 0x1
1614; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1615; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1616; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1617; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1618; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1619; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1620; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1621; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1622; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1623; GFX10-WGP-NEXT:    s_endpgm
1624;
1625; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
1626; GFX10-CU:       ; %bb.0: ; %entry
1627; GFX10-CU-NEXT:    s_clause 0x1
1628; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1629; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1630; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1631; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1632; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1633; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1634; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1635; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1636; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1637; GFX10-CU-NEXT:    s_endpgm
1638;
1639; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_ret_atomicrmw:
1640; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1641; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1642; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
1643; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1644; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1645; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1646; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1647; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1648; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1649; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1650; SKIP-CACHE-INV-NEXT:    s_endpgm
1651;
1652; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
1653; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1654; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1655; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1656; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1657; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1658; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1659; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1660; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1661; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1662; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1663;
1664; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
1665; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1666; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1667; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1668; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1669; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1670; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1671; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1672; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1673; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1674; GFX90A-TGSPLIT-NEXT:    s_endpgm
1675;
1676; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
1677; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1678; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1679; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
1680; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1681; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1682; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1683; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
1684; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1685; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1686; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1687;
1688; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
1689; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1690; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1691; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
1692; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1693; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1694; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1695; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
1696; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1697; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1698; GFX940-TGSPLIT-NEXT:    s_endpgm
1699;
1700; GFX11-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
1701; GFX11-WGP:       ; %bb.0: ; %entry
1702; GFX11-WGP-NEXT:    s_clause 0x1
1703; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
1704; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
1705; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1706; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1707; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1708; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
1709; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1710; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
1711; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1712; GFX11-WGP-NEXT:    s_endpgm
1713;
1714; GFX11-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
1715; GFX11-CU:       ; %bb.0: ; %entry
1716; GFX11-CU-NEXT:    s_clause 0x1
1717; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
1718; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
1719; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1720; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1721; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1722; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
1723; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1724; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
1725; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1726; GFX11-CU-NEXT:    s_endpgm
1727    i32* %out, i32 %in) {
1728entry:
1729  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire
1730  store i32 %val, i32* %out, align 4
1731  ret void
1732}
1733
1734define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
1735; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
1736; GFX7:       ; %bb.0: ; %entry
1737; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1738; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1739; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1740; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1741; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1742; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1743; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1744; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1745; GFX7-NEXT:    flat_store_dword v[0:1], v2
1746; GFX7-NEXT:    s_endpgm
1747;
1748; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
1749; GFX10-WGP:       ; %bb.0: ; %entry
1750; GFX10-WGP-NEXT:    s_clause 0x1
1751; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1752; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1753; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1754; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1755; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1756; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1757; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1758; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1759; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1760; GFX10-WGP-NEXT:    s_endpgm
1761;
1762; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
1763; GFX10-CU:       ; %bb.0: ; %entry
1764; GFX10-CU-NEXT:    s_clause 0x1
1765; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1766; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1767; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1768; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1769; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1770; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1771; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1772; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1773; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1774; GFX10-CU-NEXT:    s_endpgm
1775;
1776; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
1777; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1778; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1779; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
1780; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1781; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1782; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1783; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1784; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1785; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1786; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1787; SKIP-CACHE-INV-NEXT:    s_endpgm
1788;
1789; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
1790; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1791; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1792; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1793; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1794; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1795; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1796; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1797; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1798; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1799; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1800;
1801; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
1802; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1803; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1804; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1805; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1806; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1807; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1808; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1809; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1810; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1811; GFX90A-TGSPLIT-NEXT:    s_endpgm
1812;
1813; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
1814; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1815; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1816; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
1817; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1818; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1819; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1820; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
1821; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1822; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1823; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1824;
1825; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
1826; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1827; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1828; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
1829; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1830; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1831; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1832; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
1833; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1834; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1835; GFX940-TGSPLIT-NEXT:    s_endpgm
1836;
1837; GFX11-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
1838; GFX11-WGP:       ; %bb.0: ; %entry
1839; GFX11-WGP-NEXT:    s_clause 0x1
1840; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
1841; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
1842; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1843; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1844; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1845; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
1846; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1847; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
1848; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1849; GFX11-WGP-NEXT:    s_endpgm
1850;
1851; GFX11-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
1852; GFX11-CU:       ; %bb.0: ; %entry
1853; GFX11-CU-NEXT:    s_clause 0x1
1854; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
1855; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
1856; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1857; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1858; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1859; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
1860; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1861; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
1862; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1863; GFX11-CU-NEXT:    s_endpgm
1864    i32* %out, i32 %in) {
1865entry:
1866  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel
1867  store i32 %val, i32* %out, align 4
1868  ret void
1869}
1870
1871define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
1872; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
1873; GFX7:       ; %bb.0: ; %entry
1874; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1875; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1876; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1877; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1878; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1879; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1880; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1881; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1882; GFX7-NEXT:    flat_store_dword v[0:1], v2
1883; GFX7-NEXT:    s_endpgm
1884;
1885; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
1886; GFX10-WGP:       ; %bb.0: ; %entry
1887; GFX10-WGP-NEXT:    s_clause 0x1
1888; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1889; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1890; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1891; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1892; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1893; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1894; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1895; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1896; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1897; GFX10-WGP-NEXT:    s_endpgm
1898;
1899; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
1900; GFX10-CU:       ; %bb.0: ; %entry
1901; GFX10-CU-NEXT:    s_clause 0x1
1902; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1903; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1904; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1905; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1906; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1907; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1908; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1909; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1910; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1911; GFX10-CU-NEXT:    s_endpgm
1912;
1913; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
1914; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1915; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1916; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
1917; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1918; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1919; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1920; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1921; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1922; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1923; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1924; SKIP-CACHE-INV-NEXT:    s_endpgm
1925;
1926; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
1927; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1928; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1929; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1930; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1931; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1932; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1933; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1934; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1935; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1936; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1937;
1938; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
1939; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1940; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1941; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1942; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1943; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1944; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1945; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1946; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1947; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1948; GFX90A-TGSPLIT-NEXT:    s_endpgm
1949;
1950; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
1951; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1952; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1953; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
1954; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1955; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1956; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1957; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
1958; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1959; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1960; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1961;
1962; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
1963; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1964; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1965; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
1966; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1967; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1968; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1969; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
1970; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1971; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1972; GFX940-TGSPLIT-NEXT:    s_endpgm
1973;
1974; GFX11-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
1975; GFX11-WGP:       ; %bb.0: ; %entry
1976; GFX11-WGP-NEXT:    s_clause 0x1
1977; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
1978; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
1979; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1980; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1981; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1982; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
1983; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1984; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
1985; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1986; GFX11-WGP-NEXT:    s_endpgm
1987;
1988; GFX11-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
1989; GFX11-CU:       ; %bb.0: ; %entry
1990; GFX11-CU-NEXT:    s_clause 0x1
1991; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
1992; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
1993; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1994; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1995; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1996; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
1997; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1998; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
1999; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2000; GFX11-CU-NEXT:    s_endpgm
2001    i32* %out, i32 %in) {
2002entry:
2003  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst
2004  store i32 %val, i32* %out, align 4
2005  ret void
2006}
2007
2008define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
2009; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
2010; GFX7:       ; %bb.0: ; %entry
2011; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2012; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2013; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2014; GFX7-NEXT:    s_add_u32 s0, s0, 16
2015; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2016; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2017; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2018; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2019; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2020; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2021; GFX7-NEXT:    s_endpgm
2022;
2023; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
2024; GFX10-WGP:       ; %bb.0: ; %entry
2025; GFX10-WGP-NEXT:    s_clause 0x1
2026; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2027; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2028; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2029; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2030; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2031; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2032; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2033; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2034; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2035; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2036; GFX10-WGP-NEXT:    s_endpgm
2037;
2038; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
2039; GFX10-CU:       ; %bb.0: ; %entry
2040; GFX10-CU-NEXT:    s_clause 0x1
2041; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2042; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2043; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2044; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2045; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2046; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2047; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2048; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2049; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2050; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2051; GFX10-CU-NEXT:    s_endpgm
2052;
2053; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
2054; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2055; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2056; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
2057; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2058; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2059; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2060; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2061; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2062; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2063; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2064; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2065; SKIP-CACHE-INV-NEXT:    s_endpgm
2066;
2067; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
2068; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2069; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2070; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2071; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2072; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2073; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2074; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2075; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2076;
2077; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
2078; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2079; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2080; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2081; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2082; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2083; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2084; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2085; GFX90A-TGSPLIT-NEXT:    s_endpgm
2086;
2087; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
2088; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2089; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2090; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
2091; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2092; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
2093; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
2094; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2095; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2096;
2097; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
2098; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2099; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2100; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
2101; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2102; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
2103; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
2104; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2105; GFX940-TGSPLIT-NEXT:    s_endpgm
2106;
2107; GFX11-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
2108; GFX11-WGP:       ; %bb.0: ; %entry
2109; GFX11-WGP-NEXT:    s_clause 0x1
2110; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
2111; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
2112; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2113; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
2114; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
2115; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2116; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2117; GFX11-WGP-NEXT:    s_endpgm
2118;
2119; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
2120; GFX11-CU:       ; %bb.0: ; %entry
2121; GFX11-CU-NEXT:    s_clause 0x1
2122; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
2123; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
2124; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2125; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
2126; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
2127; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2128; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2129; GFX11-CU-NEXT:    s_endpgm
2130    i32* %out, i32 %in, i32 %old) {
2131entry:
2132  %gep = getelementptr i32, i32* %out, i32 4
2133  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
2134  ret void
2135}
2136
2137define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
2138; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
2139; GFX7:       ; %bb.0: ; %entry
2140; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2141; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2142; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2143; GFX7-NEXT:    s_add_u32 s0, s0, 16
2144; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2145; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2146; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2147; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2148; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2149; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2150; GFX7-NEXT:    s_endpgm
2151;
2152; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
2153; GFX10-WGP:       ; %bb.0: ; %entry
2154; GFX10-WGP-NEXT:    s_clause 0x1
2155; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2156; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2157; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2158; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2159; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2160; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2161; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2162; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2163; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2164; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2165; GFX10-WGP-NEXT:    s_endpgm
2166;
2167; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
2168; GFX10-CU:       ; %bb.0: ; %entry
2169; GFX10-CU-NEXT:    s_clause 0x1
2170; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2171; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2172; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2173; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2174; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2175; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2176; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2177; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2178; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2179; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2180; GFX10-CU-NEXT:    s_endpgm
2181;
2182; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
2183; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2184; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2185; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
2186; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2187; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2188; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2189; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2190; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2191; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2192; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2193; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2194; SKIP-CACHE-INV-NEXT:    s_endpgm
2195;
2196; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
2197; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2198; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2199; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2200; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2201; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2202; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2203; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2204; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2205;
2206; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
2207; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2208; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2209; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2210; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2211; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2212; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2213; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2214; GFX90A-TGSPLIT-NEXT:    s_endpgm
2215;
2216; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
2217; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2218; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2219; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
2220; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2221; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
2222; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
2223; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2224; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2225;
2226; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
2227; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2228; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2229; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
2230; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2231; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
2232; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
2233; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2234; GFX940-TGSPLIT-NEXT:    s_endpgm
2235;
2236; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
2237; GFX11-WGP:       ; %bb.0: ; %entry
2238; GFX11-WGP-NEXT:    s_clause 0x1
2239; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
2240; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
2241; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2242; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
2243; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
2244; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2245; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2246; GFX11-WGP-NEXT:    s_endpgm
2247;
2248; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
2249; GFX11-CU:       ; %bb.0: ; %entry
2250; GFX11-CU-NEXT:    s_clause 0x1
2251; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
2252; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
2253; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2254; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
2255; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
2256; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2257; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2258; GFX11-CU-NEXT:    s_endpgm
2259    i32* %out, i32 %in, i32 %old) {
2260entry:
2261  %gep = getelementptr i32, i32* %out, i32 4
2262  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
2263  ret void
2264}
2265
2266define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
2267; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg:
2268; GFX7:       ; %bb.0: ; %entry
2269; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2270; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2271; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2272; GFX7-NEXT:    s_add_u32 s0, s0, 16
2273; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2274; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2275; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2276; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2277; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2278; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2279; GFX7-NEXT:    s_endpgm
2280;
2281; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
2282; GFX10-WGP:       ; %bb.0: ; %entry
2283; GFX10-WGP-NEXT:    s_clause 0x1
2284; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2285; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2286; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2287; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2288; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2289; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2290; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2291; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2292; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2293; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2294; GFX10-WGP-NEXT:    s_endpgm
2295;
2296; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
2297; GFX10-CU:       ; %bb.0: ; %entry
2298; GFX10-CU-NEXT:    s_clause 0x1
2299; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2300; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2301; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2302; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2303; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2304; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2305; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2306; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2307; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2308; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2309; GFX10-CU-NEXT:    s_endpgm
2310;
2311; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_cmpxchg:
2312; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2313; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2314; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
2315; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2316; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2317; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2318; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2319; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2320; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2321; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2322; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2323; SKIP-CACHE-INV-NEXT:    s_endpgm
2324;
2325; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
2326; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2327; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2328; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2329; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2330; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2331; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2332; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2333; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2334;
2335; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
2336; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2337; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2338; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2339; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2340; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2341; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2342; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2343; GFX90A-TGSPLIT-NEXT:    s_endpgm
2344;
2345; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
2346; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2347; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2348; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
2349; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2350; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
2351; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
2352; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2353; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2354;
2355; GFX940-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
2356; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2357; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2358; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
2359; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2360; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
2361; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
2362; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2363; GFX940-TGSPLIT-NEXT:    s_endpgm
2364;
2365; GFX11-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
2366; GFX11-WGP:       ; %bb.0: ; %entry
2367; GFX11-WGP-NEXT:    s_clause 0x1
2368; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
2369; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
2370; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2371; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
2372; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
2373; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2374; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2375; GFX11-WGP-NEXT:    s_endpgm
2376;
2377; GFX11-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
2378; GFX11-CU:       ; %bb.0: ; %entry
2379; GFX11-CU-NEXT:    s_clause 0x1
2380; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
2381; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
2382; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2383; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
2384; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
2385; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2386; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2387; GFX11-CU-NEXT:    s_endpgm
2388    i32* %out, i32 %in, i32 %old) {
2389entry:
2390  %gep = getelementptr i32, i32* %out, i32 4
2391  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
2392  ret void
2393}
2394
2395define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
2396; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
2397; GFX7:       ; %bb.0: ; %entry
2398; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2399; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2400; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2401; GFX7-NEXT:    s_add_u32 s0, s0, 16
2402; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2403; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2404; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2405; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2406; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2407; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2408; GFX7-NEXT:    s_endpgm
2409;
2410; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
2411; GFX10-WGP:       ; %bb.0: ; %entry
2412; GFX10-WGP-NEXT:    s_clause 0x1
2413; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2414; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2415; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2416; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2417; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2418; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2419; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2420; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2421; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2422; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2423; GFX10-WGP-NEXT:    s_endpgm
2424;
2425; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
2426; GFX10-CU:       ; %bb.0: ; %entry
2427; GFX10-CU-NEXT:    s_clause 0x1
2428; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2429; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2430; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2431; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2432; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2433; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2434; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2435; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2436; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2437; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2438; GFX10-CU-NEXT:    s_endpgm
2439;
2440; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
2441; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2442; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2443; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
2444; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2445; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2446; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2447; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2448; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2449; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2450; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2451; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2452; SKIP-CACHE-INV-NEXT:    s_endpgm
2453;
2454; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
2455; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2456; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2457; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2458; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2459; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2460; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2461; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2462; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2463;
2464; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
2465; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2466; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2467; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2468; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2469; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2470; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2471; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2472; GFX90A-TGSPLIT-NEXT:    s_endpgm
2473;
2474; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
2475; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2476; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2477; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
2478; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2479; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
2480; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
2481; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2482; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2483;
2484; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
2485; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2486; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2487; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
2488; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2489; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
2490; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
2491; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2492; GFX940-TGSPLIT-NEXT:    s_endpgm
2493;
2494; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
2495; GFX11-WGP:       ; %bb.0: ; %entry
2496; GFX11-WGP-NEXT:    s_clause 0x1
2497; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
2498; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
2499; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2500; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
2501; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
2502; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2503; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2504; GFX11-WGP-NEXT:    s_endpgm
2505;
2506; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
2507; GFX11-CU:       ; %bb.0: ; %entry
2508; GFX11-CU-NEXT:    s_clause 0x1
2509; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
2510; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
2511; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2512; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
2513; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
2514; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2515; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2516; GFX11-CU-NEXT:    s_endpgm
2517    i32* %out, i32 %in, i32 %old) {
2518entry:
2519  %gep = getelementptr i32, i32* %out, i32 4
2520  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
2521  ret void
2522}
2523
2524define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
2525; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
2526; GFX7:       ; %bb.0: ; %entry
2527; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2528; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2529; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2530; GFX7-NEXT:    s_add_u32 s0, s0, 16
2531; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2532; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2533; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2534; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2535; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2536; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2537; GFX7-NEXT:    s_endpgm
2538;
2539; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
2540; GFX10-WGP:       ; %bb.0: ; %entry
2541; GFX10-WGP-NEXT:    s_clause 0x1
2542; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2543; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2544; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2545; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2546; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2547; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2548; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2549; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2550; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2551; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2552; GFX10-WGP-NEXT:    s_endpgm
2553;
2554; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
2555; GFX10-CU:       ; %bb.0: ; %entry
2556; GFX10-CU-NEXT:    s_clause 0x1
2557; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2558; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2559; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2560; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2561; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2562; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2563; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2564; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2565; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2566; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2567; GFX10-CU-NEXT:    s_endpgm
2568;
2569; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
2570; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2571; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2572; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
2573; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2574; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2575; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2576; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2577; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2578; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2579; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2580; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2581; SKIP-CACHE-INV-NEXT:    s_endpgm
2582;
2583; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
2584; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2585; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2586; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2587; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2588; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2589; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2590; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2591; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2592;
2593; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
2594; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2595; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2596; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2597; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2598; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2599; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2600; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2601; GFX90A-TGSPLIT-NEXT:    s_endpgm
2602;
2603; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
2604; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2605; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2606; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
2607; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2608; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
2609; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
2610; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2611; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2612;
2613; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
2614; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2615; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2616; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
2617; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2618; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
2619; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
2620; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2621; GFX940-TGSPLIT-NEXT:    s_endpgm
2622;
2623; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
2624; GFX11-WGP:       ; %bb.0: ; %entry
2625; GFX11-WGP-NEXT:    s_clause 0x1
2626; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
2627; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
2628; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2629; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
2630; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
2631; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2632; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2633; GFX11-WGP-NEXT:    s_endpgm
2634;
2635; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
2636; GFX11-CU:       ; %bb.0: ; %entry
2637; GFX11-CU-NEXT:    s_clause 0x1
2638; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
2639; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
2640; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2641; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
2642; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
2643; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2644; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2645; GFX11-CU-NEXT:    s_endpgm
2646    i32* %out, i32 %in, i32 %old) {
2647entry:
2648  %gep = getelementptr i32, i32* %out, i32 4
2649  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
2650  ret void
2651}
2652
2653define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
2654; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
2655; GFX7:       ; %bb.0: ; %entry
2656; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2657; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2658; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2659; GFX7-NEXT:    s_add_u32 s0, s0, 16
2660; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2661; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2662; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2663; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2664; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2665; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2666; GFX7-NEXT:    s_endpgm
2667;
2668; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
2669; GFX10-WGP:       ; %bb.0: ; %entry
2670; GFX10-WGP-NEXT:    s_clause 0x1
2671; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2672; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2673; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2674; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2675; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2676; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2677; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2678; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2679; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2680; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2681; GFX10-WGP-NEXT:    s_endpgm
2682;
2683; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
2684; GFX10-CU:       ; %bb.0: ; %entry
2685; GFX10-CU-NEXT:    s_clause 0x1
2686; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2687; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2688; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2689; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2690; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2691; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2692; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2693; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2694; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2695; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2696; GFX10-CU-NEXT:    s_endpgm
2697;
2698; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
2699; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2700; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2701; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
2702; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2703; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2704; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2705; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2706; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2707; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2708; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2709; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2710; SKIP-CACHE-INV-NEXT:    s_endpgm
2711;
2712; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
2713; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2714; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2715; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2716; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2717; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2718; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2719; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2720; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2721;
2722; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
2723; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2724; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2725; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2726; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2727; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2728; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2729; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2730; GFX90A-TGSPLIT-NEXT:    s_endpgm
2731;
2732; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
2733; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2734; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2735; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
2736; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2737; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
2738; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
2739; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2740; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2741;
2742; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
2743; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2744; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2745; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
2746; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2747; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
2748; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
2749; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2750; GFX940-TGSPLIT-NEXT:    s_endpgm
2751;
2752; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
2753; GFX11-WGP:       ; %bb.0: ; %entry
2754; GFX11-WGP-NEXT:    s_clause 0x1
2755; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
2756; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
2757; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2758; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
2759; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
2760; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2761; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2762; GFX11-WGP-NEXT:    s_endpgm
2763;
2764; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
2765; GFX11-CU:       ; %bb.0: ; %entry
2766; GFX11-CU-NEXT:    s_clause 0x1
2767; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
2768; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
2769; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2770; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
2771; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
2772; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2773; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2774; GFX11-CU-NEXT:    s_endpgm
2775    i32* %out, i32 %in, i32 %old) {
2776entry:
2777  %gep = getelementptr i32, i32* %out, i32 4
2778  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire
2779  ret void
2780}
2781
2782define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
2783; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
2784; GFX7:       ; %bb.0: ; %entry
2785; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2786; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2787; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2788; GFX7-NEXT:    s_add_u32 s0, s0, 16
2789; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2790; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2791; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2792; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2793; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2794; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2795; GFX7-NEXT:    s_endpgm
2796;
2797; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
2798; GFX10-WGP:       ; %bb.0: ; %entry
2799; GFX10-WGP-NEXT:    s_clause 0x1
2800; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2801; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2802; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2803; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2804; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2805; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2806; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2807; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2808; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2809; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2810; GFX10-WGP-NEXT:    s_endpgm
2811;
2812; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
2813; GFX10-CU:       ; %bb.0: ; %entry
2814; GFX10-CU-NEXT:    s_clause 0x1
2815; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2816; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2817; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2818; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2819; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2820; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2821; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2822; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2823; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2824; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2825; GFX10-CU-NEXT:    s_endpgm
2826;
2827; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
2828; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2829; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2830; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
2831; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2832; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2833; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2834; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2835; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2836; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2837; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2838; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2839; SKIP-CACHE-INV-NEXT:    s_endpgm
2840;
2841; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
2842; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2843; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2844; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2845; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2846; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2847; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2848; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2849; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2850;
2851; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
2852; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2853; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2854; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2855; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2856; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2857; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2858; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2859; GFX90A-TGSPLIT-NEXT:    s_endpgm
2860;
2861; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
2862; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2863; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2864; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
2865; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2866; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
2867; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
2868; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2869; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2870;
2871; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
2872; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2873; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2874; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
2875; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2876; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
2877; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
2878; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2879; GFX940-TGSPLIT-NEXT:    s_endpgm
2880;
2881; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
2882; GFX11-WGP:       ; %bb.0: ; %entry
2883; GFX11-WGP-NEXT:    s_clause 0x1
2884; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
2885; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
2886; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2887; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
2888; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
2889; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2890; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2891; GFX11-WGP-NEXT:    s_endpgm
2892;
2893; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
2894; GFX11-CU:       ; %bb.0: ; %entry
2895; GFX11-CU-NEXT:    s_clause 0x1
2896; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
2897; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
2898; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2899; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
2900; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
2901; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2902; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2903; GFX11-CU-NEXT:    s_endpgm
2904    i32* %out, i32 %in, i32 %old) {
2905entry:
2906  %gep = getelementptr i32, i32* %out, i32 4
2907  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
2908  ret void
2909}
2910
2911define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
2912; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg:
2913; GFX7:       ; %bb.0: ; %entry
2914; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2915; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2916; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2917; GFX7-NEXT:    s_add_u32 s0, s0, 16
2918; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2919; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2920; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2921; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2922; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2923; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2924; GFX7-NEXT:    s_endpgm
2925;
2926; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
2927; GFX10-WGP:       ; %bb.0: ; %entry
2928; GFX10-WGP-NEXT:    s_clause 0x1
2929; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2930; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2931; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2932; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2933; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2934; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2935; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2936; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2937; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2938; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2939; GFX10-WGP-NEXT:    s_endpgm
2940;
2941; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
2942; GFX10-CU:       ; %bb.0: ; %entry
2943; GFX10-CU-NEXT:    s_clause 0x1
2944; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2945; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2946; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2947; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2948; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2949; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2950; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2951; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2952; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2953; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2954; GFX10-CU-NEXT:    s_endpgm
2955;
2956; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_cmpxchg:
2957; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2958; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2959; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
2960; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2961; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2962; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2963; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2964; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2965; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2966; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2967; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2968; SKIP-CACHE-INV-NEXT:    s_endpgm
2969;
2970; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
2971; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2972; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2973; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2974; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2975; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2976; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2977; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2978; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2979;
2980; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
2981; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2982; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2983; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2984; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2985; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2986; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2987; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2988; GFX90A-TGSPLIT-NEXT:    s_endpgm
2989;
2990; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
2991; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2992; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
2993; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
2994; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2995; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
2996; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
2997; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2998; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2999;
3000; GFX940-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
3001; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3002; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3003; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
3004; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3005; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
3006; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
3007; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3008; GFX940-TGSPLIT-NEXT:    s_endpgm
3009;
3010; GFX11-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
3011; GFX11-WGP:       ; %bb.0: ; %entry
3012; GFX11-WGP-NEXT:    s_clause 0x1
3013; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
3014; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
3015; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3016; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
3017; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
3018; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3019; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3020; GFX11-WGP-NEXT:    s_endpgm
3021;
3022; GFX11-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
3023; GFX11-CU:       ; %bb.0: ; %entry
3024; GFX11-CU-NEXT:    s_clause 0x1
3025; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
3026; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
3027; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3028; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
3029; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
3030; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3031; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3032; GFX11-CU-NEXT:    s_endpgm
3033    i32* %out, i32 %in, i32 %old) {
3034entry:
3035  %gep = getelementptr i32, i32* %out, i32 4
3036  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
3037  ret void
3038}
3039
3040define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
3041; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
3042; GFX7:       ; %bb.0: ; %entry
3043; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3044; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3045; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3046; GFX7-NEXT:    s_add_u32 s0, s0, 16
3047; GFX7-NEXT:    s_addc_u32 s1, s1, 0
3048; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3049; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3050; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3051; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3052; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3053; GFX7-NEXT:    s_endpgm
3054;
3055; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
3056; GFX10-WGP:       ; %bb.0: ; %entry
3057; GFX10-WGP-NEXT:    s_clause 0x1
3058; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3059; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3060; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3061; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
3062; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
3063; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3064; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3065; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3066; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3067; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3068; GFX10-WGP-NEXT:    s_endpgm
3069;
3070; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
3071; GFX10-CU:       ; %bb.0: ; %entry
3072; GFX10-CU-NEXT:    s_clause 0x1
3073; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3074; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3075; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3076; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
3077; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
3078; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3079; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3080; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3081; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3082; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3083; GFX10-CU-NEXT:    s_endpgm
3084;
3085; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
3086; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3087; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3088; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
3089; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3090; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
3091; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
3092; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3093; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3094; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3095; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3096; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3097; SKIP-CACHE-INV-NEXT:    s_endpgm
3098;
3099; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
3100; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3101; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3102; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3103; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3104; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3105; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3106; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3107; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3108;
3109; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
3110; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3111; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3112; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3113; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3114; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3115; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3116; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3117; GFX90A-TGSPLIT-NEXT:    s_endpgm
3118;
3119; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
3120; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3121; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3122; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
3123; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3124; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
3125; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
3126; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3127; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3128;
3129; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
3130; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3131; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3132; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
3133; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3134; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
3135; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
3136; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3137; GFX940-TGSPLIT-NEXT:    s_endpgm
3138;
3139; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
3140; GFX11-WGP:       ; %bb.0: ; %entry
3141; GFX11-WGP-NEXT:    s_clause 0x1
3142; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
3143; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
3144; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3145; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
3146; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
3147; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3148; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3149; GFX11-WGP-NEXT:    s_endpgm
3150;
3151; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
3152; GFX11-CU:       ; %bb.0: ; %entry
3153; GFX11-CU-NEXT:    s_clause 0x1
3154; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
3155; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
3156; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3157; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
3158; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
3159; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3160; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3161; GFX11-CU-NEXT:    s_endpgm
3162    i32* %out, i32 %in, i32 %old) {
3163entry:
3164  %gep = getelementptr i32, i32* %out, i32 4
3165  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
3166  ret void
3167}
3168
3169define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
3170; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
3171; GFX7:       ; %bb.0: ; %entry
3172; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3173; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3174; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3175; GFX7-NEXT:    s_add_u32 s0, s0, 16
3176; GFX7-NEXT:    s_addc_u32 s1, s1, 0
3177; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3178; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3179; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3180; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3181; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3182; GFX7-NEXT:    s_endpgm
3183;
3184; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
3185; GFX10-WGP:       ; %bb.0: ; %entry
3186; GFX10-WGP-NEXT:    s_clause 0x1
3187; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3188; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3189; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3190; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
3191; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
3192; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3193; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3194; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3195; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3196; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3197; GFX10-WGP-NEXT:    s_endpgm
3198;
3199; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
3200; GFX10-CU:       ; %bb.0: ; %entry
3201; GFX10-CU-NEXT:    s_clause 0x1
3202; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3203; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3204; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3205; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
3206; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
3207; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3208; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3209; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3210; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3211; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3212; GFX10-CU-NEXT:    s_endpgm
3213;
3214; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
3215; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3216; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3217; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
3218; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3219; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
3220; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
3221; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3222; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3223; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3224; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3225; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3226; SKIP-CACHE-INV-NEXT:    s_endpgm
3227;
3228; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
3229; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3230; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3231; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3232; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3233; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3234; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3235; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3236; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3237;
3238; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
3239; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3240; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3241; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3242; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3243; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3244; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3245; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3246; GFX90A-TGSPLIT-NEXT:    s_endpgm
3247;
3248; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
3249; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3250; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3251; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
3252; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3253; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
3254; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
3255; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3256; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3257;
3258; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
3259; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3260; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3261; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
3262; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3263; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
3264; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
3265; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3266; GFX940-TGSPLIT-NEXT:    s_endpgm
3267;
3268; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
3269; GFX11-WGP:       ; %bb.0: ; %entry
3270; GFX11-WGP-NEXT:    s_clause 0x1
3271; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
3272; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
3273; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3274; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
3275; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
3276; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3277; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3278; GFX11-WGP-NEXT:    s_endpgm
3279;
3280; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
3281; GFX11-CU:       ; %bb.0: ; %entry
3282; GFX11-CU-NEXT:    s_clause 0x1
3283; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
3284; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
3285; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3286; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
3287; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
3288; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3289; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3290; GFX11-CU-NEXT:    s_endpgm
3291    i32* %out, i32 %in, i32 %old) {
3292entry:
3293  %gep = getelementptr i32, i32* %out, i32 4
3294  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
3295  ret void
3296}
3297
3298define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
3299; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
3300; GFX7:       ; %bb.0: ; %entry
3301; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3302; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3303; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3304; GFX7-NEXT:    s_add_u32 s0, s0, 16
3305; GFX7-NEXT:    s_addc_u32 s1, s1, 0
3306; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3307; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3308; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3309; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3310; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3311; GFX7-NEXT:    s_endpgm
3312;
3313; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
3314; GFX10-WGP:       ; %bb.0: ; %entry
3315; GFX10-WGP-NEXT:    s_clause 0x1
3316; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3317; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3318; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3319; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
3320; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
3321; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3322; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3323; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3324; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3325; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3326; GFX10-WGP-NEXT:    s_endpgm
3327;
3328; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
3329; GFX10-CU:       ; %bb.0: ; %entry
3330; GFX10-CU-NEXT:    s_clause 0x1
3331; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3332; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3333; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3334; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
3335; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
3336; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3337; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3338; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3339; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3340; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3341; GFX10-CU-NEXT:    s_endpgm
3342;
3343; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
3344; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3345; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3346; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
3347; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3348; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
3349; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
3350; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3351; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3352; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3353; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3354; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3355; SKIP-CACHE-INV-NEXT:    s_endpgm
3356;
3357; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
3358; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3359; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3360; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3361; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3362; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3363; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3364; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3365; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3366;
3367; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
3368; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3369; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3370; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3371; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3372; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3373; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3374; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3375; GFX90A-TGSPLIT-NEXT:    s_endpgm
3376;
3377; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
3378; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3379; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3380; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
3381; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3382; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
3383; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
3384; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3385; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3386;
3387; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
3388; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3389; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3390; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
3391; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3392; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
3393; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
3394; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3395; GFX940-TGSPLIT-NEXT:    s_endpgm
3396;
3397; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
3398; GFX11-WGP:       ; %bb.0: ; %entry
3399; GFX11-WGP-NEXT:    s_clause 0x1
3400; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
3401; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
3402; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3403; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
3404; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
3405; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3406; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3407; GFX11-WGP-NEXT:    s_endpgm
3408;
3409; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
3410; GFX11-CU:       ; %bb.0: ; %entry
3411; GFX11-CU-NEXT:    s_clause 0x1
3412; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
3413; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
3414; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3415; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
3416; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
3417; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3418; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3419; GFX11-CU-NEXT:    s_endpgm
3420    i32* %out, i32 %in, i32 %old) {
3421entry:
3422  %gep = getelementptr i32, i32* %out, i32 4
3423  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst
3424  ret void
3425}
3426
3427define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
3428; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
3429; GFX7:       ; %bb.0: ; %entry
3430; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3431; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3432; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3433; GFX7-NEXT:    s_add_u32 s0, s0, 16
3434; GFX7-NEXT:    s_addc_u32 s1, s1, 0
3435; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3436; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3437; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3438; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3439; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3440; GFX7-NEXT:    s_endpgm
3441;
3442; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
3443; GFX10-WGP:       ; %bb.0: ; %entry
3444; GFX10-WGP-NEXT:    s_clause 0x1
3445; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3446; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3447; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3448; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
3449; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
3450; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3451; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3452; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3453; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3454; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3455; GFX10-WGP-NEXT:    s_endpgm
3456;
3457; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
3458; GFX10-CU:       ; %bb.0: ; %entry
3459; GFX10-CU-NEXT:    s_clause 0x1
3460; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3461; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3462; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3463; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
3464; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
3465; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3466; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3467; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3468; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3469; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3470; GFX10-CU-NEXT:    s_endpgm
3471;
3472; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
3473; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3474; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3475; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
3476; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3477; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
3478; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
3479; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3480; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3481; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3482; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3483; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3484; SKIP-CACHE-INV-NEXT:    s_endpgm
3485;
3486; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
3487; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3488; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3489; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3490; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3491; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3492; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3493; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3494; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3495;
3496; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
3497; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3498; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3499; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3500; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3501; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3502; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3503; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3504; GFX90A-TGSPLIT-NEXT:    s_endpgm
3505;
3506; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
3507; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3508; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3509; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
3510; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3511; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
3512; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
3513; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3514; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3515;
3516; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
3517; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3518; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3519; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
3520; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3521; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
3522; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
3523; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3524; GFX940-TGSPLIT-NEXT:    s_endpgm
3525;
3526; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
3527; GFX11-WGP:       ; %bb.0: ; %entry
3528; GFX11-WGP-NEXT:    s_clause 0x1
3529; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
3530; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
3531; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3532; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
3533; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
3534; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3535; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3536; GFX11-WGP-NEXT:    s_endpgm
3537;
3538; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
3539; GFX11-CU:       ; %bb.0: ; %entry
3540; GFX11-CU-NEXT:    s_clause 0x1
3541; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
3542; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
3543; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3544; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
3545; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
3546; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3547; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3548; GFX11-CU-NEXT:    s_endpgm
3549    i32* %out, i32 %in, i32 %old) {
3550entry:
3551  %gep = getelementptr i32, i32* %out, i32 4
3552  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst
3553  ret void
3554}
3555
3556define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
3557; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
3558; GFX7:       ; %bb.0: ; %entry
3559; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3560; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3561; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3562; GFX7-NEXT:    s_add_u32 s0, s0, 16
3563; GFX7-NEXT:    s_addc_u32 s1, s1, 0
3564; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3565; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3566; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3567; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3568; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3569; GFX7-NEXT:    s_endpgm
3570;
3571; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
3572; GFX10-WGP:       ; %bb.0: ; %entry
3573; GFX10-WGP-NEXT:    s_clause 0x1
3574; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3575; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3576; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3577; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
3578; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
3579; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3580; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3581; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3582; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3583; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3584; GFX10-WGP-NEXT:    s_endpgm
3585;
3586; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
3587; GFX10-CU:       ; %bb.0: ; %entry
3588; GFX10-CU-NEXT:    s_clause 0x1
3589; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3590; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3591; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3592; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
3593; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
3594; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3595; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3596; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3597; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3598; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3599; GFX10-CU-NEXT:    s_endpgm
3600;
3601; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
3602; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3603; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3604; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
3605; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3606; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
3607; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
3608; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3609; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3610; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3611; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3612; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3613; SKIP-CACHE-INV-NEXT:    s_endpgm
3614;
3615; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
3616; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3617; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3618; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3619; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3620; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3621; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3622; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3623; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3624;
3625; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
3626; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3627; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3628; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3629; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3630; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3631; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3632; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3633; GFX90A-TGSPLIT-NEXT:    s_endpgm
3634;
3635; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
3636; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3637; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3638; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
3639; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3640; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
3641; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
3642; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3643; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3644;
3645; GFX940-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
3646; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3647; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3648; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
3649; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3650; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
3651; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
3652; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3653; GFX940-TGSPLIT-NEXT:    s_endpgm
3654;
3655; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
3656; GFX11-WGP:       ; %bb.0: ; %entry
3657; GFX11-WGP-NEXT:    s_clause 0x1
3658; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
3659; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
3660; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3661; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
3662; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
3663; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3664; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3665; GFX11-WGP-NEXT:    s_endpgm
3666;
3667; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
3668; GFX11-CU:       ; %bb.0: ; %entry
3669; GFX11-CU-NEXT:    s_clause 0x1
3670; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
3671; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
3672; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3673; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
3674; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
3675; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3676; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3677; GFX11-CU-NEXT:    s_endpgm
3678    i32* %out, i32 %in, i32 %old) {
3679entry:
3680  %gep = getelementptr i32, i32* %out, i32 4
3681  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst
3682  ret void
3683}
3684
3685define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
3686; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
3687; GFX7:       ; %bb.0: ; %entry
3688; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3689; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3690; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3691; GFX7-NEXT:    s_add_u32 s0, s0, 16
3692; GFX7-NEXT:    s_addc_u32 s1, s1, 0
3693; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3694; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3695; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3696; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3697; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3698; GFX7-NEXT:    s_endpgm
3699;
3700; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
3701; GFX10-WGP:       ; %bb.0: ; %entry
3702; GFX10-WGP-NEXT:    s_clause 0x1
3703; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3704; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3705; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3706; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
3707; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
3708; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3709; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3710; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3711; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3712; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3713; GFX10-WGP-NEXT:    s_endpgm
3714;
3715; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
3716; GFX10-CU:       ; %bb.0: ; %entry
3717; GFX10-CU-NEXT:    s_clause 0x1
3718; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3719; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3720; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3721; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
3722; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
3723; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3724; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3725; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3726; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3727; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3728; GFX10-CU-NEXT:    s_endpgm
3729;
3730; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
3731; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3732; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3733; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
3734; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3735; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
3736; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
3737; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3738; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3739; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3740; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3741; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3742; SKIP-CACHE-INV-NEXT:    s_endpgm
3743;
3744; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
3745; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3746; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3747; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3748; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3749; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3750; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3751; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3752; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3753;
3754; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
3755; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3756; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3757; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3758; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3759; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3760; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3761; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3762; GFX90A-TGSPLIT-NEXT:    s_endpgm
3763;
3764; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
3765; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3766; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3767; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
3768; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3769; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
3770; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
3771; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3772; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3773;
3774; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
3775; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3776; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3777; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
3778; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3779; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
3780; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
3781; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3782; GFX940-TGSPLIT-NEXT:    s_endpgm
3783;
3784; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
3785; GFX11-WGP:       ; %bb.0: ; %entry
3786; GFX11-WGP-NEXT:    s_clause 0x1
3787; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
3788; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
3789; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3790; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
3791; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
3792; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3793; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3794; GFX11-WGP-NEXT:    s_endpgm
3795;
3796; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
3797; GFX11-CU:       ; %bb.0: ; %entry
3798; GFX11-CU-NEXT:    s_clause 0x1
3799; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
3800; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
3801; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3802; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
3803; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
3804; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3805; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3806; GFX11-CU-NEXT:    s_endpgm
3807    i32* %out, i32 %in, i32 %old) {
3808entry:
3809  %gep = getelementptr i32, i32* %out, i32 4
3810  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst
3811  ret void
3812}
3813
3814define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
3815; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
3816; GFX7:       ; %bb.0: ; %entry
3817; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3818; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3819; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3820; GFX7-NEXT:    s_add_u32 s0, s0, 16
3821; GFX7-NEXT:    s_addc_u32 s1, s1, 0
3822; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3823; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3824; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3825; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3826; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3827; GFX7-NEXT:    s_endpgm
3828;
3829; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
3830; GFX10-WGP:       ; %bb.0: ; %entry
3831; GFX10-WGP-NEXT:    s_clause 0x1
3832; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3833; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3834; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3835; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
3836; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
3837; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3838; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3839; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3840; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3841; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3842; GFX10-WGP-NEXT:    s_endpgm
3843;
3844; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
3845; GFX10-CU:       ; %bb.0: ; %entry
3846; GFX10-CU-NEXT:    s_clause 0x1
3847; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3848; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3849; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3850; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
3851; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
3852; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3853; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3854; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3855; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3856; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3857; GFX10-CU-NEXT:    s_endpgm
3858;
3859; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
3860; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3861; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3862; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
3863; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3864; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
3865; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
3866; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3867; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3868; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3869; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3870; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3871; SKIP-CACHE-INV-NEXT:    s_endpgm
3872;
3873; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
3874; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3875; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3876; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3877; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3878; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3879; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3880; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3881; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3882;
3883; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
3884; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3885; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3886; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3887; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3888; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3889; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3890; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3891; GFX90A-TGSPLIT-NEXT:    s_endpgm
3892;
3893; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
3894; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3895; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3896; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
3897; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3898; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
3899; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
3900; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3901; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3902;
3903; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
3904; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3905; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
3906; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
3907; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3908; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
3909; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
3910; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3911; GFX940-TGSPLIT-NEXT:    s_endpgm
3912;
3913; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
3914; GFX11-WGP:       ; %bb.0: ; %entry
3915; GFX11-WGP-NEXT:    s_clause 0x1
3916; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
3917; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
3918; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3919; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
3920; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
3921; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3922; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3923; GFX11-WGP-NEXT:    s_endpgm
3924;
3925; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
3926; GFX11-CU:       ; %bb.0: ; %entry
3927; GFX11-CU-NEXT:    s_clause 0x1
3928; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
3929; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
3930; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3931; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
3932; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
3933; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3934; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3935; GFX11-CU-NEXT:    s_endpgm
3936    i32* %out, i32 %in, i32 %old) {
3937entry:
3938  %gep = getelementptr i32, i32* %out, i32 4
3939  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
3940  ret void
3941}
3942
3943define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
3944; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
3945; GFX7:       ; %bb.0: ; %entry
3946; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3947; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3948; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3949; GFX7-NEXT:    s_add_u32 s4, s0, 16
3950; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3951; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3952; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3953; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3954; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3955; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3956; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3957; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3958; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3959; GFX7-NEXT:    flat_store_dword v[0:1], v2
3960; GFX7-NEXT:    s_endpgm
3961;
3962; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
3963; GFX10-WGP:       ; %bb.0: ; %entry
3964; GFX10-WGP-NEXT:    s_clause 0x1
3965; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3966; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3967; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3968; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
3969; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
3970; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3971; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3972; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3973; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3974; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3975; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3976; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3977; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3978; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3979; GFX10-WGP-NEXT:    s_endpgm
3980;
3981; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
3982; GFX10-CU:       ; %bb.0: ; %entry
3983; GFX10-CU-NEXT:    s_clause 0x1
3984; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3985; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3986; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3987; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3988; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3989; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3990; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3991; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3992; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3993; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3994; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3995; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3996; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3997; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3998; GFX10-CU-NEXT:    s_endpgm
3999;
4000; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
4001; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4002; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4003; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
4004; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4005; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4006; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4007; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4008; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4009; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4010; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4011; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4012; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4013; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4014; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4015; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4016; SKIP-CACHE-INV-NEXT:    s_endpgm
4017;
4018; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
4019; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4020; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4021; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4022; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4023; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4024; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4025; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4026; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4027; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4028; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4029;
4030; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
4031; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4032; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4033; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4034; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4035; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4036; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4037; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4038; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4039; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4040; GFX90A-TGSPLIT-NEXT:    s_endpgm
4041;
4042; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
4043; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4044; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4045; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
4046; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4047; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
4048; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
4049; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
4050; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4051; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4052; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4053;
4054; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
4055; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4056; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4057; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
4058; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4059; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
4060; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
4061; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
4062; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4063; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4064; GFX940-TGSPLIT-NEXT:    s_endpgm
4065;
4066; GFX11-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
4067; GFX11-WGP:       ; %bb.0: ; %entry
4068; GFX11-WGP-NEXT:    s_clause 0x1
4069; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
4070; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
4071; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4072; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
4073; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
4074; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
4075; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4076; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
4077; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4078; GFX11-WGP-NEXT:    s_endpgm
4079;
4080; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
4081; GFX11-CU:       ; %bb.0: ; %entry
4082; GFX11-CU-NEXT:    s_clause 0x1
4083; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
4084; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
4085; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4086; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
4087; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
4088; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
4089; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4090; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
4091; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4092; GFX11-CU-NEXT:    s_endpgm
4093    i32* %out, i32 %in, i32 %old) {
4094entry:
4095  %gep = getelementptr i32, i32* %out, i32 4
4096  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
4097  %val0 = extractvalue { i32, i1 } %val, 0
4098  store i32 %val0, i32* %out, align 4
4099  ret void
4100}
4101
4102define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
4103; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
4104; GFX7:       ; %bb.0: ; %entry
4105; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4106; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4107; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4108; GFX7-NEXT:    s_add_u32 s4, s0, 16
4109; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4110; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4111; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4112; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4113; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4114; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4115; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4116; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4117; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4118; GFX7-NEXT:    flat_store_dword v[0:1], v2
4119; GFX7-NEXT:    s_endpgm
4120;
4121; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
4122; GFX10-WGP:       ; %bb.0: ; %entry
4123; GFX10-WGP-NEXT:    s_clause 0x1
4124; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4125; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4126; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4127; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4128; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4129; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4130; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4131; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4132; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4133; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4134; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4135; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4136; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4137; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4138; GFX10-WGP-NEXT:    s_endpgm
4139;
4140; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
4141; GFX10-CU:       ; %bb.0: ; %entry
4142; GFX10-CU-NEXT:    s_clause 0x1
4143; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4144; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4145; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4146; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4147; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4148; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4149; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4150; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4151; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4152; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4153; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4154; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4155; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4156; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4157; GFX10-CU-NEXT:    s_endpgm
4158;
4159; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
4160; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4161; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4162; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
4163; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4164; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4165; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4166; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4167; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4168; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4169; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4170; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4171; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4172; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4173; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4174; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4175; SKIP-CACHE-INV-NEXT:    s_endpgm
4176;
4177; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
4178; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4179; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4180; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4181; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4182; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4183; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4184; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4185; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4186; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4187; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4188;
4189; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
4190; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4191; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4192; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4193; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4194; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4195; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4196; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4197; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4198; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4199; GFX90A-TGSPLIT-NEXT:    s_endpgm
4200;
4201; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
4202; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4203; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4204; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
4205; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4206; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
4207; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
4208; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
4209; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4210; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4211; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4212;
4213; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
4214; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4215; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4216; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
4217; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4218; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
4219; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
4220; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
4221; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4222; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4223; GFX940-TGSPLIT-NEXT:    s_endpgm
4224;
4225; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
4226; GFX11-WGP:       ; %bb.0: ; %entry
4227; GFX11-WGP-NEXT:    s_clause 0x1
4228; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
4229; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
4230; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4231; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
4232; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
4233; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
4234; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4235; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
4236; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4237; GFX11-WGP-NEXT:    s_endpgm
4238;
4239; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
4240; GFX11-CU:       ; %bb.0: ; %entry
4241; GFX11-CU-NEXT:    s_clause 0x1
4242; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
4243; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
4244; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4245; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
4246; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
4247; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
4248; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4249; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
4250; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4251; GFX11-CU-NEXT:    s_endpgm
4252    i32* %out, i32 %in, i32 %old) {
4253entry:
4254  %gep = getelementptr i32, i32* %out, i32 4
4255  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
4256  %val0 = extractvalue { i32, i1 } %val, 0
4257  store i32 %val0, i32* %out, align 4
4258  ret void
4259}
4260
4261define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
4262; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
4263; GFX7:       ; %bb.0: ; %entry
4264; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4265; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4266; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4267; GFX7-NEXT:    s_add_u32 s4, s0, 16
4268; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4269; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4270; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4271; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4272; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4273; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4274; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4275; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4276; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4277; GFX7-NEXT:    flat_store_dword v[0:1], v2
4278; GFX7-NEXT:    s_endpgm
4279;
4280; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
4281; GFX10-WGP:       ; %bb.0: ; %entry
4282; GFX10-WGP-NEXT:    s_clause 0x1
4283; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4284; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4285; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4286; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4287; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4288; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4289; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4290; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4291; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4292; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4293; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4294; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4295; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4296; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4297; GFX10-WGP-NEXT:    s_endpgm
4298;
4299; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
4300; GFX10-CU:       ; %bb.0: ; %entry
4301; GFX10-CU-NEXT:    s_clause 0x1
4302; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4303; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4304; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4305; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4306; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4307; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4308; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4309; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4310; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4311; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4312; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4313; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4314; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4315; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4316; GFX10-CU-NEXT:    s_endpgm
4317;
4318; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
4319; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4320; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4321; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
4322; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4323; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4324; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4325; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4326; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4327; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4328; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4329; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4330; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4331; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4332; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4333; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4334; SKIP-CACHE-INV-NEXT:    s_endpgm
4335;
4336; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
4337; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4338; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4339; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4340; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4341; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4342; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4343; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4344; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4345; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4346; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4347;
4348; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
4349; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4350; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4351; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4352; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4353; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4354; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4355; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4356; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4357; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4358; GFX90A-TGSPLIT-NEXT:    s_endpgm
4359;
4360; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
4361; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4362; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4363; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
4364; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4365; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
4366; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
4367; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
4368; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4369; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4370; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4371;
4372; GFX940-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
4373; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4374; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4375; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
4376; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4377; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
4378; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
4379; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
4380; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4381; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4382; GFX940-TGSPLIT-NEXT:    s_endpgm
4383;
4384; GFX11-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
4385; GFX11-WGP:       ; %bb.0: ; %entry
4386; GFX11-WGP-NEXT:    s_clause 0x1
4387; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
4388; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
4389; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4390; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
4391; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
4392; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
4393; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4394; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
4395; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4396; GFX11-WGP-NEXT:    s_endpgm
4397;
4398; GFX11-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
4399; GFX11-CU:       ; %bb.0: ; %entry
4400; GFX11-CU-NEXT:    s_clause 0x1
4401; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
4402; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
4403; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4404; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
4405; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
4406; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
4407; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4408; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
4409; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4410; GFX11-CU-NEXT:    s_endpgm
4411    i32* %out, i32 %in, i32 %old) {
4412entry:
4413  %gep = getelementptr i32, i32* %out, i32 4
4414  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
4415  %val0 = extractvalue { i32, i1 } %val, 0
4416  store i32 %val0, i32* %out, align 4
4417  ret void
4418}
4419
4420define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
4421; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
4422; GFX7:       ; %bb.0: ; %entry
4423; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4424; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4425; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4426; GFX7-NEXT:    s_add_u32 s4, s0, 16
4427; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4428; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4429; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4430; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4431; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4432; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4433; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4434; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4435; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4436; GFX7-NEXT:    flat_store_dword v[0:1], v2
4437; GFX7-NEXT:    s_endpgm
4438;
4439; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
4440; GFX10-WGP:       ; %bb.0: ; %entry
4441; GFX10-WGP-NEXT:    s_clause 0x1
4442; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4443; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4444; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4445; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4446; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4447; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4448; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4449; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4450; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4451; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4452; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4453; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4454; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4455; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4456; GFX10-WGP-NEXT:    s_endpgm
4457;
4458; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
4459; GFX10-CU:       ; %bb.0: ; %entry
4460; GFX10-CU-NEXT:    s_clause 0x1
4461; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4462; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4463; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4464; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4465; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4466; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4467; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4468; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4469; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4470; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4471; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4472; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4473; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4474; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4475; GFX10-CU-NEXT:    s_endpgm
4476;
4477; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
4478; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4479; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4480; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
4481; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4482; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4483; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4484; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4485; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4486; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4487; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4488; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4489; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4490; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4491; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4492; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4493; SKIP-CACHE-INV-NEXT:    s_endpgm
4494;
4495; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
4496; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4497; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4498; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4499; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4500; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4501; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4502; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4503; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4504; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4505; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4506;
4507; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
4508; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4509; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4510; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4511; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4512; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4513; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4514; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4515; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4516; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4517; GFX90A-TGSPLIT-NEXT:    s_endpgm
4518;
4519; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
4520; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4521; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4522; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
4523; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4524; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
4525; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
4526; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
4527; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4528; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4529; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4530;
4531; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
4532; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4533; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4534; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
4535; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4536; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
4537; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
4538; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
4539; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4540; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4541; GFX940-TGSPLIT-NEXT:    s_endpgm
4542;
4543; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
4544; GFX11-WGP:       ; %bb.0: ; %entry
4545; GFX11-WGP-NEXT:    s_clause 0x1
4546; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
4547; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
4548; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4549; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
4550; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
4551; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
4552; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4553; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
4554; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4555; GFX11-WGP-NEXT:    s_endpgm
4556;
4557; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
4558; GFX11-CU:       ; %bb.0: ; %entry
4559; GFX11-CU-NEXT:    s_clause 0x1
4560; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
4561; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
4562; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4563; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
4564; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
4565; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
4566; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4567; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
4568; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4569; GFX11-CU-NEXT:    s_endpgm
4570    i32* %out, i32 %in, i32 %old) {
4571entry:
4572  %gep = getelementptr i32, i32* %out, i32 4
4573  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
4574  %val0 = extractvalue { i32, i1 } %val, 0
4575  store i32 %val0, i32* %out, align 4
4576  ret void
4577}
4578
4579define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
4580; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
4581; GFX7:       ; %bb.0: ; %entry
4582; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4583; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4584; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4585; GFX7-NEXT:    s_add_u32 s4, s0, 16
4586; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4587; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4588; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4589; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4590; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4591; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4592; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4593; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4594; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4595; GFX7-NEXT:    flat_store_dword v[0:1], v2
4596; GFX7-NEXT:    s_endpgm
4597;
4598; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
4599; GFX10-WGP:       ; %bb.0: ; %entry
4600; GFX10-WGP-NEXT:    s_clause 0x1
4601; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4602; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4603; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4604; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4605; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4606; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4607; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4608; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4609; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4610; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4611; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4612; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4613; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4614; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4615; GFX10-WGP-NEXT:    s_endpgm
4616;
4617; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
4618; GFX10-CU:       ; %bb.0: ; %entry
4619; GFX10-CU-NEXT:    s_clause 0x1
4620; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4621; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4622; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4623; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4624; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4625; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4626; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4627; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4628; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4629; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4630; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4631; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4632; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4633; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4634; GFX10-CU-NEXT:    s_endpgm
4635;
4636; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
4637; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4638; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4639; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
4640; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4641; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4642; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4643; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4644; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4645; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4646; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4647; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4648; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4649; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4650; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4651; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4652; SKIP-CACHE-INV-NEXT:    s_endpgm
4653;
4654; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
4655; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4656; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4657; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4658; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4659; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4660; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4661; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4662; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4663; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4664; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4665;
4666; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
4667; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4668; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4669; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4670; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4671; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4672; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4673; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4674; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4675; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4676; GFX90A-TGSPLIT-NEXT:    s_endpgm
4677;
4678; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
4679; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4680; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4681; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
4682; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4683; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
4684; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
4685; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
4686; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4687; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4688; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4689;
4690; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
4691; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4692; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4693; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
4694; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4695; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
4696; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
4697; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
4698; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4699; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4700; GFX940-TGSPLIT-NEXT:    s_endpgm
4701;
4702; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
4703; GFX11-WGP:       ; %bb.0: ; %entry
4704; GFX11-WGP-NEXT:    s_clause 0x1
4705; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
4706; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
4707; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4708; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
4709; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
4710; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
4711; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4712; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
4713; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4714; GFX11-WGP-NEXT:    s_endpgm
4715;
4716; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
4717; GFX11-CU:       ; %bb.0: ; %entry
4718; GFX11-CU-NEXT:    s_clause 0x1
4719; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
4720; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
4721; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4722; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
4723; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
4724; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
4725; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4726; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
4727; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4728; GFX11-CU-NEXT:    s_endpgm
4729    i32* %out, i32 %in, i32 %old) {
4730entry:
4731  %gep = getelementptr i32, i32* %out, i32 4
4732  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
4733  %val0 = extractvalue { i32, i1 } %val, 0
4734  store i32 %val0, i32* %out, align 4
4735  ret void
4736}
4737
4738define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
4739; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
4740; GFX7:       ; %bb.0: ; %entry
4741; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4742; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4743; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4744; GFX7-NEXT:    s_add_u32 s4, s0, 16
4745; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4746; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4747; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4748; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4749; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4750; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4751; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4752; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4753; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4754; GFX7-NEXT:    flat_store_dword v[0:1], v2
4755; GFX7-NEXT:    s_endpgm
4756;
4757; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
4758; GFX10-WGP:       ; %bb.0: ; %entry
4759; GFX10-WGP-NEXT:    s_clause 0x1
4760; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4761; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4762; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4763; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4764; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4765; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4766; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4767; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4768; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4769; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4770; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4771; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4772; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4773; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4774; GFX10-WGP-NEXT:    s_endpgm
4775;
4776; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
4777; GFX10-CU:       ; %bb.0: ; %entry
4778; GFX10-CU-NEXT:    s_clause 0x1
4779; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4780; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4781; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4782; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4783; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4784; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4785; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4786; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4787; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4788; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4789; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4790; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4791; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4792; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4793; GFX10-CU-NEXT:    s_endpgm
4794;
4795; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
4796; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4797; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4798; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
4799; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4800; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4801; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4802; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4803; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4804; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4805; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4806; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4807; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4808; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4809; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4810; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4811; SKIP-CACHE-INV-NEXT:    s_endpgm
4812;
4813; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
4814; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4815; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4816; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4817; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4818; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4819; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4820; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4821; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4822; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4823; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4824;
4825; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
4826; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4827; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4828; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4829; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4830; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4831; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4832; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4833; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4834; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4835; GFX90A-TGSPLIT-NEXT:    s_endpgm
4836;
4837; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
4838; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4839; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4840; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
4841; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4842; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
4843; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
4844; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
4845; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4846; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4847; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4848;
4849; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
4850; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4851; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4852; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
4853; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4854; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
4855; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
4856; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
4857; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4858; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4859; GFX940-TGSPLIT-NEXT:    s_endpgm
4860;
4861; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
4862; GFX11-WGP:       ; %bb.0: ; %entry
4863; GFX11-WGP-NEXT:    s_clause 0x1
4864; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
4865; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
4866; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4867; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
4868; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
4869; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
4870; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4871; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
4872; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4873; GFX11-WGP-NEXT:    s_endpgm
4874;
4875; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
4876; GFX11-CU:       ; %bb.0: ; %entry
4877; GFX11-CU-NEXT:    s_clause 0x1
4878; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
4879; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
4880; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4881; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
4882; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
4883; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
4884; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4885; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
4886; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4887; GFX11-CU-NEXT:    s_endpgm
4888    i32* %out, i32 %in, i32 %old) {
4889entry:
4890  %gep = getelementptr i32, i32* %out, i32 4
4891  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire
4892  %val0 = extractvalue { i32, i1 } %val, 0
4893  store i32 %val0, i32* %out, align 4
4894  ret void
4895}
4896
4897define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
4898; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
4899; GFX7:       ; %bb.0: ; %entry
4900; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4901; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4902; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4903; GFX7-NEXT:    s_add_u32 s4, s0, 16
4904; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4905; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4906; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4907; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4908; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4909; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4910; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4911; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4912; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4913; GFX7-NEXT:    flat_store_dword v[0:1], v2
4914; GFX7-NEXT:    s_endpgm
4915;
4916; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
4917; GFX10-WGP:       ; %bb.0: ; %entry
4918; GFX10-WGP-NEXT:    s_clause 0x1
4919; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4920; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4921; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4922; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4923; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4924; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4925; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4926; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4927; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4928; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4929; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4930; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4931; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4932; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4933; GFX10-WGP-NEXT:    s_endpgm
4934;
4935; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
4936; GFX10-CU:       ; %bb.0: ; %entry
4937; GFX10-CU-NEXT:    s_clause 0x1
4938; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4939; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4940; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4941; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4942; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4943; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4944; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4945; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4946; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4947; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4948; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4949; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4950; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4951; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4952; GFX10-CU-NEXT:    s_endpgm
4953;
4954; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
4955; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4956; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4957; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
4958; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4959; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4960; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4961; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4962; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4963; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4964; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4965; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4966; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4967; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4968; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4969; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4970; SKIP-CACHE-INV-NEXT:    s_endpgm
4971;
4972; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
4973; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4974; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4975; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4976; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4977; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4978; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4979; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4980; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4981; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4982; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4983;
4984; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
4985; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4986; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4987; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4988; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4989; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4990; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4991; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4992; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4993; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4994; GFX90A-TGSPLIT-NEXT:    s_endpgm
4995;
4996; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
4997; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4998; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
4999; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
5000; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5001; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
5002; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
5003; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5004; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5005; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5006; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5007;
5008; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
5009; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5010; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5011; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
5012; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5013; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
5014; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
5015; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5016; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5017; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5018; GFX940-TGSPLIT-NEXT:    s_endpgm
5019;
5020; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
5021; GFX11-WGP:       ; %bb.0: ; %entry
5022; GFX11-WGP-NEXT:    s_clause 0x1
5023; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
5024; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
5025; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5026; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
5027; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
5028; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5029; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5030; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
5031; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5032; GFX11-WGP-NEXT:    s_endpgm
5033;
5034; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
5035; GFX11-CU:       ; %bb.0: ; %entry
5036; GFX11-CU-NEXT:    s_clause 0x1
5037; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
5038; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
5039; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5040; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
5041; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
5042; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5043; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5044; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
5045; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5046; GFX11-CU-NEXT:    s_endpgm
5047    i32* %out, i32 %in, i32 %old) {
5048entry:
5049  %gep = getelementptr i32, i32* %out, i32 4
5050  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
5051  %val0 = extractvalue { i32, i1 } %val, 0
5052  store i32 %val0, i32* %out, align 4
5053  ret void
5054}
5055
5056define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
5057; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
5058; GFX7:       ; %bb.0: ; %entry
5059; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5060; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5061; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5062; GFX7-NEXT:    s_add_u32 s4, s0, 16
5063; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5064; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5065; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5066; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5067; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5068; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5069; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5070; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5071; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5072; GFX7-NEXT:    flat_store_dword v[0:1], v2
5073; GFX7-NEXT:    s_endpgm
5074;
5075; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
5076; GFX10-WGP:       ; %bb.0: ; %entry
5077; GFX10-WGP-NEXT:    s_clause 0x1
5078; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5079; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5080; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5081; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
5082; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
5083; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5084; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5085; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5086; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5087; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5088; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5089; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5090; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5091; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5092; GFX10-WGP-NEXT:    s_endpgm
5093;
5094; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
5095; GFX10-CU:       ; %bb.0: ; %entry
5096; GFX10-CU-NEXT:    s_clause 0x1
5097; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5098; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5099; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5100; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
5101; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
5102; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5103; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5104; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5105; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5106; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5107; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5108; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5109; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5110; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5111; GFX10-CU-NEXT:    s_endpgm
5112;
5113; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
5114; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5115; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5116; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
5117; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5118; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
5119; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
5120; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
5121; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5122; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
5123; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5124; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5125; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5126; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5127; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5128; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5129; SKIP-CACHE-INV-NEXT:    s_endpgm
5130;
5131; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
5132; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5133; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5134; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5135; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5136; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5137; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5138; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5139; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5140; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5141; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5142;
5143; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
5144; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5145; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5146; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5147; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5148; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5149; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5150; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5151; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5152; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5153; GFX90A-TGSPLIT-NEXT:    s_endpgm
5154;
5155; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
5156; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5157; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5158; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
5159; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5160; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
5161; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
5162; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5163; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5164; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5165; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5166;
5167; GFX940-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
5168; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5169; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5170; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
5171; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5172; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
5173; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
5174; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5175; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5176; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5177; GFX940-TGSPLIT-NEXT:    s_endpgm
5178;
5179; GFX11-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
5180; GFX11-WGP:       ; %bb.0: ; %entry
5181; GFX11-WGP-NEXT:    s_clause 0x1
5182; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
5183; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
5184; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5185; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
5186; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
5187; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5188; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5189; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
5190; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5191; GFX11-WGP-NEXT:    s_endpgm
5192;
5193; GFX11-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
5194; GFX11-CU:       ; %bb.0: ; %entry
5195; GFX11-CU-NEXT:    s_clause 0x1
5196; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
5197; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
5198; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5199; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
5200; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
5201; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5202; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5203; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
5204; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5205; GFX11-CU-NEXT:    s_endpgm
5206    i32* %out, i32 %in, i32 %old) {
5207entry:
5208  %gep = getelementptr i32, i32* %out, i32 4
5209  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
5210  %val0 = extractvalue { i32, i1 } %val, 0
5211  store i32 %val0, i32* %out, align 4
5212  ret void
5213}
5214
5215define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
5216; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
5217; GFX7:       ; %bb.0: ; %entry
5218; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5219; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5220; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5221; GFX7-NEXT:    s_add_u32 s4, s0, 16
5222; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5223; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5224; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5225; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5226; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5227; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5228; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5229; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5230; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5231; GFX7-NEXT:    flat_store_dword v[0:1], v2
5232; GFX7-NEXT:    s_endpgm
5233;
5234; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
5235; GFX10-WGP:       ; %bb.0: ; %entry
5236; GFX10-WGP-NEXT:    s_clause 0x1
5237; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5238; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5239; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5240; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
5241; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
5242; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5243; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5244; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5245; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5246; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5247; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5248; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5249; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5250; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5251; GFX10-WGP-NEXT:    s_endpgm
5252;
5253; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
5254; GFX10-CU:       ; %bb.0: ; %entry
5255; GFX10-CU-NEXT:    s_clause 0x1
5256; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5257; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5258; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5259; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
5260; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
5261; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5262; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5263; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5264; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5265; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5266; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5267; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5268; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5269; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5270; GFX10-CU-NEXT:    s_endpgm
5271;
5272; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
5273; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5274; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5275; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
5276; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5277; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
5278; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
5279; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
5280; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5281; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
5282; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5283; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5284; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5285; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5286; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5287; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5288; SKIP-CACHE-INV-NEXT:    s_endpgm
5289;
5290; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
5291; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5292; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5293; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5294; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5295; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5296; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5297; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5298; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5299; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5300; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5301;
5302; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
5303; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5304; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5305; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5306; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5307; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5308; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5309; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5310; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5311; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5312; GFX90A-TGSPLIT-NEXT:    s_endpgm
5313;
5314; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
5315; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5316; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5317; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
5318; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5319; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
5320; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
5321; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5322; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5323; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5324; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5325;
5326; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
5327; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5328; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5329; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
5330; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5331; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
5332; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
5333; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5334; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5335; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5336; GFX940-TGSPLIT-NEXT:    s_endpgm
5337;
5338; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
5339; GFX11-WGP:       ; %bb.0: ; %entry
5340; GFX11-WGP-NEXT:    s_clause 0x1
5341; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
5342; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
5343; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5344; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
5345; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
5346; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5347; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5348; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
5349; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5350; GFX11-WGP-NEXT:    s_endpgm
5351;
5352; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
5353; GFX11-CU:       ; %bb.0: ; %entry
5354; GFX11-CU-NEXT:    s_clause 0x1
5355; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
5356; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
5357; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5358; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
5359; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
5360; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5361; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5362; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
5363; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5364; GFX11-CU-NEXT:    s_endpgm
5365    i32* %out, i32 %in, i32 %old) {
5366entry:
5367  %gep = getelementptr i32, i32* %out, i32 4
5368  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
5369  %val0 = extractvalue { i32, i1 } %val, 0
5370  store i32 %val0, i32* %out, align 4
5371  ret void
5372}
5373
5374define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
5375; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
5376; GFX7:       ; %bb.0: ; %entry
5377; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5378; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5379; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5380; GFX7-NEXT:    s_add_u32 s4, s0, 16
5381; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5382; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5383; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5384; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5385; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5386; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5387; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5388; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5389; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5390; GFX7-NEXT:    flat_store_dword v[0:1], v2
5391; GFX7-NEXT:    s_endpgm
5392;
5393; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
5394; GFX10-WGP:       ; %bb.0: ; %entry
5395; GFX10-WGP-NEXT:    s_clause 0x1
5396; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5397; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5398; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5399; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
5400; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
5401; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5402; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5403; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5404; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5405; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5406; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5407; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5408; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5409; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5410; GFX10-WGP-NEXT:    s_endpgm
5411;
5412; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
5413; GFX10-CU:       ; %bb.0: ; %entry
5414; GFX10-CU-NEXT:    s_clause 0x1
5415; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5416; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5417; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5418; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
5419; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
5420; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5421; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5422; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5423; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5424; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5425; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5426; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5427; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5428; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5429; GFX10-CU-NEXT:    s_endpgm
5430;
5431; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
5432; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5433; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5434; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
5435; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5436; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
5437; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
5438; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
5439; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5440; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
5441; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5442; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5443; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5444; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5445; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5446; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5447; SKIP-CACHE-INV-NEXT:    s_endpgm
5448;
5449; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
5450; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5451; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5452; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5453; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5454; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5455; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5456; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5457; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5458; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5459; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5460;
5461; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
5462; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5463; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5464; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5465; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5466; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5467; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5468; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5469; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5470; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5471; GFX90A-TGSPLIT-NEXT:    s_endpgm
5472;
5473; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
5474; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5475; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5476; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
5477; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5478; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
5479; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
5480; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5481; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5482; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5483; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5484;
5485; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
5486; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5487; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5488; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
5489; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5490; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
5491; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
5492; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5493; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5494; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5495; GFX940-TGSPLIT-NEXT:    s_endpgm
5496;
5497; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
5498; GFX11-WGP:       ; %bb.0: ; %entry
5499; GFX11-WGP-NEXT:    s_clause 0x1
5500; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
5501; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
5502; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5503; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
5504; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
5505; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5506; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5507; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
5508; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5509; GFX11-WGP-NEXT:    s_endpgm
5510;
5511; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
5512; GFX11-CU:       ; %bb.0: ; %entry
5513; GFX11-CU-NEXT:    s_clause 0x1
5514; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
5515; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
5516; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5517; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
5518; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
5519; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5520; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5521; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
5522; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5523; GFX11-CU-NEXT:    s_endpgm
5524    i32* %out, i32 %in, i32 %old) {
5525entry:
5526  %gep = getelementptr i32, i32* %out, i32 4
5527  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
5528  %val0 = extractvalue { i32, i1 } %val, 0
5529  store i32 %val0, i32* %out, align 4
5530  ret void
5531}
5532
5533define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
5534; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
5535; GFX7:       ; %bb.0: ; %entry
5536; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5537; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5538; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5539; GFX7-NEXT:    s_add_u32 s4, s0, 16
5540; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5541; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5542; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5543; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5544; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5545; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5546; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5547; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5548; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5549; GFX7-NEXT:    flat_store_dword v[0:1], v2
5550; GFX7-NEXT:    s_endpgm
5551;
5552; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
5553; GFX10-WGP:       ; %bb.0: ; %entry
5554; GFX10-WGP-NEXT:    s_clause 0x1
5555; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5556; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5557; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5558; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
5559; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
5560; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5561; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5562; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5563; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5564; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5565; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5566; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5567; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5568; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5569; GFX10-WGP-NEXT:    s_endpgm
5570;
5571; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
5572; GFX10-CU:       ; %bb.0: ; %entry
5573; GFX10-CU-NEXT:    s_clause 0x1
5574; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5575; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5576; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5577; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
5578; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
5579; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5580; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5581; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5582; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5583; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5584; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5585; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5586; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5587; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5588; GFX10-CU-NEXT:    s_endpgm
5589;
5590; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
5591; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5592; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5593; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
5594; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5595; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
5596; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
5597; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
5598; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5599; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
5600; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5601; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5602; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5603; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5604; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5605; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5606; SKIP-CACHE-INV-NEXT:    s_endpgm
5607;
5608; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
5609; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5610; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5611; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5612; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5613; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5614; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5615; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5616; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5617; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5618; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5619;
5620; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
5621; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5622; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5623; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5624; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5625; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5626; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5627; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5628; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5629; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5630; GFX90A-TGSPLIT-NEXT:    s_endpgm
5631;
5632; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
5633; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5634; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5635; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
5636; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5637; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
5638; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
5639; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5640; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5641; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5642; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5643;
5644; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
5645; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5646; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5647; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
5648; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5649; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
5650; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
5651; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5652; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5653; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5654; GFX940-TGSPLIT-NEXT:    s_endpgm
5655;
5656; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
5657; GFX11-WGP:       ; %bb.0: ; %entry
5658; GFX11-WGP-NEXT:    s_clause 0x1
5659; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
5660; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
5661; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5662; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
5663; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
5664; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5665; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5666; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
5667; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5668; GFX11-WGP-NEXT:    s_endpgm
5669;
5670; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
5671; GFX11-CU:       ; %bb.0: ; %entry
5672; GFX11-CU-NEXT:    s_clause 0x1
5673; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
5674; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
5675; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5676; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
5677; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
5678; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5679; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5680; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
5681; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5682; GFX11-CU-NEXT:    s_endpgm
5683    i32* %out, i32 %in, i32 %old) {
5684entry:
5685  %gep = getelementptr i32, i32* %out, i32 4
5686  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst
5687  %val0 = extractvalue { i32, i1 } %val, 0
5688  store i32 %val0, i32* %out, align 4
5689  ret void
5690}
5691
5692define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
5693; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
5694; GFX7:       ; %bb.0: ; %entry
5695; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5696; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5697; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5698; GFX7-NEXT:    s_add_u32 s4, s0, 16
5699; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5700; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5701; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5702; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5703; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5704; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5705; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5706; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5707; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5708; GFX7-NEXT:    flat_store_dword v[0:1], v2
5709; GFX7-NEXT:    s_endpgm
5710;
5711; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
5712; GFX10-WGP:       ; %bb.0: ; %entry
5713; GFX10-WGP-NEXT:    s_clause 0x1
5714; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5715; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5716; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5717; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
5718; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
5719; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5720; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5721; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5722; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5723; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5724; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5725; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5726; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5727; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5728; GFX10-WGP-NEXT:    s_endpgm
5729;
5730; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
5731; GFX10-CU:       ; %bb.0: ; %entry
5732; GFX10-CU-NEXT:    s_clause 0x1
5733; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5734; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5735; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5736; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
5737; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
5738; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5739; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5740; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5741; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5742; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5743; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5744; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5745; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5746; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5747; GFX10-CU-NEXT:    s_endpgm
5748;
5749; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
5750; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5751; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5752; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
5753; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5754; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
5755; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
5756; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
5757; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5758; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
5759; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5760; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5761; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5762; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5763; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5764; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5765; SKIP-CACHE-INV-NEXT:    s_endpgm
5766;
5767; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
5768; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5769; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5770; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5771; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5772; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5773; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5774; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5775; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5776; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5777; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5778;
5779; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
5780; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5781; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5782; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5783; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5784; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5785; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5786; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5787; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5788; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5789; GFX90A-TGSPLIT-NEXT:    s_endpgm
5790;
5791; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
5792; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5793; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5794; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
5795; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5796; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
5797; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
5798; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5799; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5800; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5801; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5802;
5803; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
5804; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5805; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5806; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
5807; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5808; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
5809; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
5810; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5811; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5812; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5813; GFX940-TGSPLIT-NEXT:    s_endpgm
5814;
5815; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
5816; GFX11-WGP:       ; %bb.0: ; %entry
5817; GFX11-WGP-NEXT:    s_clause 0x1
5818; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
5819; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
5820; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5821; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
5822; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
5823; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5824; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5825; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
5826; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5827; GFX11-WGP-NEXT:    s_endpgm
5828;
5829; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
5830; GFX11-CU:       ; %bb.0: ; %entry
5831; GFX11-CU-NEXT:    s_clause 0x1
5832; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
5833; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
5834; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5835; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
5836; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
5837; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5838; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5839; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
5840; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5841; GFX11-CU-NEXT:    s_endpgm
5842    i32* %out, i32 %in, i32 %old) {
5843entry:
5844  %gep = getelementptr i32, i32* %out, i32 4
5845  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst
5846  %val0 = extractvalue { i32, i1 } %val, 0
5847  store i32 %val0, i32* %out, align 4
5848  ret void
5849}
5850
5851define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
5852; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
5853; GFX7:       ; %bb.0: ; %entry
5854; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5855; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5856; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5857; GFX7-NEXT:    s_add_u32 s4, s0, 16
5858; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5859; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5860; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5861; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5862; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5863; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5864; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5865; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5866; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5867; GFX7-NEXT:    flat_store_dword v[0:1], v2
5868; GFX7-NEXT:    s_endpgm
5869;
5870; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
5871; GFX10-WGP:       ; %bb.0: ; %entry
5872; GFX10-WGP-NEXT:    s_clause 0x1
5873; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5874; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5875; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5876; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
5877; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
5878; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5879; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5880; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5881; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5882; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5883; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5884; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5885; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5886; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5887; GFX10-WGP-NEXT:    s_endpgm
5888;
5889; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
5890; GFX10-CU:       ; %bb.0: ; %entry
5891; GFX10-CU-NEXT:    s_clause 0x1
5892; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5893; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5894; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5895; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
5896; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
5897; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5898; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5899; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5900; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5901; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5902; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5903; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5904; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5905; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5906; GFX10-CU-NEXT:    s_endpgm
5907;
5908; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
5909; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5910; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5911; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
5912; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5913; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
5914; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
5915; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
5916; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5917; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
5918; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5919; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5920; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5921; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5922; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5923; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5924; SKIP-CACHE-INV-NEXT:    s_endpgm
5925;
5926; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
5927; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5928; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5929; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5930; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5931; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5932; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5933; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5934; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5935; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5936; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5937;
5938; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
5939; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5940; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5941; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5942; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5943; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5944; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5945; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5946; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5947; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5948; GFX90A-TGSPLIT-NEXT:    s_endpgm
5949;
5950; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
5951; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5952; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5953; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
5954; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5955; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
5956; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
5957; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5958; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5959; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5960; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5961;
5962; GFX940-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
5963; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5964; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
5965; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
5966; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5967; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
5968; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
5969; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5970; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5971; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5972; GFX940-TGSPLIT-NEXT:    s_endpgm
5973;
5974; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
5975; GFX11-WGP:       ; %bb.0: ; %entry
5976; GFX11-WGP-NEXT:    s_clause 0x1
5977; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
5978; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
5979; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5980; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
5981; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
5982; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5983; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5984; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
5985; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5986; GFX11-WGP-NEXT:    s_endpgm
5987;
5988; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
5989; GFX11-CU:       ; %bb.0: ; %entry
5990; GFX11-CU-NEXT:    s_clause 0x1
5991; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
5992; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
5993; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5994; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
5995; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
5996; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5997; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5998; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
5999; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6000; GFX11-CU-NEXT:    s_endpgm
6001    i32* %out, i32 %in, i32 %old) {
6002entry:
6003  %gep = getelementptr i32, i32* %out, i32 4
6004  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst
6005  %val0 = extractvalue { i32, i1 } %val, 0
6006  store i32 %val0, i32* %out, align 4
6007  ret void
6008}
6009
6010define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
6011; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
6012; GFX7:       ; %bb.0: ; %entry
6013; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6014; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6015; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6016; GFX7-NEXT:    s_add_u32 s4, s0, 16
6017; GFX7-NEXT:    s_addc_u32 s5, s1, 0
6018; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6019; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6020; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6021; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6022; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6023; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6024; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6025; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6026; GFX7-NEXT:    flat_store_dword v[0:1], v2
6027; GFX7-NEXT:    s_endpgm
6028;
6029; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
6030; GFX10-WGP:       ; %bb.0: ; %entry
6031; GFX10-WGP-NEXT:    s_clause 0x1
6032; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6033; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6034; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6035; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
6036; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
6037; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
6038; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6039; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
6040; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6041; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6042; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6043; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6044; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6045; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6046; GFX10-WGP-NEXT:    s_endpgm
6047;
6048; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
6049; GFX10-CU:       ; %bb.0: ; %entry
6050; GFX10-CU-NEXT:    s_clause 0x1
6051; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6052; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6053; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6054; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
6055; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
6056; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
6057; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6058; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
6059; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6060; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6061; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6062; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6063; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6064; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6065; GFX10-CU-NEXT:    s_endpgm
6066;
6067; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
6068; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6069; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
6070; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
6071; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6072; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
6073; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
6074; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
6075; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6076; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
6077; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6078; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6079; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6080; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6081; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6082; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6083; SKIP-CACHE-INV-NEXT:    s_endpgm
6084;
6085; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
6086; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6087; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6088; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6089; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6090; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6091; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6092; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6093; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6094; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6095; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6096;
6097; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
6098; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6099; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6100; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6101; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6102; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6103; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6104; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6105; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6106; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6107; GFX90A-TGSPLIT-NEXT:    s_endpgm
6108;
6109; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
6110; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6111; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
6112; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
6113; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6114; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
6115; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
6116; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6117; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6118; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6119; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6120;
6121; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
6122; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6123; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
6124; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
6125; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6126; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
6127; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
6128; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6129; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6130; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6131; GFX940-TGSPLIT-NEXT:    s_endpgm
6132;
6133; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
6134; GFX11-WGP:       ; %bb.0: ; %entry
6135; GFX11-WGP-NEXT:    s_clause 0x1
6136; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
6137; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
6138; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6139; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
6140; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
6141; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6142; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6143; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
6144; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6145; GFX11-WGP-NEXT:    s_endpgm
6146;
6147; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
6148; GFX11-CU:       ; %bb.0: ; %entry
6149; GFX11-CU-NEXT:    s_clause 0x1
6150; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
6151; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
6152; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6153; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
6154; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
6155; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6156; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6157; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
6158; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6159; GFX11-CU-NEXT:    s_endpgm
6160    i32* %out, i32 %in, i32 %old) {
6161entry:
6162  %gep = getelementptr i32, i32* %out, i32 4
6163  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst
6164  %val0 = extractvalue { i32, i1 } %val, 0
6165  store i32 %val0, i32* %out, align 4
6166  ret void
6167}
6168
6169define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
6170; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
6171; GFX7:       ; %bb.0: ; %entry
6172; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6173; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6174; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6175; GFX7-NEXT:    s_add_u32 s4, s0, 16
6176; GFX7-NEXT:    s_addc_u32 s5, s1, 0
6177; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6178; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6179; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6180; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6181; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6182; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6183; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6184; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6185; GFX7-NEXT:    flat_store_dword v[0:1], v2
6186; GFX7-NEXT:    s_endpgm
6187;
6188; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
6189; GFX10-WGP:       ; %bb.0: ; %entry
6190; GFX10-WGP-NEXT:    s_clause 0x1
6191; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6192; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6193; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6194; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
6195; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
6196; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
6197; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6198; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
6199; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6200; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6201; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6202; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6203; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6204; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6205; GFX10-WGP-NEXT:    s_endpgm
6206;
6207; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
6208; GFX10-CU:       ; %bb.0: ; %entry
6209; GFX10-CU-NEXT:    s_clause 0x1
6210; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6211; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6212; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6213; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
6214; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
6215; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
6216; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6217; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
6218; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6219; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6220; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6221; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6222; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6223; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6224; GFX10-CU-NEXT:    s_endpgm
6225;
6226; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
6227; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6228; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
6229; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
6230; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6231; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
6232; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
6233; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
6234; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6235; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
6236; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6237; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6238; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6239; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6240; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6241; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6242; SKIP-CACHE-INV-NEXT:    s_endpgm
6243;
6244; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
6245; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6246; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6247; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6248; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6249; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6250; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6251; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6252; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6253; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6254; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6255;
6256; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
6257; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6258; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6259; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6260; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6261; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6262; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6263; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6264; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6265; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6266; GFX90A-TGSPLIT-NEXT:    s_endpgm
6267;
6268; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
6269; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6270; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
6271; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
6272; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6273; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
6274; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
6275; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6276; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6277; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6278; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6279;
6280; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
6281; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6282; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
6283; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
6284; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6285; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
6286; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
6287; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6288; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6289; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6290; GFX940-TGSPLIT-NEXT:    s_endpgm
6291;
6292; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
6293; GFX11-WGP:       ; %bb.0: ; %entry
6294; GFX11-WGP-NEXT:    s_clause 0x1
6295; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
6296; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
6297; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6298; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
6299; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
6300; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6301; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6302; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
6303; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6304; GFX11-WGP-NEXT:    s_endpgm
6305;
6306; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
6307; GFX11-CU:       ; %bb.0: ; %entry
6308; GFX11-CU-NEXT:    s_clause 0x1
6309; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
6310; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
6311; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6312; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
6313; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
6314; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6315; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6316; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
6317; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6318; GFX11-CU-NEXT:    s_endpgm
6319    i32* %out, i32 %in, i32 %old) {
6320entry:
6321  %gep = getelementptr i32, i32* %out, i32 4
6322  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
6323  %val0 = extractvalue { i32, i1 } %val, 0
6324  store i32 %val0, i32* %out, align 4
6325  ret void
6326}
6327
6328define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
6329; GFX7-LABEL: flat_singlethread_one_as_unordered_load:
6330; GFX7:       ; %bb.0: ; %entry
6331; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6332; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6333; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6334; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6335; GFX7-NEXT:    flat_load_dword v2, v[0:1]
6336; GFX7-NEXT:    v_mov_b32_e32 v0, s2
6337; GFX7-NEXT:    v_mov_b32_e32 v1, s3
6338; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6339; GFX7-NEXT:    flat_store_dword v[0:1], v2
6340; GFX7-NEXT:    s_endpgm
6341;
6342; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load:
6343; GFX10-WGP:       ; %bb.0: ; %entry
6344; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6345; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6346; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6347; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6348; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
6349; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
6350; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
6351; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6352; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6353; GFX10-WGP-NEXT:    s_endpgm
6354;
6355; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load:
6356; GFX10-CU:       ; %bb.0: ; %entry
6357; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6358; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6359; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6360; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6361; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
6362; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
6363; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
6364; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6365; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6366; GFX10-CU-NEXT:    s_endpgm
6367;
6368; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_load:
6369; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6370; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
6371; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6372; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6373; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6374; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
6375; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6376; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6377; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6378; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6379; SKIP-CACHE-INV-NEXT:    s_endpgm
6380;
6381; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
6382; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6383; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6384; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6385; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
6386; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
6387; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
6388; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6389; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
6390; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6391; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6392; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6393;
6394; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
6395; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6396; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6397; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6398; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
6399; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
6400; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
6401; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6402; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
6403; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6404; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6405; GFX90A-TGSPLIT-NEXT:    s_endpgm
6406;
6407; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
6408; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6409; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
6410; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6411; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
6412; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
6413; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
6414; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6415; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
6416; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6417; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6418; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6419;
6420; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
6421; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6422; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
6423; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6424; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
6425; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
6426; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
6427; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6428; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
6429; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6430; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6431; GFX940-TGSPLIT-NEXT:    s_endpgm
6432;
6433; GFX11-WGP-LABEL: flat_singlethread_one_as_unordered_load:
6434; GFX11-WGP:       ; %bb.0: ; %entry
6435; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
6436; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6437; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
6438; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
6439; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
6440; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6441; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
6442; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6443; GFX11-WGP-NEXT:    s_endpgm
6444;
6445; GFX11-CU-LABEL: flat_singlethread_one_as_unordered_load:
6446; GFX11-CU:       ; %bb.0: ; %entry
6447; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
6448; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6449; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
6450; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
6451; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
6452; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6453; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
6454; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6455; GFX11-CU-NEXT:    s_endpgm
6456    i32* %in, i32* %out) {
6457entry:
6458  %val = load atomic i32, i32* %in syncscope("singlethread-one-as") unordered, align 4
6459  store i32 %val, i32* %out
6460  ret void
6461}
6462
6463define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
6464; GFX7-LABEL: flat_singlethread_one_as_monotonic_load:
6465; GFX7:       ; %bb.0: ; %entry
6466; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6467; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6468; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6469; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6470; GFX7-NEXT:    flat_load_dword v2, v[0:1]
6471; GFX7-NEXT:    v_mov_b32_e32 v0, s2
6472; GFX7-NEXT:    v_mov_b32_e32 v1, s3
6473; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6474; GFX7-NEXT:    flat_store_dword v[0:1], v2
6475; GFX7-NEXT:    s_endpgm
6476;
6477; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
6478; GFX10-WGP:       ; %bb.0: ; %entry
6479; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6480; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6481; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6482; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6483; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
6484; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
6485; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
6486; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6487; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6488; GFX10-WGP-NEXT:    s_endpgm
6489;
6490; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load:
6491; GFX10-CU:       ; %bb.0: ; %entry
6492; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6493; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6494; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6495; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6496; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
6497; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
6498; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
6499; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6500; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6501; GFX10-CU-NEXT:    s_endpgm
6502;
6503; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_load:
6504; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6505; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
6506; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6507; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6508; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6509; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
6510; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6511; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6512; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6513; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6514; SKIP-CACHE-INV-NEXT:    s_endpgm
6515;
6516; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
6517; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6518; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6519; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6520; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
6521; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
6522; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
6523; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6524; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
6525; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6526; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6527; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6528;
6529; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
6530; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6531; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6532; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6533; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
6534; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
6535; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
6536; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6537; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
6538; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6539; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6540; GFX90A-TGSPLIT-NEXT:    s_endpgm
6541;
6542; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
6543; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6544; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
6545; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6546; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
6547; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
6548; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
6549; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6550; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
6551; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6552; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6553; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6554;
6555; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
6556; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6557; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
6558; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6559; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
6560; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
6561; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
6562; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6563; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
6564; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6565; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6566; GFX940-TGSPLIT-NEXT:    s_endpgm
6567;
6568; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
6569; GFX11-WGP:       ; %bb.0: ; %entry
6570; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
6571; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6572; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
6573; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
6574; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
6575; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6576; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
6577; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6578; GFX11-WGP-NEXT:    s_endpgm
6579;
6580; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_load:
6581; GFX11-CU:       ; %bb.0: ; %entry
6582; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
6583; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6584; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
6585; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
6586; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
6587; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6588; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
6589; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6590; GFX11-CU-NEXT:    s_endpgm
6591    i32* %in, i32* %out) {
6592entry:
6593  %val = load atomic i32, i32* %in syncscope("singlethread-one-as") monotonic, align 4
6594  store i32 %val, i32* %out
6595  ret void
6596}
6597
6598define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
6599; GFX7-LABEL: flat_singlethread_one_as_acquire_load:
6600; GFX7:       ; %bb.0: ; %entry
6601; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6602; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6603; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6604; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6605; GFX7-NEXT:    flat_load_dword v2, v[0:1]
6606; GFX7-NEXT:    v_mov_b32_e32 v0, s2
6607; GFX7-NEXT:    v_mov_b32_e32 v1, s3
6608; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6609; GFX7-NEXT:    flat_store_dword v[0:1], v2
6610; GFX7-NEXT:    s_endpgm
6611;
6612; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load:
6613; GFX10-WGP:       ; %bb.0: ; %entry
6614; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6615; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6616; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6617; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6618; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
6619; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
6620; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
6621; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6622; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6623; GFX10-WGP-NEXT:    s_endpgm
6624;
6625; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load:
6626; GFX10-CU:       ; %bb.0: ; %entry
6627; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6628; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6629; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6630; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6631; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
6632; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
6633; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
6634; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6635; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6636; GFX10-CU-NEXT:    s_endpgm
6637;
6638; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_load:
6639; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6640; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
6641; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6642; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6643; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6644; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
6645; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6646; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6647; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6648; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6649; SKIP-CACHE-INV-NEXT:    s_endpgm
6650;
6651; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
6652; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6653; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6654; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6655; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
6656; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
6657; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
6658; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6659; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
6660; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6661; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6662; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6663;
6664; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
6665; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6666; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6667; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6668; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
6669; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
6670; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
6671; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6672; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
6673; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6674; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6675; GFX90A-TGSPLIT-NEXT:    s_endpgm
6676;
6677; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
6678; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6679; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
6680; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6681; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
6682; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
6683; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
6684; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6685; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
6686; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6687; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6688; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6689;
6690; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
6691; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6692; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
6693; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6694; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
6695; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
6696; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
6697; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6698; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
6699; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6700; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6701; GFX940-TGSPLIT-NEXT:    s_endpgm
6702;
6703; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_load:
6704; GFX11-WGP:       ; %bb.0: ; %entry
6705; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
6706; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6707; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
6708; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
6709; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
6710; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6711; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
6712; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6713; GFX11-WGP-NEXT:    s_endpgm
6714;
6715; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_load:
6716; GFX11-CU:       ; %bb.0: ; %entry
6717; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
6718; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6719; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
6720; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
6721; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
6722; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6723; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
6724; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6725; GFX11-CU-NEXT:    s_endpgm
6726    i32* %in, i32* %out) {
6727entry:
6728  %val = load atomic i32, i32* %in syncscope("singlethread-one-as") acquire, align 4
6729  store i32 %val, i32* %out
6730  ret void
6731}
6732
6733define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
6734; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load:
6735; GFX7:       ; %bb.0: ; %entry
6736; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6737; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6738; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6739; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6740; GFX7-NEXT:    flat_load_dword v2, v[0:1]
6741; GFX7-NEXT:    v_mov_b32_e32 v0, s2
6742; GFX7-NEXT:    v_mov_b32_e32 v1, s3
6743; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6744; GFX7-NEXT:    flat_store_dword v[0:1], v2
6745; GFX7-NEXT:    s_endpgm
6746;
6747; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
6748; GFX10-WGP:       ; %bb.0: ; %entry
6749; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6750; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6751; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6752; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6753; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
6754; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
6755; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
6756; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6757; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6758; GFX10-WGP-NEXT:    s_endpgm
6759;
6760; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
6761; GFX10-CU:       ; %bb.0: ; %entry
6762; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6763; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6764; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6765; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6766; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
6767; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
6768; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
6769; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6770; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6771; GFX10-CU-NEXT:    s_endpgm
6772;
6773; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_load:
6774; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6775; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
6776; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6777; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6778; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6779; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
6780; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6781; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6782; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6783; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6784; SKIP-CACHE-INV-NEXT:    s_endpgm
6785;
6786; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
6787; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6788; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6789; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6790; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
6791; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
6792; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
6793; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6794; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
6795; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6796; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6797; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6798;
6799; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
6800; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6801; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6802; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6803; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
6804; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
6805; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
6806; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6807; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
6808; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6809; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6810; GFX90A-TGSPLIT-NEXT:    s_endpgm
6811;
6812; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
6813; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6814; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
6815; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6816; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
6817; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
6818; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
6819; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6820; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
6821; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6822; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6823; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6824;
6825; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
6826; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6827; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
6828; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6829; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
6830; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
6831; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
6832; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6833; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
6834; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6835; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6836; GFX940-TGSPLIT-NEXT:    s_endpgm
6837;
6838; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
6839; GFX11-WGP:       ; %bb.0: ; %entry
6840; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
6841; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6842; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
6843; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
6844; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
6845; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6846; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
6847; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6848; GFX11-WGP-NEXT:    s_endpgm
6849;
6850; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
6851; GFX11-CU:       ; %bb.0: ; %entry
6852; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
6853; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6854; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
6855; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
6856; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
6857; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6858; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
6859; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6860; GFX11-CU-NEXT:    s_endpgm
6861    i32* %in, i32* %out) {
6862entry:
6863  %val = load atomic i32, i32* %in syncscope("singlethread-one-as") seq_cst, align 4
6864  store i32 %val, i32* %out
6865  ret void
6866}
6867
6868define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
6869; GFX7-LABEL: flat_singlethread_one_as_unordered_store:
6870; GFX7:       ; %bb.0: ; %entry
6871; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
6872; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
6873; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6874; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6875; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6876; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6877; GFX7-NEXT:    flat_store_dword v[0:1], v2
6878; GFX7-NEXT:    s_endpgm
6879;
6880; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store:
6881; GFX10-WGP:       ; %bb.0: ; %entry
6882; GFX10-WGP-NEXT:    s_clause 0x1
6883; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6884; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
6885; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6886; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6887; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6888; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6889; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6890; GFX10-WGP-NEXT:    s_endpgm
6891;
6892; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store:
6893; GFX10-CU:       ; %bb.0: ; %entry
6894; GFX10-CU-NEXT:    s_clause 0x1
6895; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6896; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
6897; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6898; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6899; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6900; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6901; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6902; GFX10-CU-NEXT:    s_endpgm
6903;
6904; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_store:
6905; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6906; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
6907; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
6908; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6909; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6910; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6911; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6912; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6913; SKIP-CACHE-INV-NEXT:    s_endpgm
6914;
6915; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
6916; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6917; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6918; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
6919; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6920; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6921; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
6922; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6923; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6924;
6925; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
6926; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6927; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6928; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
6929; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6930; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6931; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
6932; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6933; GFX90A-TGSPLIT-NEXT:    s_endpgm
6934;
6935; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
6936; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6937; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
6938; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
6939; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6940; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
6941; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
6942; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6943; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6944;
6945; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
6946; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6947; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
6948; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
6949; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6950; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
6951; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
6952; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6953; GFX940-TGSPLIT-NEXT:    s_endpgm
6954;
6955; GFX11-WGP-LABEL: flat_singlethread_one_as_unordered_store:
6956; GFX11-WGP:       ; %bb.0: ; %entry
6957; GFX11-WGP-NEXT:    s_clause 0x1
6958; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
6959; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
6960; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6961; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
6962; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
6963; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
6964; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6965; GFX11-WGP-NEXT:    s_endpgm
6966;
6967; GFX11-CU-LABEL: flat_singlethread_one_as_unordered_store:
6968; GFX11-CU:       ; %bb.0: ; %entry
6969; GFX11-CU-NEXT:    s_clause 0x1
6970; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
6971; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
6972; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6973; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
6974; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
6975; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
6976; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6977; GFX11-CU-NEXT:    s_endpgm
6978    i32 %in, i32* %out) {
6979entry:
6980  store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4
6981  ret void
6982}
6983
6984define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
6985; GFX7-LABEL: flat_singlethread_one_as_monotonic_store:
6986; GFX7:       ; %bb.0: ; %entry
6987; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
6988; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
6989; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6990; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6991; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6992; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6993; GFX7-NEXT:    flat_store_dword v[0:1], v2
6994; GFX7-NEXT:    s_endpgm
6995;
6996; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
6997; GFX10-WGP:       ; %bb.0: ; %entry
6998; GFX10-WGP-NEXT:    s_clause 0x1
6999; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7000; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
7001; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7002; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7003; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7004; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7005; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7006; GFX10-WGP-NEXT:    s_endpgm
7007;
7008; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store:
7009; GFX10-CU:       ; %bb.0: ; %entry
7010; GFX10-CU-NEXT:    s_clause 0x1
7011; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7012; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
7013; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7014; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7015; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7016; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7017; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7018; GFX10-CU-NEXT:    s_endpgm
7019;
7020; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_store:
7021; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7022; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
7023; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
7024; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7025; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7026; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7027; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7028; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7029; SKIP-CACHE-INV-NEXT:    s_endpgm
7030;
7031; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
7032; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7033; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7034; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
7035; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7036; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7037; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
7038; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7039; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7040;
7041; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
7042; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7043; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7044; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
7045; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7046; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7047; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
7048; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7049; GFX90A-TGSPLIT-NEXT:    s_endpgm
7050;
7051; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
7052; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7053; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
7054; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
7055; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7056; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
7057; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
7058; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7059; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7060;
7061; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
7062; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7063; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
7064; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
7065; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7066; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
7067; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
7068; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7069; GFX940-TGSPLIT-NEXT:    s_endpgm
7070;
7071; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
7072; GFX11-WGP:       ; %bb.0: ; %entry
7073; GFX11-WGP-NEXT:    s_clause 0x1
7074; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
7075; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
7076; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7077; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
7078; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
7079; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
7080; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
7081; GFX11-WGP-NEXT:    s_endpgm
7082;
7083; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_store:
7084; GFX11-CU:       ; %bb.0: ; %entry
7085; GFX11-CU-NEXT:    s_clause 0x1
7086; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
7087; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
7088; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7089; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
7090; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
7091; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
7092; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
7093; GFX11-CU-NEXT:    s_endpgm
7094    i32 %in, i32* %out) {
7095entry:
7096  store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4
7097  ret void
7098}
7099
7100define amdgpu_kernel void @flat_singlethread_one_as_release_store(
7101; GFX7-LABEL: flat_singlethread_one_as_release_store:
7102; GFX7:       ; %bb.0: ; %entry
7103; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
7104; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
7105; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7106; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7107; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7108; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7109; GFX7-NEXT:    flat_store_dword v[0:1], v2
7110; GFX7-NEXT:    s_endpgm
7111;
7112; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store:
7113; GFX10-WGP:       ; %bb.0: ; %entry
7114; GFX10-WGP-NEXT:    s_clause 0x1
7115; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7116; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
7117; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7118; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7119; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7120; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7121; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7122; GFX10-WGP-NEXT:    s_endpgm
7123;
7124; GFX10-CU-LABEL: flat_singlethread_one_as_release_store:
7125; GFX10-CU:       ; %bb.0: ; %entry
7126; GFX10-CU-NEXT:    s_clause 0x1
7127; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7128; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
7129; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7130; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7131; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7132; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7133; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7134; GFX10-CU-NEXT:    s_endpgm
7135;
7136; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_store:
7137; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7138; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
7139; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
7140; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7141; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7142; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7143; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7144; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7145; SKIP-CACHE-INV-NEXT:    s_endpgm
7146;
7147; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
7148; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7149; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7150; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
7151; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7152; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7153; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
7154; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7155; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7156;
7157; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
7158; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7159; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7160; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
7161; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7162; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7163; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
7164; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7165; GFX90A-TGSPLIT-NEXT:    s_endpgm
7166;
7167; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
7168; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7169; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
7170; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
7171; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7172; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
7173; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
7174; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7175; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7176;
7177; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
7178; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7179; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
7180; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
7181; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7182; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
7183; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
7184; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7185; GFX940-TGSPLIT-NEXT:    s_endpgm
7186;
7187; GFX11-WGP-LABEL: flat_singlethread_one_as_release_store:
7188; GFX11-WGP:       ; %bb.0: ; %entry
7189; GFX11-WGP-NEXT:    s_clause 0x1
7190; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
7191; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
7192; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7193; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
7194; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
7195; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
7196; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
7197; GFX11-WGP-NEXT:    s_endpgm
7198;
7199; GFX11-CU-LABEL: flat_singlethread_one_as_release_store:
7200; GFX11-CU:       ; %bb.0: ; %entry
7201; GFX11-CU-NEXT:    s_clause 0x1
7202; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
7203; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
7204; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7205; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
7206; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
7207; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
7208; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
7209; GFX11-CU-NEXT:    s_endpgm
7210    i32 %in, i32* %out) {
7211entry:
7212  store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4
7213  ret void
7214}
7215
7216define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
7217; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store:
7218; GFX7:       ; %bb.0: ; %entry
7219; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
7220; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
7221; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7222; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7223; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7224; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7225; GFX7-NEXT:    flat_store_dword v[0:1], v2
7226; GFX7-NEXT:    s_endpgm
7227;
7228; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
7229; GFX10-WGP:       ; %bb.0: ; %entry
7230; GFX10-WGP-NEXT:    s_clause 0x1
7231; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7232; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
7233; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7234; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7235; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7236; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7237; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7238; GFX10-WGP-NEXT:    s_endpgm
7239;
7240; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
7241; GFX10-CU:       ; %bb.0: ; %entry
7242; GFX10-CU-NEXT:    s_clause 0x1
7243; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7244; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
7245; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7246; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7247; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7248; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7249; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7250; GFX10-CU-NEXT:    s_endpgm
7251;
7252; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_store:
7253; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7254; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
7255; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
7256; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7257; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7258; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7259; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7260; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7261; SKIP-CACHE-INV-NEXT:    s_endpgm
7262;
7263; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
7264; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7265; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7266; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
7267; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7268; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7269; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
7270; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7271; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7272;
7273; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
7274; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7275; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7276; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
7277; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7278; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7279; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
7280; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7281; GFX90A-TGSPLIT-NEXT:    s_endpgm
7282;
7283; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
7284; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7285; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
7286; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
7287; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7288; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
7289; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
7290; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7291; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7292;
7293; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
7294; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7295; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
7296; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
7297; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7298; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
7299; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
7300; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7301; GFX940-TGSPLIT-NEXT:    s_endpgm
7302;
7303; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
7304; GFX11-WGP:       ; %bb.0: ; %entry
7305; GFX11-WGP-NEXT:    s_clause 0x1
7306; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
7307; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
7308; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7309; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
7310; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
7311; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
7312; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
7313; GFX11-WGP-NEXT:    s_endpgm
7314;
7315; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
7316; GFX11-CU:       ; %bb.0: ; %entry
7317; GFX11-CU-NEXT:    s_clause 0x1
7318; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
7319; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
7320; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7321; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
7322; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
7323; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
7324; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
7325; GFX11-CU-NEXT:    s_endpgm
7326    i32 %in, i32* %out) {
7327entry:
7328  store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4
7329  ret void
7330}
7331
7332define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
7333; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
7334; GFX7:       ; %bb.0: ; %entry
7335; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7336; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
7337; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7338; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7339; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7340; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7341; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
7342; GFX7-NEXT:    s_endpgm
7343;
7344; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
7345; GFX10-WGP:       ; %bb.0: ; %entry
7346; GFX10-WGP-NEXT:    s_clause 0x1
7347; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7348; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
7349; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7350; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7351; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7352; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7353; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
7354; GFX10-WGP-NEXT:    s_endpgm
7355;
7356; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
7357; GFX10-CU:       ; %bb.0: ; %entry
7358; GFX10-CU-NEXT:    s_clause 0x1
7359; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7360; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
7361; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7362; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7363; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7364; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7365; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
7366; GFX10-CU-NEXT:    s_endpgm
7367;
7368; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
7369; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7370; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
7371; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
7372; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7373; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7374; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7375; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7376; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
7377; SKIP-CACHE-INV-NEXT:    s_endpgm
7378;
7379; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
7380; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7381; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7382; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
7383; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7384; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7385; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
7386; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7387; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7388;
7389; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
7390; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7391; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7392; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
7393; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7394; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7395; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
7396; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7397; GFX90A-TGSPLIT-NEXT:    s_endpgm
7398;
7399; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
7400; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7401; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
7402; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
7403; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7404; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
7405; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
7406; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7407; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7408;
7409; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
7410; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7411; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
7412; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
7413; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7414; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
7415; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
7416; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7417; GFX940-TGSPLIT-NEXT:    s_endpgm
7418;
7419; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
7420; GFX11-WGP:       ; %bb.0: ; %entry
7421; GFX11-WGP-NEXT:    s_clause 0x1
7422; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
7423; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
7424; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7425; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
7426; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
7427; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
7428; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
7429; GFX11-WGP-NEXT:    s_endpgm
7430;
7431; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
7432; GFX11-CU:       ; %bb.0: ; %entry
7433; GFX11-CU-NEXT:    s_clause 0x1
7434; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
7435; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
7436; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7437; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
7438; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
7439; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
7440; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
7441; GFX11-CU-NEXT:    s_endpgm
7442    i32* %out, i32 %in) {
7443entry:
7444  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") monotonic
7445  ret void
7446}
7447
7448define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
7449; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
7450; GFX7:       ; %bb.0: ; %entry
7451; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7452; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
7453; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7454; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7455; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7456; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7457; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
7458; GFX7-NEXT:    s_endpgm
7459;
7460; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
7461; GFX10-WGP:       ; %bb.0: ; %entry
7462; GFX10-WGP-NEXT:    s_clause 0x1
7463; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7464; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
7465; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7466; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7467; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7468; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7469; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
7470; GFX10-WGP-NEXT:    s_endpgm
7471;
7472; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
7473; GFX10-CU:       ; %bb.0: ; %entry
7474; GFX10-CU-NEXT:    s_clause 0x1
7475; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7476; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
7477; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7478; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7479; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7480; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7481; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
7482; GFX10-CU-NEXT:    s_endpgm
7483;
7484; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
7485; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7486; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
7487; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
7488; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7489; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7490; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7491; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7492; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
7493; SKIP-CACHE-INV-NEXT:    s_endpgm
7494;
7495; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
7496; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7497; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7498; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
7499; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7500; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7501; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
7502; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7503; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7504;
7505; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
7506; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7507; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7508; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
7509; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7510; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7511; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
7512; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7513; GFX90A-TGSPLIT-NEXT:    s_endpgm
7514;
7515; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
7516; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7517; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
7518; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
7519; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7520; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
7521; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
7522; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7523; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7524;
7525; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
7526; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7527; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
7528; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
7529; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7530; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
7531; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
7532; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7533; GFX940-TGSPLIT-NEXT:    s_endpgm
7534;
7535; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
7536; GFX11-WGP:       ; %bb.0: ; %entry
7537; GFX11-WGP-NEXT:    s_clause 0x1
7538; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
7539; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
7540; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7541; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
7542; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
7543; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
7544; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
7545; GFX11-WGP-NEXT:    s_endpgm
7546;
7547; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
7548; GFX11-CU:       ; %bb.0: ; %entry
7549; GFX11-CU-NEXT:    s_clause 0x1
7550; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
7551; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
7552; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7553; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
7554; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
7555; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
7556; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
7557; GFX11-CU-NEXT:    s_endpgm
7558    i32* %out, i32 %in) {
7559entry:
7560  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire
7561  ret void
7562}
7563
7564define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
7565; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw:
7566; GFX7:       ; %bb.0: ; %entry
7567; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7568; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
7569; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7570; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7571; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7572; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7573; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
7574; GFX7-NEXT:    s_endpgm
7575;
7576; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
7577; GFX10-WGP:       ; %bb.0: ; %entry
7578; GFX10-WGP-NEXT:    s_clause 0x1
7579; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7580; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
7581; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7582; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7583; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7584; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7585; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
7586; GFX10-WGP-NEXT:    s_endpgm
7587;
7588; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
7589; GFX10-CU:       ; %bb.0: ; %entry
7590; GFX10-CU-NEXT:    s_clause 0x1
7591; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7592; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
7593; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7594; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7595; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7596; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7597; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
7598; GFX10-CU-NEXT:    s_endpgm
7599;
7600; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_atomicrmw:
7601; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7602; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
7603; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
7604; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7605; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7606; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7607; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7608; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
7609; SKIP-CACHE-INV-NEXT:    s_endpgm
7610;
7611; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
7612; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7613; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7614; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
7615; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7616; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7617; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
7618; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7619; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7620;
7621; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
7622; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7623; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7624; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
7625; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7626; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7627; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
7628; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7629; GFX90A-TGSPLIT-NEXT:    s_endpgm
7630;
7631; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
7632; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7633; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
7634; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
7635; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7636; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
7637; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
7638; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7639; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7640;
7641; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
7642; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7643; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
7644; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
7645; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7646; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
7647; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
7648; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7649; GFX940-TGSPLIT-NEXT:    s_endpgm
7650;
7651; GFX11-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
7652; GFX11-WGP:       ; %bb.0: ; %entry
7653; GFX11-WGP-NEXT:    s_clause 0x1
7654; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
7655; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
7656; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7657; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
7658; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
7659; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
7660; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
7661; GFX11-WGP-NEXT:    s_endpgm
7662;
7663; GFX11-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
7664; GFX11-CU:       ; %bb.0: ; %entry
7665; GFX11-CU-NEXT:    s_clause 0x1
7666; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
7667; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
7668; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7669; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
7670; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
7671; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
7672; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
7673; GFX11-CU-NEXT:    s_endpgm
7674    i32* %out, i32 %in) {
7675entry:
7676  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") release
7677  ret void
7678}
7679
7680define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
7681; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
7682; GFX7:       ; %bb.0: ; %entry
7683; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7684; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
7685; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7686; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7687; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7688; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7689; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
7690; GFX7-NEXT:    s_endpgm
7691;
7692; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
7693; GFX10-WGP:       ; %bb.0: ; %entry
7694; GFX10-WGP-NEXT:    s_clause 0x1
7695; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7696; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
7697; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7698; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7699; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7700; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7701; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
7702; GFX10-WGP-NEXT:    s_endpgm
7703;
7704; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
7705; GFX10-CU:       ; %bb.0: ; %entry
7706; GFX10-CU-NEXT:    s_clause 0x1
7707; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7708; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
7709; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7710; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7711; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7712; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7713; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
7714; GFX10-CU-NEXT:    s_endpgm
7715;
7716; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
7717; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7718; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
7719; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
7720; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7721; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7722; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7723; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7724; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
7725; SKIP-CACHE-INV-NEXT:    s_endpgm
7726;
7727; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
7728; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7729; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7730; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
7731; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7732; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7733; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
7734; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7735; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7736;
7737; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
7738; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7739; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7740; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
7741; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7742; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7743; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
7744; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7745; GFX90A-TGSPLIT-NEXT:    s_endpgm
7746;
7747; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
7748; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7749; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
7750; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
7751; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7752; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
7753; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
7754; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7755; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7756;
7757; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
7758; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7759; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
7760; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
7761; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7762; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
7763; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
7764; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7765; GFX940-TGSPLIT-NEXT:    s_endpgm
7766;
7767; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
7768; GFX11-WGP:       ; %bb.0: ; %entry
7769; GFX11-WGP-NEXT:    s_clause 0x1
7770; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
7771; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
7772; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7773; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
7774; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
7775; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
7776; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
7777; GFX11-WGP-NEXT:    s_endpgm
7778;
7779; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
7780; GFX11-CU:       ; %bb.0: ; %entry
7781; GFX11-CU-NEXT:    s_clause 0x1
7782; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
7783; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
7784; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7785; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
7786; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
7787; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
7788; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
7789; GFX11-CU-NEXT:    s_endpgm
7790    i32* %out, i32 %in) {
7791entry:
7792  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel
7793  ret void
7794}
7795
7796define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
7797; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
7798; GFX7:       ; %bb.0: ; %entry
7799; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7800; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
7801; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7802; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7803; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7804; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7805; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
7806; GFX7-NEXT:    s_endpgm
7807;
7808; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
7809; GFX10-WGP:       ; %bb.0: ; %entry
7810; GFX10-WGP-NEXT:    s_clause 0x1
7811; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7812; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
7813; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7814; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7815; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7816; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7817; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
7818; GFX10-WGP-NEXT:    s_endpgm
7819;
7820; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
7821; GFX10-CU:       ; %bb.0: ; %entry
7822; GFX10-CU-NEXT:    s_clause 0x1
7823; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7824; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
7825; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7826; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7827; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7828; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7829; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
7830; GFX10-CU-NEXT:    s_endpgm
7831;
7832; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
7833; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7834; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
7835; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
7836; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7837; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7838; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7839; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7840; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
7841; SKIP-CACHE-INV-NEXT:    s_endpgm
7842;
7843; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
7844; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7845; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7846; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
7847; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7848; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7849; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
7850; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7851; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7852;
7853; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
7854; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7855; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7856; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
7857; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7858; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7859; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
7860; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7861; GFX90A-TGSPLIT-NEXT:    s_endpgm
7862;
7863; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
7864; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7865; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
7866; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
7867; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7868; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
7869; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
7870; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7871; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7872;
7873; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
7874; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7875; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
7876; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
7877; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7878; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
7879; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
7880; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
7881; GFX940-TGSPLIT-NEXT:    s_endpgm
7882;
7883; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
7884; GFX11-WGP:       ; %bb.0: ; %entry
7885; GFX11-WGP-NEXT:    s_clause 0x1
7886; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
7887; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
7888; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7889; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
7890; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
7891; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
7892; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
7893; GFX11-WGP-NEXT:    s_endpgm
7894;
7895; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
7896; GFX11-CU:       ; %bb.0: ; %entry
7897; GFX11-CU-NEXT:    s_clause 0x1
7898; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
7899; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
7900; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7901; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
7902; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
7903; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
7904; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
7905; GFX11-CU-NEXT:    s_endpgm
7906    i32* %out, i32 %in) {
7907entry:
7908  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst
7909  ret void
7910}
7911
7912define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
7913; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
7914; GFX7:       ; %bb.0: ; %entry
7915; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7916; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
7917; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7918; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7919; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7920; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7921; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
7922; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7923; GFX7-NEXT:    flat_store_dword v[0:1], v2
7924; GFX7-NEXT:    s_endpgm
7925;
7926; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
7927; GFX10-WGP:       ; %bb.0: ; %entry
7928; GFX10-WGP-NEXT:    s_clause 0x1
7929; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7930; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
7931; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7932; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7933; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7934; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7935; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
7936; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7937; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7938; GFX10-WGP-NEXT:    s_endpgm
7939;
7940; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
7941; GFX10-CU:       ; %bb.0: ; %entry
7942; GFX10-CU-NEXT:    s_clause 0x1
7943; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7944; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
7945; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7946; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7947; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7948; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7949; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
7950; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7951; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7952; GFX10-CU-NEXT:    s_endpgm
7953;
7954; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
7955; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7956; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
7957; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
7958; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7959; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7960; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7961; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7962; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
7963; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7964; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7965; SKIP-CACHE-INV-NEXT:    s_endpgm
7966;
7967; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
7968; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7969; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7970; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
7971; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7972; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7973; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
7974; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
7975; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7976; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7977; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7978;
7979; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
7980; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7981; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7982; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
7983; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7984; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7985; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
7986; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
7987; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7988; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7989; GFX90A-TGSPLIT-NEXT:    s_endpgm
7990;
7991; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
7992; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7993; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
7994; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
7995; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7996; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
7997; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
7998; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
7999; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8000; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8001; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8002;
8003; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
8004; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8005; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8006; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
8007; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8008; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
8009; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
8010; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
8011; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8012; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8013; GFX940-TGSPLIT-NEXT:    s_endpgm
8014;
8015; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
8016; GFX11-WGP:       ; %bb.0: ; %entry
8017; GFX11-WGP-NEXT:    s_clause 0x1
8018; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
8019; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
8020; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8021; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
8022; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
8023; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
8024; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8025; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
8026; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8027; GFX11-WGP-NEXT:    s_endpgm
8028;
8029; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
8030; GFX11-CU:       ; %bb.0: ; %entry
8031; GFX11-CU-NEXT:    s_clause 0x1
8032; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
8033; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
8034; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8035; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
8036; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
8037; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
8038; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8039; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
8040; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8041; GFX11-CU-NEXT:    s_endpgm
8042    i32* %out, i32 %in) {
8043entry:
8044  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire
8045  store i32 %val, i32* %out, align 4
8046  ret void
8047}
8048
8049define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
8050; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
8051; GFX7:       ; %bb.0: ; %entry
8052; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8053; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
8054; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8055; GFX7-NEXT:    v_mov_b32_e32 v0, s0
8056; GFX7-NEXT:    v_mov_b32_e32 v1, s1
8057; GFX7-NEXT:    v_mov_b32_e32 v2, s2
8058; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
8059; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8060; GFX7-NEXT:    flat_store_dword v[0:1], v2
8061; GFX7-NEXT:    s_endpgm
8062;
8063; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
8064; GFX10-WGP:       ; %bb.0: ; %entry
8065; GFX10-WGP-NEXT:    s_clause 0x1
8066; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8067; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
8068; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8069; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
8070; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
8071; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
8072; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
8073; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8074; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8075; GFX10-WGP-NEXT:    s_endpgm
8076;
8077; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
8078; GFX10-CU:       ; %bb.0: ; %entry
8079; GFX10-CU-NEXT:    s_clause 0x1
8080; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8081; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
8082; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8083; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
8084; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
8085; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
8086; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
8087; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8088; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8089; GFX10-CU-NEXT:    s_endpgm
8090;
8091; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
8092; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8093; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8094; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
8095; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8096; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8097; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8098; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
8099; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
8100; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8101; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8102; SKIP-CACHE-INV-NEXT:    s_endpgm
8103;
8104; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
8105; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8106; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8107; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
8108; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8109; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8110; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
8111; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
8112; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8113; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8114; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8115;
8116; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
8117; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8118; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8119; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
8120; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8121; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8122; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
8123; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
8124; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8125; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8126; GFX90A-TGSPLIT-NEXT:    s_endpgm
8127;
8128; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
8129; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8130; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8131; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
8132; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8133; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
8134; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
8135; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
8136; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8137; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8138; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8139;
8140; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
8141; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8142; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8143; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
8144; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8145; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
8146; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
8147; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
8148; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8149; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8150; GFX940-TGSPLIT-NEXT:    s_endpgm
8151;
8152; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
8153; GFX11-WGP:       ; %bb.0: ; %entry
8154; GFX11-WGP-NEXT:    s_clause 0x1
8155; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
8156; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
8157; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8158; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
8159; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
8160; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
8161; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8162; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
8163; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8164; GFX11-WGP-NEXT:    s_endpgm
8165;
8166; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
8167; GFX11-CU:       ; %bb.0: ; %entry
8168; GFX11-CU-NEXT:    s_clause 0x1
8169; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
8170; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
8171; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8172; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
8173; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
8174; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
8175; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8176; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
8177; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8178; GFX11-CU-NEXT:    s_endpgm
8179    i32* %out, i32 %in) {
8180entry:
8181  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel
8182  store i32 %val, i32* %out, align 4
8183  ret void
8184}
8185
8186define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
8187; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
8188; GFX7:       ; %bb.0: ; %entry
8189; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8190; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
8191; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8192; GFX7-NEXT:    v_mov_b32_e32 v0, s0
8193; GFX7-NEXT:    v_mov_b32_e32 v1, s1
8194; GFX7-NEXT:    v_mov_b32_e32 v2, s2
8195; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
8196; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8197; GFX7-NEXT:    flat_store_dword v[0:1], v2
8198; GFX7-NEXT:    s_endpgm
8199;
8200; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
8201; GFX10-WGP:       ; %bb.0: ; %entry
8202; GFX10-WGP-NEXT:    s_clause 0x1
8203; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8204; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
8205; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8206; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
8207; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
8208; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
8209; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
8210; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8211; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8212; GFX10-WGP-NEXT:    s_endpgm
8213;
8214; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
8215; GFX10-CU:       ; %bb.0: ; %entry
8216; GFX10-CU-NEXT:    s_clause 0x1
8217; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8218; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
8219; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8220; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
8221; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
8222; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
8223; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
8224; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8225; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8226; GFX10-CU-NEXT:    s_endpgm
8227;
8228; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
8229; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8230; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8231; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
8232; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8233; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8234; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8235; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
8236; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
8237; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8238; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8239; SKIP-CACHE-INV-NEXT:    s_endpgm
8240;
8241; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
8242; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8243; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8244; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
8245; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8246; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8247; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
8248; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
8249; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8250; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8251; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8252;
8253; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
8254; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8255; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8256; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
8257; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8258; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8259; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
8260; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
8261; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8262; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8263; GFX90A-TGSPLIT-NEXT:    s_endpgm
8264;
8265; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
8266; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8267; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8268; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
8269; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8270; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
8271; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
8272; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
8273; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8274; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8275; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8276;
8277; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
8278; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8279; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8280; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
8281; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8282; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
8283; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
8284; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
8285; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8286; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8287; GFX940-TGSPLIT-NEXT:    s_endpgm
8288;
8289; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
8290; GFX11-WGP:       ; %bb.0: ; %entry
8291; GFX11-WGP-NEXT:    s_clause 0x1
8292; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
8293; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
8294; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8295; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
8296; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
8297; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
8298; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8299; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
8300; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8301; GFX11-WGP-NEXT:    s_endpgm
8302;
8303; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
8304; GFX11-CU:       ; %bb.0: ; %entry
8305; GFX11-CU-NEXT:    s_clause 0x1
8306; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
8307; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
8308; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8309; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
8310; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
8311; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
8312; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8313; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
8314; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8315; GFX11-CU-NEXT:    s_endpgm
8316    i32* %out, i32 %in) {
8317entry:
8318  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst
8319  store i32 %val, i32* %out, align 4
8320  ret void
8321}
8322
8323define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
8324; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
8325; GFX7:       ; %bb.0: ; %entry
8326; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8327; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
8328; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8329; GFX7-NEXT:    s_add_u32 s0, s0, 16
8330; GFX7-NEXT:    s_addc_u32 s1, s1, 0
8331; GFX7-NEXT:    v_mov_b32_e32 v0, s0
8332; GFX7-NEXT:    v_mov_b32_e32 v2, s2
8333; GFX7-NEXT:    v_mov_b32_e32 v1, s1
8334; GFX7-NEXT:    v_mov_b32_e32 v3, s3
8335; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8336; GFX7-NEXT:    s_endpgm
8337;
8338; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
8339; GFX10-WGP:       ; %bb.0: ; %entry
8340; GFX10-WGP-NEXT:    s_clause 0x1
8341; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8342; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8343; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8344; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
8345; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
8346; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
8347; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
8348; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
8349; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
8350; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8351; GFX10-WGP-NEXT:    s_endpgm
8352;
8353; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
8354; GFX10-CU:       ; %bb.0: ; %entry
8355; GFX10-CU-NEXT:    s_clause 0x1
8356; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8357; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8358; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8359; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
8360; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
8361; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
8362; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
8363; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
8364; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
8365; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8366; GFX10-CU-NEXT:    s_endpgm
8367;
8368; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
8369; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8370; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8371; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
8372; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8373; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
8374; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
8375; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8376; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
8377; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8378; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
8379; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8380; SKIP-CACHE-INV-NEXT:    s_endpgm
8381;
8382; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
8383; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8384; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8385; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8386; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8387; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8388; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8389; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8390; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8391;
8392; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
8393; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8394; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8395; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8396; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8397; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8398; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8399; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8400; GFX90A-TGSPLIT-NEXT:    s_endpgm
8401;
8402; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
8403; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8404; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8405; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
8406; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8407; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
8408; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
8409; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8410; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8411;
8412; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
8413; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8414; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8415; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
8416; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8417; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
8418; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
8419; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8420; GFX940-TGSPLIT-NEXT:    s_endpgm
8421;
8422; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
8423; GFX11-WGP:       ; %bb.0: ; %entry
8424; GFX11-WGP-NEXT:    s_clause 0x1
8425; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
8426; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
8427; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8428; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
8429; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
8430; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
8431; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8432; GFX11-WGP-NEXT:    s_endpgm
8433;
8434; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
8435; GFX11-CU:       ; %bb.0: ; %entry
8436; GFX11-CU-NEXT:    s_clause 0x1
8437; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
8438; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
8439; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8440; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
8441; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
8442; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
8443; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8444; GFX11-CU-NEXT:    s_endpgm
8445    i32* %out, i32 %in, i32 %old) {
8446entry:
8447  %gep = getelementptr i32, i32* %out, i32 4
8448  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
8449  ret void
8450}
8451
8452define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
8453; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
8454; GFX7:       ; %bb.0: ; %entry
8455; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8456; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
8457; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8458; GFX7-NEXT:    s_add_u32 s0, s0, 16
8459; GFX7-NEXT:    s_addc_u32 s1, s1, 0
8460; GFX7-NEXT:    v_mov_b32_e32 v0, s0
8461; GFX7-NEXT:    v_mov_b32_e32 v2, s2
8462; GFX7-NEXT:    v_mov_b32_e32 v1, s1
8463; GFX7-NEXT:    v_mov_b32_e32 v3, s3
8464; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8465; GFX7-NEXT:    s_endpgm
8466;
8467; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
8468; GFX10-WGP:       ; %bb.0: ; %entry
8469; GFX10-WGP-NEXT:    s_clause 0x1
8470; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8471; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8472; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8473; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
8474; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
8475; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
8476; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
8477; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
8478; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
8479; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8480; GFX10-WGP-NEXT:    s_endpgm
8481;
8482; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
8483; GFX10-CU:       ; %bb.0: ; %entry
8484; GFX10-CU-NEXT:    s_clause 0x1
8485; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8486; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8487; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8488; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
8489; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
8490; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
8491; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
8492; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
8493; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
8494; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8495; GFX10-CU-NEXT:    s_endpgm
8496;
8497; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
8498; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8499; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8500; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
8501; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8502; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
8503; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
8504; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8505; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
8506; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8507; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
8508; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8509; SKIP-CACHE-INV-NEXT:    s_endpgm
8510;
8511; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
8512; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8513; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8514; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8515; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8516; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8517; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8518; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8519; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8520;
8521; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
8522; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8523; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8524; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8525; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8526; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8527; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8528; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8529; GFX90A-TGSPLIT-NEXT:    s_endpgm
8530;
8531; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
8532; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8533; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8534; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
8535; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8536; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
8537; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
8538; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8539; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8540;
8541; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
8542; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8543; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8544; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
8545; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8546; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
8547; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
8548; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8549; GFX940-TGSPLIT-NEXT:    s_endpgm
8550;
8551; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
8552; GFX11-WGP:       ; %bb.0: ; %entry
8553; GFX11-WGP-NEXT:    s_clause 0x1
8554; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
8555; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
8556; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8557; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
8558; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
8559; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
8560; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8561; GFX11-WGP-NEXT:    s_endpgm
8562;
8563; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
8564; GFX11-CU:       ; %bb.0: ; %entry
8565; GFX11-CU-NEXT:    s_clause 0x1
8566; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
8567; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
8568; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8569; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
8570; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
8571; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
8572; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8573; GFX11-CU-NEXT:    s_endpgm
8574    i32* %out, i32 %in, i32 %old) {
8575entry:
8576  %gep = getelementptr i32, i32* %out, i32 4
8577  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
8578  ret void
8579}
8580
8581define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
8582; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
8583; GFX7:       ; %bb.0: ; %entry
8584; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8585; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
8586; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8587; GFX7-NEXT:    s_add_u32 s0, s0, 16
8588; GFX7-NEXT:    s_addc_u32 s1, s1, 0
8589; GFX7-NEXT:    v_mov_b32_e32 v0, s0
8590; GFX7-NEXT:    v_mov_b32_e32 v2, s2
8591; GFX7-NEXT:    v_mov_b32_e32 v1, s1
8592; GFX7-NEXT:    v_mov_b32_e32 v3, s3
8593; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8594; GFX7-NEXT:    s_endpgm
8595;
8596; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
8597; GFX10-WGP:       ; %bb.0: ; %entry
8598; GFX10-WGP-NEXT:    s_clause 0x1
8599; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8600; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8601; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8602; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
8603; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
8604; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
8605; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
8606; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
8607; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
8608; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8609; GFX10-WGP-NEXT:    s_endpgm
8610;
8611; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
8612; GFX10-CU:       ; %bb.0: ; %entry
8613; GFX10-CU-NEXT:    s_clause 0x1
8614; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8615; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8616; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8617; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
8618; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
8619; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
8620; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
8621; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
8622; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
8623; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8624; GFX10-CU-NEXT:    s_endpgm
8625;
8626; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
8627; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8628; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8629; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
8630; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8631; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
8632; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
8633; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8634; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
8635; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8636; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
8637; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8638; SKIP-CACHE-INV-NEXT:    s_endpgm
8639;
8640; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
8641; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8642; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8643; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8644; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8645; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8646; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8647; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8648; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8649;
8650; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
8651; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8652; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8653; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8654; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8655; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8656; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8657; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8658; GFX90A-TGSPLIT-NEXT:    s_endpgm
8659;
8660; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
8661; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8662; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8663; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
8664; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8665; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
8666; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
8667; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8668; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8669;
8670; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
8671; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8672; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8673; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
8674; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8675; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
8676; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
8677; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8678; GFX940-TGSPLIT-NEXT:    s_endpgm
8679;
8680; GFX11-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
8681; GFX11-WGP:       ; %bb.0: ; %entry
8682; GFX11-WGP-NEXT:    s_clause 0x1
8683; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
8684; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
8685; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8686; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
8687; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
8688; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
8689; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8690; GFX11-WGP-NEXT:    s_endpgm
8691;
8692; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
8693; GFX11-CU:       ; %bb.0: ; %entry
8694; GFX11-CU-NEXT:    s_clause 0x1
8695; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
8696; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
8697; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8698; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
8699; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
8700; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
8701; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8702; GFX11-CU-NEXT:    s_endpgm
8703    i32* %out, i32 %in, i32 %old) {
8704entry:
8705  %gep = getelementptr i32, i32* %out, i32 4
8706  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
8707  ret void
8708}
8709
8710define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
8711; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
8712; GFX7:       ; %bb.0: ; %entry
8713; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8714; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
8715; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8716; GFX7-NEXT:    s_add_u32 s0, s0, 16
8717; GFX7-NEXT:    s_addc_u32 s1, s1, 0
8718; GFX7-NEXT:    v_mov_b32_e32 v0, s0
8719; GFX7-NEXT:    v_mov_b32_e32 v2, s2
8720; GFX7-NEXT:    v_mov_b32_e32 v1, s1
8721; GFX7-NEXT:    v_mov_b32_e32 v3, s3
8722; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8723; GFX7-NEXT:    s_endpgm
8724;
8725; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
8726; GFX10-WGP:       ; %bb.0: ; %entry
8727; GFX10-WGP-NEXT:    s_clause 0x1
8728; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8729; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8730; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8731; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
8732; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
8733; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
8734; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
8735; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
8736; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
8737; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8738; GFX10-WGP-NEXT:    s_endpgm
8739;
8740; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
8741; GFX10-CU:       ; %bb.0: ; %entry
8742; GFX10-CU-NEXT:    s_clause 0x1
8743; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8744; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8745; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8746; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
8747; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
8748; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
8749; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
8750; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
8751; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
8752; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8753; GFX10-CU-NEXT:    s_endpgm
8754;
8755; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
8756; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8757; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8758; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
8759; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8760; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
8761; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
8762; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8763; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
8764; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8765; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
8766; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8767; SKIP-CACHE-INV-NEXT:    s_endpgm
8768;
8769; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
8770; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8771; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8772; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8773; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8774; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8775; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8776; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8777; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8778;
8779; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
8780; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8781; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8782; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8783; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8784; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8785; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8786; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8787; GFX90A-TGSPLIT-NEXT:    s_endpgm
8788;
8789; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
8790; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8791; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8792; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
8793; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8794; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
8795; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
8796; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8797; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8798;
8799; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
8800; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8801; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8802; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
8803; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8804; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
8805; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
8806; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8807; GFX940-TGSPLIT-NEXT:    s_endpgm
8808;
8809; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
8810; GFX11-WGP:       ; %bb.0: ; %entry
8811; GFX11-WGP-NEXT:    s_clause 0x1
8812; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
8813; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
8814; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8815; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
8816; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
8817; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
8818; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8819; GFX11-WGP-NEXT:    s_endpgm
8820;
8821; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
8822; GFX11-CU:       ; %bb.0: ; %entry
8823; GFX11-CU-NEXT:    s_clause 0x1
8824; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
8825; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
8826; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8827; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
8828; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
8829; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
8830; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8831; GFX11-CU-NEXT:    s_endpgm
8832    i32* %out, i32 %in, i32 %old) {
8833entry:
8834  %gep = getelementptr i32, i32* %out, i32 4
8835  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
8836  ret void
8837}
8838
8839define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
8840; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
8841; GFX7:       ; %bb.0: ; %entry
8842; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8843; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
8844; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8845; GFX7-NEXT:    s_add_u32 s0, s0, 16
8846; GFX7-NEXT:    s_addc_u32 s1, s1, 0
8847; GFX7-NEXT:    v_mov_b32_e32 v0, s0
8848; GFX7-NEXT:    v_mov_b32_e32 v2, s2
8849; GFX7-NEXT:    v_mov_b32_e32 v1, s1
8850; GFX7-NEXT:    v_mov_b32_e32 v3, s3
8851; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8852; GFX7-NEXT:    s_endpgm
8853;
8854; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
8855; GFX10-WGP:       ; %bb.0: ; %entry
8856; GFX10-WGP-NEXT:    s_clause 0x1
8857; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8858; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8859; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8860; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
8861; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
8862; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
8863; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
8864; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
8865; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
8866; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8867; GFX10-WGP-NEXT:    s_endpgm
8868;
8869; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
8870; GFX10-CU:       ; %bb.0: ; %entry
8871; GFX10-CU-NEXT:    s_clause 0x1
8872; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8873; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8874; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8875; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
8876; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
8877; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
8878; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
8879; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
8880; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
8881; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8882; GFX10-CU-NEXT:    s_endpgm
8883;
8884; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
8885; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8886; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8887; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
8888; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8889; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
8890; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
8891; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8892; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
8893; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8894; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
8895; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8896; SKIP-CACHE-INV-NEXT:    s_endpgm
8897;
8898; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
8899; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8900; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8901; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8902; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8903; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8904; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8905; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8906; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8907;
8908; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
8909; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8910; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8911; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8912; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8913; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8914; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8915; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8916; GFX90A-TGSPLIT-NEXT:    s_endpgm
8917;
8918; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
8919; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8920; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8921; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
8922; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8923; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
8924; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
8925; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8926; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8927;
8928; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
8929; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8930; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
8931; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
8932; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8933; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
8934; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
8935; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
8936; GFX940-TGSPLIT-NEXT:    s_endpgm
8937;
8938; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
8939; GFX11-WGP:       ; %bb.0: ; %entry
8940; GFX11-WGP-NEXT:    s_clause 0x1
8941; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
8942; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
8943; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8944; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
8945; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
8946; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
8947; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8948; GFX11-WGP-NEXT:    s_endpgm
8949;
8950; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
8951; GFX11-CU:       ; %bb.0: ; %entry
8952; GFX11-CU-NEXT:    s_clause 0x1
8953; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
8954; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
8955; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8956; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
8957; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
8958; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
8959; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8960; GFX11-CU-NEXT:    s_endpgm
8961    i32* %out, i32 %in, i32 %old) {
8962entry:
8963  %gep = getelementptr i32, i32* %out, i32 4
8964  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
8965  ret void
8966}
8967
8968define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
8969; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
8970; GFX7:       ; %bb.0: ; %entry
8971; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8972; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
8973; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8974; GFX7-NEXT:    s_add_u32 s0, s0, 16
8975; GFX7-NEXT:    s_addc_u32 s1, s1, 0
8976; GFX7-NEXT:    v_mov_b32_e32 v0, s0
8977; GFX7-NEXT:    v_mov_b32_e32 v2, s2
8978; GFX7-NEXT:    v_mov_b32_e32 v1, s1
8979; GFX7-NEXT:    v_mov_b32_e32 v3, s3
8980; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8981; GFX7-NEXT:    s_endpgm
8982;
8983; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
8984; GFX10-WGP:       ; %bb.0: ; %entry
8985; GFX10-WGP-NEXT:    s_clause 0x1
8986; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8987; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8988; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8989; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
8990; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
8991; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
8992; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
8993; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
8994; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
8995; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
8996; GFX10-WGP-NEXT:    s_endpgm
8997;
8998; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
8999; GFX10-CU:       ; %bb.0: ; %entry
9000; GFX10-CU-NEXT:    s_clause 0x1
9001; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9002; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9003; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9004; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
9005; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
9006; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
9007; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
9008; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
9009; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
9010; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9011; GFX10-CU-NEXT:    s_endpgm
9012;
9013; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
9014; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9015; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9016; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
9017; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9018; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
9019; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
9020; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9021; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
9022; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9023; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
9024; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9025; SKIP-CACHE-INV-NEXT:    s_endpgm
9026;
9027; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
9028; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9029; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9030; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9031; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9032; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9033; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9034; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9035; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9036;
9037; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
9038; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9039; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9040; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9041; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9042; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9043; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9044; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9045; GFX90A-TGSPLIT-NEXT:    s_endpgm
9046;
9047; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
9048; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9049; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9050; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
9051; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9052; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9053; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
9054; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9055; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9056;
9057; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
9058; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9059; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9060; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
9061; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9062; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9063; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
9064; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9065; GFX940-TGSPLIT-NEXT:    s_endpgm
9066;
9067; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
9068; GFX11-WGP:       ; %bb.0: ; %entry
9069; GFX11-WGP-NEXT:    s_clause 0x1
9070; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
9071; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
9072; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9073; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
9074; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
9075; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
9076; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
9077; GFX11-WGP-NEXT:    s_endpgm
9078;
9079; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
9080; GFX11-CU:       ; %bb.0: ; %entry
9081; GFX11-CU-NEXT:    s_clause 0x1
9082; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
9083; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
9084; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9085; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
9086; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
9087; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
9088; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
9089; GFX11-CU-NEXT:    s_endpgm
9090    i32* %out, i32 %in, i32 %old) {
9091entry:
9092  %gep = getelementptr i32, i32* %out, i32 4
9093  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire
9094  ret void
9095}
9096
9097define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
9098; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
9099; GFX7:       ; %bb.0: ; %entry
9100; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9101; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
9102; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9103; GFX7-NEXT:    s_add_u32 s0, s0, 16
9104; GFX7-NEXT:    s_addc_u32 s1, s1, 0
9105; GFX7-NEXT:    v_mov_b32_e32 v0, s0
9106; GFX7-NEXT:    v_mov_b32_e32 v2, s2
9107; GFX7-NEXT:    v_mov_b32_e32 v1, s1
9108; GFX7-NEXT:    v_mov_b32_e32 v3, s3
9109; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9110; GFX7-NEXT:    s_endpgm
9111;
9112; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
9113; GFX10-WGP:       ; %bb.0: ; %entry
9114; GFX10-WGP-NEXT:    s_clause 0x1
9115; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9116; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9117; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9118; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
9119; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
9120; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
9121; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
9122; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
9123; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
9124; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9125; GFX10-WGP-NEXT:    s_endpgm
9126;
9127; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
9128; GFX10-CU:       ; %bb.0: ; %entry
9129; GFX10-CU-NEXT:    s_clause 0x1
9130; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9131; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9132; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9133; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
9134; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
9135; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
9136; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
9137; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
9138; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
9139; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9140; GFX10-CU-NEXT:    s_endpgm
9141;
9142; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
9143; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9144; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9145; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
9146; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9147; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
9148; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
9149; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9150; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
9151; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9152; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
9153; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9154; SKIP-CACHE-INV-NEXT:    s_endpgm
9155;
9156; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
9157; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9158; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9159; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9160; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9161; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9162; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9163; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9164; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9165;
9166; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
9167; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9168; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9169; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9170; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9171; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9172; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9173; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9174; GFX90A-TGSPLIT-NEXT:    s_endpgm
9175;
9176; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
9177; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9178; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9179; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
9180; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9181; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9182; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
9183; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9184; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9185;
9186; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
9187; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9188; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9189; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
9190; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9191; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9192; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
9193; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9194; GFX940-TGSPLIT-NEXT:    s_endpgm
9195;
9196; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
9197; GFX11-WGP:       ; %bb.0: ; %entry
9198; GFX11-WGP-NEXT:    s_clause 0x1
9199; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
9200; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
9201; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9202; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
9203; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
9204; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
9205; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
9206; GFX11-WGP-NEXT:    s_endpgm
9207;
9208; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
9209; GFX11-CU:       ; %bb.0: ; %entry
9210; GFX11-CU-NEXT:    s_clause 0x1
9211; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
9212; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
9213; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9214; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
9215; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
9216; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
9217; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
9218; GFX11-CU-NEXT:    s_endpgm
9219    i32* %out, i32 %in, i32 %old) {
9220entry:
9221  %gep = getelementptr i32, i32* %out, i32 4
9222  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
9223  ret void
9224}
9225
9226define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
9227; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
9228; GFX7:       ; %bb.0: ; %entry
9229; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9230; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
9231; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9232; GFX7-NEXT:    s_add_u32 s0, s0, 16
9233; GFX7-NEXT:    s_addc_u32 s1, s1, 0
9234; GFX7-NEXT:    v_mov_b32_e32 v0, s0
9235; GFX7-NEXT:    v_mov_b32_e32 v2, s2
9236; GFX7-NEXT:    v_mov_b32_e32 v1, s1
9237; GFX7-NEXT:    v_mov_b32_e32 v3, s3
9238; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9239; GFX7-NEXT:    s_endpgm
9240;
9241; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
9242; GFX10-WGP:       ; %bb.0: ; %entry
9243; GFX10-WGP-NEXT:    s_clause 0x1
9244; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9245; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9246; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9247; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
9248; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
9249; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
9250; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
9251; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
9252; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
9253; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9254; GFX10-WGP-NEXT:    s_endpgm
9255;
9256; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
9257; GFX10-CU:       ; %bb.0: ; %entry
9258; GFX10-CU-NEXT:    s_clause 0x1
9259; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9260; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9261; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9262; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
9263; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
9264; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
9265; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
9266; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
9267; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
9268; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9269; GFX10-CU-NEXT:    s_endpgm
9270;
9271; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
9272; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9273; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9274; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
9275; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9276; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
9277; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
9278; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9279; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
9280; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9281; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
9282; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9283; SKIP-CACHE-INV-NEXT:    s_endpgm
9284;
9285; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
9286; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9287; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9288; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9289; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9290; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9291; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9292; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9293; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9294;
9295; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
9296; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9297; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9298; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9299; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9300; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9301; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9302; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9303; GFX90A-TGSPLIT-NEXT:    s_endpgm
9304;
9305; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
9306; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9307; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9308; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
9309; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9310; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9311; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
9312; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9313; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9314;
9315; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
9316; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9317; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9318; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
9319; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9320; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9321; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
9322; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9323; GFX940-TGSPLIT-NEXT:    s_endpgm
9324;
9325; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
9326; GFX11-WGP:       ; %bb.0: ; %entry
9327; GFX11-WGP-NEXT:    s_clause 0x1
9328; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
9329; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
9330; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9331; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
9332; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
9333; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
9334; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
9335; GFX11-WGP-NEXT:    s_endpgm
9336;
9337; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
9338; GFX11-CU:       ; %bb.0: ; %entry
9339; GFX11-CU-NEXT:    s_clause 0x1
9340; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
9341; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
9342; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9343; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
9344; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
9345; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
9346; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
9347; GFX11-CU-NEXT:    s_endpgm
9348    i32* %out, i32 %in, i32 %old) {
9349entry:
9350  %gep = getelementptr i32, i32* %out, i32 4
9351  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
9352  ret void
9353}
9354
9355define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
9356; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
9357; GFX7:       ; %bb.0: ; %entry
9358; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9359; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
9360; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9361; GFX7-NEXT:    s_add_u32 s0, s0, 16
9362; GFX7-NEXT:    s_addc_u32 s1, s1, 0
9363; GFX7-NEXT:    v_mov_b32_e32 v0, s0
9364; GFX7-NEXT:    v_mov_b32_e32 v2, s2
9365; GFX7-NEXT:    v_mov_b32_e32 v1, s1
9366; GFX7-NEXT:    v_mov_b32_e32 v3, s3
9367; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9368; GFX7-NEXT:    s_endpgm
9369;
9370; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
9371; GFX10-WGP:       ; %bb.0: ; %entry
9372; GFX10-WGP-NEXT:    s_clause 0x1
9373; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9374; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9375; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9376; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
9377; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
9378; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
9379; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
9380; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
9381; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
9382; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9383; GFX10-WGP-NEXT:    s_endpgm
9384;
9385; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
9386; GFX10-CU:       ; %bb.0: ; %entry
9387; GFX10-CU-NEXT:    s_clause 0x1
9388; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9389; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9390; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9391; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
9392; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
9393; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
9394; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
9395; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
9396; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
9397; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9398; GFX10-CU-NEXT:    s_endpgm
9399;
9400; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
9401; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9402; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9403; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
9404; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9405; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
9406; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
9407; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9408; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
9409; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9410; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
9411; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9412; SKIP-CACHE-INV-NEXT:    s_endpgm
9413;
9414; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
9415; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9416; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9417; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9418; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9419; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9420; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9421; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9422; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9423;
9424; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
9425; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9426; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9427; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9428; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9429; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9430; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9431; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9432; GFX90A-TGSPLIT-NEXT:    s_endpgm
9433;
9434; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
9435; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9436; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9437; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
9438; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9439; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9440; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
9441; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9442; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9443;
9444; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
9445; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9446; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9447; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
9448; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9449; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9450; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
9451; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9452; GFX940-TGSPLIT-NEXT:    s_endpgm
9453;
9454; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
9455; GFX11-WGP:       ; %bb.0: ; %entry
9456; GFX11-WGP-NEXT:    s_clause 0x1
9457; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
9458; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
9459; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9460; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
9461; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
9462; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
9463; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
9464; GFX11-WGP-NEXT:    s_endpgm
9465;
9466; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
9467; GFX11-CU:       ; %bb.0: ; %entry
9468; GFX11-CU-NEXT:    s_clause 0x1
9469; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
9470; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
9471; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9472; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
9473; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
9474; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
9475; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
9476; GFX11-CU-NEXT:    s_endpgm
9477    i32* %out, i32 %in, i32 %old) {
9478entry:
9479  %gep = getelementptr i32, i32* %out, i32 4
9480  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
9481  ret void
9482}
9483
9484define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
9485; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
9486; GFX7:       ; %bb.0: ; %entry
9487; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9488; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
9489; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9490; GFX7-NEXT:    s_add_u32 s0, s0, 16
9491; GFX7-NEXT:    s_addc_u32 s1, s1, 0
9492; GFX7-NEXT:    v_mov_b32_e32 v0, s0
9493; GFX7-NEXT:    v_mov_b32_e32 v2, s2
9494; GFX7-NEXT:    v_mov_b32_e32 v1, s1
9495; GFX7-NEXT:    v_mov_b32_e32 v3, s3
9496; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9497; GFX7-NEXT:    s_endpgm
9498;
9499; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
9500; GFX10-WGP:       ; %bb.0: ; %entry
9501; GFX10-WGP-NEXT:    s_clause 0x1
9502; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9503; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9504; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9505; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
9506; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
9507; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
9508; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
9509; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
9510; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
9511; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9512; GFX10-WGP-NEXT:    s_endpgm
9513;
9514; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
9515; GFX10-CU:       ; %bb.0: ; %entry
9516; GFX10-CU-NEXT:    s_clause 0x1
9517; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9518; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9519; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9520; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
9521; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
9522; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
9523; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
9524; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
9525; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
9526; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9527; GFX10-CU-NEXT:    s_endpgm
9528;
9529; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
9530; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9531; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9532; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
9533; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9534; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
9535; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
9536; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9537; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
9538; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9539; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
9540; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9541; SKIP-CACHE-INV-NEXT:    s_endpgm
9542;
9543; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
9544; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9545; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9546; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9547; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9548; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9549; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9550; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9551; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9552;
9553; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
9554; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9555; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9556; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9557; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9558; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9559; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9560; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9561; GFX90A-TGSPLIT-NEXT:    s_endpgm
9562;
9563; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
9564; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9565; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9566; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
9567; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9568; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9569; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
9570; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9571; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9572;
9573; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
9574; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9575; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9576; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
9577; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9578; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9579; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
9580; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9581; GFX940-TGSPLIT-NEXT:    s_endpgm
9582;
9583; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
9584; GFX11-WGP:       ; %bb.0: ; %entry
9585; GFX11-WGP-NEXT:    s_clause 0x1
9586; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
9587; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
9588; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9589; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
9590; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
9591; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
9592; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
9593; GFX11-WGP-NEXT:    s_endpgm
9594;
9595; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
9596; GFX11-CU:       ; %bb.0: ; %entry
9597; GFX11-CU-NEXT:    s_clause 0x1
9598; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
9599; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
9600; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9601; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
9602; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
9603; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
9604; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
9605; GFX11-CU-NEXT:    s_endpgm
9606    i32* %out, i32 %in, i32 %old) {
9607entry:
9608  %gep = getelementptr i32, i32* %out, i32 4
9609  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
9610  ret void
9611}
9612
9613define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
9614; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
9615; GFX7:       ; %bb.0: ; %entry
9616; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9617; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
9618; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9619; GFX7-NEXT:    s_add_u32 s0, s0, 16
9620; GFX7-NEXT:    s_addc_u32 s1, s1, 0
9621; GFX7-NEXT:    v_mov_b32_e32 v0, s0
9622; GFX7-NEXT:    v_mov_b32_e32 v2, s2
9623; GFX7-NEXT:    v_mov_b32_e32 v1, s1
9624; GFX7-NEXT:    v_mov_b32_e32 v3, s3
9625; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9626; GFX7-NEXT:    s_endpgm
9627;
9628; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
9629; GFX10-WGP:       ; %bb.0: ; %entry
9630; GFX10-WGP-NEXT:    s_clause 0x1
9631; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9632; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9633; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9634; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
9635; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
9636; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
9637; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
9638; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
9639; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
9640; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9641; GFX10-WGP-NEXT:    s_endpgm
9642;
9643; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
9644; GFX10-CU:       ; %bb.0: ; %entry
9645; GFX10-CU-NEXT:    s_clause 0x1
9646; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9647; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9648; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9649; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
9650; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
9651; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
9652; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
9653; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
9654; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
9655; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9656; GFX10-CU-NEXT:    s_endpgm
9657;
9658; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
9659; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9660; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9661; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
9662; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9663; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
9664; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
9665; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9666; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
9667; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9668; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
9669; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9670; SKIP-CACHE-INV-NEXT:    s_endpgm
9671;
9672; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
9673; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9674; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9675; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9676; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9677; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9678; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9679; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9680; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9681;
9682; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
9683; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9684; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9685; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9686; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9687; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9688; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9689; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9690; GFX90A-TGSPLIT-NEXT:    s_endpgm
9691;
9692; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
9693; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9694; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9695; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
9696; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9697; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9698; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
9699; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9700; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9701;
9702; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
9703; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9704; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9705; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
9706; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9707; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9708; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
9709; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9710; GFX940-TGSPLIT-NEXT:    s_endpgm
9711;
9712; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
9713; GFX11-WGP:       ; %bb.0: ; %entry
9714; GFX11-WGP-NEXT:    s_clause 0x1
9715; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
9716; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
9717; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9718; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
9719; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
9720; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
9721; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
9722; GFX11-WGP-NEXT:    s_endpgm
9723;
9724; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
9725; GFX11-CU:       ; %bb.0: ; %entry
9726; GFX11-CU-NEXT:    s_clause 0x1
9727; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
9728; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
9729; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9730; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
9731; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
9732; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
9733; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
9734; GFX11-CU-NEXT:    s_endpgm
9735    i32* %out, i32 %in, i32 %old) {
9736entry:
9737  %gep = getelementptr i32, i32* %out, i32 4
9738  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst
9739  ret void
9740}
9741
9742define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
9743; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
9744; GFX7:       ; %bb.0: ; %entry
9745; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9746; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
9747; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9748; GFX7-NEXT:    s_add_u32 s0, s0, 16
9749; GFX7-NEXT:    s_addc_u32 s1, s1, 0
9750; GFX7-NEXT:    v_mov_b32_e32 v0, s0
9751; GFX7-NEXT:    v_mov_b32_e32 v2, s2
9752; GFX7-NEXT:    v_mov_b32_e32 v1, s1
9753; GFX7-NEXT:    v_mov_b32_e32 v3, s3
9754; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9755; GFX7-NEXT:    s_endpgm
9756;
9757; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
9758; GFX10-WGP:       ; %bb.0: ; %entry
9759; GFX10-WGP-NEXT:    s_clause 0x1
9760; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9761; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9762; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9763; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
9764; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
9765; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
9766; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
9767; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
9768; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
9769; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9770; GFX10-WGP-NEXT:    s_endpgm
9771;
9772; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
9773; GFX10-CU:       ; %bb.0: ; %entry
9774; GFX10-CU-NEXT:    s_clause 0x1
9775; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9776; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9777; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9778; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
9779; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
9780; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
9781; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
9782; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
9783; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
9784; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9785; GFX10-CU-NEXT:    s_endpgm
9786;
9787; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
9788; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9789; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9790; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
9791; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9792; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
9793; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
9794; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9795; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
9796; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9797; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
9798; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9799; SKIP-CACHE-INV-NEXT:    s_endpgm
9800;
9801; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
9802; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9803; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9804; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9805; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9806; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9807; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9808; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9809; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9810;
9811; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
9812; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9813; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9814; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9815; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9816; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9817; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9818; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9819; GFX90A-TGSPLIT-NEXT:    s_endpgm
9820;
9821; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
9822; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9823; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9824; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
9825; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9826; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9827; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
9828; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9829; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9830;
9831; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
9832; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9833; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9834; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
9835; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9836; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9837; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
9838; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9839; GFX940-TGSPLIT-NEXT:    s_endpgm
9840;
9841; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
9842; GFX11-WGP:       ; %bb.0: ; %entry
9843; GFX11-WGP-NEXT:    s_clause 0x1
9844; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
9845; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
9846; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9847; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
9848; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
9849; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
9850; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
9851; GFX11-WGP-NEXT:    s_endpgm
9852;
9853; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
9854; GFX11-CU:       ; %bb.0: ; %entry
9855; GFX11-CU-NEXT:    s_clause 0x1
9856; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
9857; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
9858; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9859; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
9860; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
9861; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
9862; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
9863; GFX11-CU-NEXT:    s_endpgm
9864    i32* %out, i32 %in, i32 %old) {
9865entry:
9866  %gep = getelementptr i32, i32* %out, i32 4
9867  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst
9868  ret void
9869}
9870
9871define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
9872; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
9873; GFX7:       ; %bb.0: ; %entry
9874; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9875; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
9876; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9877; GFX7-NEXT:    s_add_u32 s0, s0, 16
9878; GFX7-NEXT:    s_addc_u32 s1, s1, 0
9879; GFX7-NEXT:    v_mov_b32_e32 v0, s0
9880; GFX7-NEXT:    v_mov_b32_e32 v2, s2
9881; GFX7-NEXT:    v_mov_b32_e32 v1, s1
9882; GFX7-NEXT:    v_mov_b32_e32 v3, s3
9883; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9884; GFX7-NEXT:    s_endpgm
9885;
9886; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
9887; GFX10-WGP:       ; %bb.0: ; %entry
9888; GFX10-WGP-NEXT:    s_clause 0x1
9889; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9890; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9891; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9892; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
9893; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
9894; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
9895; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
9896; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
9897; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
9898; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9899; GFX10-WGP-NEXT:    s_endpgm
9900;
9901; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
9902; GFX10-CU:       ; %bb.0: ; %entry
9903; GFX10-CU-NEXT:    s_clause 0x1
9904; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9905; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9906; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9907; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
9908; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
9909; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
9910; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
9911; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
9912; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
9913; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9914; GFX10-CU-NEXT:    s_endpgm
9915;
9916; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
9917; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9918; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9919; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
9920; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9921; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
9922; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
9923; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9924; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
9925; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9926; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
9927; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
9928; SKIP-CACHE-INV-NEXT:    s_endpgm
9929;
9930; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
9931; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9932; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9933; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9934; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9935; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9936; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9937; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9938; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9939;
9940; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
9941; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9942; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9943; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9944; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9945; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9946; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9947; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9948; GFX90A-TGSPLIT-NEXT:    s_endpgm
9949;
9950; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
9951; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9952; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9953; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
9954; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9955; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9956; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
9957; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9958; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9959;
9960; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
9961; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9962; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9963; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
9964; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9965; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9966; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
9967; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
9968; GFX940-TGSPLIT-NEXT:    s_endpgm
9969;
9970; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
9971; GFX11-WGP:       ; %bb.0: ; %entry
9972; GFX11-WGP-NEXT:    s_clause 0x1
9973; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
9974; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
9975; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9976; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
9977; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
9978; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
9979; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
9980; GFX11-WGP-NEXT:    s_endpgm
9981;
9982; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
9983; GFX11-CU:       ; %bb.0: ; %entry
9984; GFX11-CU-NEXT:    s_clause 0x1
9985; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
9986; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
9987; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9988; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
9989; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
9990; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
9991; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
9992; GFX11-CU-NEXT:    s_endpgm
9993    i32* %out, i32 %in, i32 %old) {
9994entry:
9995  %gep = getelementptr i32, i32* %out, i32 4
9996  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst
9997  ret void
9998}
9999
10000define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
10001; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
10002; GFX7:       ; %bb.0: ; %entry
10003; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10004; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
10005; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10006; GFX7-NEXT:    s_add_u32 s0, s0, 16
10007; GFX7-NEXT:    s_addc_u32 s1, s1, 0
10008; GFX7-NEXT:    v_mov_b32_e32 v0, s0
10009; GFX7-NEXT:    v_mov_b32_e32 v2, s2
10010; GFX7-NEXT:    v_mov_b32_e32 v1, s1
10011; GFX7-NEXT:    v_mov_b32_e32 v3, s3
10012; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
10013; GFX7-NEXT:    s_endpgm
10014;
10015; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
10016; GFX10-WGP:       ; %bb.0: ; %entry
10017; GFX10-WGP-NEXT:    s_clause 0x1
10018; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10019; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10020; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10021; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
10022; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
10023; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
10024; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
10025; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
10026; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
10027; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
10028; GFX10-WGP-NEXT:    s_endpgm
10029;
10030; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
10031; GFX10-CU:       ; %bb.0: ; %entry
10032; GFX10-CU-NEXT:    s_clause 0x1
10033; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10034; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10035; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10036; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
10037; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
10038; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
10039; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
10040; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
10041; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
10042; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
10043; GFX10-CU-NEXT:    s_endpgm
10044;
10045; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
10046; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10047; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10048; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
10049; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10050; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
10051; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
10052; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10053; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
10054; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10055; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
10056; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
10057; SKIP-CACHE-INV-NEXT:    s_endpgm
10058;
10059; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
10060; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10061; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10062; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10063; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10064; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
10065; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
10066; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
10067; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10068;
10069; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
10070; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10071; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10072; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10073; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10074; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
10075; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
10076; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
10077; GFX90A-TGSPLIT-NEXT:    s_endpgm
10078;
10079; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
10080; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10081; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10082; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
10083; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10084; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10085; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
10086; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
10087; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10088;
10089; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
10090; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10091; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10092; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
10093; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10094; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10095; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
10096; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
10097; GFX940-TGSPLIT-NEXT:    s_endpgm
10098;
10099; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
10100; GFX11-WGP:       ; %bb.0: ; %entry
10101; GFX11-WGP-NEXT:    s_clause 0x1
10102; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
10103; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
10104; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10105; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
10106; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
10107; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
10108; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
10109; GFX11-WGP-NEXT:    s_endpgm
10110;
10111; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
10112; GFX11-CU:       ; %bb.0: ; %entry
10113; GFX11-CU-NEXT:    s_clause 0x1
10114; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
10115; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
10116; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10117; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
10118; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
10119; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
10120; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
10121; GFX11-CU-NEXT:    s_endpgm
10122    i32* %out, i32 %in, i32 %old) {
10123entry:
10124  %gep = getelementptr i32, i32* %out, i32 4
10125  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst
10126  ret void
10127}
10128
10129define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
10130; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
10131; GFX7:       ; %bb.0: ; %entry
10132; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10133; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
10134; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10135; GFX7-NEXT:    s_add_u32 s0, s0, 16
10136; GFX7-NEXT:    s_addc_u32 s1, s1, 0
10137; GFX7-NEXT:    v_mov_b32_e32 v0, s0
10138; GFX7-NEXT:    v_mov_b32_e32 v2, s2
10139; GFX7-NEXT:    v_mov_b32_e32 v1, s1
10140; GFX7-NEXT:    v_mov_b32_e32 v3, s3
10141; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
10142; GFX7-NEXT:    s_endpgm
10143;
10144; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
10145; GFX10-WGP:       ; %bb.0: ; %entry
10146; GFX10-WGP-NEXT:    s_clause 0x1
10147; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10148; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10149; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10150; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
10151; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
10152; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
10153; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
10154; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
10155; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
10156; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
10157; GFX10-WGP-NEXT:    s_endpgm
10158;
10159; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
10160; GFX10-CU:       ; %bb.0: ; %entry
10161; GFX10-CU-NEXT:    s_clause 0x1
10162; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10163; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10164; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10165; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
10166; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
10167; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
10168; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
10169; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
10170; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
10171; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
10172; GFX10-CU-NEXT:    s_endpgm
10173;
10174; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
10175; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10176; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10177; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
10178; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10179; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
10180; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
10181; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10182; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
10183; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10184; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
10185; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
10186; SKIP-CACHE-INV-NEXT:    s_endpgm
10187;
10188; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
10189; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10190; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10191; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10192; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10193; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
10194; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
10195; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
10196; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10197;
10198; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
10199; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10200; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10201; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10202; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10203; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
10204; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
10205; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
10206; GFX90A-TGSPLIT-NEXT:    s_endpgm
10207;
10208; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
10209; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10210; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10211; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
10212; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10213; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10214; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
10215; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
10216; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10217;
10218; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
10219; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10220; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10221; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
10222; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10223; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10224; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
10225; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
10226; GFX940-TGSPLIT-NEXT:    s_endpgm
10227;
10228; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
10229; GFX11-WGP:       ; %bb.0: ; %entry
10230; GFX11-WGP-NEXT:    s_clause 0x1
10231; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
10232; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
10233; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10234; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
10235; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
10236; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
10237; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
10238; GFX11-WGP-NEXT:    s_endpgm
10239;
10240; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
10241; GFX11-CU:       ; %bb.0: ; %entry
10242; GFX11-CU-NEXT:    s_clause 0x1
10243; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
10244; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
10245; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10246; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
10247; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
10248; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
10249; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
10250; GFX11-CU-NEXT:    s_endpgm
10251    i32* %out, i32 %in, i32 %old) {
10252entry:
10253  %gep = getelementptr i32, i32* %out, i32 4
10254  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
10255  ret void
10256}
10257
10258define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg(
10259; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
10260; GFX7:       ; %bb.0: ; %entry
10261; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10262; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
10263; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10264; GFX7-NEXT:    s_add_u32 s4, s0, 16
10265; GFX7-NEXT:    s_addc_u32 s5, s1, 0
10266; GFX7-NEXT:    v_mov_b32_e32 v0, s4
10267; GFX7-NEXT:    v_mov_b32_e32 v2, s2
10268; GFX7-NEXT:    v_mov_b32_e32 v1, s5
10269; GFX7-NEXT:    v_mov_b32_e32 v3, s3
10270; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10271; GFX7-NEXT:    v_mov_b32_e32 v0, s0
10272; GFX7-NEXT:    v_mov_b32_e32 v1, s1
10273; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10274; GFX7-NEXT:    flat_store_dword v[0:1], v2
10275; GFX7-NEXT:    s_endpgm
10276;
10277; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
10278; GFX10-WGP:       ; %bb.0: ; %entry
10279; GFX10-WGP-NEXT:    s_clause 0x1
10280; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10281; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10282; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10283; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
10284; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
10285; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
10286; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
10287; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
10288; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
10289; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10290; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
10291; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
10292; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10293; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10294; GFX10-WGP-NEXT:    s_endpgm
10295;
10296; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
10297; GFX10-CU:       ; %bb.0: ; %entry
10298; GFX10-CU-NEXT:    s_clause 0x1
10299; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10300; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10301; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10302; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
10303; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
10304; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
10305; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
10306; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
10307; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
10308; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10309; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
10310; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
10311; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10312; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10313; GFX10-CU-NEXT:    s_endpgm
10314;
10315; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
10316; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10317; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10318; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
10319; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10320; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
10321; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
10322; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
10323; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
10324; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
10325; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
10326; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10327; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10328; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10329; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10330; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10331; SKIP-CACHE-INV-NEXT:    s_endpgm
10332;
10333; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
10334; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10335; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10336; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10337; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10338; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
10339; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
10340; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
10341; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10342; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10343; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10344;
10345; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
10346; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10347; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10348; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10349; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10350; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
10351; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
10352; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
10353; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10354; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10355; GFX90A-TGSPLIT-NEXT:    s_endpgm
10356;
10357; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
10358; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10359; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10360; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
10361; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10362; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10363; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
10364; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
10365; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10366; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10367; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10368;
10369; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
10370; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10371; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10372; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
10373; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10374; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10375; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
10376; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
10377; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10378; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10379; GFX940-TGSPLIT-NEXT:    s_endpgm
10380;
10381; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
10382; GFX11-WGP:       ; %bb.0: ; %entry
10383; GFX11-WGP-NEXT:    s_clause 0x1
10384; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
10385; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
10386; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10387; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
10388; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
10389; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
10390; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10391; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10392; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
10393; GFX11-WGP-NEXT:    s_endpgm
10394;
10395; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
10396; GFX11-CU:       ; %bb.0: ; %entry
10397; GFX11-CU-NEXT:    s_clause 0x1
10398; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
10399; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
10400; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10401; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
10402; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
10403; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
10404; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10405; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10406; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
10407; GFX11-CU-NEXT:    s_endpgm
10408    i32* %out, i32 %in, i32 %old) {
10409entry:
10410  %gep = getelementptr i32, i32* %out, i32 4
10411  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
10412  %val0 = extractvalue { i32, i1 } %val, 0
10413  store i32 %val0, i32* %out, align 4
10414  ret void
10415}
10416
10417define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg(
10418; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
10419; GFX7:       ; %bb.0: ; %entry
10420; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10421; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
10422; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10423; GFX7-NEXT:    s_add_u32 s4, s0, 16
10424; GFX7-NEXT:    s_addc_u32 s5, s1, 0
10425; GFX7-NEXT:    v_mov_b32_e32 v0, s4
10426; GFX7-NEXT:    v_mov_b32_e32 v2, s2
10427; GFX7-NEXT:    v_mov_b32_e32 v1, s5
10428; GFX7-NEXT:    v_mov_b32_e32 v3, s3
10429; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10430; GFX7-NEXT:    v_mov_b32_e32 v0, s0
10431; GFX7-NEXT:    v_mov_b32_e32 v1, s1
10432; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10433; GFX7-NEXT:    flat_store_dword v[0:1], v2
10434; GFX7-NEXT:    s_endpgm
10435;
10436; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
10437; GFX10-WGP:       ; %bb.0: ; %entry
10438; GFX10-WGP-NEXT:    s_clause 0x1
10439; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10440; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10441; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10442; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
10443; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
10444; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
10445; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
10446; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
10447; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
10448; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10449; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
10450; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
10451; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10452; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10453; GFX10-WGP-NEXT:    s_endpgm
10454;
10455; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
10456; GFX10-CU:       ; %bb.0: ; %entry
10457; GFX10-CU-NEXT:    s_clause 0x1
10458; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10459; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10460; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10461; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
10462; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
10463; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
10464; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
10465; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
10466; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
10467; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10468; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
10469; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
10470; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10471; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10472; GFX10-CU-NEXT:    s_endpgm
10473;
10474; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
10475; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10476; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10477; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
10478; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10479; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
10480; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
10481; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
10482; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
10483; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
10484; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
10485; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10486; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10487; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10488; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10489; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10490; SKIP-CACHE-INV-NEXT:    s_endpgm
10491;
10492; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
10493; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10494; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10495; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10496; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10497; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
10498; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
10499; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
10500; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10501; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10502; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10503;
10504; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
10505; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10506; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10507; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10508; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10509; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
10510; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
10511; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
10512; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10513; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10514; GFX90A-TGSPLIT-NEXT:    s_endpgm
10515;
10516; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
10517; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10518; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10519; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
10520; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10521; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10522; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
10523; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
10524; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10525; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10526; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10527;
10528; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
10529; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10530; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10531; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
10532; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10533; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10534; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
10535; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
10536; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10537; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10538; GFX940-TGSPLIT-NEXT:    s_endpgm
10539;
10540; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
10541; GFX11-WGP:       ; %bb.0: ; %entry
10542; GFX11-WGP-NEXT:    s_clause 0x1
10543; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
10544; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
10545; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10546; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
10547; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
10548; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
10549; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10550; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10551; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
10552; GFX11-WGP-NEXT:    s_endpgm
10553;
10554; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
10555; GFX11-CU:       ; %bb.0: ; %entry
10556; GFX11-CU-NEXT:    s_clause 0x1
10557; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
10558; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
10559; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10560; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
10561; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
10562; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
10563; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10564; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10565; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
10566; GFX11-CU-NEXT:    s_endpgm
10567    i32* %out, i32 %in, i32 %old) {
10568entry:
10569  %gep = getelementptr i32, i32* %out, i32 4
10570  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
10571  %val0 = extractvalue { i32, i1 } %val, 0
10572  store i32 %val0, i32* %out, align 4
10573  ret void
10574}
10575
10576define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg(
10577; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
10578; GFX7:       ; %bb.0: ; %entry
10579; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10580; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
10581; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10582; GFX7-NEXT:    s_add_u32 s4, s0, 16
10583; GFX7-NEXT:    s_addc_u32 s5, s1, 0
10584; GFX7-NEXT:    v_mov_b32_e32 v0, s4
10585; GFX7-NEXT:    v_mov_b32_e32 v2, s2
10586; GFX7-NEXT:    v_mov_b32_e32 v1, s5
10587; GFX7-NEXT:    v_mov_b32_e32 v3, s3
10588; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10589; GFX7-NEXT:    v_mov_b32_e32 v0, s0
10590; GFX7-NEXT:    v_mov_b32_e32 v1, s1
10591; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10592; GFX7-NEXT:    flat_store_dword v[0:1], v2
10593; GFX7-NEXT:    s_endpgm
10594;
10595; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
10596; GFX10-WGP:       ; %bb.0: ; %entry
10597; GFX10-WGP-NEXT:    s_clause 0x1
10598; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10599; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10600; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10601; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
10602; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
10603; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
10604; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
10605; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
10606; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
10607; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10608; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
10609; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
10610; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10611; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10612; GFX10-WGP-NEXT:    s_endpgm
10613;
10614; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
10615; GFX10-CU:       ; %bb.0: ; %entry
10616; GFX10-CU-NEXT:    s_clause 0x1
10617; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10618; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10619; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10620; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
10621; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
10622; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
10623; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
10624; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
10625; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
10626; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10627; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
10628; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
10629; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10630; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10631; GFX10-CU-NEXT:    s_endpgm
10632;
10633; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
10634; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10635; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10636; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
10637; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10638; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
10639; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
10640; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
10641; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
10642; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
10643; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
10644; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10645; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10646; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10647; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10648; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10649; SKIP-CACHE-INV-NEXT:    s_endpgm
10650;
10651; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
10652; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10653; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10654; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10655; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10656; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
10657; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
10658; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
10659; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10660; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10661; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10662;
10663; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
10664; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10665; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10666; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10667; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10668; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
10669; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
10670; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
10671; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10672; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10673; GFX90A-TGSPLIT-NEXT:    s_endpgm
10674;
10675; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
10676; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10677; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10678; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
10679; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10680; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10681; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
10682; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
10683; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10684; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10685; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10686;
10687; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
10688; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10689; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10690; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
10691; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10692; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10693; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
10694; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
10695; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10696; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10697; GFX940-TGSPLIT-NEXT:    s_endpgm
10698;
10699; GFX11-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
10700; GFX11-WGP:       ; %bb.0: ; %entry
10701; GFX11-WGP-NEXT:    s_clause 0x1
10702; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
10703; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
10704; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10705; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
10706; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
10707; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
10708; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10709; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10710; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
10711; GFX11-WGP-NEXT:    s_endpgm
10712;
10713; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
10714; GFX11-CU:       ; %bb.0: ; %entry
10715; GFX11-CU-NEXT:    s_clause 0x1
10716; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
10717; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
10718; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10719; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
10720; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
10721; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
10722; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10723; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10724; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
10725; GFX11-CU-NEXT:    s_endpgm
10726    i32* %out, i32 %in, i32 %old) {
10727entry:
10728  %gep = getelementptr i32, i32* %out, i32 4
10729  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
10730  %val0 = extractvalue { i32, i1 } %val, 0
10731  store i32 %val0, i32* %out, align 4
10732  ret void
10733}
10734
10735define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg(
10736; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
10737; GFX7:       ; %bb.0: ; %entry
10738; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10739; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
10740; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10741; GFX7-NEXT:    s_add_u32 s4, s0, 16
10742; GFX7-NEXT:    s_addc_u32 s5, s1, 0
10743; GFX7-NEXT:    v_mov_b32_e32 v0, s4
10744; GFX7-NEXT:    v_mov_b32_e32 v2, s2
10745; GFX7-NEXT:    v_mov_b32_e32 v1, s5
10746; GFX7-NEXT:    v_mov_b32_e32 v3, s3
10747; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10748; GFX7-NEXT:    v_mov_b32_e32 v0, s0
10749; GFX7-NEXT:    v_mov_b32_e32 v1, s1
10750; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10751; GFX7-NEXT:    flat_store_dword v[0:1], v2
10752; GFX7-NEXT:    s_endpgm
10753;
10754; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
10755; GFX10-WGP:       ; %bb.0: ; %entry
10756; GFX10-WGP-NEXT:    s_clause 0x1
10757; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10758; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10759; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10760; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
10761; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
10762; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
10763; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
10764; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
10765; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
10766; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10767; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
10768; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
10769; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10770; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10771; GFX10-WGP-NEXT:    s_endpgm
10772;
10773; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
10774; GFX10-CU:       ; %bb.0: ; %entry
10775; GFX10-CU-NEXT:    s_clause 0x1
10776; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10777; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10778; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10779; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
10780; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
10781; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
10782; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
10783; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
10784; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
10785; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10786; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
10787; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
10788; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10789; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10790; GFX10-CU-NEXT:    s_endpgm
10791;
10792; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
10793; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10794; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10795; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
10796; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10797; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
10798; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
10799; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
10800; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
10801; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
10802; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
10803; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10804; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10805; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10806; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10807; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10808; SKIP-CACHE-INV-NEXT:    s_endpgm
10809;
10810; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
10811; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10812; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10813; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10814; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10815; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
10816; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
10817; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
10818; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10819; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10820; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10821;
10822; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
10823; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10824; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10825; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10826; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10827; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
10828; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
10829; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
10830; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10831; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10832; GFX90A-TGSPLIT-NEXT:    s_endpgm
10833;
10834; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
10835; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10836; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10837; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
10838; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10839; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10840; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
10841; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
10842; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10843; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10844; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10845;
10846; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
10847; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10848; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10849; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
10850; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10851; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10852; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
10853; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
10854; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10855; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10856; GFX940-TGSPLIT-NEXT:    s_endpgm
10857;
10858; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
10859; GFX11-WGP:       ; %bb.0: ; %entry
10860; GFX11-WGP-NEXT:    s_clause 0x1
10861; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
10862; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
10863; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10864; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
10865; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
10866; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
10867; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10868; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10869; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
10870; GFX11-WGP-NEXT:    s_endpgm
10871;
10872; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
10873; GFX11-CU:       ; %bb.0: ; %entry
10874; GFX11-CU-NEXT:    s_clause 0x1
10875; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
10876; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
10877; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10878; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
10879; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
10880; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
10881; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10882; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10883; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
10884; GFX11-CU-NEXT:    s_endpgm
10885    i32* %out, i32 %in, i32 %old) {
10886entry:
10887  %gep = getelementptr i32, i32* %out, i32 4
10888  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
10889  %val0 = extractvalue { i32, i1 } %val, 0
10890  store i32 %val0, i32* %out, align 4
10891  ret void
10892}
10893
10894define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg(
10895; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
10896; GFX7:       ; %bb.0: ; %entry
10897; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10898; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
10899; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10900; GFX7-NEXT:    s_add_u32 s4, s0, 16
10901; GFX7-NEXT:    s_addc_u32 s5, s1, 0
10902; GFX7-NEXT:    v_mov_b32_e32 v0, s4
10903; GFX7-NEXT:    v_mov_b32_e32 v2, s2
10904; GFX7-NEXT:    v_mov_b32_e32 v1, s5
10905; GFX7-NEXT:    v_mov_b32_e32 v3, s3
10906; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10907; GFX7-NEXT:    v_mov_b32_e32 v0, s0
10908; GFX7-NEXT:    v_mov_b32_e32 v1, s1
10909; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10910; GFX7-NEXT:    flat_store_dword v[0:1], v2
10911; GFX7-NEXT:    s_endpgm
10912;
10913; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
10914; GFX10-WGP:       ; %bb.0: ; %entry
10915; GFX10-WGP-NEXT:    s_clause 0x1
10916; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10917; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10918; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10919; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
10920; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
10921; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
10922; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
10923; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
10924; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
10925; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10926; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
10927; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
10928; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10929; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10930; GFX10-WGP-NEXT:    s_endpgm
10931;
10932; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
10933; GFX10-CU:       ; %bb.0: ; %entry
10934; GFX10-CU-NEXT:    s_clause 0x1
10935; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10936; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10937; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10938; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
10939; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
10940; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
10941; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
10942; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
10943; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
10944; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10945; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
10946; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
10947; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10948; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10949; GFX10-CU-NEXT:    s_endpgm
10950;
10951; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
10952; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10953; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10954; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
10955; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10956; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
10957; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
10958; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
10959; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
10960; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
10961; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
10962; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10963; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10964; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10965; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10966; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10967; SKIP-CACHE-INV-NEXT:    s_endpgm
10968;
10969; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
10970; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10971; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10972; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10973; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10974; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
10975; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
10976; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
10977; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10978; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10979; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10980;
10981; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
10982; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10983; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10984; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10985; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10986; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
10987; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
10988; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
10989; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10990; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10991; GFX90A-TGSPLIT-NEXT:    s_endpgm
10992;
10993; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
10994; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10995; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10996; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
10997; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10998; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10999; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
11000; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
11001; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11002; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11003; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11004;
11005; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
11006; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11007; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11008; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
11009; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11010; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11011; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
11012; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
11013; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11014; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11015; GFX940-TGSPLIT-NEXT:    s_endpgm
11016;
11017; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
11018; GFX11-WGP:       ; %bb.0: ; %entry
11019; GFX11-WGP-NEXT:    s_clause 0x1
11020; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
11021; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
11022; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11023; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
11024; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
11025; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
11026; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11027; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
11028; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
11029; GFX11-WGP-NEXT:    s_endpgm
11030;
11031; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
11032; GFX11-CU:       ; %bb.0: ; %entry
11033; GFX11-CU-NEXT:    s_clause 0x1
11034; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
11035; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
11036; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11037; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
11038; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
11039; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
11040; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11041; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
11042; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
11043; GFX11-CU-NEXT:    s_endpgm
11044    i32* %out, i32 %in, i32 %old) {
11045entry:
11046  %gep = getelementptr i32, i32* %out, i32 4
11047  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
11048  %val0 = extractvalue { i32, i1 } %val, 0
11049  store i32 %val0, i32* %out, align 4
11050  ret void
11051}
11052
11053define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg(
11054; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
11055; GFX7:       ; %bb.0: ; %entry
11056; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11057; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
11058; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11059; GFX7-NEXT:    s_add_u32 s4, s0, 16
11060; GFX7-NEXT:    s_addc_u32 s5, s1, 0
11061; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11062; GFX7-NEXT:    v_mov_b32_e32 v2, s2
11063; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11064; GFX7-NEXT:    v_mov_b32_e32 v3, s3
11065; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11066; GFX7-NEXT:    v_mov_b32_e32 v0, s0
11067; GFX7-NEXT:    v_mov_b32_e32 v1, s1
11068; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11069; GFX7-NEXT:    flat_store_dword v[0:1], v2
11070; GFX7-NEXT:    s_endpgm
11071;
11072; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
11073; GFX10-WGP:       ; %bb.0: ; %entry
11074; GFX10-WGP-NEXT:    s_clause 0x1
11075; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11076; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11077; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11078; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
11079; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
11080; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
11081; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
11082; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
11083; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
11084; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11085; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
11086; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
11087; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11088; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
11089; GFX10-WGP-NEXT:    s_endpgm
11090;
11091; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
11092; GFX10-CU:       ; %bb.0: ; %entry
11093; GFX10-CU-NEXT:    s_clause 0x1
11094; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11095; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11096; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11097; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
11098; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
11099; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
11100; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
11101; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
11102; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
11103; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11104; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
11105; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
11106; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11107; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
11108; GFX10-CU-NEXT:    s_endpgm
11109;
11110; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
11111; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11112; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11113; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
11114; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11115; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
11116; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
11117; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
11118; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
11119; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
11120; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
11121; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11122; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11123; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11124; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11125; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
11126; SKIP-CACHE-INV-NEXT:    s_endpgm
11127;
11128; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
11129; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11130; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11131; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11132; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11133; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
11134; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
11135; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
11136; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11137; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11138; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11139;
11140; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
11141; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11142; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11143; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11144; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11145; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
11146; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
11147; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
11148; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11149; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11150; GFX90A-TGSPLIT-NEXT:    s_endpgm
11151;
11152; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
11153; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11154; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11155; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
11156; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11157; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11158; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
11159; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
11160; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11161; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11162; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11163;
11164; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
11165; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11166; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11167; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
11168; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11169; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11170; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
11171; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
11172; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11173; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11174; GFX940-TGSPLIT-NEXT:    s_endpgm
11175;
11176; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
11177; GFX11-WGP:       ; %bb.0: ; %entry
11178; GFX11-WGP-NEXT:    s_clause 0x1
11179; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
11180; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
11181; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11182; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
11183; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
11184; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
11185; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11186; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
11187; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
11188; GFX11-WGP-NEXT:    s_endpgm
11189;
11190; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
11191; GFX11-CU:       ; %bb.0: ; %entry
11192; GFX11-CU-NEXT:    s_clause 0x1
11193; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
11194; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
11195; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11196; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
11197; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
11198; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
11199; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11200; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
11201; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
11202; GFX11-CU-NEXT:    s_endpgm
11203    i32* %out, i32 %in, i32 %old) {
11204entry:
11205  %gep = getelementptr i32, i32* %out, i32 4
11206  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire
11207  %val0 = extractvalue { i32, i1 } %val, 0
11208  store i32 %val0, i32* %out, align 4
11209  ret void
11210}
11211
11212define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
11213; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
11214; GFX7:       ; %bb.0: ; %entry
11215; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11216; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
11217; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11218; GFX7-NEXT:    s_add_u32 s4, s0, 16
11219; GFX7-NEXT:    s_addc_u32 s5, s1, 0
11220; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11221; GFX7-NEXT:    v_mov_b32_e32 v2, s2
11222; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11223; GFX7-NEXT:    v_mov_b32_e32 v3, s3
11224; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11225; GFX7-NEXT:    v_mov_b32_e32 v0, s0
11226; GFX7-NEXT:    v_mov_b32_e32 v1, s1
11227; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11228; GFX7-NEXT:    flat_store_dword v[0:1], v2
11229; GFX7-NEXT:    s_endpgm
11230;
11231; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
11232; GFX10-WGP:       ; %bb.0: ; %entry
11233; GFX10-WGP-NEXT:    s_clause 0x1
11234; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11235; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11236; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11237; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
11238; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
11239; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
11240; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
11241; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
11242; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
11243; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11244; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
11245; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
11246; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11247; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
11248; GFX10-WGP-NEXT:    s_endpgm
11249;
11250; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
11251; GFX10-CU:       ; %bb.0: ; %entry
11252; GFX10-CU-NEXT:    s_clause 0x1
11253; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11254; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11255; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11256; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
11257; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
11258; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
11259; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
11260; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
11261; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
11262; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11263; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
11264; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
11265; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11266; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
11267; GFX10-CU-NEXT:    s_endpgm
11268;
11269; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
11270; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11271; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11272; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
11273; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11274; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
11275; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
11276; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
11277; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
11278; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
11279; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
11280; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11281; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11282; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11283; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11284; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
11285; SKIP-CACHE-INV-NEXT:    s_endpgm
11286;
11287; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
11288; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11289; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11290; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11291; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11292; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
11293; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
11294; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
11295; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11296; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11297; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11298;
11299; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
11300; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11301; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11302; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11303; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11304; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
11305; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
11306; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
11307; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11308; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11309; GFX90A-TGSPLIT-NEXT:    s_endpgm
11310;
11311; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
11312; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11313; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11314; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
11315; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11316; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11317; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
11318; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
11319; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11320; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11321; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11322;
11323; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
11324; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11325; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11326; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
11327; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11328; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11329; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
11330; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
11331; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11332; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11333; GFX940-TGSPLIT-NEXT:    s_endpgm
11334;
11335; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
11336; GFX11-WGP:       ; %bb.0: ; %entry
11337; GFX11-WGP-NEXT:    s_clause 0x1
11338; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
11339; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
11340; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11341; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
11342; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
11343; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
11344; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11345; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
11346; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
11347; GFX11-WGP-NEXT:    s_endpgm
11348;
11349; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
11350; GFX11-CU:       ; %bb.0: ; %entry
11351; GFX11-CU-NEXT:    s_clause 0x1
11352; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
11353; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
11354; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11355; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
11356; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
11357; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
11358; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11359; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
11360; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
11361; GFX11-CU-NEXT:    s_endpgm
11362    i32* %out, i32 %in, i32 %old) {
11363entry:
11364  %gep = getelementptr i32, i32* %out, i32 4
11365  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
11366  %val0 = extractvalue { i32, i1 } %val, 0
11367  store i32 %val0, i32* %out, align 4
11368  ret void
11369}
11370
11371define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
11372; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
11373; GFX7:       ; %bb.0: ; %entry
11374; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11375; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
11376; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11377; GFX7-NEXT:    s_add_u32 s4, s0, 16
11378; GFX7-NEXT:    s_addc_u32 s5, s1, 0
11379; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11380; GFX7-NEXT:    v_mov_b32_e32 v2, s2
11381; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11382; GFX7-NEXT:    v_mov_b32_e32 v3, s3
11383; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11384; GFX7-NEXT:    v_mov_b32_e32 v0, s0
11385; GFX7-NEXT:    v_mov_b32_e32 v1, s1
11386; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11387; GFX7-NEXT:    flat_store_dword v[0:1], v2
11388; GFX7-NEXT:    s_endpgm
11389;
11390; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
11391; GFX10-WGP:       ; %bb.0: ; %entry
11392; GFX10-WGP-NEXT:    s_clause 0x1
11393; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11394; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11395; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11396; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
11397; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
11398; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
11399; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
11400; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
11401; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
11402; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11403; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
11404; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
11405; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11406; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
11407; GFX10-WGP-NEXT:    s_endpgm
11408;
11409; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
11410; GFX10-CU:       ; %bb.0: ; %entry
11411; GFX10-CU-NEXT:    s_clause 0x1
11412; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11413; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11414; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11415; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
11416; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
11417; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
11418; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
11419; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
11420; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
11421; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11422; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
11423; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
11424; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11425; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
11426; GFX10-CU-NEXT:    s_endpgm
11427;
11428; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
11429; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11430; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11431; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
11432; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11433; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
11434; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
11435; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
11436; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
11437; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
11438; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
11439; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11440; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11441; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11442; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11443; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
11444; SKIP-CACHE-INV-NEXT:    s_endpgm
11445;
11446; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
11447; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11448; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11449; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11450; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11451; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
11452; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
11453; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
11454; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11455; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11456; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11457;
11458; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
11459; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11460; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11461; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11462; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11463; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
11464; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
11465; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
11466; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11467; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11468; GFX90A-TGSPLIT-NEXT:    s_endpgm
11469;
11470; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
11471; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11472; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11473; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
11474; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11475; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11476; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
11477; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
11478; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11479; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11480; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11481;
11482; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
11483; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11484; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11485; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
11486; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11487; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11488; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
11489; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
11490; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11491; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11492; GFX940-TGSPLIT-NEXT:    s_endpgm
11493;
11494; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
11495; GFX11-WGP:       ; %bb.0: ; %entry
11496; GFX11-WGP-NEXT:    s_clause 0x1
11497; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
11498; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
11499; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11500; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
11501; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
11502; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
11503; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11504; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
11505; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
11506; GFX11-WGP-NEXT:    s_endpgm
11507;
11508; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
11509; GFX11-CU:       ; %bb.0: ; %entry
11510; GFX11-CU-NEXT:    s_clause 0x1
11511; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
11512; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
11513; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11514; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
11515; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
11516; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
11517; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11518; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
11519; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
11520; GFX11-CU-NEXT:    s_endpgm
11521    i32* %out, i32 %in, i32 %old) {
11522entry:
11523  %gep = getelementptr i32, i32* %out, i32 4
11524  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
11525  %val0 = extractvalue { i32, i1 } %val, 0
11526  store i32 %val0, i32* %out, align 4
11527  ret void
11528}
11529
11530define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
11531; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
11532; GFX7:       ; %bb.0: ; %entry
11533; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11534; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
11535; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11536; GFX7-NEXT:    s_add_u32 s4, s0, 16
11537; GFX7-NEXT:    s_addc_u32 s5, s1, 0
11538; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11539; GFX7-NEXT:    v_mov_b32_e32 v2, s2
11540; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11541; GFX7-NEXT:    v_mov_b32_e32 v3, s3
11542; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11543; GFX7-NEXT:    v_mov_b32_e32 v0, s0
11544; GFX7-NEXT:    v_mov_b32_e32 v1, s1
11545; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11546; GFX7-NEXT:    flat_store_dword v[0:1], v2
11547; GFX7-NEXT:    s_endpgm
11548;
11549; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
11550; GFX10-WGP:       ; %bb.0: ; %entry
11551; GFX10-WGP-NEXT:    s_clause 0x1
11552; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11553; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11554; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11555; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
11556; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
11557; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
11558; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
11559; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
11560; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
11561; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11562; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
11563; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
11564; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11565; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
11566; GFX10-WGP-NEXT:    s_endpgm
11567;
11568; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
11569; GFX10-CU:       ; %bb.0: ; %entry
11570; GFX10-CU-NEXT:    s_clause 0x1
11571; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11572; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11573; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11574; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
11575; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
11576; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
11577; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
11578; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
11579; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
11580; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11581; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
11582; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
11583; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11584; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
11585; GFX10-CU-NEXT:    s_endpgm
11586;
11587; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
11588; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11589; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11590; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
11591; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11592; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
11593; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
11594; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
11595; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
11596; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
11597; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
11598; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11599; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11600; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11601; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11602; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
11603; SKIP-CACHE-INV-NEXT:    s_endpgm
11604;
11605; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
11606; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11607; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11608; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11609; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11610; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
11611; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
11612; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
11613; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11614; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11615; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11616;
11617; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
11618; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11619; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11620; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11621; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11622; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
11623; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
11624; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
11625; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11626; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11627; GFX90A-TGSPLIT-NEXT:    s_endpgm
11628;
11629; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
11630; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11631; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11632; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
11633; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11634; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11635; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
11636; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
11637; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11638; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11639; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11640;
11641; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
11642; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11643; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11644; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
11645; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11646; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11647; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
11648; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
11649; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11650; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11651; GFX940-TGSPLIT-NEXT:    s_endpgm
11652;
11653; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
11654; GFX11-WGP:       ; %bb.0: ; %entry
11655; GFX11-WGP-NEXT:    s_clause 0x1
11656; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
11657; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
11658; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11659; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
11660; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
11661; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
11662; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11663; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
11664; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
11665; GFX11-WGP-NEXT:    s_endpgm
11666;
11667; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
11668; GFX11-CU:       ; %bb.0: ; %entry
11669; GFX11-CU-NEXT:    s_clause 0x1
11670; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
11671; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
11672; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11673; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
11674; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
11675; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
11676; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11677; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
11678; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
11679; GFX11-CU-NEXT:    s_endpgm
11680    i32* %out, i32 %in, i32 %old) {
11681entry:
11682  %gep = getelementptr i32, i32* %out, i32 4
11683  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
11684  %val0 = extractvalue { i32, i1 } %val, 0
11685  store i32 %val0, i32* %out, align 4
11686  ret void
11687}
11688
11689define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
11690; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
11691; GFX7:       ; %bb.0: ; %entry
11692; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11693; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
11694; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11695; GFX7-NEXT:    s_add_u32 s4, s0, 16
11696; GFX7-NEXT:    s_addc_u32 s5, s1, 0
11697; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11698; GFX7-NEXT:    v_mov_b32_e32 v2, s2
11699; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11700; GFX7-NEXT:    v_mov_b32_e32 v3, s3
11701; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11702; GFX7-NEXT:    v_mov_b32_e32 v0, s0
11703; GFX7-NEXT:    v_mov_b32_e32 v1, s1
11704; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11705; GFX7-NEXT:    flat_store_dword v[0:1], v2
11706; GFX7-NEXT:    s_endpgm
11707;
11708; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
11709; GFX10-WGP:       ; %bb.0: ; %entry
11710; GFX10-WGP-NEXT:    s_clause 0x1
11711; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11712; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11713; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11714; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
11715; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
11716; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
11717; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
11718; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
11719; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
11720; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11721; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
11722; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
11723; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11724; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
11725; GFX10-WGP-NEXT:    s_endpgm
11726;
11727; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
11728; GFX10-CU:       ; %bb.0: ; %entry
11729; GFX10-CU-NEXT:    s_clause 0x1
11730; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11731; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11732; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11733; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
11734; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
11735; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
11736; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
11737; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
11738; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
11739; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11740; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
11741; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
11742; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11743; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
11744; GFX10-CU-NEXT:    s_endpgm
11745;
11746; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
11747; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11748; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11749; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
11750; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11751; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
11752; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
11753; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
11754; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
11755; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
11756; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
11757; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11758; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11759; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11760; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11761; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
11762; SKIP-CACHE-INV-NEXT:    s_endpgm
11763;
11764; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
11765; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11766; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11767; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11768; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11769; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
11770; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
11771; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
11772; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11773; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11774; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11775;
11776; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
11777; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11778; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11779; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11780; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11781; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
11782; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
11783; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
11784; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11785; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11786; GFX90A-TGSPLIT-NEXT:    s_endpgm
11787;
11788; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
11789; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11790; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11791; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
11792; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11793; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11794; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
11795; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
11796; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11797; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11798; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11799;
11800; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
11801; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11802; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11803; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
11804; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11805; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11806; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
11807; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
11808; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11809; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11810; GFX940-TGSPLIT-NEXT:    s_endpgm
11811;
11812; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
11813; GFX11-WGP:       ; %bb.0: ; %entry
11814; GFX11-WGP-NEXT:    s_clause 0x1
11815; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
11816; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
11817; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11818; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
11819; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
11820; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
11821; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11822; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
11823; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
11824; GFX11-WGP-NEXT:    s_endpgm
11825;
11826; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
11827; GFX11-CU:       ; %bb.0: ; %entry
11828; GFX11-CU-NEXT:    s_clause 0x1
11829; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
11830; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
11831; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11832; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
11833; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
11834; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
11835; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11836; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
11837; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
11838; GFX11-CU-NEXT:    s_endpgm
11839    i32* %out, i32 %in, i32 %old) {
11840entry:
11841  %gep = getelementptr i32, i32* %out, i32 4
11842  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
11843  %val0 = extractvalue { i32, i1 } %val, 0
11844  store i32 %val0, i32* %out, align 4
11845  ret void
11846}
11847
11848define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg(
11849; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
11850; GFX7:       ; %bb.0: ; %entry
11851; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11852; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
11853; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11854; GFX7-NEXT:    s_add_u32 s4, s0, 16
11855; GFX7-NEXT:    s_addc_u32 s5, s1, 0
11856; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11857; GFX7-NEXT:    v_mov_b32_e32 v2, s2
11858; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11859; GFX7-NEXT:    v_mov_b32_e32 v3, s3
11860; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11861; GFX7-NEXT:    v_mov_b32_e32 v0, s0
11862; GFX7-NEXT:    v_mov_b32_e32 v1, s1
11863; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11864; GFX7-NEXT:    flat_store_dword v[0:1], v2
11865; GFX7-NEXT:    s_endpgm
11866;
11867; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
11868; GFX10-WGP:       ; %bb.0: ; %entry
11869; GFX10-WGP-NEXT:    s_clause 0x1
11870; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11871; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11872; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11873; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
11874; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
11875; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
11876; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
11877; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
11878; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
11879; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11880; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
11881; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
11882; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11883; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
11884; GFX10-WGP-NEXT:    s_endpgm
11885;
11886; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
11887; GFX10-CU:       ; %bb.0: ; %entry
11888; GFX10-CU-NEXT:    s_clause 0x1
11889; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11890; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11891; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11892; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
11893; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
11894; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
11895; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
11896; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
11897; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
11898; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11899; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
11900; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
11901; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11902; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
11903; GFX10-CU-NEXT:    s_endpgm
11904;
11905; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
11906; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11907; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11908; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
11909; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11910; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
11911; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
11912; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
11913; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
11914; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
11915; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
11916; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11917; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11918; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11919; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11920; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
11921; SKIP-CACHE-INV-NEXT:    s_endpgm
11922;
11923; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
11924; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11925; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11926; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11927; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11928; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
11929; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
11930; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
11931; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11932; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11933; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11934;
11935; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
11936; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11937; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11938; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11939; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11940; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
11941; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
11942; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
11943; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11944; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11945; GFX90A-TGSPLIT-NEXT:    s_endpgm
11946;
11947; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
11948; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11949; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11950; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
11951; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11952; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11953; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
11954; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
11955; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11956; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11957; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11958;
11959; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
11960; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11961; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
11962; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
11963; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11964; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11965; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
11966; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
11967; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11968; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11969; GFX940-TGSPLIT-NEXT:    s_endpgm
11970;
11971; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
11972; GFX11-WGP:       ; %bb.0: ; %entry
11973; GFX11-WGP-NEXT:    s_clause 0x1
11974; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
11975; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
11976; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11977; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
11978; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
11979; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
11980; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11981; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
11982; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
11983; GFX11-WGP-NEXT:    s_endpgm
11984;
11985; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
11986; GFX11-CU:       ; %bb.0: ; %entry
11987; GFX11-CU-NEXT:    s_clause 0x1
11988; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
11989; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
11990; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11991; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
11992; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
11993; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
11994; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11995; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
11996; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
11997; GFX11-CU-NEXT:    s_endpgm
11998    i32* %out, i32 %in, i32 %old) {
11999entry:
12000  %gep = getelementptr i32, i32* %out, i32 4
12001  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst
12002  %val0 = extractvalue { i32, i1 } %val, 0
12003  store i32 %val0, i32* %out, align 4
12004  ret void
12005}
12006
12007define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
12008; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
12009; GFX7:       ; %bb.0: ; %entry
12010; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12011; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
12012; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12013; GFX7-NEXT:    s_add_u32 s4, s0, 16
12014; GFX7-NEXT:    s_addc_u32 s5, s1, 0
12015; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12016; GFX7-NEXT:    v_mov_b32_e32 v2, s2
12017; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12018; GFX7-NEXT:    v_mov_b32_e32 v3, s3
12019; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12020; GFX7-NEXT:    v_mov_b32_e32 v0, s0
12021; GFX7-NEXT:    v_mov_b32_e32 v1, s1
12022; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12023; GFX7-NEXT:    flat_store_dword v[0:1], v2
12024; GFX7-NEXT:    s_endpgm
12025;
12026; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
12027; GFX10-WGP:       ; %bb.0: ; %entry
12028; GFX10-WGP-NEXT:    s_clause 0x1
12029; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12030; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12031; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12032; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
12033; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
12034; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
12035; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
12036; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
12037; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
12038; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12039; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
12040; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
12041; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12042; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
12043; GFX10-WGP-NEXT:    s_endpgm
12044;
12045; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
12046; GFX10-CU:       ; %bb.0: ; %entry
12047; GFX10-CU-NEXT:    s_clause 0x1
12048; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12049; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12050; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12051; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
12052; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
12053; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
12054; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
12055; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
12056; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
12057; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12058; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
12059; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
12060; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12061; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
12062; GFX10-CU-NEXT:    s_endpgm
12063;
12064; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
12065; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12066; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
12067; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
12068; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12069; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
12070; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
12071; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
12072; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
12073; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
12074; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
12075; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12076; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
12077; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
12078; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12079; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
12080; SKIP-CACHE-INV-NEXT:    s_endpgm
12081;
12082; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
12083; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12084; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12085; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12086; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12087; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
12088; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
12089; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
12090; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12091; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12092; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12093;
12094; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
12095; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12096; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12097; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12098; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12099; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
12100; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
12101; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
12102; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12103; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12104; GFX90A-TGSPLIT-NEXT:    s_endpgm
12105;
12106; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
12107; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12108; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
12109; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
12110; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12111; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12112; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
12113; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
12114; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12115; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12116; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12117;
12118; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
12119; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12120; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
12121; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
12122; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12123; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12124; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
12125; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
12126; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12127; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12128; GFX940-TGSPLIT-NEXT:    s_endpgm
12129;
12130; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
12131; GFX11-WGP:       ; %bb.0: ; %entry
12132; GFX11-WGP-NEXT:    s_clause 0x1
12133; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
12134; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
12135; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12136; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
12137; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
12138; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
12139; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12140; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
12141; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
12142; GFX11-WGP-NEXT:    s_endpgm
12143;
12144; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
12145; GFX11-CU:       ; %bb.0: ; %entry
12146; GFX11-CU-NEXT:    s_clause 0x1
12147; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
12148; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
12149; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12150; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
12151; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
12152; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
12153; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12154; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
12155; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
12156; GFX11-CU-NEXT:    s_endpgm
12157    i32* %out, i32 %in, i32 %old) {
12158entry:
12159  %gep = getelementptr i32, i32* %out, i32 4
12160  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst
12161  %val0 = extractvalue { i32, i1 } %val, 0
12162  store i32 %val0, i32* %out, align 4
12163  ret void
12164}
12165
12166define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
12167; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
12168; GFX7:       ; %bb.0: ; %entry
12169; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12170; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
12171; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12172; GFX7-NEXT:    s_add_u32 s4, s0, 16
12173; GFX7-NEXT:    s_addc_u32 s5, s1, 0
12174; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12175; GFX7-NEXT:    v_mov_b32_e32 v2, s2
12176; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12177; GFX7-NEXT:    v_mov_b32_e32 v3, s3
12178; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12179; GFX7-NEXT:    v_mov_b32_e32 v0, s0
12180; GFX7-NEXT:    v_mov_b32_e32 v1, s1
12181; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12182; GFX7-NEXT:    flat_store_dword v[0:1], v2
12183; GFX7-NEXT:    s_endpgm
12184;
12185; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
12186; GFX10-WGP:       ; %bb.0: ; %entry
12187; GFX10-WGP-NEXT:    s_clause 0x1
12188; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12189; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12190; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12191; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
12192; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
12193; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
12194; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
12195; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
12196; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
12197; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12198; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
12199; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
12200; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12201; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
12202; GFX10-WGP-NEXT:    s_endpgm
12203;
12204; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
12205; GFX10-CU:       ; %bb.0: ; %entry
12206; GFX10-CU-NEXT:    s_clause 0x1
12207; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12208; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12209; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12210; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
12211; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
12212; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
12213; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
12214; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
12215; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
12216; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12217; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
12218; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
12219; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12220; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
12221; GFX10-CU-NEXT:    s_endpgm
12222;
12223; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
12224; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12225; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
12226; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
12227; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12228; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
12229; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
12230; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
12231; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
12232; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
12233; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
12234; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12235; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
12236; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
12237; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12238; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
12239; SKIP-CACHE-INV-NEXT:    s_endpgm
12240;
12241; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
12242; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12243; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12244; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12245; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12246; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
12247; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
12248; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
12249; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12250; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12251; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12252;
12253; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
12254; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12255; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12256; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12257; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12258; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
12259; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
12260; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
12261; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12262; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12263; GFX90A-TGSPLIT-NEXT:    s_endpgm
12264;
12265; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
12266; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12267; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
12268; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
12269; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12270; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12271; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
12272; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
12273; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12274; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12275; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12276;
12277; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
12278; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12279; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
12280; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
12281; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12282; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12283; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
12284; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
12285; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12286; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12287; GFX940-TGSPLIT-NEXT:    s_endpgm
12288;
12289; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
12290; GFX11-WGP:       ; %bb.0: ; %entry
12291; GFX11-WGP-NEXT:    s_clause 0x1
12292; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
12293; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
12294; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12295; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
12296; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
12297; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
12298; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12299; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
12300; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
12301; GFX11-WGP-NEXT:    s_endpgm
12302;
12303; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
12304; GFX11-CU:       ; %bb.0: ; %entry
12305; GFX11-CU-NEXT:    s_clause 0x1
12306; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
12307; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
12308; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12309; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
12310; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
12311; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
12312; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12313; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
12314; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
12315; GFX11-CU-NEXT:    s_endpgm
12316    i32* %out, i32 %in, i32 %old) {
12317entry:
12318  %gep = getelementptr i32, i32* %out, i32 4
12319  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst
12320  %val0 = extractvalue { i32, i1 } %val, 0
12321  store i32 %val0, i32* %out, align 4
12322  ret void
12323}
12324
12325define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
12326; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
12327; GFX7:       ; %bb.0: ; %entry
12328; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12329; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
12330; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12331; GFX7-NEXT:    s_add_u32 s4, s0, 16
12332; GFX7-NEXT:    s_addc_u32 s5, s1, 0
12333; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12334; GFX7-NEXT:    v_mov_b32_e32 v2, s2
12335; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12336; GFX7-NEXT:    v_mov_b32_e32 v3, s3
12337; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12338; GFX7-NEXT:    v_mov_b32_e32 v0, s0
12339; GFX7-NEXT:    v_mov_b32_e32 v1, s1
12340; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12341; GFX7-NEXT:    flat_store_dword v[0:1], v2
12342; GFX7-NEXT:    s_endpgm
12343;
12344; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
12345; GFX10-WGP:       ; %bb.0: ; %entry
12346; GFX10-WGP-NEXT:    s_clause 0x1
12347; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12348; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12349; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12350; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
12351; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
12352; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
12353; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
12354; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
12355; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
12356; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12357; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
12358; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
12359; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12360; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
12361; GFX10-WGP-NEXT:    s_endpgm
12362;
12363; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
12364; GFX10-CU:       ; %bb.0: ; %entry
12365; GFX10-CU-NEXT:    s_clause 0x1
12366; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12367; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12368; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12369; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
12370; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
12371; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
12372; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
12373; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
12374; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
12375; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12376; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
12377; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
12378; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12379; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
12380; GFX10-CU-NEXT:    s_endpgm
12381;
12382; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
12383; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12384; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
12385; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
12386; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12387; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
12388; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
12389; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
12390; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
12391; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
12392; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
12393; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12394; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
12395; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
12396; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12397; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
12398; SKIP-CACHE-INV-NEXT:    s_endpgm
12399;
12400; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
12401; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12402; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12403; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12404; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12405; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
12406; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
12407; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
12408; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12409; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12410; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12411;
12412; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
12413; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12414; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12415; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12416; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12417; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
12418; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
12419; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
12420; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12421; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12422; GFX90A-TGSPLIT-NEXT:    s_endpgm
12423;
12424; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
12425; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12426; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
12427; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
12428; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12429; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12430; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
12431; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
12432; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12433; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12434; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12435;
12436; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
12437; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12438; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
12439; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
12440; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12441; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12442; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
12443; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
12444; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12445; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12446; GFX940-TGSPLIT-NEXT:    s_endpgm
12447;
12448; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
12449; GFX11-WGP:       ; %bb.0: ; %entry
12450; GFX11-WGP-NEXT:    s_clause 0x1
12451; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
12452; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
12453; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12454; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
12455; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
12456; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
12457; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12458; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
12459; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
12460; GFX11-WGP-NEXT:    s_endpgm
12461;
12462; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
12463; GFX11-CU:       ; %bb.0: ; %entry
12464; GFX11-CU-NEXT:    s_clause 0x1
12465; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
12466; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
12467; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12468; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
12469; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
12470; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
12471; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12472; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
12473; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
12474; GFX11-CU-NEXT:    s_endpgm
12475    i32* %out, i32 %in, i32 %old) {
12476entry:
12477  %gep = getelementptr i32, i32* %out, i32 4
12478  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst
12479  %val0 = extractvalue { i32, i1 } %val, 0
12480  store i32 %val0, i32* %out, align 4
12481  ret void
12482}
12483
12484define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
12485; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
12486; GFX7:       ; %bb.0: ; %entry
12487; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12488; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
12489; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12490; GFX7-NEXT:    s_add_u32 s4, s0, 16
12491; GFX7-NEXT:    s_addc_u32 s5, s1, 0
12492; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12493; GFX7-NEXT:    v_mov_b32_e32 v2, s2
12494; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12495; GFX7-NEXT:    v_mov_b32_e32 v3, s3
12496; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12497; GFX7-NEXT:    v_mov_b32_e32 v0, s0
12498; GFX7-NEXT:    v_mov_b32_e32 v1, s1
12499; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12500; GFX7-NEXT:    flat_store_dword v[0:1], v2
12501; GFX7-NEXT:    s_endpgm
12502;
12503; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
12504; GFX10-WGP:       ; %bb.0: ; %entry
12505; GFX10-WGP-NEXT:    s_clause 0x1
12506; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12507; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12508; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12509; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
12510; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
12511; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
12512; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
12513; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
12514; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
12515; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12516; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
12517; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
12518; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12519; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
12520; GFX10-WGP-NEXT:    s_endpgm
12521;
12522; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
12523; GFX10-CU:       ; %bb.0: ; %entry
12524; GFX10-CU-NEXT:    s_clause 0x1
12525; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12526; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12527; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12528; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
12529; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
12530; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
12531; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
12532; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
12533; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
12534; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12535; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
12536; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
12537; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12538; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
12539; GFX10-CU-NEXT:    s_endpgm
12540;
12541; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
12542; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12543; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
12544; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
12545; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12546; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
12547; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
12548; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
12549; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
12550; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
12551; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
12552; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12553; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
12554; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
12555; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12556; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
12557; SKIP-CACHE-INV-NEXT:    s_endpgm
12558;
12559; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
12560; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12561; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12562; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12563; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12564; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
12565; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
12566; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
12567; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12568; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12569; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12570;
12571; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
12572; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12573; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12574; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12575; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12576; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
12577; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
12578; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
12579; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12580; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12581; GFX90A-TGSPLIT-NEXT:    s_endpgm
12582;
12583; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
12584; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12585; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
12586; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
12587; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12588; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12589; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
12590; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
12591; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12592; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12593; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12594;
12595; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
12596; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12597; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
12598; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
12599; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12600; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12601; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
12602; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
12603; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12604; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12605; GFX940-TGSPLIT-NEXT:    s_endpgm
12606;
12607; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
12608; GFX11-WGP:       ; %bb.0: ; %entry
12609; GFX11-WGP-NEXT:    s_clause 0x1
12610; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
12611; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
12612; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12613; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
12614; GFX11-WGP-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
12615; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
12616; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12617; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
12618; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
12619; GFX11-WGP-NEXT:    s_endpgm
12620;
12621; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
12622; GFX11-CU:       ; %bb.0: ; %entry
12623; GFX11-CU-NEXT:    s_clause 0x1
12624; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
12625; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
12626; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12627; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
12628; GFX11-CU-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
12629; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
12630; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12631; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
12632; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
12633; GFX11-CU-NEXT:    s_endpgm
12634    i32* %out, i32 %in, i32 %old) {
12635entry:
12636  %gep = getelementptr i32, i32* %out, i32 4
12637  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
12638  %val0 = extractvalue { i32, i1 } %val, 0
12639  store i32 %val0, i32* %out, align 4
12640  ret void
12641}
12642
12643