1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s 6; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s 7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s 8; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s 9; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s 10; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s 11; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s 12 13define amdgpu_kernel void @flat_singlethread_unordered_load( 14; GFX7-LABEL: flat_singlethread_unordered_load: 15; GFX7: ; %bb.0: ; %entry 16; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 17; GFX7-NEXT: s_waitcnt lgkmcnt(0) 18; GFX7-NEXT: v_mov_b32_e32 v0, s0 19; GFX7-NEXT: v_mov_b32_e32 v1, s1 20; GFX7-NEXT: flat_load_dword v2, v[0:1] 21; GFX7-NEXT: v_mov_b32_e32 v0, s2 22; GFX7-NEXT: v_mov_b32_e32 v1, s3 23; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 24; GFX7-NEXT: flat_store_dword v[0:1], v2 25; GFX7-NEXT: s_endpgm 26; 27; GFX10-WGP-LABEL: flat_singlethread_unordered_load: 28; GFX10-WGP: ; %bb.0: ; %entry 29; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 30; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 31; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 32; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 33; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 34; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 35; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 36; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 37; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 38; GFX10-WGP-NEXT: s_endpgm 39; 40; GFX10-CU-LABEL: flat_singlethread_unordered_load: 41; GFX10-CU: ; %bb.0: ; %entry 42; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 43; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 44; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 45; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 46; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 47; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 48; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 49; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 50; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 51; GFX10-CU-NEXT: s_endpgm 52; 53; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_load: 54; SKIP-CACHE-INV: ; %bb.0: ; %entry 55; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 56; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 57; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 58; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 59; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 60; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 61; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 62; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 63; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 64; SKIP-CACHE-INV-NEXT: s_endpgm 65; 66; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load: 67; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 68; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 69; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 70; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 71; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 72; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 73; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 74; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 75; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 76; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 77; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 78; 79; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load: 80; GFX90A-TGSPLIT: ; %bb.0: ; %entry 81; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 82; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 83; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 84; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 85; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 86; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 87; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 88; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 89; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 90; GFX90A-TGSPLIT-NEXT: s_endpgm 91; 92; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load: 93; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 94; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 95; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 96; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 97; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 98; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 99; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 100; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 101; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 102; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 103; GFX940-NOTTGSPLIT-NEXT: s_endpgm 104; 105; GFX940-TGSPLIT-LABEL: flat_singlethread_unordered_load: 106; GFX940-TGSPLIT: ; %bb.0: ; %entry 107; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 108; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 109; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 110; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 111; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 112; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 113; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 114; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 115; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 116; GFX940-TGSPLIT-NEXT: s_endpgm 117; 118; GFX11-WGP-LABEL: flat_singlethread_unordered_load: 119; GFX11-WGP: ; %bb.0: ; %entry 120; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 121; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 122; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 123; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] 124; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 125; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 126; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 127; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 128; GFX11-WGP-NEXT: s_endpgm 129; 130; GFX11-CU-LABEL: flat_singlethread_unordered_load: 131; GFX11-CU: ; %bb.0: ; %entry 132; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 133; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 134; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 135; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 136; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 137; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 138; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 139; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 140; GFX11-CU-NEXT: s_endpgm 141 i32* %in, i32* %out) { 142entry: 143 %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4 144 store i32 %val, i32* %out 145 ret void 146} 147 148define amdgpu_kernel void @flat_singlethread_monotonic_load( 149; GFX7-LABEL: flat_singlethread_monotonic_load: 150; GFX7: ; %bb.0: ; %entry 151; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 152; GFX7-NEXT: s_waitcnt lgkmcnt(0) 153; GFX7-NEXT: v_mov_b32_e32 v0, s0 154; GFX7-NEXT: v_mov_b32_e32 v1, s1 155; GFX7-NEXT: flat_load_dword v2, v[0:1] 156; GFX7-NEXT: v_mov_b32_e32 v0, s2 157; GFX7-NEXT: v_mov_b32_e32 v1, s3 158; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 159; GFX7-NEXT: flat_store_dword v[0:1], v2 160; GFX7-NEXT: s_endpgm 161; 162; GFX10-WGP-LABEL: flat_singlethread_monotonic_load: 163; GFX10-WGP: ; %bb.0: ; %entry 164; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 165; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 166; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 167; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 168; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 169; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 170; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 171; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 172; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 173; GFX10-WGP-NEXT: s_endpgm 174; 175; GFX10-CU-LABEL: flat_singlethread_monotonic_load: 176; GFX10-CU: ; %bb.0: ; %entry 177; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 178; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 179; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 180; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 181; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 182; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 183; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 184; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 185; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 186; GFX10-CU-NEXT: s_endpgm 187; 188; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_load: 189; SKIP-CACHE-INV: ; %bb.0: ; %entry 190; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 191; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 192; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 193; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 194; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 195; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 196; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 197; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 198; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 199; SKIP-CACHE-INV-NEXT: s_endpgm 200; 201; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load: 202; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 203; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 204; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 205; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 206; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 207; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 208; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 209; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 210; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 211; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 212; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 213; 214; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load: 215; GFX90A-TGSPLIT: ; %bb.0: ; %entry 216; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 217; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 218; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 219; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 220; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 221; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 222; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 223; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 224; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 225; GFX90A-TGSPLIT-NEXT: s_endpgm 226; 227; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load: 228; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 229; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 230; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 231; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 232; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 233; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 234; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 235; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 236; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 237; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 238; GFX940-NOTTGSPLIT-NEXT: s_endpgm 239; 240; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_load: 241; GFX940-TGSPLIT: ; %bb.0: ; %entry 242; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 243; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 244; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 245; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 246; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 247; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 248; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 249; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 250; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 251; GFX940-TGSPLIT-NEXT: s_endpgm 252; 253; GFX11-WGP-LABEL: flat_singlethread_monotonic_load: 254; GFX11-WGP: ; %bb.0: ; %entry 255; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 256; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 257; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 258; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] 259; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 260; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 261; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 262; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 263; GFX11-WGP-NEXT: s_endpgm 264; 265; GFX11-CU-LABEL: flat_singlethread_monotonic_load: 266; GFX11-CU: ; %bb.0: ; %entry 267; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 268; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 269; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 270; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 271; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 272; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 273; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 274; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 275; GFX11-CU-NEXT: s_endpgm 276 i32* %in, i32* %out) { 277entry: 278 %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4 279 store i32 %val, i32* %out 280 ret void 281} 282 283define amdgpu_kernel void @flat_singlethread_acquire_load( 284; GFX7-LABEL: flat_singlethread_acquire_load: 285; GFX7: ; %bb.0: ; %entry 286; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 287; GFX7-NEXT: s_waitcnt lgkmcnt(0) 288; GFX7-NEXT: v_mov_b32_e32 v0, s0 289; GFX7-NEXT: v_mov_b32_e32 v1, s1 290; GFX7-NEXT: flat_load_dword v2, v[0:1] 291; GFX7-NEXT: v_mov_b32_e32 v0, s2 292; GFX7-NEXT: v_mov_b32_e32 v1, s3 293; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 294; GFX7-NEXT: flat_store_dword v[0:1], v2 295; GFX7-NEXT: s_endpgm 296; 297; GFX10-WGP-LABEL: flat_singlethread_acquire_load: 298; GFX10-WGP: ; %bb.0: ; %entry 299; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 300; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 301; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 302; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 303; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 304; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 305; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 306; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 307; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 308; GFX10-WGP-NEXT: s_endpgm 309; 310; GFX10-CU-LABEL: flat_singlethread_acquire_load: 311; GFX10-CU: ; %bb.0: ; %entry 312; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 313; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 314; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 315; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 316; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 317; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 318; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 319; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 320; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 321; GFX10-CU-NEXT: s_endpgm 322; 323; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_load: 324; SKIP-CACHE-INV: ; %bb.0: ; %entry 325; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 326; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 327; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 328; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 329; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 330; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 331; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 332; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 333; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 334; SKIP-CACHE-INV-NEXT: s_endpgm 335; 336; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load: 337; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 338; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 339; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 340; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 341; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 342; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 343; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 344; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 345; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 346; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 347; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 348; 349; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load: 350; GFX90A-TGSPLIT: ; %bb.0: ; %entry 351; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 352; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 353; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 354; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 355; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 356; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 357; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 358; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 359; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 360; GFX90A-TGSPLIT-NEXT: s_endpgm 361; 362; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load: 363; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 364; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 365; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 366; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 367; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 368; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 369; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 370; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 371; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 372; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 373; GFX940-NOTTGSPLIT-NEXT: s_endpgm 374; 375; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_load: 376; GFX940-TGSPLIT: ; %bb.0: ; %entry 377; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 378; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 379; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 380; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 381; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 382; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 383; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 384; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 385; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 386; GFX940-TGSPLIT-NEXT: s_endpgm 387; 388; GFX11-WGP-LABEL: flat_singlethread_acquire_load: 389; GFX11-WGP: ; %bb.0: ; %entry 390; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 391; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 392; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 393; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] 394; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 395; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 396; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 397; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 398; GFX11-WGP-NEXT: s_endpgm 399; 400; GFX11-CU-LABEL: flat_singlethread_acquire_load: 401; GFX11-CU: ; %bb.0: ; %entry 402; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 403; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 404; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 405; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 406; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 407; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 408; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 409; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 410; GFX11-CU-NEXT: s_endpgm 411 i32* %in, i32* %out) { 412entry: 413 %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4 414 store i32 %val, i32* %out 415 ret void 416} 417 418define amdgpu_kernel void @flat_singlethread_seq_cst_load( 419; GFX7-LABEL: flat_singlethread_seq_cst_load: 420; GFX7: ; %bb.0: ; %entry 421; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 422; GFX7-NEXT: s_waitcnt lgkmcnt(0) 423; GFX7-NEXT: v_mov_b32_e32 v0, s0 424; GFX7-NEXT: v_mov_b32_e32 v1, s1 425; GFX7-NEXT: flat_load_dword v2, v[0:1] 426; GFX7-NEXT: v_mov_b32_e32 v0, s2 427; GFX7-NEXT: v_mov_b32_e32 v1, s3 428; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 429; GFX7-NEXT: flat_store_dword v[0:1], v2 430; GFX7-NEXT: s_endpgm 431; 432; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load: 433; GFX10-WGP: ; %bb.0: ; %entry 434; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 435; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 436; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 437; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 438; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 439; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 440; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 441; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 442; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 443; GFX10-WGP-NEXT: s_endpgm 444; 445; GFX10-CU-LABEL: flat_singlethread_seq_cst_load: 446; GFX10-CU: ; %bb.0: ; %entry 447; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 448; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 449; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 450; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 451; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 452; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 453; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 454; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 455; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 456; GFX10-CU-NEXT: s_endpgm 457; 458; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_load: 459; SKIP-CACHE-INV: ; %bb.0: ; %entry 460; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 461; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 462; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 463; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 464; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 465; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 466; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 467; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 468; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 469; SKIP-CACHE-INV-NEXT: s_endpgm 470; 471; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load: 472; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 473; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 474; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 475; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 476; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 477; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 478; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 479; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 480; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 481; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 482; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 483; 484; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load: 485; GFX90A-TGSPLIT: ; %bb.0: ; %entry 486; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 487; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 488; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 489; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 490; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 491; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 492; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 493; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 494; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 495; GFX90A-TGSPLIT-NEXT: s_endpgm 496; 497; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load: 498; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 499; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 500; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 501; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 502; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 503; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 504; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 505; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 506; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 507; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 508; GFX940-NOTTGSPLIT-NEXT: s_endpgm 509; 510; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_load: 511; GFX940-TGSPLIT: ; %bb.0: ; %entry 512; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 513; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 514; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 515; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 516; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 517; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 518; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 519; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 520; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 521; GFX940-TGSPLIT-NEXT: s_endpgm 522; 523; GFX11-WGP-LABEL: flat_singlethread_seq_cst_load: 524; GFX11-WGP: ; %bb.0: ; %entry 525; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 526; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 527; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 528; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] 529; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 530; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 531; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 532; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 533; GFX11-WGP-NEXT: s_endpgm 534; 535; GFX11-CU-LABEL: flat_singlethread_seq_cst_load: 536; GFX11-CU: ; %bb.0: ; %entry 537; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 538; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 539; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 540; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 541; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 542; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 543; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 544; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 545; GFX11-CU-NEXT: s_endpgm 546 i32* %in, i32* %out) { 547entry: 548 %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4 549 store i32 %val, i32* %out 550 ret void 551} 552 553define amdgpu_kernel void @flat_singlethread_unordered_store( 554; GFX7-LABEL: flat_singlethread_unordered_store: 555; GFX7: ; %bb.0: ; %entry 556; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 557; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 558; GFX7-NEXT: s_waitcnt lgkmcnt(0) 559; GFX7-NEXT: v_mov_b32_e32 v0, s0 560; GFX7-NEXT: v_mov_b32_e32 v1, s1 561; GFX7-NEXT: v_mov_b32_e32 v2, s2 562; GFX7-NEXT: flat_store_dword v[0:1], v2 563; GFX7-NEXT: s_endpgm 564; 565; GFX10-WGP-LABEL: flat_singlethread_unordered_store: 566; GFX10-WGP: ; %bb.0: ; %entry 567; GFX10-WGP-NEXT: s_clause 0x1 568; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 569; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 570; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 571; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 572; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 573; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 574; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 575; GFX10-WGP-NEXT: s_endpgm 576; 577; GFX10-CU-LABEL: flat_singlethread_unordered_store: 578; GFX10-CU: ; %bb.0: ; %entry 579; GFX10-CU-NEXT: s_clause 0x1 580; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 581; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 582; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 583; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 584; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 585; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 586; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 587; GFX10-CU-NEXT: s_endpgm 588; 589; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_store: 590; SKIP-CACHE-INV: ; %bb.0: ; %entry 591; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 592; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0 593; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 594; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 595; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 596; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 597; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 598; SKIP-CACHE-INV-NEXT: s_endpgm 599; 600; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store: 601; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 602; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 603; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 604; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 605; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 606; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 607; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 608; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 609; 610; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store: 611; GFX90A-TGSPLIT: ; %bb.0: ; %entry 612; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 613; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 614; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 615; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 616; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 617; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 618; GFX90A-TGSPLIT-NEXT: s_endpgm 619; 620; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store: 621; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 622; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 623; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 624; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 625; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 626; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 627; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 628; GFX940-NOTTGSPLIT-NEXT: s_endpgm 629; 630; GFX940-TGSPLIT-LABEL: flat_singlethread_unordered_store: 631; GFX940-TGSPLIT: ; %bb.0: ; %entry 632; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 633; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 634; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 635; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 636; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 637; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 638; GFX940-TGSPLIT-NEXT: s_endpgm 639; 640; GFX11-WGP-LABEL: flat_singlethread_unordered_store: 641; GFX11-WGP: ; %bb.0: ; %entry 642; GFX11-WGP-NEXT: s_clause 0x1 643; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 644; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 645; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 646; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 647; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 648; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 649; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 650; GFX11-WGP-NEXT: s_endpgm 651; 652; GFX11-CU-LABEL: flat_singlethread_unordered_store: 653; GFX11-CU: ; %bb.0: ; %entry 654; GFX11-CU-NEXT: s_clause 0x1 655; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 656; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 657; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 658; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 659; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 660; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 661; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 662; GFX11-CU-NEXT: s_endpgm 663 i32 %in, i32* %out) { 664entry: 665 store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4 666 ret void 667} 668 669define amdgpu_kernel void @flat_singlethread_monotonic_store( 670; GFX7-LABEL: flat_singlethread_monotonic_store: 671; GFX7: ; %bb.0: ; %entry 672; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 673; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 674; GFX7-NEXT: s_waitcnt lgkmcnt(0) 675; GFX7-NEXT: v_mov_b32_e32 v0, s0 676; GFX7-NEXT: v_mov_b32_e32 v1, s1 677; GFX7-NEXT: v_mov_b32_e32 v2, s2 678; GFX7-NEXT: flat_store_dword v[0:1], v2 679; GFX7-NEXT: s_endpgm 680; 681; GFX10-WGP-LABEL: flat_singlethread_monotonic_store: 682; GFX10-WGP: ; %bb.0: ; %entry 683; GFX10-WGP-NEXT: s_clause 0x1 684; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 685; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 686; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 687; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 688; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 689; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 690; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 691; GFX10-WGP-NEXT: s_endpgm 692; 693; GFX10-CU-LABEL: flat_singlethread_monotonic_store: 694; GFX10-CU: ; %bb.0: ; %entry 695; GFX10-CU-NEXT: s_clause 0x1 696; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 697; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 698; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 699; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 700; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 701; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 702; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 703; GFX10-CU-NEXT: s_endpgm 704; 705; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_store: 706; SKIP-CACHE-INV: ; %bb.0: ; %entry 707; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 708; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0 709; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 710; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 711; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 712; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 713; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 714; SKIP-CACHE-INV-NEXT: s_endpgm 715; 716; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store: 717; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 718; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 719; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 720; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 721; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 722; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 723; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 724; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 725; 726; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store: 727; GFX90A-TGSPLIT: ; %bb.0: ; %entry 728; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 729; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 730; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 731; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 732; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 733; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 734; GFX90A-TGSPLIT-NEXT: s_endpgm 735; 736; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store: 737; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 738; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 739; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 740; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 741; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 742; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 743; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 744; GFX940-NOTTGSPLIT-NEXT: s_endpgm 745; 746; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_store: 747; GFX940-TGSPLIT: ; %bb.0: ; %entry 748; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 749; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 750; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 751; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 752; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 753; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 754; GFX940-TGSPLIT-NEXT: s_endpgm 755; 756; GFX11-WGP-LABEL: flat_singlethread_monotonic_store: 757; GFX11-WGP: ; %bb.0: ; %entry 758; GFX11-WGP-NEXT: s_clause 0x1 759; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 760; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 761; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 762; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 763; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 764; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 765; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 766; GFX11-WGP-NEXT: s_endpgm 767; 768; GFX11-CU-LABEL: flat_singlethread_monotonic_store: 769; GFX11-CU: ; %bb.0: ; %entry 770; GFX11-CU-NEXT: s_clause 0x1 771; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 772; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 773; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 774; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 775; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 776; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 777; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 778; GFX11-CU-NEXT: s_endpgm 779 i32 %in, i32* %out) { 780entry: 781 store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4 782 ret void 783} 784 785define amdgpu_kernel void @flat_singlethread_release_store( 786; GFX7-LABEL: flat_singlethread_release_store: 787; GFX7: ; %bb.0: ; %entry 788; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 789; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 790; GFX7-NEXT: s_waitcnt lgkmcnt(0) 791; GFX7-NEXT: v_mov_b32_e32 v0, s0 792; GFX7-NEXT: v_mov_b32_e32 v1, s1 793; GFX7-NEXT: v_mov_b32_e32 v2, s2 794; GFX7-NEXT: flat_store_dword v[0:1], v2 795; GFX7-NEXT: s_endpgm 796; 797; GFX10-WGP-LABEL: flat_singlethread_release_store: 798; GFX10-WGP: ; %bb.0: ; %entry 799; GFX10-WGP-NEXT: s_clause 0x1 800; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 801; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 802; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 803; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 804; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 805; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 806; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 807; GFX10-WGP-NEXT: s_endpgm 808; 809; GFX10-CU-LABEL: flat_singlethread_release_store: 810; GFX10-CU: ; %bb.0: ; %entry 811; GFX10-CU-NEXT: s_clause 0x1 812; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 813; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 814; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 815; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 816; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 817; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 818; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 819; GFX10-CU-NEXT: s_endpgm 820; 821; SKIP-CACHE-INV-LABEL: flat_singlethread_release_store: 822; SKIP-CACHE-INV: ; %bb.0: ; %entry 823; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 824; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0 825; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 826; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 827; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 828; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 829; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 830; SKIP-CACHE-INV-NEXT: s_endpgm 831; 832; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store: 833; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 834; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 835; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 836; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 837; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 838; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 839; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 840; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 841; 842; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store: 843; GFX90A-TGSPLIT: ; %bb.0: ; %entry 844; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 845; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 846; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 847; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 848; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 849; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 850; GFX90A-TGSPLIT-NEXT: s_endpgm 851; 852; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_store: 853; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 854; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 855; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 856; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 857; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 858; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 859; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 860; GFX940-NOTTGSPLIT-NEXT: s_endpgm 861; 862; GFX940-TGSPLIT-LABEL: flat_singlethread_release_store: 863; GFX940-TGSPLIT: ; %bb.0: ; %entry 864; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 865; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 866; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 867; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 868; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 869; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 870; GFX940-TGSPLIT-NEXT: s_endpgm 871; 872; GFX11-WGP-LABEL: flat_singlethread_release_store: 873; GFX11-WGP: ; %bb.0: ; %entry 874; GFX11-WGP-NEXT: s_clause 0x1 875; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 876; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 877; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 878; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 879; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 880; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 881; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 882; GFX11-WGP-NEXT: s_endpgm 883; 884; GFX11-CU-LABEL: flat_singlethread_release_store: 885; GFX11-CU: ; %bb.0: ; %entry 886; GFX11-CU-NEXT: s_clause 0x1 887; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 888; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 889; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 890; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 891; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 892; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 893; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 894; GFX11-CU-NEXT: s_endpgm 895 i32 %in, i32* %out) { 896entry: 897 store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4 898 ret void 899} 900 901define amdgpu_kernel void @flat_singlethread_seq_cst_store( 902; GFX7-LABEL: flat_singlethread_seq_cst_store: 903; GFX7: ; %bb.0: ; %entry 904; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 905; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 906; GFX7-NEXT: s_waitcnt lgkmcnt(0) 907; GFX7-NEXT: v_mov_b32_e32 v0, s0 908; GFX7-NEXT: v_mov_b32_e32 v1, s1 909; GFX7-NEXT: v_mov_b32_e32 v2, s2 910; GFX7-NEXT: flat_store_dword v[0:1], v2 911; GFX7-NEXT: s_endpgm 912; 913; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store: 914; GFX10-WGP: ; %bb.0: ; %entry 915; GFX10-WGP-NEXT: s_clause 0x1 916; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 917; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 918; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 919; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 920; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 921; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 922; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 923; GFX10-WGP-NEXT: s_endpgm 924; 925; GFX10-CU-LABEL: flat_singlethread_seq_cst_store: 926; GFX10-CU: ; %bb.0: ; %entry 927; GFX10-CU-NEXT: s_clause 0x1 928; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 929; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 930; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 931; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 932; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 933; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 934; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 935; GFX10-CU-NEXT: s_endpgm 936; 937; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_store: 938; SKIP-CACHE-INV: ; %bb.0: ; %entry 939; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 940; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0 941; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 942; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 943; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 944; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 945; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 946; SKIP-CACHE-INV-NEXT: s_endpgm 947; 948; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store: 949; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 950; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 951; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 952; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 953; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 954; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 955; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 956; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 957; 958; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store: 959; GFX90A-TGSPLIT: ; %bb.0: ; %entry 960; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 961; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 962; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 963; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 964; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 965; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 966; GFX90A-TGSPLIT-NEXT: s_endpgm 967; 968; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store: 969; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 970; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 971; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 972; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 973; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 974; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 975; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 976; GFX940-NOTTGSPLIT-NEXT: s_endpgm 977; 978; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_store: 979; GFX940-TGSPLIT: ; %bb.0: ; %entry 980; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 981; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 982; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 983; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 984; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 985; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 986; GFX940-TGSPLIT-NEXT: s_endpgm 987; 988; GFX11-WGP-LABEL: flat_singlethread_seq_cst_store: 989; GFX11-WGP: ; %bb.0: ; %entry 990; GFX11-WGP-NEXT: s_clause 0x1 991; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 992; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 993; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 994; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 995; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 996; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 997; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 998; GFX11-WGP-NEXT: s_endpgm 999; 1000; GFX11-CU-LABEL: flat_singlethread_seq_cst_store: 1001; GFX11-CU: ; %bb.0: ; %entry 1002; GFX11-CU-NEXT: s_clause 0x1 1003; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 1004; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 1005; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1006; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1007; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1008; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 1009; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1010; GFX11-CU-NEXT: s_endpgm 1011 i32 %in, i32* %out) { 1012entry: 1013 store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4 1014 ret void 1015} 1016 1017define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( 1018; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw: 1019; GFX7: ; %bb.0: ; %entry 1020; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1021; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1022; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1023; GFX7-NEXT: v_mov_b32_e32 v0, s0 1024; GFX7-NEXT: v_mov_b32_e32 v1, s1 1025; GFX7-NEXT: v_mov_b32_e32 v2, s2 1026; GFX7-NEXT: flat_atomic_swap v[0:1], v2 1027; GFX7-NEXT: s_endpgm 1028; 1029; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw: 1030; GFX10-WGP: ; %bb.0: ; %entry 1031; GFX10-WGP-NEXT: s_clause 0x1 1032; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1033; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1034; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1035; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1036; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1037; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1038; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 1039; GFX10-WGP-NEXT: s_endpgm 1040; 1041; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw: 1042; GFX10-CU: ; %bb.0: ; %entry 1043; GFX10-CU-NEXT: s_clause 0x1 1044; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1045; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1046; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1047; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1048; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1049; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1050; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 1051; GFX10-CU-NEXT: s_endpgm 1052; 1053; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_atomicrmw: 1054; SKIP-CACHE-INV: ; %bb.0: ; %entry 1055; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1056; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 1057; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1058; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1059; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1060; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1061; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 1062; SKIP-CACHE-INV-NEXT: s_endpgm 1063; 1064; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: 1065; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1066; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1067; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1068; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1069; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1070; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1071; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1072; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1073; 1074; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: 1075; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1076; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1077; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1078; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1079; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1080; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1081; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1082; GFX90A-TGSPLIT-NEXT: s_endpgm 1083; 1084; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: 1085; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1086; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1087; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 1088; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1089; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1090; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1091; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1092; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1093; 1094; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: 1095; GFX940-TGSPLIT: ; %bb.0: ; %entry 1096; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1097; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 1098; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1099; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1100; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1101; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1102; GFX940-TGSPLIT-NEXT: s_endpgm 1103; 1104; GFX11-WGP-LABEL: flat_singlethread_monotonic_atomicrmw: 1105; GFX11-WGP: ; %bb.0: ; %entry 1106; GFX11-WGP-NEXT: s_clause 0x1 1107; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 1108; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 1109; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1110; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1111; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1112; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1113; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1114; GFX11-WGP-NEXT: s_endpgm 1115; 1116; GFX11-CU-LABEL: flat_singlethread_monotonic_atomicrmw: 1117; GFX11-CU: ; %bb.0: ; %entry 1118; GFX11-CU-NEXT: s_clause 0x1 1119; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 1120; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 1121; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1122; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1123; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1124; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1125; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1126; GFX11-CU-NEXT: s_endpgm 1127 i32* %out, i32 %in) { 1128entry: 1129 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") monotonic 1130 ret void 1131} 1132 1133define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( 1134; GFX7-LABEL: flat_singlethread_acquire_atomicrmw: 1135; GFX7: ; %bb.0: ; %entry 1136; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1137; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1138; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1139; GFX7-NEXT: v_mov_b32_e32 v0, s0 1140; GFX7-NEXT: v_mov_b32_e32 v1, s1 1141; GFX7-NEXT: v_mov_b32_e32 v2, s2 1142; GFX7-NEXT: flat_atomic_swap v[0:1], v2 1143; GFX7-NEXT: s_endpgm 1144; 1145; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw: 1146; GFX10-WGP: ; %bb.0: ; %entry 1147; GFX10-WGP-NEXT: s_clause 0x1 1148; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1149; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1150; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1151; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1152; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1153; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1154; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 1155; GFX10-WGP-NEXT: s_endpgm 1156; 1157; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw: 1158; GFX10-CU: ; %bb.0: ; %entry 1159; GFX10-CU-NEXT: s_clause 0x1 1160; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1161; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1162; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1163; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1164; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1165; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1166; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 1167; GFX10-CU-NEXT: s_endpgm 1168; 1169; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_atomicrmw: 1170; SKIP-CACHE-INV: ; %bb.0: ; %entry 1171; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1172; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 1173; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1174; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1175; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1176; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1177; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 1178; SKIP-CACHE-INV-NEXT: s_endpgm 1179; 1180; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: 1181; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1182; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1183; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1184; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1185; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1186; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1187; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1188; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1189; 1190; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: 1191; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1192; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1193; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1194; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1195; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1196; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1197; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1198; GFX90A-TGSPLIT-NEXT: s_endpgm 1199; 1200; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: 1201; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1202; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1203; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 1204; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1205; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1206; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1207; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1208; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1209; 1210; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: 1211; GFX940-TGSPLIT: ; %bb.0: ; %entry 1212; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1213; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 1214; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1215; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1216; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1217; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1218; GFX940-TGSPLIT-NEXT: s_endpgm 1219; 1220; GFX11-WGP-LABEL: flat_singlethread_acquire_atomicrmw: 1221; GFX11-WGP: ; %bb.0: ; %entry 1222; GFX11-WGP-NEXT: s_clause 0x1 1223; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 1224; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 1225; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1226; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1227; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1228; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1229; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1230; GFX11-WGP-NEXT: s_endpgm 1231; 1232; GFX11-CU-LABEL: flat_singlethread_acquire_atomicrmw: 1233; GFX11-CU: ; %bb.0: ; %entry 1234; GFX11-CU-NEXT: s_clause 0x1 1235; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 1236; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 1237; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1238; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1239; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1240; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1241; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1242; GFX11-CU-NEXT: s_endpgm 1243 i32* %out, i32 %in) { 1244entry: 1245 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire 1246 ret void 1247} 1248 1249define amdgpu_kernel void @flat_singlethread_release_atomicrmw( 1250; GFX7-LABEL: flat_singlethread_release_atomicrmw: 1251; GFX7: ; %bb.0: ; %entry 1252; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1253; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1254; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1255; GFX7-NEXT: v_mov_b32_e32 v0, s0 1256; GFX7-NEXT: v_mov_b32_e32 v1, s1 1257; GFX7-NEXT: v_mov_b32_e32 v2, s2 1258; GFX7-NEXT: flat_atomic_swap v[0:1], v2 1259; GFX7-NEXT: s_endpgm 1260; 1261; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw: 1262; GFX10-WGP: ; %bb.0: ; %entry 1263; GFX10-WGP-NEXT: s_clause 0x1 1264; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1265; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1266; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1267; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1268; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1269; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1270; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 1271; GFX10-WGP-NEXT: s_endpgm 1272; 1273; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw: 1274; GFX10-CU: ; %bb.0: ; %entry 1275; GFX10-CU-NEXT: s_clause 0x1 1276; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1277; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1278; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1279; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1280; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1281; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1282; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 1283; GFX10-CU-NEXT: s_endpgm 1284; 1285; SKIP-CACHE-INV-LABEL: flat_singlethread_release_atomicrmw: 1286; SKIP-CACHE-INV: ; %bb.0: ; %entry 1287; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1288; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 1289; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1290; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1291; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1292; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1293; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 1294; SKIP-CACHE-INV-NEXT: s_endpgm 1295; 1296; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw: 1297; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1298; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1299; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1300; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1301; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1302; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1303; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1304; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1305; 1306; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw: 1307; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1308; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1309; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1310; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1311; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1312; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1313; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1314; GFX90A-TGSPLIT-NEXT: s_endpgm 1315; 1316; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw: 1317; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1318; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1319; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 1320; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1321; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1322; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1323; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1324; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1325; 1326; GFX940-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw: 1327; GFX940-TGSPLIT: ; %bb.0: ; %entry 1328; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1329; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 1330; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1331; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1332; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1333; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1334; GFX940-TGSPLIT-NEXT: s_endpgm 1335; 1336; GFX11-WGP-LABEL: flat_singlethread_release_atomicrmw: 1337; GFX11-WGP: ; %bb.0: ; %entry 1338; GFX11-WGP-NEXT: s_clause 0x1 1339; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 1340; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 1341; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1342; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1343; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1344; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1345; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1346; GFX11-WGP-NEXT: s_endpgm 1347; 1348; GFX11-CU-LABEL: flat_singlethread_release_atomicrmw: 1349; GFX11-CU: ; %bb.0: ; %entry 1350; GFX11-CU-NEXT: s_clause 0x1 1351; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 1352; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 1353; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1354; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1355; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1356; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1357; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1358; GFX11-CU-NEXT: s_endpgm 1359 i32* %out, i32 %in) { 1360entry: 1361 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") release 1362 ret void 1363} 1364 1365define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( 1366; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw: 1367; GFX7: ; %bb.0: ; %entry 1368; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1369; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1370; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1371; GFX7-NEXT: v_mov_b32_e32 v0, s0 1372; GFX7-NEXT: v_mov_b32_e32 v1, s1 1373; GFX7-NEXT: v_mov_b32_e32 v2, s2 1374; GFX7-NEXT: flat_atomic_swap v[0:1], v2 1375; GFX7-NEXT: s_endpgm 1376; 1377; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: 1378; GFX10-WGP: ; %bb.0: ; %entry 1379; GFX10-WGP-NEXT: s_clause 0x1 1380; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1381; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1382; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1383; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1384; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1385; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1386; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 1387; GFX10-WGP-NEXT: s_endpgm 1388; 1389; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: 1390; GFX10-CU: ; %bb.0: ; %entry 1391; GFX10-CU-NEXT: s_clause 0x1 1392; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1393; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1394; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1395; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1396; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1397; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1398; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 1399; GFX10-CU-NEXT: s_endpgm 1400; 1401; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_atomicrmw: 1402; SKIP-CACHE-INV: ; %bb.0: ; %entry 1403; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1404; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 1405; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1406; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1407; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1408; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1409; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 1410; SKIP-CACHE-INV-NEXT: s_endpgm 1411; 1412; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: 1413; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1414; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1415; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1416; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1417; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1418; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1419; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1420; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1421; 1422; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: 1423; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1424; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1425; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1426; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1427; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1428; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1429; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1430; GFX90A-TGSPLIT-NEXT: s_endpgm 1431; 1432; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: 1433; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1434; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1435; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 1436; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1437; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1438; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1439; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1440; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1441; 1442; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: 1443; GFX940-TGSPLIT: ; %bb.0: ; %entry 1444; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1445; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 1446; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1447; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1448; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1449; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1450; GFX940-TGSPLIT-NEXT: s_endpgm 1451; 1452; GFX11-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: 1453; GFX11-WGP: ; %bb.0: ; %entry 1454; GFX11-WGP-NEXT: s_clause 0x1 1455; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 1456; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 1457; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1458; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1459; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1460; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1461; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1462; GFX11-WGP-NEXT: s_endpgm 1463; 1464; GFX11-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: 1465; GFX11-CU: ; %bb.0: ; %entry 1466; GFX11-CU-NEXT: s_clause 0x1 1467; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 1468; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 1469; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1470; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1471; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1472; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1473; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1474; GFX11-CU-NEXT: s_endpgm 1475 i32* %out, i32 %in) { 1476entry: 1477 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel 1478 ret void 1479} 1480 1481define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( 1482; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw: 1483; GFX7: ; %bb.0: ; %entry 1484; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1485; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1486; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1487; GFX7-NEXT: v_mov_b32_e32 v0, s0 1488; GFX7-NEXT: v_mov_b32_e32 v1, s1 1489; GFX7-NEXT: v_mov_b32_e32 v2, s2 1490; GFX7-NEXT: flat_atomic_swap v[0:1], v2 1491; GFX7-NEXT: s_endpgm 1492; 1493; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: 1494; GFX10-WGP: ; %bb.0: ; %entry 1495; GFX10-WGP-NEXT: s_clause 0x1 1496; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1497; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1498; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1499; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1500; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1501; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1502; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 1503; GFX10-WGP-NEXT: s_endpgm 1504; 1505; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: 1506; GFX10-CU: ; %bb.0: ; %entry 1507; GFX10-CU-NEXT: s_clause 0x1 1508; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1509; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1510; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1511; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1512; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1513; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1514; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 1515; GFX10-CU-NEXT: s_endpgm 1516; 1517; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_atomicrmw: 1518; SKIP-CACHE-INV: ; %bb.0: ; %entry 1519; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1520; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 1521; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1522; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1523; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1524; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1525; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 1526; SKIP-CACHE-INV-NEXT: s_endpgm 1527; 1528; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: 1529; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1530; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1531; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1532; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1533; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1534; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1535; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1536; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1537; 1538; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: 1539; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1540; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1541; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1542; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1543; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1544; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1545; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1546; GFX90A-TGSPLIT-NEXT: s_endpgm 1547; 1548; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: 1549; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1550; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1551; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 1552; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1553; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1554; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1555; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1556; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1557; 1558; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: 1559; GFX940-TGSPLIT: ; %bb.0: ; %entry 1560; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1561; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 1562; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1563; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1564; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1565; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1566; GFX940-TGSPLIT-NEXT: s_endpgm 1567; 1568; GFX11-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: 1569; GFX11-WGP: ; %bb.0: ; %entry 1570; GFX11-WGP-NEXT: s_clause 0x1 1571; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 1572; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 1573; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1574; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1575; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1576; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1577; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1578; GFX11-WGP-NEXT: s_endpgm 1579; 1580; GFX11-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: 1581; GFX11-CU: ; %bb.0: ; %entry 1582; GFX11-CU-NEXT: s_clause 0x1 1583; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 1584; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 1585; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1586; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1587; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1588; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1589; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1590; GFX11-CU-NEXT: s_endpgm 1591 i32* %out, i32 %in) { 1592entry: 1593 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst 1594 ret void 1595} 1596 1597define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( 1598; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1599; GFX7: ; %bb.0: ; %entry 1600; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1601; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1602; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1603; GFX7-NEXT: v_mov_b32_e32 v0, s0 1604; GFX7-NEXT: v_mov_b32_e32 v1, s1 1605; GFX7-NEXT: v_mov_b32_e32 v2, s2 1606; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1607; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1608; GFX7-NEXT: flat_store_dword v[0:1], v2 1609; GFX7-NEXT: s_endpgm 1610; 1611; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1612; GFX10-WGP: ; %bb.0: ; %entry 1613; GFX10-WGP-NEXT: s_clause 0x1 1614; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1615; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1616; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1617; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1618; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1619; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1620; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1621; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1622; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 1623; GFX10-WGP-NEXT: s_endpgm 1624; 1625; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1626; GFX10-CU: ; %bb.0: ; %entry 1627; GFX10-CU-NEXT: s_clause 0x1 1628; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1629; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1630; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1631; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1632; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1633; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1634; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1635; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1636; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 1637; GFX10-CU-NEXT: s_endpgm 1638; 1639; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1640; SKIP-CACHE-INV: ; %bb.0: ; %entry 1641; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1642; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 1643; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1644; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1645; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1646; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1647; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1648; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1649; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1650; SKIP-CACHE-INV-NEXT: s_endpgm 1651; 1652; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1653; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1654; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1655; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1656; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1657; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1658; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1659; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1660; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1661; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1662; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1663; 1664; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1665; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1666; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1667; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1668; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1669; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1670; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1671; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1672; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1673; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1674; GFX90A-TGSPLIT-NEXT: s_endpgm 1675; 1676; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1677; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1678; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1679; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 1680; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1681; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1682; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1683; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 1684; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1685; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1686; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1687; 1688; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1689; GFX940-TGSPLIT: ; %bb.0: ; %entry 1690; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1691; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 1692; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1693; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1694; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1695; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 1696; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1697; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1698; GFX940-TGSPLIT-NEXT: s_endpgm 1699; 1700; GFX11-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1701; GFX11-WGP: ; %bb.0: ; %entry 1702; GFX11-WGP-NEXT: s_clause 0x1 1703; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 1704; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 1705; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1706; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1707; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1708; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 1709; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1710; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 1711; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1712; GFX11-WGP-NEXT: s_endpgm 1713; 1714; GFX11-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1715; GFX11-CU: ; %bb.0: ; %entry 1716; GFX11-CU-NEXT: s_clause 0x1 1717; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 1718; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 1719; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1720; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1721; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1722; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 1723; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1724; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 1725; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1726; GFX11-CU-NEXT: s_endpgm 1727 i32* %out, i32 %in) { 1728entry: 1729 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire 1730 store i32 %val, i32* %out, align 4 1731 ret void 1732} 1733 1734define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( 1735; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 1736; GFX7: ; %bb.0: ; %entry 1737; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1738; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1739; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1740; GFX7-NEXT: v_mov_b32_e32 v0, s0 1741; GFX7-NEXT: v_mov_b32_e32 v1, s1 1742; GFX7-NEXT: v_mov_b32_e32 v2, s2 1743; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1744; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1745; GFX7-NEXT: flat_store_dword v[0:1], v2 1746; GFX7-NEXT: s_endpgm 1747; 1748; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 1749; GFX10-WGP: ; %bb.0: ; %entry 1750; GFX10-WGP-NEXT: s_clause 0x1 1751; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1752; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1753; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1754; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1755; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1756; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1757; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1758; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1759; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 1760; GFX10-WGP-NEXT: s_endpgm 1761; 1762; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 1763; GFX10-CU: ; %bb.0: ; %entry 1764; GFX10-CU-NEXT: s_clause 0x1 1765; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1766; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1767; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1768; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1769; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1770; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1771; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1772; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1773; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 1774; GFX10-CU-NEXT: s_endpgm 1775; 1776; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 1777; SKIP-CACHE-INV: ; %bb.0: ; %entry 1778; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1779; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 1780; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1781; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1782; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1783; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1784; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1785; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1786; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1787; SKIP-CACHE-INV-NEXT: s_endpgm 1788; 1789; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 1790; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1791; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1792; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1793; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1794; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1795; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1796; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1797; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1798; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1799; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1800; 1801; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 1802; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1803; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1804; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1805; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1806; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1807; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1808; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1809; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1810; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1811; GFX90A-TGSPLIT-NEXT: s_endpgm 1812; 1813; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 1814; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1815; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1816; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 1817; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1818; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1819; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1820; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 1821; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1822; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1823; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1824; 1825; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 1826; GFX940-TGSPLIT: ; %bb.0: ; %entry 1827; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1828; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 1829; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1830; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1831; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1832; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 1833; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1834; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1835; GFX940-TGSPLIT-NEXT: s_endpgm 1836; 1837; GFX11-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 1838; GFX11-WGP: ; %bb.0: ; %entry 1839; GFX11-WGP-NEXT: s_clause 0x1 1840; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 1841; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 1842; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1843; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1844; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1845; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 1846; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1847; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 1848; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1849; GFX11-WGP-NEXT: s_endpgm 1850; 1851; GFX11-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 1852; GFX11-CU: ; %bb.0: ; %entry 1853; GFX11-CU-NEXT: s_clause 0x1 1854; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 1855; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 1856; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1857; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1858; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1859; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 1860; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1861; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 1862; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1863; GFX11-CU-NEXT: s_endpgm 1864 i32* %out, i32 %in) { 1865entry: 1866 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel 1867 store i32 %val, i32* %out, align 4 1868 ret void 1869} 1870 1871define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( 1872; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 1873; GFX7: ; %bb.0: ; %entry 1874; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1875; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1876; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1877; GFX7-NEXT: v_mov_b32_e32 v0, s0 1878; GFX7-NEXT: v_mov_b32_e32 v1, s1 1879; GFX7-NEXT: v_mov_b32_e32 v2, s2 1880; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1881; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1882; GFX7-NEXT: flat_store_dword v[0:1], v2 1883; GFX7-NEXT: s_endpgm 1884; 1885; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 1886; GFX10-WGP: ; %bb.0: ; %entry 1887; GFX10-WGP-NEXT: s_clause 0x1 1888; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1889; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1890; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1891; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1892; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1893; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1894; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1895; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1896; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 1897; GFX10-WGP-NEXT: s_endpgm 1898; 1899; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 1900; GFX10-CU: ; %bb.0: ; %entry 1901; GFX10-CU-NEXT: s_clause 0x1 1902; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1903; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1904; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1905; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1906; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1907; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1908; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1909; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1910; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 1911; GFX10-CU-NEXT: s_endpgm 1912; 1913; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 1914; SKIP-CACHE-INV: ; %bb.0: ; %entry 1915; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1916; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 1917; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1918; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1919; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1920; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1921; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1922; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1923; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1924; SKIP-CACHE-INV-NEXT: s_endpgm 1925; 1926; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 1927; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1928; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1929; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1930; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1931; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1932; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1933; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1934; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1935; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1936; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1937; 1938; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 1939; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1940; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1941; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1942; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1943; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1944; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1945; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1946; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1947; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1948; GFX90A-TGSPLIT-NEXT: s_endpgm 1949; 1950; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 1951; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1952; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1953; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 1954; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1955; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1956; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1957; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 1958; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1959; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1960; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1961; 1962; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 1963; GFX940-TGSPLIT: ; %bb.0: ; %entry 1964; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1965; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 1966; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1967; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1968; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1969; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 1970; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1971; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1972; GFX940-TGSPLIT-NEXT: s_endpgm 1973; 1974; GFX11-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 1975; GFX11-WGP: ; %bb.0: ; %entry 1976; GFX11-WGP-NEXT: s_clause 0x1 1977; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 1978; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 1979; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1980; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1981; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1982; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 1983; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1984; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 1985; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1986; GFX11-WGP-NEXT: s_endpgm 1987; 1988; GFX11-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 1989; GFX11-CU: ; %bb.0: ; %entry 1990; GFX11-CU-NEXT: s_clause 0x1 1991; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 1992; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 1993; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1994; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1995; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1996; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 1997; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1998; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 1999; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2000; GFX11-CU-NEXT: s_endpgm 2001 i32* %out, i32 %in) { 2002entry: 2003 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst 2004 store i32 %val, i32* %out, align 4 2005 ret void 2006} 2007 2008define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( 2009; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2010; GFX7: ; %bb.0: ; %entry 2011; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2012; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2013; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2014; GFX7-NEXT: s_add_u32 s0, s0, 16 2015; GFX7-NEXT: s_addc_u32 s1, s1, 0 2016; GFX7-NEXT: v_mov_b32_e32 v0, s0 2017; GFX7-NEXT: v_mov_b32_e32 v2, s2 2018; GFX7-NEXT: v_mov_b32_e32 v1, s1 2019; GFX7-NEXT: v_mov_b32_e32 v3, s3 2020; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2021; GFX7-NEXT: s_endpgm 2022; 2023; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2024; GFX10-WGP: ; %bb.0: ; %entry 2025; GFX10-WGP-NEXT: s_clause 0x1 2026; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2027; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2028; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2029; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2030; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2031; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2032; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2033; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2034; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2035; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2036; GFX10-WGP-NEXT: s_endpgm 2037; 2038; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2039; GFX10-CU: ; %bb.0: ; %entry 2040; GFX10-CU-NEXT: s_clause 0x1 2041; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2042; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2043; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2044; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2045; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2046; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2047; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2048; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2049; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2050; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2051; GFX10-CU-NEXT: s_endpgm 2052; 2053; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2054; SKIP-CACHE-INV: ; %bb.0: ; %entry 2055; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2056; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 2057; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2058; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2059; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2060; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2061; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2062; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2063; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2064; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2065; SKIP-CACHE-INV-NEXT: s_endpgm 2066; 2067; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2068; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2069; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2070; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2071; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2072; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2073; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2074; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2075; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2076; 2077; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2078; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2079; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2080; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2081; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2082; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2083; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2084; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2085; GFX90A-TGSPLIT-NEXT: s_endpgm 2086; 2087; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2088; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 2089; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2090; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 2091; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2092; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 2093; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 2094; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2095; GFX940-NOTTGSPLIT-NEXT: s_endpgm 2096; 2097; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2098; GFX940-TGSPLIT: ; %bb.0: ; %entry 2099; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2100; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 2101; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2102; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 2103; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 2104; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2105; GFX940-TGSPLIT-NEXT: s_endpgm 2106; 2107; GFX11-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2108; GFX11-WGP: ; %bb.0: ; %entry 2109; GFX11-WGP-NEXT: s_clause 0x1 2110; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 2111; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 2112; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 2113; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 2114; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 2115; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2116; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2117; GFX11-WGP-NEXT: s_endpgm 2118; 2119; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2120; GFX11-CU: ; %bb.0: ; %entry 2121; GFX11-CU-NEXT: s_clause 0x1 2122; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 2123; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 2124; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2125; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 2126; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 2127; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2128; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2129; GFX11-CU-NEXT: s_endpgm 2130 i32* %out, i32 %in, i32 %old) { 2131entry: 2132 %gep = getelementptr i32, i32* %out, i32 4 2133 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic 2134 ret void 2135} 2136 2137define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( 2138; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2139; GFX7: ; %bb.0: ; %entry 2140; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2141; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2142; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2143; GFX7-NEXT: s_add_u32 s0, s0, 16 2144; GFX7-NEXT: s_addc_u32 s1, s1, 0 2145; GFX7-NEXT: v_mov_b32_e32 v0, s0 2146; GFX7-NEXT: v_mov_b32_e32 v2, s2 2147; GFX7-NEXT: v_mov_b32_e32 v1, s1 2148; GFX7-NEXT: v_mov_b32_e32 v3, s3 2149; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2150; GFX7-NEXT: s_endpgm 2151; 2152; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2153; GFX10-WGP: ; %bb.0: ; %entry 2154; GFX10-WGP-NEXT: s_clause 0x1 2155; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2156; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2157; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2158; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2159; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2160; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2161; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2162; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2163; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2164; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2165; GFX10-WGP-NEXT: s_endpgm 2166; 2167; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2168; GFX10-CU: ; %bb.0: ; %entry 2169; GFX10-CU-NEXT: s_clause 0x1 2170; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2171; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2172; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2173; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2174; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2175; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2176; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2177; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2178; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2179; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2180; GFX10-CU-NEXT: s_endpgm 2181; 2182; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2183; SKIP-CACHE-INV: ; %bb.0: ; %entry 2184; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2185; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 2186; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2187; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2188; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2189; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2190; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2191; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2192; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2193; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2194; SKIP-CACHE-INV-NEXT: s_endpgm 2195; 2196; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2197; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2198; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2199; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2200; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2201; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2202; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2203; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2204; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2205; 2206; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2207; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2208; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2209; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2210; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2211; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2212; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2213; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2214; GFX90A-TGSPLIT-NEXT: s_endpgm 2215; 2216; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2217; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 2218; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2219; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 2220; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2221; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 2222; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 2223; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2224; GFX940-NOTTGSPLIT-NEXT: s_endpgm 2225; 2226; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2227; GFX940-TGSPLIT: ; %bb.0: ; %entry 2228; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2229; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 2230; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2231; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 2232; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 2233; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2234; GFX940-TGSPLIT-NEXT: s_endpgm 2235; 2236; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2237; GFX11-WGP: ; %bb.0: ; %entry 2238; GFX11-WGP-NEXT: s_clause 0x1 2239; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 2240; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 2241; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 2242; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 2243; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 2244; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2245; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2246; GFX11-WGP-NEXT: s_endpgm 2247; 2248; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2249; GFX11-CU: ; %bb.0: ; %entry 2250; GFX11-CU-NEXT: s_clause 0x1 2251; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 2252; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 2253; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2254; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 2255; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 2256; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2257; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2258; GFX11-CU-NEXT: s_endpgm 2259 i32* %out, i32 %in, i32 %old) { 2260entry: 2261 %gep = getelementptr i32, i32* %out, i32 4 2262 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic 2263 ret void 2264} 2265 2266define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( 2267; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg: 2268; GFX7: ; %bb.0: ; %entry 2269; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2270; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2271; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2272; GFX7-NEXT: s_add_u32 s0, s0, 16 2273; GFX7-NEXT: s_addc_u32 s1, s1, 0 2274; GFX7-NEXT: v_mov_b32_e32 v0, s0 2275; GFX7-NEXT: v_mov_b32_e32 v2, s2 2276; GFX7-NEXT: v_mov_b32_e32 v1, s1 2277; GFX7-NEXT: v_mov_b32_e32 v3, s3 2278; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2279; GFX7-NEXT: s_endpgm 2280; 2281; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: 2282; GFX10-WGP: ; %bb.0: ; %entry 2283; GFX10-WGP-NEXT: s_clause 0x1 2284; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2285; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2286; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2287; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2288; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2289; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2290; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2291; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2292; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2293; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2294; GFX10-WGP-NEXT: s_endpgm 2295; 2296; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: 2297; GFX10-CU: ; %bb.0: ; %entry 2298; GFX10-CU-NEXT: s_clause 0x1 2299; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2300; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2301; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2302; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2303; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2304; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2305; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2306; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2307; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2308; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2309; GFX10-CU-NEXT: s_endpgm 2310; 2311; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_cmpxchg: 2312; SKIP-CACHE-INV: ; %bb.0: ; %entry 2313; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2314; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 2315; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2316; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2317; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2318; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2319; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2320; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2321; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2322; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2323; SKIP-CACHE-INV-NEXT: s_endpgm 2324; 2325; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: 2326; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2327; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2328; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2329; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2330; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2331; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2332; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2333; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2334; 2335; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: 2336; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2337; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2338; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2339; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2340; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2341; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2342; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2343; GFX90A-TGSPLIT-NEXT: s_endpgm 2344; 2345; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: 2346; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 2347; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2348; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 2349; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2350; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 2351; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 2352; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2353; GFX940-NOTTGSPLIT-NEXT: s_endpgm 2354; 2355; GFX940-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: 2356; GFX940-TGSPLIT: ; %bb.0: ; %entry 2357; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2358; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 2359; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2360; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 2361; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 2362; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2363; GFX940-TGSPLIT-NEXT: s_endpgm 2364; 2365; GFX11-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: 2366; GFX11-WGP: ; %bb.0: ; %entry 2367; GFX11-WGP-NEXT: s_clause 0x1 2368; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 2369; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 2370; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 2371; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 2372; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 2373; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2374; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2375; GFX11-WGP-NEXT: s_endpgm 2376; 2377; GFX11-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: 2378; GFX11-CU: ; %bb.0: ; %entry 2379; GFX11-CU-NEXT: s_clause 0x1 2380; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 2381; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 2382; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2383; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 2384; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 2385; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2386; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2387; GFX11-CU-NEXT: s_endpgm 2388 i32* %out, i32 %in, i32 %old) { 2389entry: 2390 %gep = getelementptr i32, i32* %out, i32 4 2391 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic 2392 ret void 2393} 2394 2395define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( 2396; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 2397; GFX7: ; %bb.0: ; %entry 2398; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2399; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2400; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2401; GFX7-NEXT: s_add_u32 s0, s0, 16 2402; GFX7-NEXT: s_addc_u32 s1, s1, 0 2403; GFX7-NEXT: v_mov_b32_e32 v0, s0 2404; GFX7-NEXT: v_mov_b32_e32 v2, s2 2405; GFX7-NEXT: v_mov_b32_e32 v1, s1 2406; GFX7-NEXT: v_mov_b32_e32 v3, s3 2407; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2408; GFX7-NEXT: s_endpgm 2409; 2410; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 2411; GFX10-WGP: ; %bb.0: ; %entry 2412; GFX10-WGP-NEXT: s_clause 0x1 2413; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2414; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2415; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2416; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2417; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2418; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2419; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2420; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2421; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2422; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2423; GFX10-WGP-NEXT: s_endpgm 2424; 2425; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 2426; GFX10-CU: ; %bb.0: ; %entry 2427; GFX10-CU-NEXT: s_clause 0x1 2428; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2429; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2430; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2431; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2432; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2433; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2434; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2435; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2436; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2437; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2438; GFX10-CU-NEXT: s_endpgm 2439; 2440; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 2441; SKIP-CACHE-INV: ; %bb.0: ; %entry 2442; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2443; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 2444; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2445; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2446; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2447; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2448; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2449; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2450; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2451; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2452; SKIP-CACHE-INV-NEXT: s_endpgm 2453; 2454; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 2455; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2456; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2457; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2458; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2459; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2460; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2461; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2462; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2463; 2464; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 2465; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2466; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2467; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2468; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2469; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2470; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2471; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2472; GFX90A-TGSPLIT-NEXT: s_endpgm 2473; 2474; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 2475; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 2476; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2477; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 2478; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2479; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 2480; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 2481; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2482; GFX940-NOTTGSPLIT-NEXT: s_endpgm 2483; 2484; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 2485; GFX940-TGSPLIT: ; %bb.0: ; %entry 2486; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2487; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 2488; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2489; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 2490; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 2491; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2492; GFX940-TGSPLIT-NEXT: s_endpgm 2493; 2494; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 2495; GFX11-WGP: ; %bb.0: ; %entry 2496; GFX11-WGP-NEXT: s_clause 0x1 2497; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 2498; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 2499; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 2500; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 2501; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 2502; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2503; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2504; GFX11-WGP-NEXT: s_endpgm 2505; 2506; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 2507; GFX11-CU: ; %bb.0: ; %entry 2508; GFX11-CU-NEXT: s_clause 0x1 2509; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 2510; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 2511; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2512; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 2513; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 2514; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2515; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2516; GFX11-CU-NEXT: s_endpgm 2517 i32* %out, i32 %in, i32 %old) { 2518entry: 2519 %gep = getelementptr i32, i32* %out, i32 4 2520 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic 2521 ret void 2522} 2523 2524define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( 2525; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 2526; GFX7: ; %bb.0: ; %entry 2527; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2528; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2529; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2530; GFX7-NEXT: s_add_u32 s0, s0, 16 2531; GFX7-NEXT: s_addc_u32 s1, s1, 0 2532; GFX7-NEXT: v_mov_b32_e32 v0, s0 2533; GFX7-NEXT: v_mov_b32_e32 v2, s2 2534; GFX7-NEXT: v_mov_b32_e32 v1, s1 2535; GFX7-NEXT: v_mov_b32_e32 v3, s3 2536; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2537; GFX7-NEXT: s_endpgm 2538; 2539; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 2540; GFX10-WGP: ; %bb.0: ; %entry 2541; GFX10-WGP-NEXT: s_clause 0x1 2542; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2543; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2544; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2545; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2546; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2547; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2548; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2549; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2550; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2551; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2552; GFX10-WGP-NEXT: s_endpgm 2553; 2554; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 2555; GFX10-CU: ; %bb.0: ; %entry 2556; GFX10-CU-NEXT: s_clause 0x1 2557; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2558; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2559; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2560; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2561; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2562; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2563; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2564; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2565; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2566; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2567; GFX10-CU-NEXT: s_endpgm 2568; 2569; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 2570; SKIP-CACHE-INV: ; %bb.0: ; %entry 2571; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2572; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 2573; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2574; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2575; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2576; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2577; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2578; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2579; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2580; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2581; SKIP-CACHE-INV-NEXT: s_endpgm 2582; 2583; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 2584; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2585; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2586; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2587; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2588; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2589; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2590; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2591; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2592; 2593; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 2594; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2595; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2596; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2597; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2598; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2599; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2600; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2601; GFX90A-TGSPLIT-NEXT: s_endpgm 2602; 2603; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 2604; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 2605; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2606; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 2607; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2608; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 2609; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 2610; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2611; GFX940-NOTTGSPLIT-NEXT: s_endpgm 2612; 2613; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 2614; GFX940-TGSPLIT: ; %bb.0: ; %entry 2615; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2616; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 2617; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2618; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 2619; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 2620; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2621; GFX940-TGSPLIT-NEXT: s_endpgm 2622; 2623; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 2624; GFX11-WGP: ; %bb.0: ; %entry 2625; GFX11-WGP-NEXT: s_clause 0x1 2626; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 2627; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 2628; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 2629; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 2630; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 2631; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2632; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2633; GFX11-WGP-NEXT: s_endpgm 2634; 2635; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 2636; GFX11-CU: ; %bb.0: ; %entry 2637; GFX11-CU-NEXT: s_clause 0x1 2638; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 2639; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 2640; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2641; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 2642; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 2643; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2644; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2645; GFX11-CU-NEXT: s_endpgm 2646 i32* %out, i32 %in, i32 %old) { 2647entry: 2648 %gep = getelementptr i32, i32* %out, i32 4 2649 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic 2650 ret void 2651} 2652 2653define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( 2654; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 2655; GFX7: ; %bb.0: ; %entry 2656; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2657; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2658; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2659; GFX7-NEXT: s_add_u32 s0, s0, 16 2660; GFX7-NEXT: s_addc_u32 s1, s1, 0 2661; GFX7-NEXT: v_mov_b32_e32 v0, s0 2662; GFX7-NEXT: v_mov_b32_e32 v2, s2 2663; GFX7-NEXT: v_mov_b32_e32 v1, s1 2664; GFX7-NEXT: v_mov_b32_e32 v3, s3 2665; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2666; GFX7-NEXT: s_endpgm 2667; 2668; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 2669; GFX10-WGP: ; %bb.0: ; %entry 2670; GFX10-WGP-NEXT: s_clause 0x1 2671; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2672; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2673; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2674; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2675; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2676; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2677; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2678; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2679; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2680; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2681; GFX10-WGP-NEXT: s_endpgm 2682; 2683; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 2684; GFX10-CU: ; %bb.0: ; %entry 2685; GFX10-CU-NEXT: s_clause 0x1 2686; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2687; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2688; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2689; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2690; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2691; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2692; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2693; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2694; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2695; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2696; GFX10-CU-NEXT: s_endpgm 2697; 2698; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 2699; SKIP-CACHE-INV: ; %bb.0: ; %entry 2700; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2701; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 2702; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2703; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2704; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2705; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2706; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2707; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2708; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2709; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2710; SKIP-CACHE-INV-NEXT: s_endpgm 2711; 2712; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 2713; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2714; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2715; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2716; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2717; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2718; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2719; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2720; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2721; 2722; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 2723; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2724; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2725; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2726; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2727; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2728; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2729; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2730; GFX90A-TGSPLIT-NEXT: s_endpgm 2731; 2732; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 2733; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 2734; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2735; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 2736; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2737; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 2738; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 2739; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2740; GFX940-NOTTGSPLIT-NEXT: s_endpgm 2741; 2742; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 2743; GFX940-TGSPLIT: ; %bb.0: ; %entry 2744; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2745; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 2746; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2747; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 2748; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 2749; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2750; GFX940-TGSPLIT-NEXT: s_endpgm 2751; 2752; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 2753; GFX11-WGP: ; %bb.0: ; %entry 2754; GFX11-WGP-NEXT: s_clause 0x1 2755; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 2756; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 2757; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 2758; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 2759; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 2760; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2761; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2762; GFX11-WGP-NEXT: s_endpgm 2763; 2764; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 2765; GFX11-CU: ; %bb.0: ; %entry 2766; GFX11-CU-NEXT: s_clause 0x1 2767; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 2768; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 2769; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2770; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 2771; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 2772; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2773; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2774; GFX11-CU-NEXT: s_endpgm 2775 i32* %out, i32 %in, i32 %old) { 2776entry: 2777 %gep = getelementptr i32, i32* %out, i32 4 2778 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire 2779 ret void 2780} 2781 2782define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( 2783; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 2784; GFX7: ; %bb.0: ; %entry 2785; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2786; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2787; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2788; GFX7-NEXT: s_add_u32 s0, s0, 16 2789; GFX7-NEXT: s_addc_u32 s1, s1, 0 2790; GFX7-NEXT: v_mov_b32_e32 v0, s0 2791; GFX7-NEXT: v_mov_b32_e32 v2, s2 2792; GFX7-NEXT: v_mov_b32_e32 v1, s1 2793; GFX7-NEXT: v_mov_b32_e32 v3, s3 2794; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2795; GFX7-NEXT: s_endpgm 2796; 2797; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 2798; GFX10-WGP: ; %bb.0: ; %entry 2799; GFX10-WGP-NEXT: s_clause 0x1 2800; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2801; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2802; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2803; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2804; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2805; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2806; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2807; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2808; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2809; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2810; GFX10-WGP-NEXT: s_endpgm 2811; 2812; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 2813; GFX10-CU: ; %bb.0: ; %entry 2814; GFX10-CU-NEXT: s_clause 0x1 2815; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2816; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2817; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2818; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2819; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2820; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2821; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2822; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2823; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2824; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2825; GFX10-CU-NEXT: s_endpgm 2826; 2827; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 2828; SKIP-CACHE-INV: ; %bb.0: ; %entry 2829; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2830; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 2831; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2832; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2833; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2834; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2835; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2836; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2837; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2838; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2839; SKIP-CACHE-INV-NEXT: s_endpgm 2840; 2841; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 2842; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2843; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2844; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2845; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2846; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2847; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2848; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2849; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2850; 2851; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 2852; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2853; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2854; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2855; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2856; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2857; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2858; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2859; GFX90A-TGSPLIT-NEXT: s_endpgm 2860; 2861; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 2862; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 2863; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2864; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 2865; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2866; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 2867; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 2868; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2869; GFX940-NOTTGSPLIT-NEXT: s_endpgm 2870; 2871; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 2872; GFX940-TGSPLIT: ; %bb.0: ; %entry 2873; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2874; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 2875; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2876; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 2877; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 2878; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2879; GFX940-TGSPLIT-NEXT: s_endpgm 2880; 2881; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 2882; GFX11-WGP: ; %bb.0: ; %entry 2883; GFX11-WGP-NEXT: s_clause 0x1 2884; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 2885; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 2886; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 2887; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 2888; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 2889; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2890; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2891; GFX11-WGP-NEXT: s_endpgm 2892; 2893; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 2894; GFX11-CU: ; %bb.0: ; %entry 2895; GFX11-CU-NEXT: s_clause 0x1 2896; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 2897; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 2898; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2899; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 2900; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 2901; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2902; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2903; GFX11-CU-NEXT: s_endpgm 2904 i32* %out, i32 %in, i32 %old) { 2905entry: 2906 %gep = getelementptr i32, i32* %out, i32 4 2907 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire 2908 ret void 2909} 2910 2911define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( 2912; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg: 2913; GFX7: ; %bb.0: ; %entry 2914; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2915; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2916; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2917; GFX7-NEXT: s_add_u32 s0, s0, 16 2918; GFX7-NEXT: s_addc_u32 s1, s1, 0 2919; GFX7-NEXT: v_mov_b32_e32 v0, s0 2920; GFX7-NEXT: v_mov_b32_e32 v2, s2 2921; GFX7-NEXT: v_mov_b32_e32 v1, s1 2922; GFX7-NEXT: v_mov_b32_e32 v3, s3 2923; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2924; GFX7-NEXT: s_endpgm 2925; 2926; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: 2927; GFX10-WGP: ; %bb.0: ; %entry 2928; GFX10-WGP-NEXT: s_clause 0x1 2929; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2930; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2931; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2932; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2933; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2934; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2935; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2936; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2937; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2938; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2939; GFX10-WGP-NEXT: s_endpgm 2940; 2941; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: 2942; GFX10-CU: ; %bb.0: ; %entry 2943; GFX10-CU-NEXT: s_clause 0x1 2944; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2945; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2946; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2947; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2948; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2949; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2950; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2951; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2952; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2953; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2954; GFX10-CU-NEXT: s_endpgm 2955; 2956; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_cmpxchg: 2957; SKIP-CACHE-INV: ; %bb.0: ; %entry 2958; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2959; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 2960; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2961; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2962; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2963; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2964; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2965; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2966; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2967; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2968; SKIP-CACHE-INV-NEXT: s_endpgm 2969; 2970; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: 2971; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2972; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2973; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2974; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2975; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2976; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2977; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2978; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2979; 2980; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: 2981; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2982; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2983; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2984; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2985; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2986; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2987; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2988; GFX90A-TGSPLIT-NEXT: s_endpgm 2989; 2990; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: 2991; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 2992; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 2993; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 2994; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2995; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 2996; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 2997; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2998; GFX940-NOTTGSPLIT-NEXT: s_endpgm 2999; 3000; GFX940-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: 3001; GFX940-TGSPLIT: ; %bb.0: ; %entry 3002; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3003; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 3004; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3005; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 3006; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 3007; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3008; GFX940-TGSPLIT-NEXT: s_endpgm 3009; 3010; GFX11-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: 3011; GFX11-WGP: ; %bb.0: ; %entry 3012; GFX11-WGP-NEXT: s_clause 0x1 3013; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 3014; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 3015; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3016; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 3017; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 3018; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3019; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3020; GFX11-WGP-NEXT: s_endpgm 3021; 3022; GFX11-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: 3023; GFX11-CU: ; %bb.0: ; %entry 3024; GFX11-CU-NEXT: s_clause 0x1 3025; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 3026; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 3027; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3028; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 3029; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 3030; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3031; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3032; GFX11-CU-NEXT: s_endpgm 3033 i32* %out, i32 %in, i32 %old) { 3034entry: 3035 %gep = getelementptr i32, i32* %out, i32 4 3036 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire 3037 ret void 3038} 3039 3040define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( 3041; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 3042; GFX7: ; %bb.0: ; %entry 3043; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3044; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3045; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3046; GFX7-NEXT: s_add_u32 s0, s0, 16 3047; GFX7-NEXT: s_addc_u32 s1, s1, 0 3048; GFX7-NEXT: v_mov_b32_e32 v0, s0 3049; GFX7-NEXT: v_mov_b32_e32 v2, s2 3050; GFX7-NEXT: v_mov_b32_e32 v1, s1 3051; GFX7-NEXT: v_mov_b32_e32 v3, s3 3052; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3053; GFX7-NEXT: s_endpgm 3054; 3055; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 3056; GFX10-WGP: ; %bb.0: ; %entry 3057; GFX10-WGP-NEXT: s_clause 0x1 3058; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3059; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3060; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3061; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 3062; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 3063; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3064; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3065; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3066; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3067; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3068; GFX10-WGP-NEXT: s_endpgm 3069; 3070; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 3071; GFX10-CU: ; %bb.0: ; %entry 3072; GFX10-CU-NEXT: s_clause 0x1 3073; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3074; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3075; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3076; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 3077; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 3078; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3079; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3080; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3081; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3082; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3083; GFX10-CU-NEXT: s_endpgm 3084; 3085; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 3086; SKIP-CACHE-INV: ; %bb.0: ; %entry 3087; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3088; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 3089; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3090; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 3091; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 3092; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3093; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3094; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3095; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3096; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3097; SKIP-CACHE-INV-NEXT: s_endpgm 3098; 3099; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 3100; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3101; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3102; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3103; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3104; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3105; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3106; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3107; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3108; 3109; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 3110; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3111; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3112; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3113; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3114; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3115; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3116; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3117; GFX90A-TGSPLIT-NEXT: s_endpgm 3118; 3119; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 3120; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 3121; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3122; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 3123; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3124; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 3125; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 3126; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3127; GFX940-NOTTGSPLIT-NEXT: s_endpgm 3128; 3129; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 3130; GFX940-TGSPLIT: ; %bb.0: ; %entry 3131; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3132; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 3133; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3134; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 3135; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 3136; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3137; GFX940-TGSPLIT-NEXT: s_endpgm 3138; 3139; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 3140; GFX11-WGP: ; %bb.0: ; %entry 3141; GFX11-WGP-NEXT: s_clause 0x1 3142; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 3143; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 3144; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3145; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 3146; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 3147; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3148; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3149; GFX11-WGP-NEXT: s_endpgm 3150; 3151; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 3152; GFX11-CU: ; %bb.0: ; %entry 3153; GFX11-CU-NEXT: s_clause 0x1 3154; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 3155; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 3156; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3157; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 3158; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 3159; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3160; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3161; GFX11-CU-NEXT: s_endpgm 3162 i32* %out, i32 %in, i32 %old) { 3163entry: 3164 %gep = getelementptr i32, i32* %out, i32 4 3165 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire 3166 ret void 3167} 3168 3169define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( 3170; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 3171; GFX7: ; %bb.0: ; %entry 3172; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3173; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3174; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3175; GFX7-NEXT: s_add_u32 s0, s0, 16 3176; GFX7-NEXT: s_addc_u32 s1, s1, 0 3177; GFX7-NEXT: v_mov_b32_e32 v0, s0 3178; GFX7-NEXT: v_mov_b32_e32 v2, s2 3179; GFX7-NEXT: v_mov_b32_e32 v1, s1 3180; GFX7-NEXT: v_mov_b32_e32 v3, s3 3181; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3182; GFX7-NEXT: s_endpgm 3183; 3184; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 3185; GFX10-WGP: ; %bb.0: ; %entry 3186; GFX10-WGP-NEXT: s_clause 0x1 3187; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3188; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3189; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3190; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 3191; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 3192; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3193; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3194; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3195; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3196; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3197; GFX10-WGP-NEXT: s_endpgm 3198; 3199; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 3200; GFX10-CU: ; %bb.0: ; %entry 3201; GFX10-CU-NEXT: s_clause 0x1 3202; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3203; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3204; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3205; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 3206; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 3207; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3208; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3209; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3210; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3211; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3212; GFX10-CU-NEXT: s_endpgm 3213; 3214; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 3215; SKIP-CACHE-INV: ; %bb.0: ; %entry 3216; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3217; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 3218; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3219; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 3220; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 3221; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3222; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3223; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3224; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3225; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3226; SKIP-CACHE-INV-NEXT: s_endpgm 3227; 3228; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 3229; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3230; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3231; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3232; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3233; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3234; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3235; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3236; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3237; 3238; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 3239; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3240; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3241; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3242; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3243; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3244; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3245; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3246; GFX90A-TGSPLIT-NEXT: s_endpgm 3247; 3248; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 3249; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 3250; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3251; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 3252; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3253; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 3254; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 3255; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3256; GFX940-NOTTGSPLIT-NEXT: s_endpgm 3257; 3258; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 3259; GFX940-TGSPLIT: ; %bb.0: ; %entry 3260; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3261; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 3262; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3263; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 3264; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 3265; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3266; GFX940-TGSPLIT-NEXT: s_endpgm 3267; 3268; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 3269; GFX11-WGP: ; %bb.0: ; %entry 3270; GFX11-WGP-NEXT: s_clause 0x1 3271; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 3272; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 3273; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3274; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 3275; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 3276; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3277; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3278; GFX11-WGP-NEXT: s_endpgm 3279; 3280; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 3281; GFX11-CU: ; %bb.0: ; %entry 3282; GFX11-CU-NEXT: s_clause 0x1 3283; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 3284; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 3285; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3286; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 3287; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 3288; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3289; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3290; GFX11-CU-NEXT: s_endpgm 3291 i32* %out, i32 %in, i32 %old) { 3292entry: 3293 %gep = getelementptr i32, i32* %out, i32 4 3294 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire 3295 ret void 3296} 3297 3298define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( 3299; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 3300; GFX7: ; %bb.0: ; %entry 3301; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3302; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3303; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3304; GFX7-NEXT: s_add_u32 s0, s0, 16 3305; GFX7-NEXT: s_addc_u32 s1, s1, 0 3306; GFX7-NEXT: v_mov_b32_e32 v0, s0 3307; GFX7-NEXT: v_mov_b32_e32 v2, s2 3308; GFX7-NEXT: v_mov_b32_e32 v1, s1 3309; GFX7-NEXT: v_mov_b32_e32 v3, s3 3310; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3311; GFX7-NEXT: s_endpgm 3312; 3313; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 3314; GFX10-WGP: ; %bb.0: ; %entry 3315; GFX10-WGP-NEXT: s_clause 0x1 3316; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3317; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3318; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3319; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 3320; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 3321; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3322; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3323; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3324; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3325; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3326; GFX10-WGP-NEXT: s_endpgm 3327; 3328; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 3329; GFX10-CU: ; %bb.0: ; %entry 3330; GFX10-CU-NEXT: s_clause 0x1 3331; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3332; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3333; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3334; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 3335; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 3336; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3337; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3338; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3339; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3340; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3341; GFX10-CU-NEXT: s_endpgm 3342; 3343; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 3344; SKIP-CACHE-INV: ; %bb.0: ; %entry 3345; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3346; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 3347; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3348; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 3349; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 3350; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3351; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3352; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3353; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3354; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3355; SKIP-CACHE-INV-NEXT: s_endpgm 3356; 3357; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 3358; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3359; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3360; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3361; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3362; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3363; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3364; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3365; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3366; 3367; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 3368; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3369; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3370; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3371; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3372; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3373; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3374; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3375; GFX90A-TGSPLIT-NEXT: s_endpgm 3376; 3377; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 3378; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 3379; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3380; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 3381; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3382; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 3383; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 3384; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3385; GFX940-NOTTGSPLIT-NEXT: s_endpgm 3386; 3387; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 3388; GFX940-TGSPLIT: ; %bb.0: ; %entry 3389; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3390; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 3391; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3392; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 3393; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 3394; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3395; GFX940-TGSPLIT-NEXT: s_endpgm 3396; 3397; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 3398; GFX11-WGP: ; %bb.0: ; %entry 3399; GFX11-WGP-NEXT: s_clause 0x1 3400; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 3401; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 3402; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3403; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 3404; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 3405; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3406; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3407; GFX11-WGP-NEXT: s_endpgm 3408; 3409; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 3410; GFX11-CU: ; %bb.0: ; %entry 3411; GFX11-CU-NEXT: s_clause 0x1 3412; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 3413; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 3414; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3415; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 3416; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 3417; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3418; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3419; GFX11-CU-NEXT: s_endpgm 3420 i32* %out, i32 %in, i32 %old) { 3421entry: 3422 %gep = getelementptr i32, i32* %out, i32 4 3423 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst 3424 ret void 3425} 3426 3427define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( 3428; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 3429; GFX7: ; %bb.0: ; %entry 3430; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3431; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3432; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3433; GFX7-NEXT: s_add_u32 s0, s0, 16 3434; GFX7-NEXT: s_addc_u32 s1, s1, 0 3435; GFX7-NEXT: v_mov_b32_e32 v0, s0 3436; GFX7-NEXT: v_mov_b32_e32 v2, s2 3437; GFX7-NEXT: v_mov_b32_e32 v1, s1 3438; GFX7-NEXT: v_mov_b32_e32 v3, s3 3439; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3440; GFX7-NEXT: s_endpgm 3441; 3442; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 3443; GFX10-WGP: ; %bb.0: ; %entry 3444; GFX10-WGP-NEXT: s_clause 0x1 3445; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3446; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3447; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3448; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 3449; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 3450; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3451; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3452; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3453; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3454; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3455; GFX10-WGP-NEXT: s_endpgm 3456; 3457; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 3458; GFX10-CU: ; %bb.0: ; %entry 3459; GFX10-CU-NEXT: s_clause 0x1 3460; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3461; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3462; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3463; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 3464; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 3465; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3466; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3467; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3468; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3469; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3470; GFX10-CU-NEXT: s_endpgm 3471; 3472; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 3473; SKIP-CACHE-INV: ; %bb.0: ; %entry 3474; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3475; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 3476; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3477; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 3478; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 3479; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3480; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3481; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3482; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3483; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3484; SKIP-CACHE-INV-NEXT: s_endpgm 3485; 3486; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 3487; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3488; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3489; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3490; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3491; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3492; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3493; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3494; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3495; 3496; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 3497; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3498; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3499; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3500; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3501; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3502; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3503; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3504; GFX90A-TGSPLIT-NEXT: s_endpgm 3505; 3506; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 3507; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 3508; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3509; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 3510; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3511; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 3512; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 3513; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3514; GFX940-NOTTGSPLIT-NEXT: s_endpgm 3515; 3516; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 3517; GFX940-TGSPLIT: ; %bb.0: ; %entry 3518; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3519; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 3520; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3521; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 3522; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 3523; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3524; GFX940-TGSPLIT-NEXT: s_endpgm 3525; 3526; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 3527; GFX11-WGP: ; %bb.0: ; %entry 3528; GFX11-WGP-NEXT: s_clause 0x1 3529; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 3530; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 3531; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3532; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 3533; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 3534; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3535; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3536; GFX11-WGP-NEXT: s_endpgm 3537; 3538; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 3539; GFX11-CU: ; %bb.0: ; %entry 3540; GFX11-CU-NEXT: s_clause 0x1 3541; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 3542; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 3543; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3544; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 3545; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 3546; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3547; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3548; GFX11-CU-NEXT: s_endpgm 3549 i32* %out, i32 %in, i32 %old) { 3550entry: 3551 %gep = getelementptr i32, i32* %out, i32 4 3552 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst 3553 ret void 3554} 3555 3556define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( 3557; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 3558; GFX7: ; %bb.0: ; %entry 3559; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3560; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3561; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3562; GFX7-NEXT: s_add_u32 s0, s0, 16 3563; GFX7-NEXT: s_addc_u32 s1, s1, 0 3564; GFX7-NEXT: v_mov_b32_e32 v0, s0 3565; GFX7-NEXT: v_mov_b32_e32 v2, s2 3566; GFX7-NEXT: v_mov_b32_e32 v1, s1 3567; GFX7-NEXT: v_mov_b32_e32 v3, s3 3568; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3569; GFX7-NEXT: s_endpgm 3570; 3571; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 3572; GFX10-WGP: ; %bb.0: ; %entry 3573; GFX10-WGP-NEXT: s_clause 0x1 3574; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3575; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3576; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3577; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 3578; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 3579; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3580; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3581; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3582; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3583; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3584; GFX10-WGP-NEXT: s_endpgm 3585; 3586; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 3587; GFX10-CU: ; %bb.0: ; %entry 3588; GFX10-CU-NEXT: s_clause 0x1 3589; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3590; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3591; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3592; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 3593; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 3594; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3595; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3596; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3597; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3598; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3599; GFX10-CU-NEXT: s_endpgm 3600; 3601; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 3602; SKIP-CACHE-INV: ; %bb.0: ; %entry 3603; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3604; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 3605; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3606; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 3607; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 3608; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3609; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3610; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3611; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3612; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3613; SKIP-CACHE-INV-NEXT: s_endpgm 3614; 3615; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 3616; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3617; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3618; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3619; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3620; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3621; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3622; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3623; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3624; 3625; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 3626; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3627; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3628; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3629; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3630; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3631; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3632; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3633; GFX90A-TGSPLIT-NEXT: s_endpgm 3634; 3635; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 3636; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 3637; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3638; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 3639; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3640; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 3641; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 3642; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3643; GFX940-NOTTGSPLIT-NEXT: s_endpgm 3644; 3645; GFX940-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 3646; GFX940-TGSPLIT: ; %bb.0: ; %entry 3647; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3648; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 3649; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3650; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 3651; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 3652; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3653; GFX940-TGSPLIT-NEXT: s_endpgm 3654; 3655; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 3656; GFX11-WGP: ; %bb.0: ; %entry 3657; GFX11-WGP-NEXT: s_clause 0x1 3658; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 3659; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 3660; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3661; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 3662; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 3663; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3664; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3665; GFX11-WGP-NEXT: s_endpgm 3666; 3667; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 3668; GFX11-CU: ; %bb.0: ; %entry 3669; GFX11-CU-NEXT: s_clause 0x1 3670; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 3671; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 3672; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3673; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 3674; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 3675; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3676; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3677; GFX11-CU-NEXT: s_endpgm 3678 i32* %out, i32 %in, i32 %old) { 3679entry: 3680 %gep = getelementptr i32, i32* %out, i32 4 3681 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst 3682 ret void 3683} 3684 3685define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( 3686; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 3687; GFX7: ; %bb.0: ; %entry 3688; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3689; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3690; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3691; GFX7-NEXT: s_add_u32 s0, s0, 16 3692; GFX7-NEXT: s_addc_u32 s1, s1, 0 3693; GFX7-NEXT: v_mov_b32_e32 v0, s0 3694; GFX7-NEXT: v_mov_b32_e32 v2, s2 3695; GFX7-NEXT: v_mov_b32_e32 v1, s1 3696; GFX7-NEXT: v_mov_b32_e32 v3, s3 3697; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3698; GFX7-NEXT: s_endpgm 3699; 3700; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 3701; GFX10-WGP: ; %bb.0: ; %entry 3702; GFX10-WGP-NEXT: s_clause 0x1 3703; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3704; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3705; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3706; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 3707; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 3708; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3709; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3710; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3711; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3712; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3713; GFX10-WGP-NEXT: s_endpgm 3714; 3715; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 3716; GFX10-CU: ; %bb.0: ; %entry 3717; GFX10-CU-NEXT: s_clause 0x1 3718; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3719; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3720; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3721; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 3722; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 3723; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3724; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3725; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3726; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3727; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3728; GFX10-CU-NEXT: s_endpgm 3729; 3730; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 3731; SKIP-CACHE-INV: ; %bb.0: ; %entry 3732; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3733; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 3734; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3735; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 3736; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 3737; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3738; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3739; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3740; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3741; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3742; SKIP-CACHE-INV-NEXT: s_endpgm 3743; 3744; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 3745; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3746; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3747; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3748; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3749; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3750; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3751; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3752; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3753; 3754; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 3755; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3756; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3757; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3758; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3759; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3760; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3761; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3762; GFX90A-TGSPLIT-NEXT: s_endpgm 3763; 3764; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 3765; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 3766; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3767; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 3768; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3769; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 3770; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 3771; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3772; GFX940-NOTTGSPLIT-NEXT: s_endpgm 3773; 3774; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 3775; GFX940-TGSPLIT: ; %bb.0: ; %entry 3776; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3777; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 3778; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3779; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 3780; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 3781; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3782; GFX940-TGSPLIT-NEXT: s_endpgm 3783; 3784; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 3785; GFX11-WGP: ; %bb.0: ; %entry 3786; GFX11-WGP-NEXT: s_clause 0x1 3787; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 3788; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 3789; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3790; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 3791; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 3792; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3793; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3794; GFX11-WGP-NEXT: s_endpgm 3795; 3796; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 3797; GFX11-CU: ; %bb.0: ; %entry 3798; GFX11-CU-NEXT: s_clause 0x1 3799; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 3800; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 3801; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3802; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 3803; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 3804; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3805; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3806; GFX11-CU-NEXT: s_endpgm 3807 i32* %out, i32 %in, i32 %old) { 3808entry: 3809 %gep = getelementptr i32, i32* %out, i32 4 3810 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst 3811 ret void 3812} 3813 3814define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( 3815; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 3816; GFX7: ; %bb.0: ; %entry 3817; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3818; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3819; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3820; GFX7-NEXT: s_add_u32 s0, s0, 16 3821; GFX7-NEXT: s_addc_u32 s1, s1, 0 3822; GFX7-NEXT: v_mov_b32_e32 v0, s0 3823; GFX7-NEXT: v_mov_b32_e32 v2, s2 3824; GFX7-NEXT: v_mov_b32_e32 v1, s1 3825; GFX7-NEXT: v_mov_b32_e32 v3, s3 3826; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3827; GFX7-NEXT: s_endpgm 3828; 3829; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 3830; GFX10-WGP: ; %bb.0: ; %entry 3831; GFX10-WGP-NEXT: s_clause 0x1 3832; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3833; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3834; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3835; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 3836; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 3837; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3838; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3839; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3840; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3841; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3842; GFX10-WGP-NEXT: s_endpgm 3843; 3844; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 3845; GFX10-CU: ; %bb.0: ; %entry 3846; GFX10-CU-NEXT: s_clause 0x1 3847; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3848; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3849; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3850; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 3851; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 3852; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3853; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3854; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3855; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3856; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3857; GFX10-CU-NEXT: s_endpgm 3858; 3859; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 3860; SKIP-CACHE-INV: ; %bb.0: ; %entry 3861; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3862; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 3863; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3864; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 3865; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 3866; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3867; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3868; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3869; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3870; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3871; SKIP-CACHE-INV-NEXT: s_endpgm 3872; 3873; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 3874; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3875; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3876; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3877; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3878; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3879; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3880; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3881; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3882; 3883; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 3884; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3885; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3886; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3887; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3888; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3889; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3890; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3891; GFX90A-TGSPLIT-NEXT: s_endpgm 3892; 3893; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 3894; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 3895; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3896; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 3897; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3898; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 3899; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 3900; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3901; GFX940-NOTTGSPLIT-NEXT: s_endpgm 3902; 3903; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 3904; GFX940-TGSPLIT: ; %bb.0: ; %entry 3905; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 3906; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 3907; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3908; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 3909; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 3910; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3911; GFX940-TGSPLIT-NEXT: s_endpgm 3912; 3913; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 3914; GFX11-WGP: ; %bb.0: ; %entry 3915; GFX11-WGP-NEXT: s_clause 0x1 3916; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 3917; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 3918; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3919; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 3920; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 3921; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3922; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3923; GFX11-WGP-NEXT: s_endpgm 3924; 3925; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 3926; GFX11-CU: ; %bb.0: ; %entry 3927; GFX11-CU-NEXT: s_clause 0x1 3928; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 3929; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 3930; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3931; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 3932; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 3933; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3934; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3935; GFX11-CU-NEXT: s_endpgm 3936 i32* %out, i32 %in, i32 %old) { 3937entry: 3938 %gep = getelementptr i32, i32* %out, i32 4 3939 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst 3940 ret void 3941} 3942 3943define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( 3944; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 3945; GFX7: ; %bb.0: ; %entry 3946; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3947; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3948; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3949; GFX7-NEXT: s_add_u32 s4, s0, 16 3950; GFX7-NEXT: s_addc_u32 s5, s1, 0 3951; GFX7-NEXT: v_mov_b32_e32 v0, s4 3952; GFX7-NEXT: v_mov_b32_e32 v2, s2 3953; GFX7-NEXT: v_mov_b32_e32 v1, s5 3954; GFX7-NEXT: v_mov_b32_e32 v3, s3 3955; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3956; GFX7-NEXT: v_mov_b32_e32 v0, s0 3957; GFX7-NEXT: v_mov_b32_e32 v1, s1 3958; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3959; GFX7-NEXT: flat_store_dword v[0:1], v2 3960; GFX7-NEXT: s_endpgm 3961; 3962; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 3963; GFX10-WGP: ; %bb.0: ; %entry 3964; GFX10-WGP-NEXT: s_clause 0x1 3965; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3966; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3967; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3968; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 3969; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 3970; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3971; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3972; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3973; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3974; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3975; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3976; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3977; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3978; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3979; GFX10-WGP-NEXT: s_endpgm 3980; 3981; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 3982; GFX10-CU: ; %bb.0: ; %entry 3983; GFX10-CU-NEXT: s_clause 0x1 3984; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3985; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3986; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3987; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3988; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3989; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3990; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3991; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3992; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3993; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3994; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3995; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3996; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3997; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3998; GFX10-CU-NEXT: s_endpgm 3999; 4000; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 4001; SKIP-CACHE-INV: ; %bb.0: ; %entry 4002; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4003; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 4004; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4005; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4006; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4007; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4008; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4009; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4010; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4011; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4012; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4013; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4014; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4015; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4016; SKIP-CACHE-INV-NEXT: s_endpgm 4017; 4018; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 4019; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4020; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4021; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4022; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4023; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4024; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4025; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4026; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4027; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4028; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4029; 4030; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 4031; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4032; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4033; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4034; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4035; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4036; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4037; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4038; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4039; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4040; GFX90A-TGSPLIT-NEXT: s_endpgm 4041; 4042; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 4043; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 4044; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4045; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 4046; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4047; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 4048; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 4049; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 4050; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4051; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4052; GFX940-NOTTGSPLIT-NEXT: s_endpgm 4053; 4054; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 4055; GFX940-TGSPLIT: ; %bb.0: ; %entry 4056; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4057; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 4058; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4059; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 4060; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 4061; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 4062; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4063; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4064; GFX940-TGSPLIT-NEXT: s_endpgm 4065; 4066; GFX11-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 4067; GFX11-WGP: ; %bb.0: ; %entry 4068; GFX11-WGP-NEXT: s_clause 0x1 4069; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 4070; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 4071; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 4072; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 4073; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 4074; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 4075; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4076; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 4077; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4078; GFX11-WGP-NEXT: s_endpgm 4079; 4080; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 4081; GFX11-CU: ; %bb.0: ; %entry 4082; GFX11-CU-NEXT: s_clause 0x1 4083; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 4084; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 4085; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4086; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 4087; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 4088; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 4089; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4090; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 4091; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4092; GFX11-CU-NEXT: s_endpgm 4093 i32* %out, i32 %in, i32 %old) { 4094entry: 4095 %gep = getelementptr i32, i32* %out, i32 4 4096 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic 4097 %val0 = extractvalue { i32, i1 } %val, 0 4098 store i32 %val0, i32* %out, align 4 4099 ret void 4100} 4101 4102define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( 4103; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 4104; GFX7: ; %bb.0: ; %entry 4105; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4106; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4107; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4108; GFX7-NEXT: s_add_u32 s4, s0, 16 4109; GFX7-NEXT: s_addc_u32 s5, s1, 0 4110; GFX7-NEXT: v_mov_b32_e32 v0, s4 4111; GFX7-NEXT: v_mov_b32_e32 v2, s2 4112; GFX7-NEXT: v_mov_b32_e32 v1, s5 4113; GFX7-NEXT: v_mov_b32_e32 v3, s3 4114; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4115; GFX7-NEXT: v_mov_b32_e32 v0, s0 4116; GFX7-NEXT: v_mov_b32_e32 v1, s1 4117; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4118; GFX7-NEXT: flat_store_dword v[0:1], v2 4119; GFX7-NEXT: s_endpgm 4120; 4121; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 4122; GFX10-WGP: ; %bb.0: ; %entry 4123; GFX10-WGP-NEXT: s_clause 0x1 4124; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4125; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4126; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4127; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4128; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4129; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4130; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4131; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4132; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4133; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4134; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4135; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4136; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4137; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4138; GFX10-WGP-NEXT: s_endpgm 4139; 4140; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 4141; GFX10-CU: ; %bb.0: ; %entry 4142; GFX10-CU-NEXT: s_clause 0x1 4143; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4144; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4145; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4146; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4147; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4148; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4149; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4150; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4151; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4152; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4153; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4154; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4155; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4156; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4157; GFX10-CU-NEXT: s_endpgm 4158; 4159; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 4160; SKIP-CACHE-INV: ; %bb.0: ; %entry 4161; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4162; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 4163; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4164; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4165; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4166; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4167; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4168; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4169; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4170; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4171; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4172; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4173; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4174; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4175; SKIP-CACHE-INV-NEXT: s_endpgm 4176; 4177; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 4178; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4179; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4180; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4181; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4182; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4183; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4184; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4185; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4186; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4187; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4188; 4189; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 4190; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4191; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4192; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4193; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4194; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4195; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4196; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4197; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4198; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4199; GFX90A-TGSPLIT-NEXT: s_endpgm 4200; 4201; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 4202; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 4203; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4204; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 4205; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4206; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 4207; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 4208; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 4209; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4210; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4211; GFX940-NOTTGSPLIT-NEXT: s_endpgm 4212; 4213; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 4214; GFX940-TGSPLIT: ; %bb.0: ; %entry 4215; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4216; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 4217; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4218; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 4219; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 4220; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 4221; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4222; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4223; GFX940-TGSPLIT-NEXT: s_endpgm 4224; 4225; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 4226; GFX11-WGP: ; %bb.0: ; %entry 4227; GFX11-WGP-NEXT: s_clause 0x1 4228; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 4229; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 4230; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 4231; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 4232; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 4233; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 4234; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4235; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 4236; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4237; GFX11-WGP-NEXT: s_endpgm 4238; 4239; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 4240; GFX11-CU: ; %bb.0: ; %entry 4241; GFX11-CU-NEXT: s_clause 0x1 4242; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 4243; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 4244; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4245; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 4246; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 4247; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 4248; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4249; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 4250; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4251; GFX11-CU-NEXT: s_endpgm 4252 i32* %out, i32 %in, i32 %old) { 4253entry: 4254 %gep = getelementptr i32, i32* %out, i32 4 4255 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic 4256 %val0 = extractvalue { i32, i1 } %val, 0 4257 store i32 %val0, i32* %out, align 4 4258 ret void 4259} 4260 4261define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( 4262; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 4263; GFX7: ; %bb.0: ; %entry 4264; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4265; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4266; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4267; GFX7-NEXT: s_add_u32 s4, s0, 16 4268; GFX7-NEXT: s_addc_u32 s5, s1, 0 4269; GFX7-NEXT: v_mov_b32_e32 v0, s4 4270; GFX7-NEXT: v_mov_b32_e32 v2, s2 4271; GFX7-NEXT: v_mov_b32_e32 v1, s5 4272; GFX7-NEXT: v_mov_b32_e32 v3, s3 4273; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4274; GFX7-NEXT: v_mov_b32_e32 v0, s0 4275; GFX7-NEXT: v_mov_b32_e32 v1, s1 4276; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4277; GFX7-NEXT: flat_store_dword v[0:1], v2 4278; GFX7-NEXT: s_endpgm 4279; 4280; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 4281; GFX10-WGP: ; %bb.0: ; %entry 4282; GFX10-WGP-NEXT: s_clause 0x1 4283; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4284; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4285; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4286; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4287; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4288; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4289; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4290; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4291; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4292; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4293; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4294; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4295; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4296; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4297; GFX10-WGP-NEXT: s_endpgm 4298; 4299; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 4300; GFX10-CU: ; %bb.0: ; %entry 4301; GFX10-CU-NEXT: s_clause 0x1 4302; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4303; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4304; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4305; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4306; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4307; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4308; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4309; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4310; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4311; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4312; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4313; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4314; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4315; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4316; GFX10-CU-NEXT: s_endpgm 4317; 4318; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 4319; SKIP-CACHE-INV: ; %bb.0: ; %entry 4320; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4321; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 4322; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4323; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4324; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4325; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4326; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4327; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4328; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4329; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4330; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4331; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4332; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4333; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4334; SKIP-CACHE-INV-NEXT: s_endpgm 4335; 4336; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 4337; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4338; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4339; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4340; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4341; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4342; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4343; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4344; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4345; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4346; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4347; 4348; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 4349; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4350; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4351; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4352; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4353; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4354; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4355; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4356; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4357; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4358; GFX90A-TGSPLIT-NEXT: s_endpgm 4359; 4360; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 4361; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 4362; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4363; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 4364; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4365; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 4366; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 4367; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 4368; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4369; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4370; GFX940-NOTTGSPLIT-NEXT: s_endpgm 4371; 4372; GFX940-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 4373; GFX940-TGSPLIT: ; %bb.0: ; %entry 4374; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4375; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 4376; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4377; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 4378; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 4379; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 4380; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4381; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4382; GFX940-TGSPLIT-NEXT: s_endpgm 4383; 4384; GFX11-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 4385; GFX11-WGP: ; %bb.0: ; %entry 4386; GFX11-WGP-NEXT: s_clause 0x1 4387; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 4388; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 4389; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 4390; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 4391; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 4392; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 4393; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4394; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 4395; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4396; GFX11-WGP-NEXT: s_endpgm 4397; 4398; GFX11-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 4399; GFX11-CU: ; %bb.0: ; %entry 4400; GFX11-CU-NEXT: s_clause 0x1 4401; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 4402; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 4403; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4404; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 4405; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 4406; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 4407; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4408; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 4409; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4410; GFX11-CU-NEXT: s_endpgm 4411 i32* %out, i32 %in, i32 %old) { 4412entry: 4413 %gep = getelementptr i32, i32* %out, i32 4 4414 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic 4415 %val0 = extractvalue { i32, i1 } %val, 0 4416 store i32 %val0, i32* %out, align 4 4417 ret void 4418} 4419 4420define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( 4421; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 4422; GFX7: ; %bb.0: ; %entry 4423; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4424; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4425; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4426; GFX7-NEXT: s_add_u32 s4, s0, 16 4427; GFX7-NEXT: s_addc_u32 s5, s1, 0 4428; GFX7-NEXT: v_mov_b32_e32 v0, s4 4429; GFX7-NEXT: v_mov_b32_e32 v2, s2 4430; GFX7-NEXT: v_mov_b32_e32 v1, s5 4431; GFX7-NEXT: v_mov_b32_e32 v3, s3 4432; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4433; GFX7-NEXT: v_mov_b32_e32 v0, s0 4434; GFX7-NEXT: v_mov_b32_e32 v1, s1 4435; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4436; GFX7-NEXT: flat_store_dword v[0:1], v2 4437; GFX7-NEXT: s_endpgm 4438; 4439; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 4440; GFX10-WGP: ; %bb.0: ; %entry 4441; GFX10-WGP-NEXT: s_clause 0x1 4442; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4443; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4444; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4445; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4446; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4447; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4448; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4449; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4450; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4451; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4452; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4453; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4454; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4455; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4456; GFX10-WGP-NEXT: s_endpgm 4457; 4458; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 4459; GFX10-CU: ; %bb.0: ; %entry 4460; GFX10-CU-NEXT: s_clause 0x1 4461; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4462; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4463; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4464; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4465; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4466; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4467; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4468; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4469; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4470; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4471; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4472; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4473; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4474; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4475; GFX10-CU-NEXT: s_endpgm 4476; 4477; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 4478; SKIP-CACHE-INV: ; %bb.0: ; %entry 4479; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4480; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 4481; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4482; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4483; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4484; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4485; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4486; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4487; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4488; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4489; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4490; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4491; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4492; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4493; SKIP-CACHE-INV-NEXT: s_endpgm 4494; 4495; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 4496; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4497; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4498; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4499; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4500; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4501; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4502; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4503; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4504; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4505; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4506; 4507; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 4508; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4509; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4510; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4511; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4512; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4513; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4514; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4515; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4516; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4517; GFX90A-TGSPLIT-NEXT: s_endpgm 4518; 4519; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 4520; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 4521; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4522; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 4523; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4524; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 4525; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 4526; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 4527; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4528; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4529; GFX940-NOTTGSPLIT-NEXT: s_endpgm 4530; 4531; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 4532; GFX940-TGSPLIT: ; %bb.0: ; %entry 4533; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4534; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 4535; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4536; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 4537; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 4538; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 4539; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4540; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4541; GFX940-TGSPLIT-NEXT: s_endpgm 4542; 4543; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 4544; GFX11-WGP: ; %bb.0: ; %entry 4545; GFX11-WGP-NEXT: s_clause 0x1 4546; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 4547; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 4548; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 4549; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 4550; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 4551; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 4552; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4553; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 4554; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4555; GFX11-WGP-NEXT: s_endpgm 4556; 4557; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 4558; GFX11-CU: ; %bb.0: ; %entry 4559; GFX11-CU-NEXT: s_clause 0x1 4560; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 4561; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 4562; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4563; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 4564; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 4565; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 4566; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4567; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 4568; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4569; GFX11-CU-NEXT: s_endpgm 4570 i32* %out, i32 %in, i32 %old) { 4571entry: 4572 %gep = getelementptr i32, i32* %out, i32 4 4573 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic 4574 %val0 = extractvalue { i32, i1 } %val, 0 4575 store i32 %val0, i32* %out, align 4 4576 ret void 4577} 4578 4579define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( 4580; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 4581; GFX7: ; %bb.0: ; %entry 4582; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4583; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4584; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4585; GFX7-NEXT: s_add_u32 s4, s0, 16 4586; GFX7-NEXT: s_addc_u32 s5, s1, 0 4587; GFX7-NEXT: v_mov_b32_e32 v0, s4 4588; GFX7-NEXT: v_mov_b32_e32 v2, s2 4589; GFX7-NEXT: v_mov_b32_e32 v1, s5 4590; GFX7-NEXT: v_mov_b32_e32 v3, s3 4591; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4592; GFX7-NEXT: v_mov_b32_e32 v0, s0 4593; GFX7-NEXT: v_mov_b32_e32 v1, s1 4594; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4595; GFX7-NEXT: flat_store_dword v[0:1], v2 4596; GFX7-NEXT: s_endpgm 4597; 4598; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 4599; GFX10-WGP: ; %bb.0: ; %entry 4600; GFX10-WGP-NEXT: s_clause 0x1 4601; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4602; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4603; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4604; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4605; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4606; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4607; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4608; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4609; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4610; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4611; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4612; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4613; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4614; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4615; GFX10-WGP-NEXT: s_endpgm 4616; 4617; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 4618; GFX10-CU: ; %bb.0: ; %entry 4619; GFX10-CU-NEXT: s_clause 0x1 4620; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4621; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4622; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4623; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4624; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4625; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4626; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4627; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4628; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4629; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4630; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4631; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4632; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4633; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4634; GFX10-CU-NEXT: s_endpgm 4635; 4636; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 4637; SKIP-CACHE-INV: ; %bb.0: ; %entry 4638; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4639; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 4640; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4641; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4642; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4643; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4644; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4645; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4646; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4647; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4648; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4649; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4650; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4651; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4652; SKIP-CACHE-INV-NEXT: s_endpgm 4653; 4654; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 4655; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4656; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4657; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4658; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4659; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4660; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4661; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4662; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4663; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4664; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4665; 4666; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 4667; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4668; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4669; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4670; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4671; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4672; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4673; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4674; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4675; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4676; GFX90A-TGSPLIT-NEXT: s_endpgm 4677; 4678; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 4679; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 4680; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4681; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 4682; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4683; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 4684; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 4685; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 4686; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4687; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4688; GFX940-NOTTGSPLIT-NEXT: s_endpgm 4689; 4690; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 4691; GFX940-TGSPLIT: ; %bb.0: ; %entry 4692; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4693; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 4694; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4695; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 4696; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 4697; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 4698; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4699; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4700; GFX940-TGSPLIT-NEXT: s_endpgm 4701; 4702; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 4703; GFX11-WGP: ; %bb.0: ; %entry 4704; GFX11-WGP-NEXT: s_clause 0x1 4705; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 4706; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 4707; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 4708; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 4709; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 4710; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 4711; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4712; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 4713; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4714; GFX11-WGP-NEXT: s_endpgm 4715; 4716; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 4717; GFX11-CU: ; %bb.0: ; %entry 4718; GFX11-CU-NEXT: s_clause 0x1 4719; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 4720; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 4721; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4722; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 4723; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 4724; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 4725; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4726; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 4727; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4728; GFX11-CU-NEXT: s_endpgm 4729 i32* %out, i32 %in, i32 %old) { 4730entry: 4731 %gep = getelementptr i32, i32* %out, i32 4 4732 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic 4733 %val0 = extractvalue { i32, i1 } %val, 0 4734 store i32 %val0, i32* %out, align 4 4735 ret void 4736} 4737 4738define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( 4739; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 4740; GFX7: ; %bb.0: ; %entry 4741; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4742; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4743; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4744; GFX7-NEXT: s_add_u32 s4, s0, 16 4745; GFX7-NEXT: s_addc_u32 s5, s1, 0 4746; GFX7-NEXT: v_mov_b32_e32 v0, s4 4747; GFX7-NEXT: v_mov_b32_e32 v2, s2 4748; GFX7-NEXT: v_mov_b32_e32 v1, s5 4749; GFX7-NEXT: v_mov_b32_e32 v3, s3 4750; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4751; GFX7-NEXT: v_mov_b32_e32 v0, s0 4752; GFX7-NEXT: v_mov_b32_e32 v1, s1 4753; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4754; GFX7-NEXT: flat_store_dword v[0:1], v2 4755; GFX7-NEXT: s_endpgm 4756; 4757; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 4758; GFX10-WGP: ; %bb.0: ; %entry 4759; GFX10-WGP-NEXT: s_clause 0x1 4760; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4761; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4762; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4763; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4764; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4765; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4766; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4767; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4768; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4769; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4770; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4771; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4772; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4773; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4774; GFX10-WGP-NEXT: s_endpgm 4775; 4776; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 4777; GFX10-CU: ; %bb.0: ; %entry 4778; GFX10-CU-NEXT: s_clause 0x1 4779; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4780; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4781; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4782; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4783; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4784; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4785; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4786; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4787; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4788; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4789; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4790; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4791; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4792; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4793; GFX10-CU-NEXT: s_endpgm 4794; 4795; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 4796; SKIP-CACHE-INV: ; %bb.0: ; %entry 4797; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4798; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 4799; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4800; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4801; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4802; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4803; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4804; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4805; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4806; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4807; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4808; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4809; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4810; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4811; SKIP-CACHE-INV-NEXT: s_endpgm 4812; 4813; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 4814; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4815; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4816; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4817; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4818; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4819; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4820; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4821; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4822; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4823; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4824; 4825; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 4826; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4827; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4828; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4829; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4830; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4831; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4832; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4833; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4834; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4835; GFX90A-TGSPLIT-NEXT: s_endpgm 4836; 4837; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 4838; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 4839; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4840; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 4841; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4842; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 4843; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 4844; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 4845; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4846; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4847; GFX940-NOTTGSPLIT-NEXT: s_endpgm 4848; 4849; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 4850; GFX940-TGSPLIT: ; %bb.0: ; %entry 4851; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4852; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 4853; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4854; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 4855; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 4856; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 4857; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4858; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4859; GFX940-TGSPLIT-NEXT: s_endpgm 4860; 4861; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 4862; GFX11-WGP: ; %bb.0: ; %entry 4863; GFX11-WGP-NEXT: s_clause 0x1 4864; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 4865; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 4866; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 4867; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 4868; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 4869; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 4870; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4871; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 4872; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4873; GFX11-WGP-NEXT: s_endpgm 4874; 4875; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 4876; GFX11-CU: ; %bb.0: ; %entry 4877; GFX11-CU-NEXT: s_clause 0x1 4878; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 4879; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 4880; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4881; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 4882; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 4883; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 4884; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4885; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 4886; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4887; GFX11-CU-NEXT: s_endpgm 4888 i32* %out, i32 %in, i32 %old) { 4889entry: 4890 %gep = getelementptr i32, i32* %out, i32 4 4891 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire 4892 %val0 = extractvalue { i32, i1 } %val, 0 4893 store i32 %val0, i32* %out, align 4 4894 ret void 4895} 4896 4897define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( 4898; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 4899; GFX7: ; %bb.0: ; %entry 4900; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4901; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4902; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4903; GFX7-NEXT: s_add_u32 s4, s0, 16 4904; GFX7-NEXT: s_addc_u32 s5, s1, 0 4905; GFX7-NEXT: v_mov_b32_e32 v0, s4 4906; GFX7-NEXT: v_mov_b32_e32 v2, s2 4907; GFX7-NEXT: v_mov_b32_e32 v1, s5 4908; GFX7-NEXT: v_mov_b32_e32 v3, s3 4909; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4910; GFX7-NEXT: v_mov_b32_e32 v0, s0 4911; GFX7-NEXT: v_mov_b32_e32 v1, s1 4912; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4913; GFX7-NEXT: flat_store_dword v[0:1], v2 4914; GFX7-NEXT: s_endpgm 4915; 4916; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 4917; GFX10-WGP: ; %bb.0: ; %entry 4918; GFX10-WGP-NEXT: s_clause 0x1 4919; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4920; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4921; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4922; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4923; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4924; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4925; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4926; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4927; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4928; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4929; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4930; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4931; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4932; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4933; GFX10-WGP-NEXT: s_endpgm 4934; 4935; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 4936; GFX10-CU: ; %bb.0: ; %entry 4937; GFX10-CU-NEXT: s_clause 0x1 4938; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4939; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4940; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4941; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4942; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4943; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4944; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4945; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4946; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4947; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4948; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4949; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4950; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4951; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4952; GFX10-CU-NEXT: s_endpgm 4953; 4954; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 4955; SKIP-CACHE-INV: ; %bb.0: ; %entry 4956; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4957; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 4958; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4959; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4960; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4961; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4962; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4963; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4964; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4965; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4966; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4967; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4968; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4969; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4970; SKIP-CACHE-INV-NEXT: s_endpgm 4971; 4972; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 4973; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4974; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4975; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4976; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4977; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4978; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4979; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4980; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4981; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4982; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4983; 4984; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 4985; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4986; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4987; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4988; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4989; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4990; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4991; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4992; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4993; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4994; GFX90A-TGSPLIT-NEXT: s_endpgm 4995; 4996; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 4997; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 4998; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 4999; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 5000; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5001; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 5002; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 5003; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5004; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5005; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5006; GFX940-NOTTGSPLIT-NEXT: s_endpgm 5007; 5008; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 5009; GFX940-TGSPLIT: ; %bb.0: ; %entry 5010; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5011; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 5012; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5013; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 5014; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 5015; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5016; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5017; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5018; GFX940-TGSPLIT-NEXT: s_endpgm 5019; 5020; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 5021; GFX11-WGP: ; %bb.0: ; %entry 5022; GFX11-WGP-NEXT: s_clause 0x1 5023; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 5024; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 5025; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 5026; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 5027; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 5028; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5029; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5030; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 5031; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5032; GFX11-WGP-NEXT: s_endpgm 5033; 5034; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 5035; GFX11-CU: ; %bb.0: ; %entry 5036; GFX11-CU-NEXT: s_clause 0x1 5037; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 5038; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 5039; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5040; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 5041; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 5042; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5043; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5044; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 5045; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5046; GFX11-CU-NEXT: s_endpgm 5047 i32* %out, i32 %in, i32 %old) { 5048entry: 5049 %gep = getelementptr i32, i32* %out, i32 4 5050 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire 5051 %val0 = extractvalue { i32, i1 } %val, 0 5052 store i32 %val0, i32* %out, align 4 5053 ret void 5054} 5055 5056define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( 5057; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 5058; GFX7: ; %bb.0: ; %entry 5059; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5060; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5061; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5062; GFX7-NEXT: s_add_u32 s4, s0, 16 5063; GFX7-NEXT: s_addc_u32 s5, s1, 0 5064; GFX7-NEXT: v_mov_b32_e32 v0, s4 5065; GFX7-NEXT: v_mov_b32_e32 v2, s2 5066; GFX7-NEXT: v_mov_b32_e32 v1, s5 5067; GFX7-NEXT: v_mov_b32_e32 v3, s3 5068; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5069; GFX7-NEXT: v_mov_b32_e32 v0, s0 5070; GFX7-NEXT: v_mov_b32_e32 v1, s1 5071; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5072; GFX7-NEXT: flat_store_dword v[0:1], v2 5073; GFX7-NEXT: s_endpgm 5074; 5075; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 5076; GFX10-WGP: ; %bb.0: ; %entry 5077; GFX10-WGP-NEXT: s_clause 0x1 5078; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5079; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5080; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5081; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 5082; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 5083; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5084; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5085; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5086; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5087; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5088; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5089; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5090; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5091; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5092; GFX10-WGP-NEXT: s_endpgm 5093; 5094; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 5095; GFX10-CU: ; %bb.0: ; %entry 5096; GFX10-CU-NEXT: s_clause 0x1 5097; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5098; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5099; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5100; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 5101; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 5102; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5103; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5104; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5105; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5106; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5107; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5108; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5109; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5110; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5111; GFX10-CU-NEXT: s_endpgm 5112; 5113; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 5114; SKIP-CACHE-INV: ; %bb.0: ; %entry 5115; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5116; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 5117; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5118; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 5119; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 5120; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 5121; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5122; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 5123; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5124; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5125; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5126; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5127; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5128; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5129; SKIP-CACHE-INV-NEXT: s_endpgm 5130; 5131; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 5132; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5133; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5134; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5135; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5136; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5137; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5138; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5139; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5140; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5141; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5142; 5143; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 5144; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5145; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5146; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5147; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5148; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5149; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5150; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5151; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5152; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5153; GFX90A-TGSPLIT-NEXT: s_endpgm 5154; 5155; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 5156; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 5157; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5158; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 5159; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5160; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 5161; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 5162; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5163; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5164; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5165; GFX940-NOTTGSPLIT-NEXT: s_endpgm 5166; 5167; GFX940-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 5168; GFX940-TGSPLIT: ; %bb.0: ; %entry 5169; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5170; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 5171; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5172; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 5173; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 5174; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5175; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5176; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5177; GFX940-TGSPLIT-NEXT: s_endpgm 5178; 5179; GFX11-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 5180; GFX11-WGP: ; %bb.0: ; %entry 5181; GFX11-WGP-NEXT: s_clause 0x1 5182; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 5183; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 5184; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 5185; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 5186; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 5187; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5188; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5189; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 5190; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5191; GFX11-WGP-NEXT: s_endpgm 5192; 5193; GFX11-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 5194; GFX11-CU: ; %bb.0: ; %entry 5195; GFX11-CU-NEXT: s_clause 0x1 5196; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 5197; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 5198; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5199; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 5200; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 5201; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5202; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5203; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 5204; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5205; GFX11-CU-NEXT: s_endpgm 5206 i32* %out, i32 %in, i32 %old) { 5207entry: 5208 %gep = getelementptr i32, i32* %out, i32 4 5209 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire 5210 %val0 = extractvalue { i32, i1 } %val, 0 5211 store i32 %val0, i32* %out, align 4 5212 ret void 5213} 5214 5215define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( 5216; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 5217; GFX7: ; %bb.0: ; %entry 5218; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5219; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5220; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5221; GFX7-NEXT: s_add_u32 s4, s0, 16 5222; GFX7-NEXT: s_addc_u32 s5, s1, 0 5223; GFX7-NEXT: v_mov_b32_e32 v0, s4 5224; GFX7-NEXT: v_mov_b32_e32 v2, s2 5225; GFX7-NEXT: v_mov_b32_e32 v1, s5 5226; GFX7-NEXT: v_mov_b32_e32 v3, s3 5227; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5228; GFX7-NEXT: v_mov_b32_e32 v0, s0 5229; GFX7-NEXT: v_mov_b32_e32 v1, s1 5230; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5231; GFX7-NEXT: flat_store_dword v[0:1], v2 5232; GFX7-NEXT: s_endpgm 5233; 5234; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 5235; GFX10-WGP: ; %bb.0: ; %entry 5236; GFX10-WGP-NEXT: s_clause 0x1 5237; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5238; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5239; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5240; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 5241; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 5242; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5243; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5244; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5245; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5246; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5247; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5248; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5249; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5250; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5251; GFX10-WGP-NEXT: s_endpgm 5252; 5253; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 5254; GFX10-CU: ; %bb.0: ; %entry 5255; GFX10-CU-NEXT: s_clause 0x1 5256; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5257; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5258; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5259; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 5260; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 5261; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5262; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5263; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5264; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5265; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5266; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5267; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5268; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5269; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5270; GFX10-CU-NEXT: s_endpgm 5271; 5272; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 5273; SKIP-CACHE-INV: ; %bb.0: ; %entry 5274; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5275; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 5276; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5277; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 5278; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 5279; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 5280; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5281; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 5282; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5283; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5284; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5285; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5286; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5287; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5288; SKIP-CACHE-INV-NEXT: s_endpgm 5289; 5290; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 5291; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5292; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5293; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5294; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5295; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5296; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5297; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5298; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5299; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5300; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5301; 5302; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 5303; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5304; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5305; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5306; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5307; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5308; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5309; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5310; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5311; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5312; GFX90A-TGSPLIT-NEXT: s_endpgm 5313; 5314; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 5315; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 5316; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5317; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 5318; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5319; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 5320; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 5321; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5322; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5323; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5324; GFX940-NOTTGSPLIT-NEXT: s_endpgm 5325; 5326; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 5327; GFX940-TGSPLIT: ; %bb.0: ; %entry 5328; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5329; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 5330; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5331; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 5332; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 5333; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5334; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5335; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5336; GFX940-TGSPLIT-NEXT: s_endpgm 5337; 5338; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 5339; GFX11-WGP: ; %bb.0: ; %entry 5340; GFX11-WGP-NEXT: s_clause 0x1 5341; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 5342; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 5343; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 5344; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 5345; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 5346; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5347; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5348; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 5349; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5350; GFX11-WGP-NEXT: s_endpgm 5351; 5352; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 5353; GFX11-CU: ; %bb.0: ; %entry 5354; GFX11-CU-NEXT: s_clause 0x1 5355; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 5356; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 5357; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5358; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 5359; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 5360; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5361; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5362; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 5363; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5364; GFX11-CU-NEXT: s_endpgm 5365 i32* %out, i32 %in, i32 %old) { 5366entry: 5367 %gep = getelementptr i32, i32* %out, i32 4 5368 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire 5369 %val0 = extractvalue { i32, i1 } %val, 0 5370 store i32 %val0, i32* %out, align 4 5371 ret void 5372} 5373 5374define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( 5375; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 5376; GFX7: ; %bb.0: ; %entry 5377; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5378; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5379; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5380; GFX7-NEXT: s_add_u32 s4, s0, 16 5381; GFX7-NEXT: s_addc_u32 s5, s1, 0 5382; GFX7-NEXT: v_mov_b32_e32 v0, s4 5383; GFX7-NEXT: v_mov_b32_e32 v2, s2 5384; GFX7-NEXT: v_mov_b32_e32 v1, s5 5385; GFX7-NEXT: v_mov_b32_e32 v3, s3 5386; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5387; GFX7-NEXT: v_mov_b32_e32 v0, s0 5388; GFX7-NEXT: v_mov_b32_e32 v1, s1 5389; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5390; GFX7-NEXT: flat_store_dword v[0:1], v2 5391; GFX7-NEXT: s_endpgm 5392; 5393; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 5394; GFX10-WGP: ; %bb.0: ; %entry 5395; GFX10-WGP-NEXT: s_clause 0x1 5396; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5397; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5398; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5399; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 5400; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 5401; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5402; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5403; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5404; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5405; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5406; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5407; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5408; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5409; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5410; GFX10-WGP-NEXT: s_endpgm 5411; 5412; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 5413; GFX10-CU: ; %bb.0: ; %entry 5414; GFX10-CU-NEXT: s_clause 0x1 5415; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5416; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5417; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5418; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 5419; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 5420; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5421; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5422; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5423; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5424; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5425; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5426; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5427; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5428; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5429; GFX10-CU-NEXT: s_endpgm 5430; 5431; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 5432; SKIP-CACHE-INV: ; %bb.0: ; %entry 5433; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5434; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 5435; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5436; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 5437; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 5438; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 5439; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5440; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 5441; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5442; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5443; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5444; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5445; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5446; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5447; SKIP-CACHE-INV-NEXT: s_endpgm 5448; 5449; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 5450; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5451; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5452; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5453; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5454; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5455; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5456; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5457; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5458; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5459; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5460; 5461; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 5462; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5463; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5464; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5465; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5466; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5467; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5468; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5469; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5470; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5471; GFX90A-TGSPLIT-NEXT: s_endpgm 5472; 5473; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 5474; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 5475; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5476; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 5477; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5478; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 5479; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 5480; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5481; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5482; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5483; GFX940-NOTTGSPLIT-NEXT: s_endpgm 5484; 5485; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 5486; GFX940-TGSPLIT: ; %bb.0: ; %entry 5487; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5488; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 5489; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5490; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 5491; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 5492; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5493; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5494; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5495; GFX940-TGSPLIT-NEXT: s_endpgm 5496; 5497; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 5498; GFX11-WGP: ; %bb.0: ; %entry 5499; GFX11-WGP-NEXT: s_clause 0x1 5500; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 5501; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 5502; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 5503; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 5504; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 5505; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5506; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5507; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 5508; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5509; GFX11-WGP-NEXT: s_endpgm 5510; 5511; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 5512; GFX11-CU: ; %bb.0: ; %entry 5513; GFX11-CU-NEXT: s_clause 0x1 5514; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 5515; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 5516; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5517; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 5518; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 5519; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5520; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5521; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 5522; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5523; GFX11-CU-NEXT: s_endpgm 5524 i32* %out, i32 %in, i32 %old) { 5525entry: 5526 %gep = getelementptr i32, i32* %out, i32 4 5527 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire 5528 %val0 = extractvalue { i32, i1 } %val, 0 5529 store i32 %val0, i32* %out, align 4 5530 ret void 5531} 5532 5533define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( 5534; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 5535; GFX7: ; %bb.0: ; %entry 5536; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5537; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5538; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5539; GFX7-NEXT: s_add_u32 s4, s0, 16 5540; GFX7-NEXT: s_addc_u32 s5, s1, 0 5541; GFX7-NEXT: v_mov_b32_e32 v0, s4 5542; GFX7-NEXT: v_mov_b32_e32 v2, s2 5543; GFX7-NEXT: v_mov_b32_e32 v1, s5 5544; GFX7-NEXT: v_mov_b32_e32 v3, s3 5545; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5546; GFX7-NEXT: v_mov_b32_e32 v0, s0 5547; GFX7-NEXT: v_mov_b32_e32 v1, s1 5548; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5549; GFX7-NEXT: flat_store_dword v[0:1], v2 5550; GFX7-NEXT: s_endpgm 5551; 5552; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 5553; GFX10-WGP: ; %bb.0: ; %entry 5554; GFX10-WGP-NEXT: s_clause 0x1 5555; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5556; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5557; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5558; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 5559; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 5560; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5561; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5562; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5563; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5564; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5565; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5566; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5567; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5568; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5569; GFX10-WGP-NEXT: s_endpgm 5570; 5571; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 5572; GFX10-CU: ; %bb.0: ; %entry 5573; GFX10-CU-NEXT: s_clause 0x1 5574; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5575; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5576; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5577; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 5578; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 5579; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5580; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5581; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5582; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5583; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5584; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5585; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5586; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5587; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5588; GFX10-CU-NEXT: s_endpgm 5589; 5590; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 5591; SKIP-CACHE-INV: ; %bb.0: ; %entry 5592; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5593; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 5594; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5595; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 5596; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 5597; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 5598; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5599; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 5600; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5601; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5602; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5603; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5604; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5605; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5606; SKIP-CACHE-INV-NEXT: s_endpgm 5607; 5608; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 5609; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5610; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5611; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5612; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5613; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5614; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5615; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5616; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5617; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5618; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5619; 5620; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 5621; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5622; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5623; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5624; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5625; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5626; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5627; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5628; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5629; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5630; GFX90A-TGSPLIT-NEXT: s_endpgm 5631; 5632; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 5633; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 5634; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5635; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 5636; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5637; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 5638; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 5639; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5640; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5641; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5642; GFX940-NOTTGSPLIT-NEXT: s_endpgm 5643; 5644; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 5645; GFX940-TGSPLIT: ; %bb.0: ; %entry 5646; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5647; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 5648; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5649; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 5650; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 5651; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5652; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5653; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5654; GFX940-TGSPLIT-NEXT: s_endpgm 5655; 5656; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 5657; GFX11-WGP: ; %bb.0: ; %entry 5658; GFX11-WGP-NEXT: s_clause 0x1 5659; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 5660; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 5661; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 5662; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 5663; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 5664; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5665; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5666; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 5667; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5668; GFX11-WGP-NEXT: s_endpgm 5669; 5670; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 5671; GFX11-CU: ; %bb.0: ; %entry 5672; GFX11-CU-NEXT: s_clause 0x1 5673; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 5674; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 5675; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5676; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 5677; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 5678; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5679; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5680; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 5681; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5682; GFX11-CU-NEXT: s_endpgm 5683 i32* %out, i32 %in, i32 %old) { 5684entry: 5685 %gep = getelementptr i32, i32* %out, i32 4 5686 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst 5687 %val0 = extractvalue { i32, i1 } %val, 0 5688 store i32 %val0, i32* %out, align 4 5689 ret void 5690} 5691 5692define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( 5693; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 5694; GFX7: ; %bb.0: ; %entry 5695; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5696; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5697; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5698; GFX7-NEXT: s_add_u32 s4, s0, 16 5699; GFX7-NEXT: s_addc_u32 s5, s1, 0 5700; GFX7-NEXT: v_mov_b32_e32 v0, s4 5701; GFX7-NEXT: v_mov_b32_e32 v2, s2 5702; GFX7-NEXT: v_mov_b32_e32 v1, s5 5703; GFX7-NEXT: v_mov_b32_e32 v3, s3 5704; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5705; GFX7-NEXT: v_mov_b32_e32 v0, s0 5706; GFX7-NEXT: v_mov_b32_e32 v1, s1 5707; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5708; GFX7-NEXT: flat_store_dword v[0:1], v2 5709; GFX7-NEXT: s_endpgm 5710; 5711; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 5712; GFX10-WGP: ; %bb.0: ; %entry 5713; GFX10-WGP-NEXT: s_clause 0x1 5714; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5715; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5716; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5717; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 5718; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 5719; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5720; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5721; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5722; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5723; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5724; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5725; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5726; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5727; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5728; GFX10-WGP-NEXT: s_endpgm 5729; 5730; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 5731; GFX10-CU: ; %bb.0: ; %entry 5732; GFX10-CU-NEXT: s_clause 0x1 5733; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5734; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5735; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5736; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 5737; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 5738; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5739; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5740; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5741; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5742; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5743; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5744; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5745; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5746; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5747; GFX10-CU-NEXT: s_endpgm 5748; 5749; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 5750; SKIP-CACHE-INV: ; %bb.0: ; %entry 5751; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5752; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 5753; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5754; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 5755; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 5756; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 5757; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5758; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 5759; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5760; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5761; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5762; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5763; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5764; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5765; SKIP-CACHE-INV-NEXT: s_endpgm 5766; 5767; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 5768; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5769; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5770; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5771; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5772; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5773; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5774; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5775; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5776; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5777; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5778; 5779; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 5780; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5781; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5782; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5783; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5784; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5785; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5786; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5787; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5788; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5789; GFX90A-TGSPLIT-NEXT: s_endpgm 5790; 5791; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 5792; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 5793; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5794; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 5795; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5796; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 5797; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 5798; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5799; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5800; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5801; GFX940-NOTTGSPLIT-NEXT: s_endpgm 5802; 5803; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 5804; GFX940-TGSPLIT: ; %bb.0: ; %entry 5805; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5806; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 5807; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5808; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 5809; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 5810; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5811; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5812; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5813; GFX940-TGSPLIT-NEXT: s_endpgm 5814; 5815; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 5816; GFX11-WGP: ; %bb.0: ; %entry 5817; GFX11-WGP-NEXT: s_clause 0x1 5818; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 5819; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 5820; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 5821; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 5822; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 5823; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5824; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5825; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 5826; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5827; GFX11-WGP-NEXT: s_endpgm 5828; 5829; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 5830; GFX11-CU: ; %bb.0: ; %entry 5831; GFX11-CU-NEXT: s_clause 0x1 5832; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 5833; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 5834; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5835; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 5836; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 5837; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5838; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5839; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 5840; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5841; GFX11-CU-NEXT: s_endpgm 5842 i32* %out, i32 %in, i32 %old) { 5843entry: 5844 %gep = getelementptr i32, i32* %out, i32 4 5845 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst 5846 %val0 = extractvalue { i32, i1 } %val, 0 5847 store i32 %val0, i32* %out, align 4 5848 ret void 5849} 5850 5851define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( 5852; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 5853; GFX7: ; %bb.0: ; %entry 5854; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5855; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5856; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5857; GFX7-NEXT: s_add_u32 s4, s0, 16 5858; GFX7-NEXT: s_addc_u32 s5, s1, 0 5859; GFX7-NEXT: v_mov_b32_e32 v0, s4 5860; GFX7-NEXT: v_mov_b32_e32 v2, s2 5861; GFX7-NEXT: v_mov_b32_e32 v1, s5 5862; GFX7-NEXT: v_mov_b32_e32 v3, s3 5863; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5864; GFX7-NEXT: v_mov_b32_e32 v0, s0 5865; GFX7-NEXT: v_mov_b32_e32 v1, s1 5866; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5867; GFX7-NEXT: flat_store_dword v[0:1], v2 5868; GFX7-NEXT: s_endpgm 5869; 5870; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 5871; GFX10-WGP: ; %bb.0: ; %entry 5872; GFX10-WGP-NEXT: s_clause 0x1 5873; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5874; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5875; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5876; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 5877; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 5878; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5879; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5880; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5881; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5882; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5883; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5884; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5885; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5886; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5887; GFX10-WGP-NEXT: s_endpgm 5888; 5889; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 5890; GFX10-CU: ; %bb.0: ; %entry 5891; GFX10-CU-NEXT: s_clause 0x1 5892; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5893; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5894; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5895; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 5896; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 5897; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5898; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5899; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5900; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5901; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5902; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5903; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5904; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5905; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5906; GFX10-CU-NEXT: s_endpgm 5907; 5908; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 5909; SKIP-CACHE-INV: ; %bb.0: ; %entry 5910; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5911; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 5912; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5913; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 5914; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 5915; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 5916; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5917; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 5918; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5919; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5920; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5921; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5922; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5923; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5924; SKIP-CACHE-INV-NEXT: s_endpgm 5925; 5926; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 5927; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5928; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5929; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5930; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5931; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5932; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5933; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5934; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5935; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5936; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5937; 5938; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 5939; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5940; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5941; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5942; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5943; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5944; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5945; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5946; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5947; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5948; GFX90A-TGSPLIT-NEXT: s_endpgm 5949; 5950; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 5951; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 5952; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5953; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 5954; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5955; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 5956; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 5957; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5958; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5959; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5960; GFX940-NOTTGSPLIT-NEXT: s_endpgm 5961; 5962; GFX940-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 5963; GFX940-TGSPLIT: ; %bb.0: ; %entry 5964; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 5965; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 5966; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5967; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 5968; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 5969; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5970; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5971; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5972; GFX940-TGSPLIT-NEXT: s_endpgm 5973; 5974; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 5975; GFX11-WGP: ; %bb.0: ; %entry 5976; GFX11-WGP-NEXT: s_clause 0x1 5977; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 5978; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 5979; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 5980; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 5981; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 5982; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5983; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5984; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 5985; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5986; GFX11-WGP-NEXT: s_endpgm 5987; 5988; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 5989; GFX11-CU: ; %bb.0: ; %entry 5990; GFX11-CU-NEXT: s_clause 0x1 5991; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 5992; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 5993; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5994; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 5995; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 5996; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5997; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5998; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 5999; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6000; GFX11-CU-NEXT: s_endpgm 6001 i32* %out, i32 %in, i32 %old) { 6002entry: 6003 %gep = getelementptr i32, i32* %out, i32 4 6004 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst 6005 %val0 = extractvalue { i32, i1 } %val, 0 6006 store i32 %val0, i32* %out, align 4 6007 ret void 6008} 6009 6010define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( 6011; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 6012; GFX7: ; %bb.0: ; %entry 6013; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6014; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6015; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6016; GFX7-NEXT: s_add_u32 s4, s0, 16 6017; GFX7-NEXT: s_addc_u32 s5, s1, 0 6018; GFX7-NEXT: v_mov_b32_e32 v0, s4 6019; GFX7-NEXT: v_mov_b32_e32 v2, s2 6020; GFX7-NEXT: v_mov_b32_e32 v1, s5 6021; GFX7-NEXT: v_mov_b32_e32 v3, s3 6022; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6023; GFX7-NEXT: v_mov_b32_e32 v0, s0 6024; GFX7-NEXT: v_mov_b32_e32 v1, s1 6025; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6026; GFX7-NEXT: flat_store_dword v[0:1], v2 6027; GFX7-NEXT: s_endpgm 6028; 6029; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 6030; GFX10-WGP: ; %bb.0: ; %entry 6031; GFX10-WGP-NEXT: s_clause 0x1 6032; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6033; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6034; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6035; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 6036; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 6037; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 6038; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6039; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 6040; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6041; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6042; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6043; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6044; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6045; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6046; GFX10-WGP-NEXT: s_endpgm 6047; 6048; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 6049; GFX10-CU: ; %bb.0: ; %entry 6050; GFX10-CU-NEXT: s_clause 0x1 6051; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6052; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6053; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6054; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 6055; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 6056; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 6057; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6058; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 6059; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6060; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6061; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6062; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6063; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6064; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6065; GFX10-CU-NEXT: s_endpgm 6066; 6067; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 6068; SKIP-CACHE-INV: ; %bb.0: ; %entry 6069; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 6070; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 6071; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6072; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 6073; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 6074; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 6075; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6076; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 6077; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6078; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6079; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6080; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6081; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6082; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6083; SKIP-CACHE-INV-NEXT: s_endpgm 6084; 6085; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 6086; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6087; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6088; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6089; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6090; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6091; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6092; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6093; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6094; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6095; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6096; 6097; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 6098; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6099; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6100; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6101; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6102; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6103; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6104; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6105; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6106; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6107; GFX90A-TGSPLIT-NEXT: s_endpgm 6108; 6109; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 6110; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 6111; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 6112; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 6113; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6114; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 6115; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 6116; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 6117; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6118; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6119; GFX940-NOTTGSPLIT-NEXT: s_endpgm 6120; 6121; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 6122; GFX940-TGSPLIT: ; %bb.0: ; %entry 6123; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 6124; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 6125; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6126; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 6127; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 6128; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 6129; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6130; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6131; GFX940-TGSPLIT-NEXT: s_endpgm 6132; 6133; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 6134; GFX11-WGP: ; %bb.0: ; %entry 6135; GFX11-WGP-NEXT: s_clause 0x1 6136; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 6137; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 6138; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 6139; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 6140; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 6141; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 6142; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6143; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 6144; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6145; GFX11-WGP-NEXT: s_endpgm 6146; 6147; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 6148; GFX11-CU: ; %bb.0: ; %entry 6149; GFX11-CU-NEXT: s_clause 0x1 6150; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 6151; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 6152; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 6153; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 6154; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 6155; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 6156; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6157; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 6158; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6159; GFX11-CU-NEXT: s_endpgm 6160 i32* %out, i32 %in, i32 %old) { 6161entry: 6162 %gep = getelementptr i32, i32* %out, i32 4 6163 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst 6164 %val0 = extractvalue { i32, i1 } %val, 0 6165 store i32 %val0, i32* %out, align 4 6166 ret void 6167} 6168 6169define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( 6170; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 6171; GFX7: ; %bb.0: ; %entry 6172; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6173; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6174; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6175; GFX7-NEXT: s_add_u32 s4, s0, 16 6176; GFX7-NEXT: s_addc_u32 s5, s1, 0 6177; GFX7-NEXT: v_mov_b32_e32 v0, s4 6178; GFX7-NEXT: v_mov_b32_e32 v2, s2 6179; GFX7-NEXT: v_mov_b32_e32 v1, s5 6180; GFX7-NEXT: v_mov_b32_e32 v3, s3 6181; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6182; GFX7-NEXT: v_mov_b32_e32 v0, s0 6183; GFX7-NEXT: v_mov_b32_e32 v1, s1 6184; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6185; GFX7-NEXT: flat_store_dword v[0:1], v2 6186; GFX7-NEXT: s_endpgm 6187; 6188; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 6189; GFX10-WGP: ; %bb.0: ; %entry 6190; GFX10-WGP-NEXT: s_clause 0x1 6191; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6192; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6193; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6194; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 6195; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 6196; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 6197; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6198; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 6199; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6200; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6201; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6202; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6203; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6204; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6205; GFX10-WGP-NEXT: s_endpgm 6206; 6207; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 6208; GFX10-CU: ; %bb.0: ; %entry 6209; GFX10-CU-NEXT: s_clause 0x1 6210; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6211; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6212; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6213; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 6214; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 6215; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 6216; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6217; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 6218; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6219; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6220; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6221; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6222; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6223; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6224; GFX10-CU-NEXT: s_endpgm 6225; 6226; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 6227; SKIP-CACHE-INV: ; %bb.0: ; %entry 6228; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 6229; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 6230; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6231; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 6232; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 6233; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 6234; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6235; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 6236; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6237; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6238; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6239; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6240; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6241; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6242; SKIP-CACHE-INV-NEXT: s_endpgm 6243; 6244; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 6245; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6246; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6247; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6248; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6249; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6250; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6251; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6252; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6253; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6254; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6255; 6256; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 6257; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6258; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6259; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6260; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6261; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6262; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6263; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6264; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6265; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6266; GFX90A-TGSPLIT-NEXT: s_endpgm 6267; 6268; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 6269; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 6270; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 6271; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 6272; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6273; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 6274; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 6275; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 6276; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6277; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6278; GFX940-NOTTGSPLIT-NEXT: s_endpgm 6279; 6280; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 6281; GFX940-TGSPLIT: ; %bb.0: ; %entry 6282; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 6283; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 6284; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6285; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 6286; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 6287; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 6288; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6289; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6290; GFX940-TGSPLIT-NEXT: s_endpgm 6291; 6292; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 6293; GFX11-WGP: ; %bb.0: ; %entry 6294; GFX11-WGP-NEXT: s_clause 0x1 6295; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 6296; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 6297; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 6298; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 6299; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 6300; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 6301; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6302; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 6303; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6304; GFX11-WGP-NEXT: s_endpgm 6305; 6306; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 6307; GFX11-CU: ; %bb.0: ; %entry 6308; GFX11-CU-NEXT: s_clause 0x1 6309; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 6310; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 6311; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 6312; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 6313; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 6314; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 6315; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6316; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 6317; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6318; GFX11-CU-NEXT: s_endpgm 6319 i32* %out, i32 %in, i32 %old) { 6320entry: 6321 %gep = getelementptr i32, i32* %out, i32 4 6322 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst 6323 %val0 = extractvalue { i32, i1 } %val, 0 6324 store i32 %val0, i32* %out, align 4 6325 ret void 6326} 6327 6328define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( 6329; GFX7-LABEL: flat_singlethread_one_as_unordered_load: 6330; GFX7: ; %bb.0: ; %entry 6331; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6332; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6333; GFX7-NEXT: v_mov_b32_e32 v0, s0 6334; GFX7-NEXT: v_mov_b32_e32 v1, s1 6335; GFX7-NEXT: flat_load_dword v2, v[0:1] 6336; GFX7-NEXT: v_mov_b32_e32 v0, s2 6337; GFX7-NEXT: v_mov_b32_e32 v1, s3 6338; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6339; GFX7-NEXT: flat_store_dword v[0:1], v2 6340; GFX7-NEXT: s_endpgm 6341; 6342; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load: 6343; GFX10-WGP: ; %bb.0: ; %entry 6344; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6345; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6346; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6347; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6348; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 6349; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 6350; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 6351; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6352; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6353; GFX10-WGP-NEXT: s_endpgm 6354; 6355; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load: 6356; GFX10-CU: ; %bb.0: ; %entry 6357; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6358; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6359; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6360; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6361; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 6362; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 6363; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 6364; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6365; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6366; GFX10-CU-NEXT: s_endpgm 6367; 6368; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_load: 6369; SKIP-CACHE-INV: ; %bb.0: ; %entry 6370; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 6371; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6372; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 6373; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 6374; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 6375; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6376; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6377; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6378; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6379; SKIP-CACHE-INV-NEXT: s_endpgm 6380; 6381; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: 6382; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6383; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6384; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6385; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 6386; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 6387; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 6388; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6389; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 6390; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6391; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6392; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6393; 6394; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: 6395; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6396; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6397; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6398; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 6399; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 6400; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 6401; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6402; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 6403; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6404; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6405; GFX90A-TGSPLIT-NEXT: s_endpgm 6406; 6407; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: 6408; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 6409; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 6410; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6411; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 6412; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 6413; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 6414; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6415; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 6416; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6417; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6418; GFX940-NOTTGSPLIT-NEXT: s_endpgm 6419; 6420; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: 6421; GFX940-TGSPLIT: ; %bb.0: ; %entry 6422; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 6423; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6424; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 6425; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 6426; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 6427; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6428; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 6429; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6430; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6431; GFX940-TGSPLIT-NEXT: s_endpgm 6432; 6433; GFX11-WGP-LABEL: flat_singlethread_one_as_unordered_load: 6434; GFX11-WGP: ; %bb.0: ; %entry 6435; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 6436; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 6437; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 6438; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] 6439; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 6440; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6441; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 6442; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6443; GFX11-WGP-NEXT: s_endpgm 6444; 6445; GFX11-CU-LABEL: flat_singlethread_one_as_unordered_load: 6446; GFX11-CU: ; %bb.0: ; %entry 6447; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 6448; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 6449; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 6450; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 6451; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 6452; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6453; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 6454; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6455; GFX11-CU-NEXT: s_endpgm 6456 i32* %in, i32* %out) { 6457entry: 6458 %val = load atomic i32, i32* %in syncscope("singlethread-one-as") unordered, align 4 6459 store i32 %val, i32* %out 6460 ret void 6461} 6462 6463define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( 6464; GFX7-LABEL: flat_singlethread_one_as_monotonic_load: 6465; GFX7: ; %bb.0: ; %entry 6466; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6467; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6468; GFX7-NEXT: v_mov_b32_e32 v0, s0 6469; GFX7-NEXT: v_mov_b32_e32 v1, s1 6470; GFX7-NEXT: flat_load_dword v2, v[0:1] 6471; GFX7-NEXT: v_mov_b32_e32 v0, s2 6472; GFX7-NEXT: v_mov_b32_e32 v1, s3 6473; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6474; GFX7-NEXT: flat_store_dword v[0:1], v2 6475; GFX7-NEXT: s_endpgm 6476; 6477; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load: 6478; GFX10-WGP: ; %bb.0: ; %entry 6479; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6480; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6481; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6482; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6483; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 6484; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 6485; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 6486; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6487; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6488; GFX10-WGP-NEXT: s_endpgm 6489; 6490; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load: 6491; GFX10-CU: ; %bb.0: ; %entry 6492; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6493; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6494; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6495; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6496; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 6497; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 6498; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 6499; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6500; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6501; GFX10-CU-NEXT: s_endpgm 6502; 6503; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_load: 6504; SKIP-CACHE-INV: ; %bb.0: ; %entry 6505; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 6506; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6507; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 6508; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 6509; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 6510; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6511; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6512; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6513; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6514; SKIP-CACHE-INV-NEXT: s_endpgm 6515; 6516; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: 6517; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6518; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6519; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6520; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 6521; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 6522; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 6523; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6524; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 6525; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6526; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6527; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6528; 6529; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: 6530; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6531; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6532; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6533; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 6534; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 6535; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 6536; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6537; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 6538; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6539; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6540; GFX90A-TGSPLIT-NEXT: s_endpgm 6541; 6542; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: 6543; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 6544; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 6545; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6546; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 6547; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 6548; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 6549; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6550; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 6551; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6552; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6553; GFX940-NOTTGSPLIT-NEXT: s_endpgm 6554; 6555; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: 6556; GFX940-TGSPLIT: ; %bb.0: ; %entry 6557; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 6558; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6559; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 6560; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 6561; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 6562; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6563; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 6564; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6565; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6566; GFX940-TGSPLIT-NEXT: s_endpgm 6567; 6568; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_load: 6569; GFX11-WGP: ; %bb.0: ; %entry 6570; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 6571; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 6572; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 6573; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] 6574; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 6575; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6576; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 6577; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6578; GFX11-WGP-NEXT: s_endpgm 6579; 6580; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_load: 6581; GFX11-CU: ; %bb.0: ; %entry 6582; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 6583; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 6584; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 6585; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 6586; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 6587; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6588; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 6589; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6590; GFX11-CU-NEXT: s_endpgm 6591 i32* %in, i32* %out) { 6592entry: 6593 %val = load atomic i32, i32* %in syncscope("singlethread-one-as") monotonic, align 4 6594 store i32 %val, i32* %out 6595 ret void 6596} 6597 6598define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( 6599; GFX7-LABEL: flat_singlethread_one_as_acquire_load: 6600; GFX7: ; %bb.0: ; %entry 6601; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6602; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6603; GFX7-NEXT: v_mov_b32_e32 v0, s0 6604; GFX7-NEXT: v_mov_b32_e32 v1, s1 6605; GFX7-NEXT: flat_load_dword v2, v[0:1] 6606; GFX7-NEXT: v_mov_b32_e32 v0, s2 6607; GFX7-NEXT: v_mov_b32_e32 v1, s3 6608; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6609; GFX7-NEXT: flat_store_dword v[0:1], v2 6610; GFX7-NEXT: s_endpgm 6611; 6612; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load: 6613; GFX10-WGP: ; %bb.0: ; %entry 6614; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6615; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6616; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6617; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6618; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 6619; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 6620; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 6621; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6622; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6623; GFX10-WGP-NEXT: s_endpgm 6624; 6625; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load: 6626; GFX10-CU: ; %bb.0: ; %entry 6627; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6628; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6629; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6630; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6631; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 6632; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 6633; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 6634; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6635; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6636; GFX10-CU-NEXT: s_endpgm 6637; 6638; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_load: 6639; SKIP-CACHE-INV: ; %bb.0: ; %entry 6640; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 6641; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6642; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 6643; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 6644; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 6645; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6646; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6647; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6648; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6649; SKIP-CACHE-INV-NEXT: s_endpgm 6650; 6651; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: 6652; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6653; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6654; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6655; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 6656; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 6657; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 6658; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6659; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 6660; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6661; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6662; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6663; 6664; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: 6665; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6666; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6667; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6668; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 6669; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 6670; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 6671; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6672; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 6673; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6674; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6675; GFX90A-TGSPLIT-NEXT: s_endpgm 6676; 6677; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: 6678; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 6679; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 6680; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6681; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 6682; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 6683; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 6684; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6685; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 6686; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6687; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6688; GFX940-NOTTGSPLIT-NEXT: s_endpgm 6689; 6690; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: 6691; GFX940-TGSPLIT: ; %bb.0: ; %entry 6692; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 6693; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6694; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 6695; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 6696; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 6697; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6698; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 6699; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6700; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6701; GFX940-TGSPLIT-NEXT: s_endpgm 6702; 6703; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_load: 6704; GFX11-WGP: ; %bb.0: ; %entry 6705; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 6706; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 6707; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 6708; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] 6709; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 6710; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6711; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 6712; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6713; GFX11-WGP-NEXT: s_endpgm 6714; 6715; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_load: 6716; GFX11-CU: ; %bb.0: ; %entry 6717; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 6718; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 6719; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 6720; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 6721; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 6722; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6723; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 6724; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6725; GFX11-CU-NEXT: s_endpgm 6726 i32* %in, i32* %out) { 6727entry: 6728 %val = load atomic i32, i32* %in syncscope("singlethread-one-as") acquire, align 4 6729 store i32 %val, i32* %out 6730 ret void 6731} 6732 6733define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( 6734; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load: 6735; GFX7: ; %bb.0: ; %entry 6736; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6737; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6738; GFX7-NEXT: v_mov_b32_e32 v0, s0 6739; GFX7-NEXT: v_mov_b32_e32 v1, s1 6740; GFX7-NEXT: flat_load_dword v2, v[0:1] 6741; GFX7-NEXT: v_mov_b32_e32 v0, s2 6742; GFX7-NEXT: v_mov_b32_e32 v1, s3 6743; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6744; GFX7-NEXT: flat_store_dword v[0:1], v2 6745; GFX7-NEXT: s_endpgm 6746; 6747; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: 6748; GFX10-WGP: ; %bb.0: ; %entry 6749; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6750; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6751; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6752; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6753; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 6754; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 6755; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 6756; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6757; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6758; GFX10-WGP-NEXT: s_endpgm 6759; 6760; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load: 6761; GFX10-CU: ; %bb.0: ; %entry 6762; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6763; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6764; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6765; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6766; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 6767; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 6768; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 6769; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6770; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6771; GFX10-CU-NEXT: s_endpgm 6772; 6773; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_load: 6774; SKIP-CACHE-INV: ; %bb.0: ; %entry 6775; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 6776; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6777; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 6778; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 6779; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 6780; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6781; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6782; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6783; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6784; SKIP-CACHE-INV-NEXT: s_endpgm 6785; 6786; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: 6787; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6788; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6789; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6790; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 6791; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 6792; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 6793; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6794; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 6795; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6796; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6797; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6798; 6799; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: 6800; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6801; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6802; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6803; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 6804; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 6805; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 6806; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6807; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 6808; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6809; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6810; GFX90A-TGSPLIT-NEXT: s_endpgm 6811; 6812; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: 6813; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 6814; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 6815; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6816; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 6817; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 6818; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 6819; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6820; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 6821; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6822; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6823; GFX940-NOTTGSPLIT-NEXT: s_endpgm 6824; 6825; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: 6826; GFX940-TGSPLIT: ; %bb.0: ; %entry 6827; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 6828; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6829; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 6830; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 6831; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 6832; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6833; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 6834; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6835; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6836; GFX940-TGSPLIT-NEXT: s_endpgm 6837; 6838; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: 6839; GFX11-WGP: ; %bb.0: ; %entry 6840; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 6841; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 6842; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 6843; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] 6844; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 6845; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6846; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 6847; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6848; GFX11-WGP-NEXT: s_endpgm 6849; 6850; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_load: 6851; GFX11-CU: ; %bb.0: ; %entry 6852; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 6853; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 6854; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 6855; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 6856; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 6857; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6858; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 6859; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6860; GFX11-CU-NEXT: s_endpgm 6861 i32* %in, i32* %out) { 6862entry: 6863 %val = load atomic i32, i32* %in syncscope("singlethread-one-as") seq_cst, align 4 6864 store i32 %val, i32* %out 6865 ret void 6866} 6867 6868define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( 6869; GFX7-LABEL: flat_singlethread_one_as_unordered_store: 6870; GFX7: ; %bb.0: ; %entry 6871; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 6872; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 6873; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6874; GFX7-NEXT: v_mov_b32_e32 v0, s0 6875; GFX7-NEXT: v_mov_b32_e32 v1, s1 6876; GFX7-NEXT: v_mov_b32_e32 v2, s2 6877; GFX7-NEXT: flat_store_dword v[0:1], v2 6878; GFX7-NEXT: s_endpgm 6879; 6880; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store: 6881; GFX10-WGP: ; %bb.0: ; %entry 6882; GFX10-WGP-NEXT: s_clause 0x1 6883; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 6884; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 6885; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6886; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6887; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6888; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6889; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6890; GFX10-WGP-NEXT: s_endpgm 6891; 6892; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store: 6893; GFX10-CU: ; %bb.0: ; %entry 6894; GFX10-CU-NEXT: s_clause 0x1 6895; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 6896; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 6897; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6898; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6899; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6900; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6901; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6902; GFX10-CU-NEXT: s_endpgm 6903; 6904; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_store: 6905; SKIP-CACHE-INV: ; %bb.0: ; %entry 6906; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 6907; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0 6908; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6909; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6910; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6911; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6912; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6913; SKIP-CACHE-INV-NEXT: s_endpgm 6914; 6915; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: 6916; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6917; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 6918; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 6919; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6920; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6921; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 6922; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6923; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6924; 6925; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: 6926; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6927; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 6928; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 6929; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6930; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6931; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 6932; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6933; GFX90A-TGSPLIT-NEXT: s_endpgm 6934; 6935; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: 6936; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 6937; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 6938; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 6939; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6940; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 6941; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 6942; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6943; GFX940-NOTTGSPLIT-NEXT: s_endpgm 6944; 6945; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: 6946; GFX940-TGSPLIT: ; %bb.0: ; %entry 6947; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 6948; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 6949; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6950; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 6951; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 6952; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6953; GFX940-TGSPLIT-NEXT: s_endpgm 6954; 6955; GFX11-WGP-LABEL: flat_singlethread_one_as_unordered_store: 6956; GFX11-WGP: ; %bb.0: ; %entry 6957; GFX11-WGP-NEXT: s_clause 0x1 6958; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 6959; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 6960; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 6961; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 6962; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 6963; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 6964; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6965; GFX11-WGP-NEXT: s_endpgm 6966; 6967; GFX11-CU-LABEL: flat_singlethread_one_as_unordered_store: 6968; GFX11-CU: ; %bb.0: ; %entry 6969; GFX11-CU-NEXT: s_clause 0x1 6970; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 6971; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 6972; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 6973; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 6974; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 6975; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 6976; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6977; GFX11-CU-NEXT: s_endpgm 6978 i32 %in, i32* %out) { 6979entry: 6980 store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4 6981 ret void 6982} 6983 6984define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( 6985; GFX7-LABEL: flat_singlethread_one_as_monotonic_store: 6986; GFX7: ; %bb.0: ; %entry 6987; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 6988; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 6989; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6990; GFX7-NEXT: v_mov_b32_e32 v0, s0 6991; GFX7-NEXT: v_mov_b32_e32 v1, s1 6992; GFX7-NEXT: v_mov_b32_e32 v2, s2 6993; GFX7-NEXT: flat_store_dword v[0:1], v2 6994; GFX7-NEXT: s_endpgm 6995; 6996; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store: 6997; GFX10-WGP: ; %bb.0: ; %entry 6998; GFX10-WGP-NEXT: s_clause 0x1 6999; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 7000; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 7001; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7002; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7003; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7004; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7005; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7006; GFX10-WGP-NEXT: s_endpgm 7007; 7008; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store: 7009; GFX10-CU: ; %bb.0: ; %entry 7010; GFX10-CU-NEXT: s_clause 0x1 7011; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 7012; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 7013; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7014; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7015; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7016; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7017; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7018; GFX10-CU-NEXT: s_endpgm 7019; 7020; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_store: 7021; SKIP-CACHE-INV: ; %bb.0: ; %entry 7022; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 7023; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0 7024; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7025; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7026; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7027; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7028; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7029; SKIP-CACHE-INV-NEXT: s_endpgm 7030; 7031; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: 7032; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7033; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 7034; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 7035; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7036; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7037; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 7038; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7039; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7040; 7041; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: 7042; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7043; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 7044; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 7045; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7046; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7047; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 7048; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7049; GFX90A-TGSPLIT-NEXT: s_endpgm 7050; 7051; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: 7052; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 7053; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 7054; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 7055; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7056; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 7057; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 7058; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7059; GFX940-NOTTGSPLIT-NEXT: s_endpgm 7060; 7061; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: 7062; GFX940-TGSPLIT: ; %bb.0: ; %entry 7063; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 7064; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 7065; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7066; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 7067; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 7068; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7069; GFX940-TGSPLIT-NEXT: s_endpgm 7070; 7071; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_store: 7072; GFX11-WGP: ; %bb.0: ; %entry 7073; GFX11-WGP-NEXT: s_clause 0x1 7074; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 7075; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 7076; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 7077; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 7078; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 7079; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 7080; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 7081; GFX11-WGP-NEXT: s_endpgm 7082; 7083; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_store: 7084; GFX11-CU: ; %bb.0: ; %entry 7085; GFX11-CU-NEXT: s_clause 0x1 7086; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 7087; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 7088; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7089; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 7090; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 7091; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 7092; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 7093; GFX11-CU-NEXT: s_endpgm 7094 i32 %in, i32* %out) { 7095entry: 7096 store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4 7097 ret void 7098} 7099 7100define amdgpu_kernel void @flat_singlethread_one_as_release_store( 7101; GFX7-LABEL: flat_singlethread_one_as_release_store: 7102; GFX7: ; %bb.0: ; %entry 7103; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 7104; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 7105; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7106; GFX7-NEXT: v_mov_b32_e32 v0, s0 7107; GFX7-NEXT: v_mov_b32_e32 v1, s1 7108; GFX7-NEXT: v_mov_b32_e32 v2, s2 7109; GFX7-NEXT: flat_store_dword v[0:1], v2 7110; GFX7-NEXT: s_endpgm 7111; 7112; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store: 7113; GFX10-WGP: ; %bb.0: ; %entry 7114; GFX10-WGP-NEXT: s_clause 0x1 7115; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 7116; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 7117; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7118; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7119; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7120; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7121; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7122; GFX10-WGP-NEXT: s_endpgm 7123; 7124; GFX10-CU-LABEL: flat_singlethread_one_as_release_store: 7125; GFX10-CU: ; %bb.0: ; %entry 7126; GFX10-CU-NEXT: s_clause 0x1 7127; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 7128; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 7129; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7130; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7131; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7132; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7133; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7134; GFX10-CU-NEXT: s_endpgm 7135; 7136; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_store: 7137; SKIP-CACHE-INV: ; %bb.0: ; %entry 7138; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 7139; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0 7140; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7141; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7142; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7143; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7144; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7145; SKIP-CACHE-INV-NEXT: s_endpgm 7146; 7147; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store: 7148; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7149; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 7150; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 7151; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7152; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7153; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 7154; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7155; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7156; 7157; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store: 7158; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7159; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 7160; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 7161; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7162; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7163; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 7164; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7165; GFX90A-TGSPLIT-NEXT: s_endpgm 7166; 7167; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store: 7168; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 7169; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 7170; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 7171; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7172; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 7173; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 7174; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7175; GFX940-NOTTGSPLIT-NEXT: s_endpgm 7176; 7177; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_store: 7178; GFX940-TGSPLIT: ; %bb.0: ; %entry 7179; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 7180; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 7181; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7182; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 7183; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 7184; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7185; GFX940-TGSPLIT-NEXT: s_endpgm 7186; 7187; GFX11-WGP-LABEL: flat_singlethread_one_as_release_store: 7188; GFX11-WGP: ; %bb.0: ; %entry 7189; GFX11-WGP-NEXT: s_clause 0x1 7190; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 7191; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 7192; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 7193; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 7194; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 7195; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 7196; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 7197; GFX11-WGP-NEXT: s_endpgm 7198; 7199; GFX11-CU-LABEL: flat_singlethread_one_as_release_store: 7200; GFX11-CU: ; %bb.0: ; %entry 7201; GFX11-CU-NEXT: s_clause 0x1 7202; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 7203; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 7204; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7205; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 7206; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 7207; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 7208; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 7209; GFX11-CU-NEXT: s_endpgm 7210 i32 %in, i32* %out) { 7211entry: 7212 store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4 7213 ret void 7214} 7215 7216define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( 7217; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store: 7218; GFX7: ; %bb.0: ; %entry 7219; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 7220; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 7221; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7222; GFX7-NEXT: v_mov_b32_e32 v0, s0 7223; GFX7-NEXT: v_mov_b32_e32 v1, s1 7224; GFX7-NEXT: v_mov_b32_e32 v2, s2 7225; GFX7-NEXT: flat_store_dword v[0:1], v2 7226; GFX7-NEXT: s_endpgm 7227; 7228; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store: 7229; GFX10-WGP: ; %bb.0: ; %entry 7230; GFX10-WGP-NEXT: s_clause 0x1 7231; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 7232; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 7233; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7234; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7235; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7236; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7237; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7238; GFX10-WGP-NEXT: s_endpgm 7239; 7240; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store: 7241; GFX10-CU: ; %bb.0: ; %entry 7242; GFX10-CU-NEXT: s_clause 0x1 7243; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 7244; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 7245; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7246; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7247; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7248; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7249; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7250; GFX10-CU-NEXT: s_endpgm 7251; 7252; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_store: 7253; SKIP-CACHE-INV: ; %bb.0: ; %entry 7254; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 7255; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0 7256; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7257; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7258; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7259; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7260; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7261; SKIP-CACHE-INV-NEXT: s_endpgm 7262; 7263; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: 7264; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7265; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 7266; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 7267; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7268; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7269; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 7270; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7271; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7272; 7273; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: 7274; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7275; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 7276; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 7277; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7278; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7279; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 7280; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7281; GFX90A-TGSPLIT-NEXT: s_endpgm 7282; 7283; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: 7284; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 7285; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 7286; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 7287; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7288; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 7289; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 7290; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7291; GFX940-NOTTGSPLIT-NEXT: s_endpgm 7292; 7293; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: 7294; GFX940-TGSPLIT: ; %bb.0: ; %entry 7295; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 7296; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 7297; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7298; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 7299; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 7300; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7301; GFX940-TGSPLIT-NEXT: s_endpgm 7302; 7303; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_store: 7304; GFX11-WGP: ; %bb.0: ; %entry 7305; GFX11-WGP-NEXT: s_clause 0x1 7306; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 7307; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 7308; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 7309; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 7310; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 7311; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 7312; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 7313; GFX11-WGP-NEXT: s_endpgm 7314; 7315; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_store: 7316; GFX11-CU: ; %bb.0: ; %entry 7317; GFX11-CU-NEXT: s_clause 0x1 7318; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 7319; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 7320; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7321; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 7322; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 7323; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 7324; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 7325; GFX11-CU-NEXT: s_endpgm 7326 i32 %in, i32* %out) { 7327entry: 7328 store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4 7329 ret void 7330} 7331 7332define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( 7333; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 7334; GFX7: ; %bb.0: ; %entry 7335; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7336; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 7337; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7338; GFX7-NEXT: v_mov_b32_e32 v0, s0 7339; GFX7-NEXT: v_mov_b32_e32 v1, s1 7340; GFX7-NEXT: v_mov_b32_e32 v2, s2 7341; GFX7-NEXT: flat_atomic_swap v[0:1], v2 7342; GFX7-NEXT: s_endpgm 7343; 7344; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 7345; GFX10-WGP: ; %bb.0: ; %entry 7346; GFX10-WGP-NEXT: s_clause 0x1 7347; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7348; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 7349; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7350; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7351; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7352; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7353; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 7354; GFX10-WGP-NEXT: s_endpgm 7355; 7356; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 7357; GFX10-CU: ; %bb.0: ; %entry 7358; GFX10-CU-NEXT: s_clause 0x1 7359; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7360; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 7361; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7362; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7363; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7364; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7365; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 7366; GFX10-CU-NEXT: s_endpgm 7367; 7368; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 7369; SKIP-CACHE-INV: ; %bb.0: ; %entry 7370; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 7371; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 7372; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7373; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7374; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7375; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7376; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 7377; SKIP-CACHE-INV-NEXT: s_endpgm 7378; 7379; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 7380; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7381; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7382; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 7383; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7384; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7385; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 7386; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7387; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7388; 7389; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 7390; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7391; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7392; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 7393; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7394; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7395; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 7396; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7397; GFX90A-TGSPLIT-NEXT: s_endpgm 7398; 7399; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 7400; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 7401; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 7402; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 7403; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7404; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 7405; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 7406; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7407; GFX940-NOTTGSPLIT-NEXT: s_endpgm 7408; 7409; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 7410; GFX940-TGSPLIT: ; %bb.0: ; %entry 7411; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 7412; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 7413; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7414; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 7415; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 7416; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7417; GFX940-TGSPLIT-NEXT: s_endpgm 7418; 7419; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 7420; GFX11-WGP: ; %bb.0: ; %entry 7421; GFX11-WGP-NEXT: s_clause 0x1 7422; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 7423; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 7424; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 7425; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 7426; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 7427; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 7428; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 7429; GFX11-WGP-NEXT: s_endpgm 7430; 7431; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 7432; GFX11-CU: ; %bb.0: ; %entry 7433; GFX11-CU-NEXT: s_clause 0x1 7434; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 7435; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 7436; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7437; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 7438; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 7439; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 7440; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 7441; GFX11-CU-NEXT: s_endpgm 7442 i32* %out, i32 %in) { 7443entry: 7444 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") monotonic 7445 ret void 7446} 7447 7448define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( 7449; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 7450; GFX7: ; %bb.0: ; %entry 7451; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7452; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 7453; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7454; GFX7-NEXT: v_mov_b32_e32 v0, s0 7455; GFX7-NEXT: v_mov_b32_e32 v1, s1 7456; GFX7-NEXT: v_mov_b32_e32 v2, s2 7457; GFX7-NEXT: flat_atomic_swap v[0:1], v2 7458; GFX7-NEXT: s_endpgm 7459; 7460; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 7461; GFX10-WGP: ; %bb.0: ; %entry 7462; GFX10-WGP-NEXT: s_clause 0x1 7463; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7464; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 7465; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7466; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7467; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7468; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7469; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 7470; GFX10-WGP-NEXT: s_endpgm 7471; 7472; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 7473; GFX10-CU: ; %bb.0: ; %entry 7474; GFX10-CU-NEXT: s_clause 0x1 7475; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7476; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 7477; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7478; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7479; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7480; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7481; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 7482; GFX10-CU-NEXT: s_endpgm 7483; 7484; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 7485; SKIP-CACHE-INV: ; %bb.0: ; %entry 7486; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 7487; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 7488; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7489; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7490; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7491; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7492; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 7493; SKIP-CACHE-INV-NEXT: s_endpgm 7494; 7495; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 7496; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7497; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7498; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 7499; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7500; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7501; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 7502; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7503; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7504; 7505; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 7506; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7507; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7508; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 7509; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7510; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7511; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 7512; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7513; GFX90A-TGSPLIT-NEXT: s_endpgm 7514; 7515; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 7516; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 7517; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 7518; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 7519; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7520; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 7521; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 7522; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7523; GFX940-NOTTGSPLIT-NEXT: s_endpgm 7524; 7525; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 7526; GFX940-TGSPLIT: ; %bb.0: ; %entry 7527; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 7528; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 7529; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7530; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 7531; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 7532; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7533; GFX940-TGSPLIT-NEXT: s_endpgm 7534; 7535; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 7536; GFX11-WGP: ; %bb.0: ; %entry 7537; GFX11-WGP-NEXT: s_clause 0x1 7538; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 7539; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 7540; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 7541; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 7542; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 7543; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 7544; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 7545; GFX11-WGP-NEXT: s_endpgm 7546; 7547; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 7548; GFX11-CU: ; %bb.0: ; %entry 7549; GFX11-CU-NEXT: s_clause 0x1 7550; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 7551; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 7552; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7553; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 7554; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 7555; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 7556; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 7557; GFX11-CU-NEXT: s_endpgm 7558 i32* %out, i32 %in) { 7559entry: 7560 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire 7561 ret void 7562} 7563 7564define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( 7565; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw: 7566; GFX7: ; %bb.0: ; %entry 7567; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7568; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 7569; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7570; GFX7-NEXT: v_mov_b32_e32 v0, s0 7571; GFX7-NEXT: v_mov_b32_e32 v1, s1 7572; GFX7-NEXT: v_mov_b32_e32 v2, s2 7573; GFX7-NEXT: flat_atomic_swap v[0:1], v2 7574; GFX7-NEXT: s_endpgm 7575; 7576; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: 7577; GFX10-WGP: ; %bb.0: ; %entry 7578; GFX10-WGP-NEXT: s_clause 0x1 7579; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7580; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 7581; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7582; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7583; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7584; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7585; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 7586; GFX10-WGP-NEXT: s_endpgm 7587; 7588; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: 7589; GFX10-CU: ; %bb.0: ; %entry 7590; GFX10-CU-NEXT: s_clause 0x1 7591; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7592; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 7593; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7594; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7595; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7596; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7597; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 7598; GFX10-CU-NEXT: s_endpgm 7599; 7600; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_atomicrmw: 7601; SKIP-CACHE-INV: ; %bb.0: ; %entry 7602; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 7603; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 7604; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7605; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7606; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7607; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7608; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 7609; SKIP-CACHE-INV-NEXT: s_endpgm 7610; 7611; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: 7612; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7613; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7614; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 7615; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7616; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7617; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 7618; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7619; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7620; 7621; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: 7622; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7623; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7624; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 7625; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7626; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7627; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 7628; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7629; GFX90A-TGSPLIT-NEXT: s_endpgm 7630; 7631; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: 7632; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 7633; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 7634; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 7635; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7636; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 7637; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 7638; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7639; GFX940-NOTTGSPLIT-NEXT: s_endpgm 7640; 7641; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: 7642; GFX940-TGSPLIT: ; %bb.0: ; %entry 7643; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 7644; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 7645; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7646; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 7647; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 7648; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7649; GFX940-TGSPLIT-NEXT: s_endpgm 7650; 7651; GFX11-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: 7652; GFX11-WGP: ; %bb.0: ; %entry 7653; GFX11-WGP-NEXT: s_clause 0x1 7654; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 7655; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 7656; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 7657; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 7658; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 7659; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 7660; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 7661; GFX11-WGP-NEXT: s_endpgm 7662; 7663; GFX11-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: 7664; GFX11-CU: ; %bb.0: ; %entry 7665; GFX11-CU-NEXT: s_clause 0x1 7666; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 7667; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 7668; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7669; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 7670; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 7671; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 7672; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 7673; GFX11-CU-NEXT: s_endpgm 7674 i32* %out, i32 %in) { 7675entry: 7676 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") release 7677 ret void 7678} 7679 7680define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( 7681; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 7682; GFX7: ; %bb.0: ; %entry 7683; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7684; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 7685; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7686; GFX7-NEXT: v_mov_b32_e32 v0, s0 7687; GFX7-NEXT: v_mov_b32_e32 v1, s1 7688; GFX7-NEXT: v_mov_b32_e32 v2, s2 7689; GFX7-NEXT: flat_atomic_swap v[0:1], v2 7690; GFX7-NEXT: s_endpgm 7691; 7692; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 7693; GFX10-WGP: ; %bb.0: ; %entry 7694; GFX10-WGP-NEXT: s_clause 0x1 7695; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7696; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 7697; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7698; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7699; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7700; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7701; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 7702; GFX10-WGP-NEXT: s_endpgm 7703; 7704; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 7705; GFX10-CU: ; %bb.0: ; %entry 7706; GFX10-CU-NEXT: s_clause 0x1 7707; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7708; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 7709; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7710; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7711; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7712; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7713; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 7714; GFX10-CU-NEXT: s_endpgm 7715; 7716; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 7717; SKIP-CACHE-INV: ; %bb.0: ; %entry 7718; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 7719; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 7720; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7721; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7722; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7723; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7724; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 7725; SKIP-CACHE-INV-NEXT: s_endpgm 7726; 7727; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 7728; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7729; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7730; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 7731; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7732; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7733; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 7734; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7735; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7736; 7737; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 7738; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7739; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7740; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 7741; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7742; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7743; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 7744; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7745; GFX90A-TGSPLIT-NEXT: s_endpgm 7746; 7747; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 7748; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 7749; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 7750; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 7751; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7752; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 7753; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 7754; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7755; GFX940-NOTTGSPLIT-NEXT: s_endpgm 7756; 7757; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 7758; GFX940-TGSPLIT: ; %bb.0: ; %entry 7759; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 7760; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 7761; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7762; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 7763; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 7764; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7765; GFX940-TGSPLIT-NEXT: s_endpgm 7766; 7767; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 7768; GFX11-WGP: ; %bb.0: ; %entry 7769; GFX11-WGP-NEXT: s_clause 0x1 7770; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 7771; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 7772; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 7773; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 7774; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 7775; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 7776; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 7777; GFX11-WGP-NEXT: s_endpgm 7778; 7779; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 7780; GFX11-CU: ; %bb.0: ; %entry 7781; GFX11-CU-NEXT: s_clause 0x1 7782; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 7783; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 7784; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7785; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 7786; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 7787; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 7788; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 7789; GFX11-CU-NEXT: s_endpgm 7790 i32* %out, i32 %in) { 7791entry: 7792 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel 7793 ret void 7794} 7795 7796define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( 7797; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 7798; GFX7: ; %bb.0: ; %entry 7799; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7800; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 7801; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7802; GFX7-NEXT: v_mov_b32_e32 v0, s0 7803; GFX7-NEXT: v_mov_b32_e32 v1, s1 7804; GFX7-NEXT: v_mov_b32_e32 v2, s2 7805; GFX7-NEXT: flat_atomic_swap v[0:1], v2 7806; GFX7-NEXT: s_endpgm 7807; 7808; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 7809; GFX10-WGP: ; %bb.0: ; %entry 7810; GFX10-WGP-NEXT: s_clause 0x1 7811; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7812; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 7813; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7814; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7815; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7816; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7817; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 7818; GFX10-WGP-NEXT: s_endpgm 7819; 7820; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 7821; GFX10-CU: ; %bb.0: ; %entry 7822; GFX10-CU-NEXT: s_clause 0x1 7823; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7824; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 7825; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7826; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7827; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7828; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7829; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 7830; GFX10-CU-NEXT: s_endpgm 7831; 7832; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 7833; SKIP-CACHE-INV: ; %bb.0: ; %entry 7834; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 7835; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 7836; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7837; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7838; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7839; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7840; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 7841; SKIP-CACHE-INV-NEXT: s_endpgm 7842; 7843; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 7844; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7845; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7846; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 7847; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7848; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7849; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 7850; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7851; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7852; 7853; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 7854; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7855; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7856; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 7857; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7858; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7859; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 7860; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7861; GFX90A-TGSPLIT-NEXT: s_endpgm 7862; 7863; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 7864; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 7865; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 7866; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 7867; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7868; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 7869; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 7870; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7871; GFX940-NOTTGSPLIT-NEXT: s_endpgm 7872; 7873; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 7874; GFX940-TGSPLIT: ; %bb.0: ; %entry 7875; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 7876; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 7877; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7878; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 7879; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 7880; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 7881; GFX940-TGSPLIT-NEXT: s_endpgm 7882; 7883; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 7884; GFX11-WGP: ; %bb.0: ; %entry 7885; GFX11-WGP-NEXT: s_clause 0x1 7886; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 7887; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 7888; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 7889; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 7890; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 7891; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 7892; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 7893; GFX11-WGP-NEXT: s_endpgm 7894; 7895; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 7896; GFX11-CU: ; %bb.0: ; %entry 7897; GFX11-CU-NEXT: s_clause 0x1 7898; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 7899; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 7900; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7901; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 7902; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 7903; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 7904; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 7905; GFX11-CU-NEXT: s_endpgm 7906 i32* %out, i32 %in) { 7907entry: 7908 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst 7909 ret void 7910} 7911 7912define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( 7913; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 7914; GFX7: ; %bb.0: ; %entry 7915; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7916; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 7917; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7918; GFX7-NEXT: v_mov_b32_e32 v0, s0 7919; GFX7-NEXT: v_mov_b32_e32 v1, s1 7920; GFX7-NEXT: v_mov_b32_e32 v2, s2 7921; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 7922; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7923; GFX7-NEXT: flat_store_dword v[0:1], v2 7924; GFX7-NEXT: s_endpgm 7925; 7926; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 7927; GFX10-WGP: ; %bb.0: ; %entry 7928; GFX10-WGP-NEXT: s_clause 0x1 7929; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7930; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 7931; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7932; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7933; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7934; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7935; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 7936; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7937; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7938; GFX10-WGP-NEXT: s_endpgm 7939; 7940; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 7941; GFX10-CU: ; %bb.0: ; %entry 7942; GFX10-CU-NEXT: s_clause 0x1 7943; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7944; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 7945; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7946; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7947; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7948; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7949; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 7950; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7951; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7952; GFX10-CU-NEXT: s_endpgm 7953; 7954; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 7955; SKIP-CACHE-INV: ; %bb.0: ; %entry 7956; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 7957; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 7958; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7959; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7960; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7961; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7962; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 7963; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7964; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7965; SKIP-CACHE-INV-NEXT: s_endpgm 7966; 7967; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 7968; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7969; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7970; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 7971; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7972; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7973; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 7974; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 7975; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7976; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7977; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7978; 7979; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 7980; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7981; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7982; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 7983; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7984; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7985; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 7986; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 7987; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7988; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7989; GFX90A-TGSPLIT-NEXT: s_endpgm 7990; 7991; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 7992; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 7993; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 7994; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 7995; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7996; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 7997; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 7998; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 7999; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8000; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8001; GFX940-NOTTGSPLIT-NEXT: s_endpgm 8002; 8003; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 8004; GFX940-TGSPLIT: ; %bb.0: ; %entry 8005; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8006; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 8007; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8008; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 8009; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 8010; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 8011; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8012; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8013; GFX940-TGSPLIT-NEXT: s_endpgm 8014; 8015; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 8016; GFX11-WGP: ; %bb.0: ; %entry 8017; GFX11-WGP-NEXT: s_clause 0x1 8018; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 8019; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 8020; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 8021; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 8022; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 8023; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 8024; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8025; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 8026; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 8027; GFX11-WGP-NEXT: s_endpgm 8028; 8029; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 8030; GFX11-CU: ; %bb.0: ; %entry 8031; GFX11-CU-NEXT: s_clause 0x1 8032; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 8033; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 8034; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8035; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 8036; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 8037; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 8038; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8039; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 8040; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 8041; GFX11-CU-NEXT: s_endpgm 8042 i32* %out, i32 %in) { 8043entry: 8044 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire 8045 store i32 %val, i32* %out, align 4 8046 ret void 8047} 8048 8049define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( 8050; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 8051; GFX7: ; %bb.0: ; %entry 8052; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8053; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 8054; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8055; GFX7-NEXT: v_mov_b32_e32 v0, s0 8056; GFX7-NEXT: v_mov_b32_e32 v1, s1 8057; GFX7-NEXT: v_mov_b32_e32 v2, s2 8058; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 8059; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8060; GFX7-NEXT: flat_store_dword v[0:1], v2 8061; GFX7-NEXT: s_endpgm 8062; 8063; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 8064; GFX10-WGP: ; %bb.0: ; %entry 8065; GFX10-WGP-NEXT: s_clause 0x1 8066; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8067; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 8068; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8069; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 8070; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 8071; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 8072; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 8073; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8074; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8075; GFX10-WGP-NEXT: s_endpgm 8076; 8077; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 8078; GFX10-CU: ; %bb.0: ; %entry 8079; GFX10-CU-NEXT: s_clause 0x1 8080; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8081; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 8082; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8083; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 8084; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 8085; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 8086; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 8087; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8088; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8089; GFX10-CU-NEXT: s_endpgm 8090; 8091; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 8092; SKIP-CACHE-INV: ; %bb.0: ; %entry 8093; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8094; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 8095; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8096; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8097; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8098; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 8099; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 8100; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8101; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8102; SKIP-CACHE-INV-NEXT: s_endpgm 8103; 8104; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 8105; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8106; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8107; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 8108; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8109; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8110; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 8111; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 8112; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8113; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8114; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8115; 8116; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 8117; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8118; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8119; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 8120; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8121; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8122; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 8123; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 8124; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8125; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8126; GFX90A-TGSPLIT-NEXT: s_endpgm 8127; 8128; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 8129; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 8130; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8131; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 8132; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8133; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 8134; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 8135; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 8136; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8137; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8138; GFX940-NOTTGSPLIT-NEXT: s_endpgm 8139; 8140; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 8141; GFX940-TGSPLIT: ; %bb.0: ; %entry 8142; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8143; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 8144; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8145; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 8146; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 8147; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 8148; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8149; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8150; GFX940-TGSPLIT-NEXT: s_endpgm 8151; 8152; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 8153; GFX11-WGP: ; %bb.0: ; %entry 8154; GFX11-WGP-NEXT: s_clause 0x1 8155; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 8156; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 8157; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 8158; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 8159; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 8160; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 8161; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8162; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 8163; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 8164; GFX11-WGP-NEXT: s_endpgm 8165; 8166; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 8167; GFX11-CU: ; %bb.0: ; %entry 8168; GFX11-CU-NEXT: s_clause 0x1 8169; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 8170; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 8171; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8172; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 8173; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 8174; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 8175; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8176; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 8177; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 8178; GFX11-CU-NEXT: s_endpgm 8179 i32* %out, i32 %in) { 8180entry: 8181 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel 8182 store i32 %val, i32* %out, align 4 8183 ret void 8184} 8185 8186define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( 8187; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 8188; GFX7: ; %bb.0: ; %entry 8189; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8190; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 8191; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8192; GFX7-NEXT: v_mov_b32_e32 v0, s0 8193; GFX7-NEXT: v_mov_b32_e32 v1, s1 8194; GFX7-NEXT: v_mov_b32_e32 v2, s2 8195; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 8196; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8197; GFX7-NEXT: flat_store_dword v[0:1], v2 8198; GFX7-NEXT: s_endpgm 8199; 8200; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 8201; GFX10-WGP: ; %bb.0: ; %entry 8202; GFX10-WGP-NEXT: s_clause 0x1 8203; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8204; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 8205; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8206; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 8207; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 8208; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 8209; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 8210; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8211; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8212; GFX10-WGP-NEXT: s_endpgm 8213; 8214; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 8215; GFX10-CU: ; %bb.0: ; %entry 8216; GFX10-CU-NEXT: s_clause 0x1 8217; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8218; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 8219; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8220; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 8221; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 8222; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 8223; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 8224; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8225; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8226; GFX10-CU-NEXT: s_endpgm 8227; 8228; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 8229; SKIP-CACHE-INV: ; %bb.0: ; %entry 8230; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8231; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 8232; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8233; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8234; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8235; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 8236; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 8237; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8238; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8239; SKIP-CACHE-INV-NEXT: s_endpgm 8240; 8241; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 8242; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8243; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8244; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 8245; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8246; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8247; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 8248; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 8249; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8250; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8251; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8252; 8253; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 8254; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8255; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8256; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 8257; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8258; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8259; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 8260; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 8261; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8262; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8263; GFX90A-TGSPLIT-NEXT: s_endpgm 8264; 8265; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 8266; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 8267; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8268; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 8269; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8270; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 8271; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 8272; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 8273; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8274; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8275; GFX940-NOTTGSPLIT-NEXT: s_endpgm 8276; 8277; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 8278; GFX940-TGSPLIT: ; %bb.0: ; %entry 8279; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8280; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 8281; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8282; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 8283; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 8284; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 8285; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8286; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8287; GFX940-TGSPLIT-NEXT: s_endpgm 8288; 8289; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 8290; GFX11-WGP: ; %bb.0: ; %entry 8291; GFX11-WGP-NEXT: s_clause 0x1 8292; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 8293; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 8294; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 8295; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 8296; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 8297; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 8298; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8299; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 8300; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 8301; GFX11-WGP-NEXT: s_endpgm 8302; 8303; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 8304; GFX11-CU: ; %bb.0: ; %entry 8305; GFX11-CU-NEXT: s_clause 0x1 8306; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 8307; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 8308; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8309; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 8310; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 8311; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 8312; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8313; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 8314; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 8315; GFX11-CU-NEXT: s_endpgm 8316 i32* %out, i32 %in) { 8317entry: 8318 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst 8319 store i32 %val, i32* %out, align 4 8320 ret void 8321} 8322 8323define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( 8324; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 8325; GFX7: ; %bb.0: ; %entry 8326; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8327; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 8328; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8329; GFX7-NEXT: s_add_u32 s0, s0, 16 8330; GFX7-NEXT: s_addc_u32 s1, s1, 0 8331; GFX7-NEXT: v_mov_b32_e32 v0, s0 8332; GFX7-NEXT: v_mov_b32_e32 v2, s2 8333; GFX7-NEXT: v_mov_b32_e32 v1, s1 8334; GFX7-NEXT: v_mov_b32_e32 v3, s3 8335; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8336; GFX7-NEXT: s_endpgm 8337; 8338; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 8339; GFX10-WGP: ; %bb.0: ; %entry 8340; GFX10-WGP-NEXT: s_clause 0x1 8341; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8342; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8343; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8344; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 8345; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 8346; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 8347; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 8348; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 8349; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 8350; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8351; GFX10-WGP-NEXT: s_endpgm 8352; 8353; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 8354; GFX10-CU: ; %bb.0: ; %entry 8355; GFX10-CU-NEXT: s_clause 0x1 8356; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8357; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8358; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8359; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 8360; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 8361; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 8362; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 8363; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 8364; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 8365; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8366; GFX10-CU-NEXT: s_endpgm 8367; 8368; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 8369; SKIP-CACHE-INV: ; %bb.0: ; %entry 8370; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8371; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 8372; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8373; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 8374; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 8375; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8376; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 8377; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8378; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 8379; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8380; SKIP-CACHE-INV-NEXT: s_endpgm 8381; 8382; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 8383; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8384; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8385; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8386; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8387; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8388; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8389; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8390; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8391; 8392; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 8393; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8394; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8395; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8396; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8397; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8398; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8399; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8400; GFX90A-TGSPLIT-NEXT: s_endpgm 8401; 8402; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 8403; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 8404; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8405; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 8406; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8407; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 8408; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 8409; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8410; GFX940-NOTTGSPLIT-NEXT: s_endpgm 8411; 8412; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 8413; GFX940-TGSPLIT: ; %bb.0: ; %entry 8414; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8415; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 8416; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8417; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 8418; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 8419; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8420; GFX940-TGSPLIT-NEXT: s_endpgm 8421; 8422; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 8423; GFX11-WGP: ; %bb.0: ; %entry 8424; GFX11-WGP-NEXT: s_clause 0x1 8425; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 8426; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 8427; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 8428; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 8429; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 8430; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 8431; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 8432; GFX11-WGP-NEXT: s_endpgm 8433; 8434; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 8435; GFX11-CU: ; %bb.0: ; %entry 8436; GFX11-CU-NEXT: s_clause 0x1 8437; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 8438; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 8439; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8440; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 8441; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 8442; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 8443; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 8444; GFX11-CU-NEXT: s_endpgm 8445 i32* %out, i32 %in, i32 %old) { 8446entry: 8447 %gep = getelementptr i32, i32* %out, i32 4 8448 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic 8449 ret void 8450} 8451 8452define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( 8453; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 8454; GFX7: ; %bb.0: ; %entry 8455; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8456; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 8457; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8458; GFX7-NEXT: s_add_u32 s0, s0, 16 8459; GFX7-NEXT: s_addc_u32 s1, s1, 0 8460; GFX7-NEXT: v_mov_b32_e32 v0, s0 8461; GFX7-NEXT: v_mov_b32_e32 v2, s2 8462; GFX7-NEXT: v_mov_b32_e32 v1, s1 8463; GFX7-NEXT: v_mov_b32_e32 v3, s3 8464; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8465; GFX7-NEXT: s_endpgm 8466; 8467; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 8468; GFX10-WGP: ; %bb.0: ; %entry 8469; GFX10-WGP-NEXT: s_clause 0x1 8470; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8471; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8472; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8473; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 8474; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 8475; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 8476; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 8477; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 8478; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 8479; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8480; GFX10-WGP-NEXT: s_endpgm 8481; 8482; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 8483; GFX10-CU: ; %bb.0: ; %entry 8484; GFX10-CU-NEXT: s_clause 0x1 8485; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8486; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8487; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8488; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 8489; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 8490; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 8491; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 8492; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 8493; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 8494; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8495; GFX10-CU-NEXT: s_endpgm 8496; 8497; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 8498; SKIP-CACHE-INV: ; %bb.0: ; %entry 8499; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8500; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 8501; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8502; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 8503; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 8504; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8505; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 8506; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8507; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 8508; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8509; SKIP-CACHE-INV-NEXT: s_endpgm 8510; 8511; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 8512; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8513; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8514; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8515; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8516; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8517; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8518; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8519; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8520; 8521; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 8522; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8523; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8524; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8525; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8526; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8527; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8528; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8529; GFX90A-TGSPLIT-NEXT: s_endpgm 8530; 8531; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 8532; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 8533; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8534; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 8535; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8536; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 8537; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 8538; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8539; GFX940-NOTTGSPLIT-NEXT: s_endpgm 8540; 8541; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 8542; GFX940-TGSPLIT: ; %bb.0: ; %entry 8543; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8544; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 8545; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8546; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 8547; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 8548; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8549; GFX940-TGSPLIT-NEXT: s_endpgm 8550; 8551; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 8552; GFX11-WGP: ; %bb.0: ; %entry 8553; GFX11-WGP-NEXT: s_clause 0x1 8554; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 8555; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 8556; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 8557; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 8558; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 8559; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 8560; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 8561; GFX11-WGP-NEXT: s_endpgm 8562; 8563; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 8564; GFX11-CU: ; %bb.0: ; %entry 8565; GFX11-CU-NEXT: s_clause 0x1 8566; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 8567; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 8568; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8569; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 8570; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 8571; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 8572; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 8573; GFX11-CU-NEXT: s_endpgm 8574 i32* %out, i32 %in, i32 %old) { 8575entry: 8576 %gep = getelementptr i32, i32* %out, i32 4 8577 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic 8578 ret void 8579} 8580 8581define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( 8582; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 8583; GFX7: ; %bb.0: ; %entry 8584; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8585; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 8586; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8587; GFX7-NEXT: s_add_u32 s0, s0, 16 8588; GFX7-NEXT: s_addc_u32 s1, s1, 0 8589; GFX7-NEXT: v_mov_b32_e32 v0, s0 8590; GFX7-NEXT: v_mov_b32_e32 v2, s2 8591; GFX7-NEXT: v_mov_b32_e32 v1, s1 8592; GFX7-NEXT: v_mov_b32_e32 v3, s3 8593; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8594; GFX7-NEXT: s_endpgm 8595; 8596; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 8597; GFX10-WGP: ; %bb.0: ; %entry 8598; GFX10-WGP-NEXT: s_clause 0x1 8599; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8600; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8601; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8602; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 8603; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 8604; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 8605; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 8606; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 8607; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 8608; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8609; GFX10-WGP-NEXT: s_endpgm 8610; 8611; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 8612; GFX10-CU: ; %bb.0: ; %entry 8613; GFX10-CU-NEXT: s_clause 0x1 8614; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8615; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8616; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8617; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 8618; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 8619; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 8620; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 8621; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 8622; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 8623; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8624; GFX10-CU-NEXT: s_endpgm 8625; 8626; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 8627; SKIP-CACHE-INV: ; %bb.0: ; %entry 8628; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8629; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 8630; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8631; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 8632; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 8633; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8634; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 8635; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8636; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 8637; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8638; SKIP-CACHE-INV-NEXT: s_endpgm 8639; 8640; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 8641; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8642; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8643; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8644; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8645; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8646; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8647; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8648; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8649; 8650; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 8651; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8652; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8653; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8654; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8655; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8656; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8657; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8658; GFX90A-TGSPLIT-NEXT: s_endpgm 8659; 8660; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 8661; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 8662; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8663; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 8664; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8665; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 8666; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 8667; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8668; GFX940-NOTTGSPLIT-NEXT: s_endpgm 8669; 8670; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 8671; GFX940-TGSPLIT: ; %bb.0: ; %entry 8672; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8673; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 8674; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8675; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 8676; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 8677; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8678; GFX940-TGSPLIT-NEXT: s_endpgm 8679; 8680; GFX11-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 8681; GFX11-WGP: ; %bb.0: ; %entry 8682; GFX11-WGP-NEXT: s_clause 0x1 8683; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 8684; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 8685; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 8686; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 8687; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 8688; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 8689; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 8690; GFX11-WGP-NEXT: s_endpgm 8691; 8692; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 8693; GFX11-CU: ; %bb.0: ; %entry 8694; GFX11-CU-NEXT: s_clause 0x1 8695; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 8696; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 8697; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8698; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 8699; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 8700; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 8701; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 8702; GFX11-CU-NEXT: s_endpgm 8703 i32* %out, i32 %in, i32 %old) { 8704entry: 8705 %gep = getelementptr i32, i32* %out, i32 4 8706 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic 8707 ret void 8708} 8709 8710define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( 8711; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 8712; GFX7: ; %bb.0: ; %entry 8713; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8714; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 8715; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8716; GFX7-NEXT: s_add_u32 s0, s0, 16 8717; GFX7-NEXT: s_addc_u32 s1, s1, 0 8718; GFX7-NEXT: v_mov_b32_e32 v0, s0 8719; GFX7-NEXT: v_mov_b32_e32 v2, s2 8720; GFX7-NEXT: v_mov_b32_e32 v1, s1 8721; GFX7-NEXT: v_mov_b32_e32 v3, s3 8722; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8723; GFX7-NEXT: s_endpgm 8724; 8725; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 8726; GFX10-WGP: ; %bb.0: ; %entry 8727; GFX10-WGP-NEXT: s_clause 0x1 8728; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8729; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8730; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8731; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 8732; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 8733; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 8734; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 8735; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 8736; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 8737; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8738; GFX10-WGP-NEXT: s_endpgm 8739; 8740; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 8741; GFX10-CU: ; %bb.0: ; %entry 8742; GFX10-CU-NEXT: s_clause 0x1 8743; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8744; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8745; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8746; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 8747; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 8748; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 8749; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 8750; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 8751; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 8752; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8753; GFX10-CU-NEXT: s_endpgm 8754; 8755; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 8756; SKIP-CACHE-INV: ; %bb.0: ; %entry 8757; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8758; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 8759; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8760; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 8761; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 8762; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8763; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 8764; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8765; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 8766; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8767; SKIP-CACHE-INV-NEXT: s_endpgm 8768; 8769; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 8770; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8771; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8772; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8773; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8774; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8775; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8776; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8777; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8778; 8779; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 8780; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8781; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8782; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8783; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8784; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8785; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8786; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8787; GFX90A-TGSPLIT-NEXT: s_endpgm 8788; 8789; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 8790; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 8791; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8792; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 8793; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8794; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 8795; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 8796; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8797; GFX940-NOTTGSPLIT-NEXT: s_endpgm 8798; 8799; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 8800; GFX940-TGSPLIT: ; %bb.0: ; %entry 8801; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8802; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 8803; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8804; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 8805; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 8806; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8807; GFX940-TGSPLIT-NEXT: s_endpgm 8808; 8809; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 8810; GFX11-WGP: ; %bb.0: ; %entry 8811; GFX11-WGP-NEXT: s_clause 0x1 8812; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 8813; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 8814; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 8815; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 8816; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 8817; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 8818; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 8819; GFX11-WGP-NEXT: s_endpgm 8820; 8821; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 8822; GFX11-CU: ; %bb.0: ; %entry 8823; GFX11-CU-NEXT: s_clause 0x1 8824; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 8825; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 8826; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8827; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 8828; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 8829; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 8830; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 8831; GFX11-CU-NEXT: s_endpgm 8832 i32* %out, i32 %in, i32 %old) { 8833entry: 8834 %gep = getelementptr i32, i32* %out, i32 4 8835 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic 8836 ret void 8837} 8838 8839define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( 8840; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 8841; GFX7: ; %bb.0: ; %entry 8842; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8843; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 8844; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8845; GFX7-NEXT: s_add_u32 s0, s0, 16 8846; GFX7-NEXT: s_addc_u32 s1, s1, 0 8847; GFX7-NEXT: v_mov_b32_e32 v0, s0 8848; GFX7-NEXT: v_mov_b32_e32 v2, s2 8849; GFX7-NEXT: v_mov_b32_e32 v1, s1 8850; GFX7-NEXT: v_mov_b32_e32 v3, s3 8851; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8852; GFX7-NEXT: s_endpgm 8853; 8854; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 8855; GFX10-WGP: ; %bb.0: ; %entry 8856; GFX10-WGP-NEXT: s_clause 0x1 8857; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8858; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8859; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8860; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 8861; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 8862; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 8863; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 8864; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 8865; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 8866; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8867; GFX10-WGP-NEXT: s_endpgm 8868; 8869; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 8870; GFX10-CU: ; %bb.0: ; %entry 8871; GFX10-CU-NEXT: s_clause 0x1 8872; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8873; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8874; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8875; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 8876; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 8877; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 8878; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 8879; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 8880; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 8881; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8882; GFX10-CU-NEXT: s_endpgm 8883; 8884; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 8885; SKIP-CACHE-INV: ; %bb.0: ; %entry 8886; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8887; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 8888; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8889; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 8890; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 8891; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8892; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 8893; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8894; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 8895; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8896; SKIP-CACHE-INV-NEXT: s_endpgm 8897; 8898; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 8899; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8900; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8901; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8902; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8903; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8904; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8905; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8906; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8907; 8908; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 8909; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8910; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8911; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8912; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8913; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8914; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8915; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8916; GFX90A-TGSPLIT-NEXT: s_endpgm 8917; 8918; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 8919; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 8920; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8921; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 8922; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8923; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 8924; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 8925; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8926; GFX940-NOTTGSPLIT-NEXT: s_endpgm 8927; 8928; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 8929; GFX940-TGSPLIT: ; %bb.0: ; %entry 8930; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 8931; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 8932; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8933; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 8934; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 8935; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 8936; GFX940-TGSPLIT-NEXT: s_endpgm 8937; 8938; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 8939; GFX11-WGP: ; %bb.0: ; %entry 8940; GFX11-WGP-NEXT: s_clause 0x1 8941; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 8942; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 8943; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 8944; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 8945; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 8946; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 8947; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 8948; GFX11-WGP-NEXT: s_endpgm 8949; 8950; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 8951; GFX11-CU: ; %bb.0: ; %entry 8952; GFX11-CU-NEXT: s_clause 0x1 8953; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 8954; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 8955; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8956; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 8957; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 8958; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 8959; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 8960; GFX11-CU-NEXT: s_endpgm 8961 i32* %out, i32 %in, i32 %old) { 8962entry: 8963 %gep = getelementptr i32, i32* %out, i32 4 8964 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic 8965 ret void 8966} 8967 8968define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( 8969; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 8970; GFX7: ; %bb.0: ; %entry 8971; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8972; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 8973; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8974; GFX7-NEXT: s_add_u32 s0, s0, 16 8975; GFX7-NEXT: s_addc_u32 s1, s1, 0 8976; GFX7-NEXT: v_mov_b32_e32 v0, s0 8977; GFX7-NEXT: v_mov_b32_e32 v2, s2 8978; GFX7-NEXT: v_mov_b32_e32 v1, s1 8979; GFX7-NEXT: v_mov_b32_e32 v3, s3 8980; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8981; GFX7-NEXT: s_endpgm 8982; 8983; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 8984; GFX10-WGP: ; %bb.0: ; %entry 8985; GFX10-WGP-NEXT: s_clause 0x1 8986; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8987; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8988; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8989; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 8990; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 8991; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 8992; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 8993; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 8994; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 8995; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 8996; GFX10-WGP-NEXT: s_endpgm 8997; 8998; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 8999; GFX10-CU: ; %bb.0: ; %entry 9000; GFX10-CU-NEXT: s_clause 0x1 9001; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9002; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9003; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9004; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 9005; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 9006; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 9007; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 9008; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 9009; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 9010; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9011; GFX10-CU-NEXT: s_endpgm 9012; 9013; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 9014; SKIP-CACHE-INV: ; %bb.0: ; %entry 9015; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9016; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 9017; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9018; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 9019; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 9020; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9021; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 9022; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9023; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 9024; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9025; SKIP-CACHE-INV-NEXT: s_endpgm 9026; 9027; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 9028; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9029; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9030; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9031; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9032; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9033; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9034; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9035; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9036; 9037; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 9038; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9039; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9040; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9041; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9042; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9043; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9044; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9045; GFX90A-TGSPLIT-NEXT: s_endpgm 9046; 9047; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 9048; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 9049; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9050; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 9051; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9052; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9053; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 9054; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9055; GFX940-NOTTGSPLIT-NEXT: s_endpgm 9056; 9057; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 9058; GFX940-TGSPLIT: ; %bb.0: ; %entry 9059; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9060; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 9061; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9062; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9063; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 9064; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9065; GFX940-TGSPLIT-NEXT: s_endpgm 9066; 9067; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 9068; GFX11-WGP: ; %bb.0: ; %entry 9069; GFX11-WGP-NEXT: s_clause 0x1 9070; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 9071; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 9072; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 9073; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 9074; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 9075; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 9076; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 9077; GFX11-WGP-NEXT: s_endpgm 9078; 9079; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 9080; GFX11-CU: ; %bb.0: ; %entry 9081; GFX11-CU-NEXT: s_clause 0x1 9082; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 9083; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 9084; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9085; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 9086; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 9087; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 9088; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 9089; GFX11-CU-NEXT: s_endpgm 9090 i32* %out, i32 %in, i32 %old) { 9091entry: 9092 %gep = getelementptr i32, i32* %out, i32 4 9093 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire 9094 ret void 9095} 9096 9097define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( 9098; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 9099; GFX7: ; %bb.0: ; %entry 9100; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9101; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 9102; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9103; GFX7-NEXT: s_add_u32 s0, s0, 16 9104; GFX7-NEXT: s_addc_u32 s1, s1, 0 9105; GFX7-NEXT: v_mov_b32_e32 v0, s0 9106; GFX7-NEXT: v_mov_b32_e32 v2, s2 9107; GFX7-NEXT: v_mov_b32_e32 v1, s1 9108; GFX7-NEXT: v_mov_b32_e32 v3, s3 9109; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9110; GFX7-NEXT: s_endpgm 9111; 9112; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 9113; GFX10-WGP: ; %bb.0: ; %entry 9114; GFX10-WGP-NEXT: s_clause 0x1 9115; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9116; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9117; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9118; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 9119; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 9120; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 9121; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 9122; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 9123; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 9124; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9125; GFX10-WGP-NEXT: s_endpgm 9126; 9127; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 9128; GFX10-CU: ; %bb.0: ; %entry 9129; GFX10-CU-NEXT: s_clause 0x1 9130; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9131; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9132; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9133; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 9134; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 9135; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 9136; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 9137; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 9138; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 9139; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9140; GFX10-CU-NEXT: s_endpgm 9141; 9142; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 9143; SKIP-CACHE-INV: ; %bb.0: ; %entry 9144; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9145; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 9146; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9147; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 9148; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 9149; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9150; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 9151; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9152; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 9153; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9154; SKIP-CACHE-INV-NEXT: s_endpgm 9155; 9156; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 9157; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9158; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9159; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9160; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9161; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9162; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9163; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9164; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9165; 9166; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 9167; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9168; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9169; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9170; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9171; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9172; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9173; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9174; GFX90A-TGSPLIT-NEXT: s_endpgm 9175; 9176; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 9177; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 9178; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9179; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 9180; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9181; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9182; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 9183; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9184; GFX940-NOTTGSPLIT-NEXT: s_endpgm 9185; 9186; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 9187; GFX940-TGSPLIT: ; %bb.0: ; %entry 9188; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9189; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 9190; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9191; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9192; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 9193; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9194; GFX940-TGSPLIT-NEXT: s_endpgm 9195; 9196; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 9197; GFX11-WGP: ; %bb.0: ; %entry 9198; GFX11-WGP-NEXT: s_clause 0x1 9199; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 9200; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 9201; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 9202; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 9203; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 9204; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 9205; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 9206; GFX11-WGP-NEXT: s_endpgm 9207; 9208; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 9209; GFX11-CU: ; %bb.0: ; %entry 9210; GFX11-CU-NEXT: s_clause 0x1 9211; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 9212; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 9213; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9214; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 9215; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 9216; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 9217; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 9218; GFX11-CU-NEXT: s_endpgm 9219 i32* %out, i32 %in, i32 %old) { 9220entry: 9221 %gep = getelementptr i32, i32* %out, i32 4 9222 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire 9223 ret void 9224} 9225 9226define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( 9227; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 9228; GFX7: ; %bb.0: ; %entry 9229; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9230; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 9231; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9232; GFX7-NEXT: s_add_u32 s0, s0, 16 9233; GFX7-NEXT: s_addc_u32 s1, s1, 0 9234; GFX7-NEXT: v_mov_b32_e32 v0, s0 9235; GFX7-NEXT: v_mov_b32_e32 v2, s2 9236; GFX7-NEXT: v_mov_b32_e32 v1, s1 9237; GFX7-NEXT: v_mov_b32_e32 v3, s3 9238; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9239; GFX7-NEXT: s_endpgm 9240; 9241; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 9242; GFX10-WGP: ; %bb.0: ; %entry 9243; GFX10-WGP-NEXT: s_clause 0x1 9244; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9245; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9246; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9247; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 9248; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 9249; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 9250; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 9251; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 9252; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 9253; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9254; GFX10-WGP-NEXT: s_endpgm 9255; 9256; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 9257; GFX10-CU: ; %bb.0: ; %entry 9258; GFX10-CU-NEXT: s_clause 0x1 9259; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9260; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9261; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9262; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 9263; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 9264; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 9265; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 9266; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 9267; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 9268; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9269; GFX10-CU-NEXT: s_endpgm 9270; 9271; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 9272; SKIP-CACHE-INV: ; %bb.0: ; %entry 9273; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9274; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 9275; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9276; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 9277; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 9278; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9279; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 9280; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9281; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 9282; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9283; SKIP-CACHE-INV-NEXT: s_endpgm 9284; 9285; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 9286; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9287; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9288; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9289; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9290; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9291; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9292; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9293; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9294; 9295; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 9296; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9297; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9298; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9299; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9300; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9301; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9302; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9303; GFX90A-TGSPLIT-NEXT: s_endpgm 9304; 9305; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 9306; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 9307; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9308; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 9309; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9310; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9311; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 9312; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9313; GFX940-NOTTGSPLIT-NEXT: s_endpgm 9314; 9315; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 9316; GFX940-TGSPLIT: ; %bb.0: ; %entry 9317; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9318; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 9319; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9320; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9321; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 9322; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9323; GFX940-TGSPLIT-NEXT: s_endpgm 9324; 9325; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 9326; GFX11-WGP: ; %bb.0: ; %entry 9327; GFX11-WGP-NEXT: s_clause 0x1 9328; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 9329; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 9330; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 9331; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 9332; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 9333; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 9334; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 9335; GFX11-WGP-NEXT: s_endpgm 9336; 9337; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 9338; GFX11-CU: ; %bb.0: ; %entry 9339; GFX11-CU-NEXT: s_clause 0x1 9340; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 9341; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 9342; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9343; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 9344; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 9345; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 9346; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 9347; GFX11-CU-NEXT: s_endpgm 9348 i32* %out, i32 %in, i32 %old) { 9349entry: 9350 %gep = getelementptr i32, i32* %out, i32 4 9351 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire 9352 ret void 9353} 9354 9355define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( 9356; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 9357; GFX7: ; %bb.0: ; %entry 9358; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9359; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 9360; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9361; GFX7-NEXT: s_add_u32 s0, s0, 16 9362; GFX7-NEXT: s_addc_u32 s1, s1, 0 9363; GFX7-NEXT: v_mov_b32_e32 v0, s0 9364; GFX7-NEXT: v_mov_b32_e32 v2, s2 9365; GFX7-NEXT: v_mov_b32_e32 v1, s1 9366; GFX7-NEXT: v_mov_b32_e32 v3, s3 9367; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9368; GFX7-NEXT: s_endpgm 9369; 9370; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 9371; GFX10-WGP: ; %bb.0: ; %entry 9372; GFX10-WGP-NEXT: s_clause 0x1 9373; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9374; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9375; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9376; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 9377; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 9378; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 9379; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 9380; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 9381; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 9382; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9383; GFX10-WGP-NEXT: s_endpgm 9384; 9385; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 9386; GFX10-CU: ; %bb.0: ; %entry 9387; GFX10-CU-NEXT: s_clause 0x1 9388; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9389; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9390; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9391; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 9392; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 9393; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 9394; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 9395; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 9396; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 9397; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9398; GFX10-CU-NEXT: s_endpgm 9399; 9400; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 9401; SKIP-CACHE-INV: ; %bb.0: ; %entry 9402; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9403; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 9404; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9405; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 9406; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 9407; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9408; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 9409; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9410; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 9411; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9412; SKIP-CACHE-INV-NEXT: s_endpgm 9413; 9414; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 9415; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9416; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9417; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9418; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9419; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9420; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9421; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9422; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9423; 9424; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 9425; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9426; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9427; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9428; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9429; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9430; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9431; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9432; GFX90A-TGSPLIT-NEXT: s_endpgm 9433; 9434; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 9435; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 9436; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9437; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 9438; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9439; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9440; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 9441; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9442; GFX940-NOTTGSPLIT-NEXT: s_endpgm 9443; 9444; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 9445; GFX940-TGSPLIT: ; %bb.0: ; %entry 9446; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9447; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 9448; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9449; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9450; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 9451; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9452; GFX940-TGSPLIT-NEXT: s_endpgm 9453; 9454; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 9455; GFX11-WGP: ; %bb.0: ; %entry 9456; GFX11-WGP-NEXT: s_clause 0x1 9457; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 9458; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 9459; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 9460; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 9461; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 9462; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 9463; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 9464; GFX11-WGP-NEXT: s_endpgm 9465; 9466; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 9467; GFX11-CU: ; %bb.0: ; %entry 9468; GFX11-CU-NEXT: s_clause 0x1 9469; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 9470; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 9471; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9472; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 9473; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 9474; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 9475; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 9476; GFX11-CU-NEXT: s_endpgm 9477 i32* %out, i32 %in, i32 %old) { 9478entry: 9479 %gep = getelementptr i32, i32* %out, i32 4 9480 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire 9481 ret void 9482} 9483 9484define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( 9485; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 9486; GFX7: ; %bb.0: ; %entry 9487; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9488; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 9489; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9490; GFX7-NEXT: s_add_u32 s0, s0, 16 9491; GFX7-NEXT: s_addc_u32 s1, s1, 0 9492; GFX7-NEXT: v_mov_b32_e32 v0, s0 9493; GFX7-NEXT: v_mov_b32_e32 v2, s2 9494; GFX7-NEXT: v_mov_b32_e32 v1, s1 9495; GFX7-NEXT: v_mov_b32_e32 v3, s3 9496; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9497; GFX7-NEXT: s_endpgm 9498; 9499; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 9500; GFX10-WGP: ; %bb.0: ; %entry 9501; GFX10-WGP-NEXT: s_clause 0x1 9502; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9503; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9504; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9505; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 9506; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 9507; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 9508; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 9509; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 9510; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 9511; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9512; GFX10-WGP-NEXT: s_endpgm 9513; 9514; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 9515; GFX10-CU: ; %bb.0: ; %entry 9516; GFX10-CU-NEXT: s_clause 0x1 9517; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9518; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9519; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9520; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 9521; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 9522; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 9523; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 9524; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 9525; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 9526; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9527; GFX10-CU-NEXT: s_endpgm 9528; 9529; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 9530; SKIP-CACHE-INV: ; %bb.0: ; %entry 9531; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9532; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 9533; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9534; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 9535; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 9536; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9537; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 9538; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9539; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 9540; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9541; SKIP-CACHE-INV-NEXT: s_endpgm 9542; 9543; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 9544; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9545; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9546; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9547; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9548; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9549; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9550; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9551; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9552; 9553; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 9554; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9555; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9556; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9557; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9558; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9559; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9560; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9561; GFX90A-TGSPLIT-NEXT: s_endpgm 9562; 9563; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 9564; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 9565; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9566; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 9567; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9568; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9569; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 9570; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9571; GFX940-NOTTGSPLIT-NEXT: s_endpgm 9572; 9573; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 9574; GFX940-TGSPLIT: ; %bb.0: ; %entry 9575; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9576; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 9577; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9578; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9579; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 9580; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9581; GFX940-TGSPLIT-NEXT: s_endpgm 9582; 9583; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 9584; GFX11-WGP: ; %bb.0: ; %entry 9585; GFX11-WGP-NEXT: s_clause 0x1 9586; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 9587; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 9588; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 9589; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 9590; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 9591; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 9592; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 9593; GFX11-WGP-NEXT: s_endpgm 9594; 9595; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 9596; GFX11-CU: ; %bb.0: ; %entry 9597; GFX11-CU-NEXT: s_clause 0x1 9598; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 9599; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 9600; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9601; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 9602; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 9603; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 9604; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 9605; GFX11-CU-NEXT: s_endpgm 9606 i32* %out, i32 %in, i32 %old) { 9607entry: 9608 %gep = getelementptr i32, i32* %out, i32 4 9609 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire 9610 ret void 9611} 9612 9613define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( 9614; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 9615; GFX7: ; %bb.0: ; %entry 9616; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9617; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 9618; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9619; GFX7-NEXT: s_add_u32 s0, s0, 16 9620; GFX7-NEXT: s_addc_u32 s1, s1, 0 9621; GFX7-NEXT: v_mov_b32_e32 v0, s0 9622; GFX7-NEXT: v_mov_b32_e32 v2, s2 9623; GFX7-NEXT: v_mov_b32_e32 v1, s1 9624; GFX7-NEXT: v_mov_b32_e32 v3, s3 9625; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9626; GFX7-NEXT: s_endpgm 9627; 9628; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 9629; GFX10-WGP: ; %bb.0: ; %entry 9630; GFX10-WGP-NEXT: s_clause 0x1 9631; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9632; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9633; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9634; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 9635; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 9636; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 9637; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 9638; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 9639; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 9640; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9641; GFX10-WGP-NEXT: s_endpgm 9642; 9643; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 9644; GFX10-CU: ; %bb.0: ; %entry 9645; GFX10-CU-NEXT: s_clause 0x1 9646; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9647; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9648; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9649; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 9650; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 9651; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 9652; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 9653; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 9654; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 9655; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9656; GFX10-CU-NEXT: s_endpgm 9657; 9658; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 9659; SKIP-CACHE-INV: ; %bb.0: ; %entry 9660; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9661; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 9662; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9663; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 9664; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 9665; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9666; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 9667; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9668; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 9669; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9670; SKIP-CACHE-INV-NEXT: s_endpgm 9671; 9672; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 9673; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9674; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9675; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9676; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9677; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9678; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9679; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9680; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9681; 9682; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 9683; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9684; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9685; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9686; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9687; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9688; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9689; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9690; GFX90A-TGSPLIT-NEXT: s_endpgm 9691; 9692; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 9693; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 9694; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9695; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 9696; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9697; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9698; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 9699; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9700; GFX940-NOTTGSPLIT-NEXT: s_endpgm 9701; 9702; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 9703; GFX940-TGSPLIT: ; %bb.0: ; %entry 9704; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9705; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 9706; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9707; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9708; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 9709; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9710; GFX940-TGSPLIT-NEXT: s_endpgm 9711; 9712; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 9713; GFX11-WGP: ; %bb.0: ; %entry 9714; GFX11-WGP-NEXT: s_clause 0x1 9715; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 9716; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 9717; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 9718; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 9719; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 9720; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 9721; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 9722; GFX11-WGP-NEXT: s_endpgm 9723; 9724; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 9725; GFX11-CU: ; %bb.0: ; %entry 9726; GFX11-CU-NEXT: s_clause 0x1 9727; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 9728; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 9729; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9730; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 9731; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 9732; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 9733; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 9734; GFX11-CU-NEXT: s_endpgm 9735 i32* %out, i32 %in, i32 %old) { 9736entry: 9737 %gep = getelementptr i32, i32* %out, i32 4 9738 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst 9739 ret void 9740} 9741 9742define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( 9743; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 9744; GFX7: ; %bb.0: ; %entry 9745; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9746; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 9747; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9748; GFX7-NEXT: s_add_u32 s0, s0, 16 9749; GFX7-NEXT: s_addc_u32 s1, s1, 0 9750; GFX7-NEXT: v_mov_b32_e32 v0, s0 9751; GFX7-NEXT: v_mov_b32_e32 v2, s2 9752; GFX7-NEXT: v_mov_b32_e32 v1, s1 9753; GFX7-NEXT: v_mov_b32_e32 v3, s3 9754; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9755; GFX7-NEXT: s_endpgm 9756; 9757; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 9758; GFX10-WGP: ; %bb.0: ; %entry 9759; GFX10-WGP-NEXT: s_clause 0x1 9760; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9761; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9762; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9763; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 9764; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 9765; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 9766; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 9767; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 9768; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 9769; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9770; GFX10-WGP-NEXT: s_endpgm 9771; 9772; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 9773; GFX10-CU: ; %bb.0: ; %entry 9774; GFX10-CU-NEXT: s_clause 0x1 9775; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9776; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9777; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9778; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 9779; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 9780; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 9781; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 9782; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 9783; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 9784; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9785; GFX10-CU-NEXT: s_endpgm 9786; 9787; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 9788; SKIP-CACHE-INV: ; %bb.0: ; %entry 9789; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9790; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 9791; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9792; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 9793; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 9794; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9795; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 9796; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9797; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 9798; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9799; SKIP-CACHE-INV-NEXT: s_endpgm 9800; 9801; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 9802; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9803; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9804; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9805; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9806; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9807; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9808; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9809; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9810; 9811; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 9812; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9813; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9814; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9815; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9816; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9817; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9818; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9819; GFX90A-TGSPLIT-NEXT: s_endpgm 9820; 9821; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 9822; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 9823; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9824; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 9825; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9826; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9827; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 9828; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9829; GFX940-NOTTGSPLIT-NEXT: s_endpgm 9830; 9831; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 9832; GFX940-TGSPLIT: ; %bb.0: ; %entry 9833; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9834; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 9835; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9836; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9837; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 9838; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9839; GFX940-TGSPLIT-NEXT: s_endpgm 9840; 9841; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 9842; GFX11-WGP: ; %bb.0: ; %entry 9843; GFX11-WGP-NEXT: s_clause 0x1 9844; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 9845; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 9846; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 9847; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 9848; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 9849; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 9850; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 9851; GFX11-WGP-NEXT: s_endpgm 9852; 9853; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 9854; GFX11-CU: ; %bb.0: ; %entry 9855; GFX11-CU-NEXT: s_clause 0x1 9856; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 9857; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 9858; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9859; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 9860; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 9861; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 9862; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 9863; GFX11-CU-NEXT: s_endpgm 9864 i32* %out, i32 %in, i32 %old) { 9865entry: 9866 %gep = getelementptr i32, i32* %out, i32 4 9867 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst 9868 ret void 9869} 9870 9871define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( 9872; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 9873; GFX7: ; %bb.0: ; %entry 9874; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9875; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 9876; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9877; GFX7-NEXT: s_add_u32 s0, s0, 16 9878; GFX7-NEXT: s_addc_u32 s1, s1, 0 9879; GFX7-NEXT: v_mov_b32_e32 v0, s0 9880; GFX7-NEXT: v_mov_b32_e32 v2, s2 9881; GFX7-NEXT: v_mov_b32_e32 v1, s1 9882; GFX7-NEXT: v_mov_b32_e32 v3, s3 9883; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9884; GFX7-NEXT: s_endpgm 9885; 9886; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 9887; GFX10-WGP: ; %bb.0: ; %entry 9888; GFX10-WGP-NEXT: s_clause 0x1 9889; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9890; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9891; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9892; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 9893; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 9894; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 9895; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 9896; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 9897; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 9898; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9899; GFX10-WGP-NEXT: s_endpgm 9900; 9901; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 9902; GFX10-CU: ; %bb.0: ; %entry 9903; GFX10-CU-NEXT: s_clause 0x1 9904; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9905; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9906; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9907; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 9908; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 9909; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 9910; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 9911; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 9912; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 9913; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9914; GFX10-CU-NEXT: s_endpgm 9915; 9916; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 9917; SKIP-CACHE-INV: ; %bb.0: ; %entry 9918; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9919; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 9920; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9921; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 9922; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 9923; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9924; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 9925; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9926; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 9927; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 9928; SKIP-CACHE-INV-NEXT: s_endpgm 9929; 9930; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 9931; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9932; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9933; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9934; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9935; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9936; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9937; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9938; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9939; 9940; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 9941; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9942; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9943; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9944; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9945; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9946; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9947; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9948; GFX90A-TGSPLIT-NEXT: s_endpgm 9949; 9950; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 9951; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 9952; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9953; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 9954; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9955; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9956; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 9957; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9958; GFX940-NOTTGSPLIT-NEXT: s_endpgm 9959; 9960; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 9961; GFX940-TGSPLIT: ; %bb.0: ; %entry 9962; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9963; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 9964; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9965; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9966; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 9967; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 9968; GFX940-TGSPLIT-NEXT: s_endpgm 9969; 9970; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 9971; GFX11-WGP: ; %bb.0: ; %entry 9972; GFX11-WGP-NEXT: s_clause 0x1 9973; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 9974; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 9975; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 9976; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 9977; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 9978; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 9979; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 9980; GFX11-WGP-NEXT: s_endpgm 9981; 9982; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 9983; GFX11-CU: ; %bb.0: ; %entry 9984; GFX11-CU-NEXT: s_clause 0x1 9985; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 9986; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 9987; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9988; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 9989; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 9990; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 9991; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 9992; GFX11-CU-NEXT: s_endpgm 9993 i32* %out, i32 %in, i32 %old) { 9994entry: 9995 %gep = getelementptr i32, i32* %out, i32 4 9996 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst 9997 ret void 9998} 9999 10000define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( 10001; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 10002; GFX7: ; %bb.0: ; %entry 10003; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10004; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 10005; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10006; GFX7-NEXT: s_add_u32 s0, s0, 16 10007; GFX7-NEXT: s_addc_u32 s1, s1, 0 10008; GFX7-NEXT: v_mov_b32_e32 v0, s0 10009; GFX7-NEXT: v_mov_b32_e32 v2, s2 10010; GFX7-NEXT: v_mov_b32_e32 v1, s1 10011; GFX7-NEXT: v_mov_b32_e32 v3, s3 10012; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 10013; GFX7-NEXT: s_endpgm 10014; 10015; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 10016; GFX10-WGP: ; %bb.0: ; %entry 10017; GFX10-WGP-NEXT: s_clause 0x1 10018; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10019; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10020; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10021; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 10022; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 10023; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 10024; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 10025; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 10026; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 10027; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 10028; GFX10-WGP-NEXT: s_endpgm 10029; 10030; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 10031; GFX10-CU: ; %bb.0: ; %entry 10032; GFX10-CU-NEXT: s_clause 0x1 10033; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10034; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10035; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10036; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 10037; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 10038; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 10039; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 10040; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 10041; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 10042; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 10043; GFX10-CU-NEXT: s_endpgm 10044; 10045; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 10046; SKIP-CACHE-INV: ; %bb.0: ; %entry 10047; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10048; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 10049; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10050; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 10051; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 10052; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10053; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 10054; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10055; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 10056; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 10057; SKIP-CACHE-INV-NEXT: s_endpgm 10058; 10059; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 10060; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10061; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10062; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10063; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10064; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 10065; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 10066; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 10067; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10068; 10069; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 10070; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10071; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10072; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10073; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10074; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 10075; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 10076; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 10077; GFX90A-TGSPLIT-NEXT: s_endpgm 10078; 10079; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 10080; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10081; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10082; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 10083; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10084; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10085; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 10086; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 10087; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10088; 10089; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 10090; GFX940-TGSPLIT: ; %bb.0: ; %entry 10091; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10092; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 10093; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10094; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10095; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 10096; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 10097; GFX940-TGSPLIT-NEXT: s_endpgm 10098; 10099; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 10100; GFX11-WGP: ; %bb.0: ; %entry 10101; GFX11-WGP-NEXT: s_clause 0x1 10102; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 10103; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 10104; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10105; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 10106; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 10107; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 10108; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 10109; GFX11-WGP-NEXT: s_endpgm 10110; 10111; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 10112; GFX11-CU: ; %bb.0: ; %entry 10113; GFX11-CU-NEXT: s_clause 0x1 10114; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 10115; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 10116; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10117; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 10118; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 10119; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 10120; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 10121; GFX11-CU-NEXT: s_endpgm 10122 i32* %out, i32 %in, i32 %old) { 10123entry: 10124 %gep = getelementptr i32, i32* %out, i32 4 10125 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst 10126 ret void 10127} 10128 10129define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( 10130; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 10131; GFX7: ; %bb.0: ; %entry 10132; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10133; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 10134; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10135; GFX7-NEXT: s_add_u32 s0, s0, 16 10136; GFX7-NEXT: s_addc_u32 s1, s1, 0 10137; GFX7-NEXT: v_mov_b32_e32 v0, s0 10138; GFX7-NEXT: v_mov_b32_e32 v2, s2 10139; GFX7-NEXT: v_mov_b32_e32 v1, s1 10140; GFX7-NEXT: v_mov_b32_e32 v3, s3 10141; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 10142; GFX7-NEXT: s_endpgm 10143; 10144; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 10145; GFX10-WGP: ; %bb.0: ; %entry 10146; GFX10-WGP-NEXT: s_clause 0x1 10147; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10148; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10149; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10150; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 10151; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 10152; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 10153; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 10154; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 10155; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 10156; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 10157; GFX10-WGP-NEXT: s_endpgm 10158; 10159; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 10160; GFX10-CU: ; %bb.0: ; %entry 10161; GFX10-CU-NEXT: s_clause 0x1 10162; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10163; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10164; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10165; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 10166; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 10167; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 10168; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 10169; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 10170; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 10171; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 10172; GFX10-CU-NEXT: s_endpgm 10173; 10174; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 10175; SKIP-CACHE-INV: ; %bb.0: ; %entry 10176; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10177; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 10178; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10179; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 10180; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 10181; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10182; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 10183; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10184; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 10185; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 10186; SKIP-CACHE-INV-NEXT: s_endpgm 10187; 10188; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 10189; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10190; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10191; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10192; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10193; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 10194; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 10195; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 10196; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10197; 10198; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 10199; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10200; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10201; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10202; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10203; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 10204; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 10205; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 10206; GFX90A-TGSPLIT-NEXT: s_endpgm 10207; 10208; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 10209; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10210; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10211; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 10212; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10213; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10214; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 10215; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 10216; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10217; 10218; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 10219; GFX940-TGSPLIT: ; %bb.0: ; %entry 10220; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10221; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 10222; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10223; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10224; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 10225; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 10226; GFX940-TGSPLIT-NEXT: s_endpgm 10227; 10228; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 10229; GFX11-WGP: ; %bb.0: ; %entry 10230; GFX11-WGP-NEXT: s_clause 0x1 10231; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 10232; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 10233; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10234; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 10235; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 10236; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 10237; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 10238; GFX11-WGP-NEXT: s_endpgm 10239; 10240; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 10241; GFX11-CU: ; %bb.0: ; %entry 10242; GFX11-CU-NEXT: s_clause 0x1 10243; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 10244; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 10245; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10246; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 10247; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 10248; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 10249; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 10250; GFX11-CU-NEXT: s_endpgm 10251 i32* %out, i32 %in, i32 %old) { 10252entry: 10253 %gep = getelementptr i32, i32* %out, i32 4 10254 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst 10255 ret void 10256} 10257 10258define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg( 10259; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 10260; GFX7: ; %bb.0: ; %entry 10261; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10262; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 10263; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10264; GFX7-NEXT: s_add_u32 s4, s0, 16 10265; GFX7-NEXT: s_addc_u32 s5, s1, 0 10266; GFX7-NEXT: v_mov_b32_e32 v0, s4 10267; GFX7-NEXT: v_mov_b32_e32 v2, s2 10268; GFX7-NEXT: v_mov_b32_e32 v1, s5 10269; GFX7-NEXT: v_mov_b32_e32 v3, s3 10270; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10271; GFX7-NEXT: v_mov_b32_e32 v0, s0 10272; GFX7-NEXT: v_mov_b32_e32 v1, s1 10273; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10274; GFX7-NEXT: flat_store_dword v[0:1], v2 10275; GFX7-NEXT: s_endpgm 10276; 10277; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 10278; GFX10-WGP: ; %bb.0: ; %entry 10279; GFX10-WGP-NEXT: s_clause 0x1 10280; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10281; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10282; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10283; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 10284; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 10285; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 10286; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 10287; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 10288; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 10289; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10290; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 10291; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 10292; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10293; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10294; GFX10-WGP-NEXT: s_endpgm 10295; 10296; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 10297; GFX10-CU: ; %bb.0: ; %entry 10298; GFX10-CU-NEXT: s_clause 0x1 10299; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10300; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10301; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10302; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 10303; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 10304; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 10305; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 10306; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 10307; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 10308; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10309; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 10310; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 10311; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10312; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10313; GFX10-CU-NEXT: s_endpgm 10314; 10315; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 10316; SKIP-CACHE-INV: ; %bb.0: ; %entry 10317; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10318; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 10319; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10320; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 10321; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 10322; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 10323; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 10324; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 10325; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 10326; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10327; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10328; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10329; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10330; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10331; SKIP-CACHE-INV-NEXT: s_endpgm 10332; 10333; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 10334; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10335; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10336; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10337; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10338; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 10339; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 10340; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 10341; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10342; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10343; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10344; 10345; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 10346; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10347; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10348; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10349; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10350; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 10351; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 10352; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 10353; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10354; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10355; GFX90A-TGSPLIT-NEXT: s_endpgm 10356; 10357; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 10358; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10359; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10360; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 10361; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10362; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10363; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 10364; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 10365; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10366; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10367; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10368; 10369; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 10370; GFX940-TGSPLIT: ; %bb.0: ; %entry 10371; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10372; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 10373; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10374; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10375; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 10376; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 10377; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10378; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10379; GFX940-TGSPLIT-NEXT: s_endpgm 10380; 10381; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 10382; GFX11-WGP: ; %bb.0: ; %entry 10383; GFX11-WGP-NEXT: s_clause 0x1 10384; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 10385; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 10386; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10387; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 10388; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 10389; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 10390; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10391; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 10392; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 10393; GFX11-WGP-NEXT: s_endpgm 10394; 10395; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 10396; GFX11-CU: ; %bb.0: ; %entry 10397; GFX11-CU-NEXT: s_clause 0x1 10398; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 10399; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 10400; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10401; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 10402; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 10403; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 10404; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10405; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 10406; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 10407; GFX11-CU-NEXT: s_endpgm 10408 i32* %out, i32 %in, i32 %old) { 10409entry: 10410 %gep = getelementptr i32, i32* %out, i32 4 10411 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic 10412 %val0 = extractvalue { i32, i1 } %val, 0 10413 store i32 %val0, i32* %out, align 4 10414 ret void 10415} 10416 10417define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg( 10418; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 10419; GFX7: ; %bb.0: ; %entry 10420; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10421; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 10422; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10423; GFX7-NEXT: s_add_u32 s4, s0, 16 10424; GFX7-NEXT: s_addc_u32 s5, s1, 0 10425; GFX7-NEXT: v_mov_b32_e32 v0, s4 10426; GFX7-NEXT: v_mov_b32_e32 v2, s2 10427; GFX7-NEXT: v_mov_b32_e32 v1, s5 10428; GFX7-NEXT: v_mov_b32_e32 v3, s3 10429; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10430; GFX7-NEXT: v_mov_b32_e32 v0, s0 10431; GFX7-NEXT: v_mov_b32_e32 v1, s1 10432; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10433; GFX7-NEXT: flat_store_dword v[0:1], v2 10434; GFX7-NEXT: s_endpgm 10435; 10436; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 10437; GFX10-WGP: ; %bb.0: ; %entry 10438; GFX10-WGP-NEXT: s_clause 0x1 10439; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10440; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10441; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10442; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 10443; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 10444; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 10445; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 10446; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 10447; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 10448; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10449; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 10450; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 10451; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10452; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10453; GFX10-WGP-NEXT: s_endpgm 10454; 10455; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 10456; GFX10-CU: ; %bb.0: ; %entry 10457; GFX10-CU-NEXT: s_clause 0x1 10458; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10459; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10460; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10461; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 10462; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 10463; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 10464; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 10465; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 10466; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 10467; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10468; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 10469; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 10470; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10471; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10472; GFX10-CU-NEXT: s_endpgm 10473; 10474; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 10475; SKIP-CACHE-INV: ; %bb.0: ; %entry 10476; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10477; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 10478; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10479; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 10480; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 10481; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 10482; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 10483; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 10484; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 10485; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10486; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10487; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10488; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10489; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10490; SKIP-CACHE-INV-NEXT: s_endpgm 10491; 10492; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 10493; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10494; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10495; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10496; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10497; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 10498; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 10499; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 10500; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10501; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10502; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10503; 10504; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 10505; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10506; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10507; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10508; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10509; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 10510; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 10511; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 10512; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10513; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10514; GFX90A-TGSPLIT-NEXT: s_endpgm 10515; 10516; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 10517; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10518; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10519; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 10520; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10521; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10522; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 10523; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 10524; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10525; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10526; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10527; 10528; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 10529; GFX940-TGSPLIT: ; %bb.0: ; %entry 10530; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10531; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 10532; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10533; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10534; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 10535; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 10536; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10537; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10538; GFX940-TGSPLIT-NEXT: s_endpgm 10539; 10540; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 10541; GFX11-WGP: ; %bb.0: ; %entry 10542; GFX11-WGP-NEXT: s_clause 0x1 10543; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 10544; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 10545; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10546; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 10547; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 10548; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 10549; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10550; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 10551; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 10552; GFX11-WGP-NEXT: s_endpgm 10553; 10554; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 10555; GFX11-CU: ; %bb.0: ; %entry 10556; GFX11-CU-NEXT: s_clause 0x1 10557; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 10558; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 10559; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10560; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 10561; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 10562; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 10563; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10564; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 10565; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 10566; GFX11-CU-NEXT: s_endpgm 10567 i32* %out, i32 %in, i32 %old) { 10568entry: 10569 %gep = getelementptr i32, i32* %out, i32 4 10570 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic 10571 %val0 = extractvalue { i32, i1 } %val, 0 10572 store i32 %val0, i32* %out, align 4 10573 ret void 10574} 10575 10576define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg( 10577; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 10578; GFX7: ; %bb.0: ; %entry 10579; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10580; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 10581; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10582; GFX7-NEXT: s_add_u32 s4, s0, 16 10583; GFX7-NEXT: s_addc_u32 s5, s1, 0 10584; GFX7-NEXT: v_mov_b32_e32 v0, s4 10585; GFX7-NEXT: v_mov_b32_e32 v2, s2 10586; GFX7-NEXT: v_mov_b32_e32 v1, s5 10587; GFX7-NEXT: v_mov_b32_e32 v3, s3 10588; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10589; GFX7-NEXT: v_mov_b32_e32 v0, s0 10590; GFX7-NEXT: v_mov_b32_e32 v1, s1 10591; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10592; GFX7-NEXT: flat_store_dword v[0:1], v2 10593; GFX7-NEXT: s_endpgm 10594; 10595; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 10596; GFX10-WGP: ; %bb.0: ; %entry 10597; GFX10-WGP-NEXT: s_clause 0x1 10598; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10599; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10600; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10601; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 10602; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 10603; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 10604; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 10605; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 10606; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 10607; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10608; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 10609; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 10610; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10611; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10612; GFX10-WGP-NEXT: s_endpgm 10613; 10614; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 10615; GFX10-CU: ; %bb.0: ; %entry 10616; GFX10-CU-NEXT: s_clause 0x1 10617; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10618; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10619; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10620; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 10621; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 10622; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 10623; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 10624; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 10625; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 10626; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10627; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 10628; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 10629; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10630; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10631; GFX10-CU-NEXT: s_endpgm 10632; 10633; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 10634; SKIP-CACHE-INV: ; %bb.0: ; %entry 10635; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10636; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 10637; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10638; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 10639; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 10640; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 10641; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 10642; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 10643; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 10644; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10645; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10646; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10647; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10648; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10649; SKIP-CACHE-INV-NEXT: s_endpgm 10650; 10651; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 10652; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10653; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10654; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10655; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10656; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 10657; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 10658; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 10659; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10660; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10661; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10662; 10663; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 10664; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10665; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10666; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10667; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10668; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 10669; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 10670; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 10671; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10672; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10673; GFX90A-TGSPLIT-NEXT: s_endpgm 10674; 10675; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 10676; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10677; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10678; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 10679; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10680; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10681; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 10682; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 10683; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10684; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10685; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10686; 10687; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 10688; GFX940-TGSPLIT: ; %bb.0: ; %entry 10689; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10690; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 10691; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10692; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10693; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 10694; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 10695; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10696; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10697; GFX940-TGSPLIT-NEXT: s_endpgm 10698; 10699; GFX11-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 10700; GFX11-WGP: ; %bb.0: ; %entry 10701; GFX11-WGP-NEXT: s_clause 0x1 10702; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 10703; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 10704; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10705; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 10706; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 10707; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 10708; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10709; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 10710; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 10711; GFX11-WGP-NEXT: s_endpgm 10712; 10713; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 10714; GFX11-CU: ; %bb.0: ; %entry 10715; GFX11-CU-NEXT: s_clause 0x1 10716; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 10717; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 10718; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10719; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 10720; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 10721; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 10722; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10723; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 10724; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 10725; GFX11-CU-NEXT: s_endpgm 10726 i32* %out, i32 %in, i32 %old) { 10727entry: 10728 %gep = getelementptr i32, i32* %out, i32 4 10729 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic 10730 %val0 = extractvalue { i32, i1 } %val, 0 10731 store i32 %val0, i32* %out, align 4 10732 ret void 10733} 10734 10735define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( 10736; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 10737; GFX7: ; %bb.0: ; %entry 10738; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10739; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 10740; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10741; GFX7-NEXT: s_add_u32 s4, s0, 16 10742; GFX7-NEXT: s_addc_u32 s5, s1, 0 10743; GFX7-NEXT: v_mov_b32_e32 v0, s4 10744; GFX7-NEXT: v_mov_b32_e32 v2, s2 10745; GFX7-NEXT: v_mov_b32_e32 v1, s5 10746; GFX7-NEXT: v_mov_b32_e32 v3, s3 10747; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10748; GFX7-NEXT: v_mov_b32_e32 v0, s0 10749; GFX7-NEXT: v_mov_b32_e32 v1, s1 10750; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10751; GFX7-NEXT: flat_store_dword v[0:1], v2 10752; GFX7-NEXT: s_endpgm 10753; 10754; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 10755; GFX10-WGP: ; %bb.0: ; %entry 10756; GFX10-WGP-NEXT: s_clause 0x1 10757; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10758; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10759; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10760; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 10761; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 10762; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 10763; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 10764; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 10765; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 10766; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10767; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 10768; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 10769; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10770; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10771; GFX10-WGP-NEXT: s_endpgm 10772; 10773; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 10774; GFX10-CU: ; %bb.0: ; %entry 10775; GFX10-CU-NEXT: s_clause 0x1 10776; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10777; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10778; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10779; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 10780; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 10781; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 10782; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 10783; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 10784; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 10785; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10786; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 10787; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 10788; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10789; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10790; GFX10-CU-NEXT: s_endpgm 10791; 10792; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 10793; SKIP-CACHE-INV: ; %bb.0: ; %entry 10794; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10795; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 10796; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10797; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 10798; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 10799; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 10800; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 10801; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 10802; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 10803; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10804; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10805; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10806; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10807; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10808; SKIP-CACHE-INV-NEXT: s_endpgm 10809; 10810; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 10811; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10812; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10813; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10814; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10815; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 10816; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 10817; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 10818; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10819; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10820; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10821; 10822; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 10823; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10824; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10825; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10826; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10827; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 10828; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 10829; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 10830; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10831; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10832; GFX90A-TGSPLIT-NEXT: s_endpgm 10833; 10834; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 10835; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10836; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10837; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 10838; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10839; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10840; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 10841; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 10842; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10843; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10844; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10845; 10846; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 10847; GFX940-TGSPLIT: ; %bb.0: ; %entry 10848; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10849; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 10850; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10851; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10852; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 10853; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 10854; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10855; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10856; GFX940-TGSPLIT-NEXT: s_endpgm 10857; 10858; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 10859; GFX11-WGP: ; %bb.0: ; %entry 10860; GFX11-WGP-NEXT: s_clause 0x1 10861; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 10862; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 10863; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10864; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 10865; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 10866; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 10867; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10868; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 10869; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 10870; GFX11-WGP-NEXT: s_endpgm 10871; 10872; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 10873; GFX11-CU: ; %bb.0: ; %entry 10874; GFX11-CU-NEXT: s_clause 0x1 10875; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 10876; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 10877; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10878; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 10879; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 10880; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 10881; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10882; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 10883; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 10884; GFX11-CU-NEXT: s_endpgm 10885 i32* %out, i32 %in, i32 %old) { 10886entry: 10887 %gep = getelementptr i32, i32* %out, i32 4 10888 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic 10889 %val0 = extractvalue { i32, i1 } %val, 0 10890 store i32 %val0, i32* %out, align 4 10891 ret void 10892} 10893 10894define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( 10895; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 10896; GFX7: ; %bb.0: ; %entry 10897; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10898; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 10899; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10900; GFX7-NEXT: s_add_u32 s4, s0, 16 10901; GFX7-NEXT: s_addc_u32 s5, s1, 0 10902; GFX7-NEXT: v_mov_b32_e32 v0, s4 10903; GFX7-NEXT: v_mov_b32_e32 v2, s2 10904; GFX7-NEXT: v_mov_b32_e32 v1, s5 10905; GFX7-NEXT: v_mov_b32_e32 v3, s3 10906; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10907; GFX7-NEXT: v_mov_b32_e32 v0, s0 10908; GFX7-NEXT: v_mov_b32_e32 v1, s1 10909; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10910; GFX7-NEXT: flat_store_dword v[0:1], v2 10911; GFX7-NEXT: s_endpgm 10912; 10913; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 10914; GFX10-WGP: ; %bb.0: ; %entry 10915; GFX10-WGP-NEXT: s_clause 0x1 10916; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10917; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10918; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10919; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 10920; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 10921; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 10922; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 10923; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 10924; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 10925; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10926; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 10927; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 10928; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10929; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10930; GFX10-WGP-NEXT: s_endpgm 10931; 10932; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 10933; GFX10-CU: ; %bb.0: ; %entry 10934; GFX10-CU-NEXT: s_clause 0x1 10935; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10936; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10937; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10938; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 10939; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 10940; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 10941; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 10942; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 10943; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 10944; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10945; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 10946; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 10947; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10948; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10949; GFX10-CU-NEXT: s_endpgm 10950; 10951; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 10952; SKIP-CACHE-INV: ; %bb.0: ; %entry 10953; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10954; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 10955; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10956; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 10957; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 10958; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 10959; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 10960; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 10961; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 10962; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10963; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10964; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10965; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10966; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10967; SKIP-CACHE-INV-NEXT: s_endpgm 10968; 10969; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 10970; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10971; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10972; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10973; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10974; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 10975; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 10976; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 10977; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10978; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10979; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10980; 10981; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 10982; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10983; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 10984; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10985; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10986; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 10987; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 10988; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 10989; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10990; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10991; GFX90A-TGSPLIT-NEXT: s_endpgm 10992; 10993; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 10994; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10995; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10996; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 10997; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10998; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10999; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 11000; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 11001; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11002; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 11003; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11004; 11005; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 11006; GFX940-TGSPLIT: ; %bb.0: ; %entry 11007; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11008; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 11009; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11010; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11011; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 11012; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 11013; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11014; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 11015; GFX940-TGSPLIT-NEXT: s_endpgm 11016; 11017; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 11018; GFX11-WGP: ; %bb.0: ; %entry 11019; GFX11-WGP-NEXT: s_clause 0x1 11020; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 11021; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 11022; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11023; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 11024; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 11025; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 11026; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11027; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 11028; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 11029; GFX11-WGP-NEXT: s_endpgm 11030; 11031; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 11032; GFX11-CU: ; %bb.0: ; %entry 11033; GFX11-CU-NEXT: s_clause 0x1 11034; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 11035; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 11036; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11037; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 11038; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 11039; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 11040; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11041; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 11042; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 11043; GFX11-CU-NEXT: s_endpgm 11044 i32* %out, i32 %in, i32 %old) { 11045entry: 11046 %gep = getelementptr i32, i32* %out, i32 4 11047 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic 11048 %val0 = extractvalue { i32, i1 } %val, 0 11049 store i32 %val0, i32* %out, align 4 11050 ret void 11051} 11052 11053define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg( 11054; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 11055; GFX7: ; %bb.0: ; %entry 11056; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11057; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 11058; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11059; GFX7-NEXT: s_add_u32 s4, s0, 16 11060; GFX7-NEXT: s_addc_u32 s5, s1, 0 11061; GFX7-NEXT: v_mov_b32_e32 v0, s4 11062; GFX7-NEXT: v_mov_b32_e32 v2, s2 11063; GFX7-NEXT: v_mov_b32_e32 v1, s5 11064; GFX7-NEXT: v_mov_b32_e32 v3, s3 11065; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11066; GFX7-NEXT: v_mov_b32_e32 v0, s0 11067; GFX7-NEXT: v_mov_b32_e32 v1, s1 11068; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11069; GFX7-NEXT: flat_store_dword v[0:1], v2 11070; GFX7-NEXT: s_endpgm 11071; 11072; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 11073; GFX10-WGP: ; %bb.0: ; %entry 11074; GFX10-WGP-NEXT: s_clause 0x1 11075; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11076; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11077; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11078; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 11079; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 11080; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 11081; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 11082; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 11083; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 11084; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11085; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 11086; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 11087; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11088; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 11089; GFX10-WGP-NEXT: s_endpgm 11090; 11091; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 11092; GFX10-CU: ; %bb.0: ; %entry 11093; GFX10-CU-NEXT: s_clause 0x1 11094; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11095; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11096; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11097; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 11098; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 11099; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 11100; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 11101; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 11102; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 11103; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11104; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 11105; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 11106; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11107; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 11108; GFX10-CU-NEXT: s_endpgm 11109; 11110; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 11111; SKIP-CACHE-INV: ; %bb.0: ; %entry 11112; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11113; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 11114; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11115; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 11116; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 11117; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 11118; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 11119; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 11120; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 11121; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11122; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 11123; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 11124; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11125; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 11126; SKIP-CACHE-INV-NEXT: s_endpgm 11127; 11128; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 11129; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11130; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11131; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11132; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11133; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 11134; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 11135; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 11136; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11137; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 11138; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11139; 11140; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 11141; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11142; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11143; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11144; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11145; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 11146; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 11147; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 11148; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11149; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 11150; GFX90A-TGSPLIT-NEXT: s_endpgm 11151; 11152; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 11153; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11154; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11155; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 11156; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11157; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11158; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 11159; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 11160; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11161; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 11162; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11163; 11164; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 11165; GFX940-TGSPLIT: ; %bb.0: ; %entry 11166; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11167; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 11168; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11169; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11170; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 11171; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 11172; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11173; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 11174; GFX940-TGSPLIT-NEXT: s_endpgm 11175; 11176; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 11177; GFX11-WGP: ; %bb.0: ; %entry 11178; GFX11-WGP-NEXT: s_clause 0x1 11179; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 11180; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 11181; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11182; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 11183; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 11184; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 11185; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11186; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 11187; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 11188; GFX11-WGP-NEXT: s_endpgm 11189; 11190; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 11191; GFX11-CU: ; %bb.0: ; %entry 11192; GFX11-CU-NEXT: s_clause 0x1 11193; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 11194; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 11195; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11196; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 11197; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 11198; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 11199; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11200; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 11201; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 11202; GFX11-CU-NEXT: s_endpgm 11203 i32* %out, i32 %in, i32 %old) { 11204entry: 11205 %gep = getelementptr i32, i32* %out, i32 4 11206 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire 11207 %val0 = extractvalue { i32, i1 } %val, 0 11208 store i32 %val0, i32* %out, align 4 11209 ret void 11210} 11211 11212define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( 11213; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 11214; GFX7: ; %bb.0: ; %entry 11215; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11216; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 11217; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11218; GFX7-NEXT: s_add_u32 s4, s0, 16 11219; GFX7-NEXT: s_addc_u32 s5, s1, 0 11220; GFX7-NEXT: v_mov_b32_e32 v0, s4 11221; GFX7-NEXT: v_mov_b32_e32 v2, s2 11222; GFX7-NEXT: v_mov_b32_e32 v1, s5 11223; GFX7-NEXT: v_mov_b32_e32 v3, s3 11224; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11225; GFX7-NEXT: v_mov_b32_e32 v0, s0 11226; GFX7-NEXT: v_mov_b32_e32 v1, s1 11227; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11228; GFX7-NEXT: flat_store_dword v[0:1], v2 11229; GFX7-NEXT: s_endpgm 11230; 11231; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 11232; GFX10-WGP: ; %bb.0: ; %entry 11233; GFX10-WGP-NEXT: s_clause 0x1 11234; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11235; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11236; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11237; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 11238; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 11239; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 11240; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 11241; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 11242; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 11243; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11244; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 11245; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 11246; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11247; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 11248; GFX10-WGP-NEXT: s_endpgm 11249; 11250; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 11251; GFX10-CU: ; %bb.0: ; %entry 11252; GFX10-CU-NEXT: s_clause 0x1 11253; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11254; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11255; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11256; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 11257; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 11258; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 11259; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 11260; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 11261; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 11262; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11263; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 11264; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 11265; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11266; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 11267; GFX10-CU-NEXT: s_endpgm 11268; 11269; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 11270; SKIP-CACHE-INV: ; %bb.0: ; %entry 11271; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11272; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 11273; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11274; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 11275; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 11276; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 11277; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 11278; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 11279; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 11280; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11281; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 11282; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 11283; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11284; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 11285; SKIP-CACHE-INV-NEXT: s_endpgm 11286; 11287; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 11288; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11289; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11290; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11291; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11292; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 11293; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 11294; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 11295; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11296; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 11297; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11298; 11299; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 11300; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11301; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11302; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11303; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11304; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 11305; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 11306; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 11307; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11308; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 11309; GFX90A-TGSPLIT-NEXT: s_endpgm 11310; 11311; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 11312; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11313; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11314; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 11315; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11316; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11317; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 11318; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 11319; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11320; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 11321; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11322; 11323; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 11324; GFX940-TGSPLIT: ; %bb.0: ; %entry 11325; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11326; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 11327; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11328; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11329; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 11330; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 11331; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11332; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 11333; GFX940-TGSPLIT-NEXT: s_endpgm 11334; 11335; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 11336; GFX11-WGP: ; %bb.0: ; %entry 11337; GFX11-WGP-NEXT: s_clause 0x1 11338; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 11339; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 11340; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11341; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 11342; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 11343; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 11344; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11345; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 11346; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 11347; GFX11-WGP-NEXT: s_endpgm 11348; 11349; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 11350; GFX11-CU: ; %bb.0: ; %entry 11351; GFX11-CU-NEXT: s_clause 0x1 11352; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 11353; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 11354; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11355; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 11356; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 11357; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 11358; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11359; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 11360; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 11361; GFX11-CU-NEXT: s_endpgm 11362 i32* %out, i32 %in, i32 %old) { 11363entry: 11364 %gep = getelementptr i32, i32* %out, i32 4 11365 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire 11366 %val0 = extractvalue { i32, i1 } %val, 0 11367 store i32 %val0, i32* %out, align 4 11368 ret void 11369} 11370 11371define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( 11372; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 11373; GFX7: ; %bb.0: ; %entry 11374; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11375; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 11376; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11377; GFX7-NEXT: s_add_u32 s4, s0, 16 11378; GFX7-NEXT: s_addc_u32 s5, s1, 0 11379; GFX7-NEXT: v_mov_b32_e32 v0, s4 11380; GFX7-NEXT: v_mov_b32_e32 v2, s2 11381; GFX7-NEXT: v_mov_b32_e32 v1, s5 11382; GFX7-NEXT: v_mov_b32_e32 v3, s3 11383; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11384; GFX7-NEXT: v_mov_b32_e32 v0, s0 11385; GFX7-NEXT: v_mov_b32_e32 v1, s1 11386; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11387; GFX7-NEXT: flat_store_dword v[0:1], v2 11388; GFX7-NEXT: s_endpgm 11389; 11390; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 11391; GFX10-WGP: ; %bb.0: ; %entry 11392; GFX10-WGP-NEXT: s_clause 0x1 11393; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11394; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11395; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11396; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 11397; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 11398; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 11399; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 11400; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 11401; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 11402; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11403; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 11404; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 11405; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11406; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 11407; GFX10-WGP-NEXT: s_endpgm 11408; 11409; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 11410; GFX10-CU: ; %bb.0: ; %entry 11411; GFX10-CU-NEXT: s_clause 0x1 11412; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11413; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11414; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11415; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 11416; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 11417; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 11418; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 11419; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 11420; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 11421; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11422; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 11423; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 11424; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11425; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 11426; GFX10-CU-NEXT: s_endpgm 11427; 11428; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 11429; SKIP-CACHE-INV: ; %bb.0: ; %entry 11430; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11431; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 11432; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11433; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 11434; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 11435; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 11436; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 11437; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 11438; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 11439; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11440; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 11441; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 11442; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11443; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 11444; SKIP-CACHE-INV-NEXT: s_endpgm 11445; 11446; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 11447; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11448; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11449; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11450; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11451; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 11452; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 11453; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 11454; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11455; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 11456; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11457; 11458; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 11459; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11460; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11461; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11462; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11463; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 11464; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 11465; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 11466; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11467; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 11468; GFX90A-TGSPLIT-NEXT: s_endpgm 11469; 11470; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 11471; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11472; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11473; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 11474; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11475; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11476; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 11477; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 11478; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11479; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 11480; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11481; 11482; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 11483; GFX940-TGSPLIT: ; %bb.0: ; %entry 11484; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11485; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 11486; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11487; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11488; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 11489; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 11490; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11491; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 11492; GFX940-TGSPLIT-NEXT: s_endpgm 11493; 11494; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 11495; GFX11-WGP: ; %bb.0: ; %entry 11496; GFX11-WGP-NEXT: s_clause 0x1 11497; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 11498; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 11499; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11500; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 11501; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 11502; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 11503; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11504; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 11505; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 11506; GFX11-WGP-NEXT: s_endpgm 11507; 11508; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 11509; GFX11-CU: ; %bb.0: ; %entry 11510; GFX11-CU-NEXT: s_clause 0x1 11511; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 11512; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 11513; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11514; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 11515; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 11516; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 11517; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11518; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 11519; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 11520; GFX11-CU-NEXT: s_endpgm 11521 i32* %out, i32 %in, i32 %old) { 11522entry: 11523 %gep = getelementptr i32, i32* %out, i32 4 11524 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire 11525 %val0 = extractvalue { i32, i1 } %val, 0 11526 store i32 %val0, i32* %out, align 4 11527 ret void 11528} 11529 11530define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( 11531; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 11532; GFX7: ; %bb.0: ; %entry 11533; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11534; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 11535; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11536; GFX7-NEXT: s_add_u32 s4, s0, 16 11537; GFX7-NEXT: s_addc_u32 s5, s1, 0 11538; GFX7-NEXT: v_mov_b32_e32 v0, s4 11539; GFX7-NEXT: v_mov_b32_e32 v2, s2 11540; GFX7-NEXT: v_mov_b32_e32 v1, s5 11541; GFX7-NEXT: v_mov_b32_e32 v3, s3 11542; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11543; GFX7-NEXT: v_mov_b32_e32 v0, s0 11544; GFX7-NEXT: v_mov_b32_e32 v1, s1 11545; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11546; GFX7-NEXT: flat_store_dword v[0:1], v2 11547; GFX7-NEXT: s_endpgm 11548; 11549; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 11550; GFX10-WGP: ; %bb.0: ; %entry 11551; GFX10-WGP-NEXT: s_clause 0x1 11552; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11553; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11554; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11555; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 11556; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 11557; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 11558; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 11559; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 11560; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 11561; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11562; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 11563; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 11564; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11565; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 11566; GFX10-WGP-NEXT: s_endpgm 11567; 11568; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 11569; GFX10-CU: ; %bb.0: ; %entry 11570; GFX10-CU-NEXT: s_clause 0x1 11571; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11572; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11573; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11574; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 11575; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 11576; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 11577; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 11578; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 11579; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 11580; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11581; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 11582; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 11583; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11584; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 11585; GFX10-CU-NEXT: s_endpgm 11586; 11587; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 11588; SKIP-CACHE-INV: ; %bb.0: ; %entry 11589; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11590; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 11591; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11592; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 11593; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 11594; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 11595; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 11596; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 11597; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 11598; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11599; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 11600; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 11601; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11602; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 11603; SKIP-CACHE-INV-NEXT: s_endpgm 11604; 11605; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 11606; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11607; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11608; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11609; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11610; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 11611; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 11612; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 11613; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11614; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 11615; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11616; 11617; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 11618; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11619; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11620; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11621; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11622; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 11623; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 11624; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 11625; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11626; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 11627; GFX90A-TGSPLIT-NEXT: s_endpgm 11628; 11629; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 11630; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11631; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11632; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 11633; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11634; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11635; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 11636; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 11637; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11638; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 11639; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11640; 11641; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 11642; GFX940-TGSPLIT: ; %bb.0: ; %entry 11643; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11644; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 11645; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11646; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11647; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 11648; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 11649; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11650; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 11651; GFX940-TGSPLIT-NEXT: s_endpgm 11652; 11653; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 11654; GFX11-WGP: ; %bb.0: ; %entry 11655; GFX11-WGP-NEXT: s_clause 0x1 11656; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 11657; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 11658; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11659; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 11660; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 11661; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 11662; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11663; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 11664; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 11665; GFX11-WGP-NEXT: s_endpgm 11666; 11667; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 11668; GFX11-CU: ; %bb.0: ; %entry 11669; GFX11-CU-NEXT: s_clause 0x1 11670; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 11671; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 11672; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11673; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 11674; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 11675; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 11676; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11677; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 11678; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 11679; GFX11-CU-NEXT: s_endpgm 11680 i32* %out, i32 %in, i32 %old) { 11681entry: 11682 %gep = getelementptr i32, i32* %out, i32 4 11683 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire 11684 %val0 = extractvalue { i32, i1 } %val, 0 11685 store i32 %val0, i32* %out, align 4 11686 ret void 11687} 11688 11689define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( 11690; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 11691; GFX7: ; %bb.0: ; %entry 11692; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11693; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 11694; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11695; GFX7-NEXT: s_add_u32 s4, s0, 16 11696; GFX7-NEXT: s_addc_u32 s5, s1, 0 11697; GFX7-NEXT: v_mov_b32_e32 v0, s4 11698; GFX7-NEXT: v_mov_b32_e32 v2, s2 11699; GFX7-NEXT: v_mov_b32_e32 v1, s5 11700; GFX7-NEXT: v_mov_b32_e32 v3, s3 11701; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11702; GFX7-NEXT: v_mov_b32_e32 v0, s0 11703; GFX7-NEXT: v_mov_b32_e32 v1, s1 11704; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11705; GFX7-NEXT: flat_store_dword v[0:1], v2 11706; GFX7-NEXT: s_endpgm 11707; 11708; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 11709; GFX10-WGP: ; %bb.0: ; %entry 11710; GFX10-WGP-NEXT: s_clause 0x1 11711; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11712; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11713; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11714; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 11715; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 11716; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 11717; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 11718; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 11719; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 11720; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11721; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 11722; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 11723; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11724; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 11725; GFX10-WGP-NEXT: s_endpgm 11726; 11727; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 11728; GFX10-CU: ; %bb.0: ; %entry 11729; GFX10-CU-NEXT: s_clause 0x1 11730; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11731; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11732; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11733; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 11734; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 11735; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 11736; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 11737; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 11738; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 11739; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11740; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 11741; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 11742; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11743; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 11744; GFX10-CU-NEXT: s_endpgm 11745; 11746; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 11747; SKIP-CACHE-INV: ; %bb.0: ; %entry 11748; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11749; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 11750; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11751; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 11752; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 11753; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 11754; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 11755; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 11756; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 11757; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11758; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 11759; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 11760; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11761; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 11762; SKIP-CACHE-INV-NEXT: s_endpgm 11763; 11764; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 11765; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11766; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11767; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11768; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11769; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 11770; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 11771; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 11772; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11773; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 11774; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11775; 11776; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 11777; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11778; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11779; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11780; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11781; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 11782; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 11783; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 11784; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11785; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 11786; GFX90A-TGSPLIT-NEXT: s_endpgm 11787; 11788; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 11789; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11790; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11791; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 11792; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11793; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11794; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 11795; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 11796; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11797; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 11798; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11799; 11800; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 11801; GFX940-TGSPLIT: ; %bb.0: ; %entry 11802; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11803; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 11804; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11805; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11806; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 11807; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 11808; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11809; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 11810; GFX940-TGSPLIT-NEXT: s_endpgm 11811; 11812; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 11813; GFX11-WGP: ; %bb.0: ; %entry 11814; GFX11-WGP-NEXT: s_clause 0x1 11815; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 11816; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 11817; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11818; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 11819; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 11820; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 11821; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11822; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 11823; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 11824; GFX11-WGP-NEXT: s_endpgm 11825; 11826; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 11827; GFX11-CU: ; %bb.0: ; %entry 11828; GFX11-CU-NEXT: s_clause 0x1 11829; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 11830; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 11831; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11832; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 11833; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 11834; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 11835; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11836; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 11837; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 11838; GFX11-CU-NEXT: s_endpgm 11839 i32* %out, i32 %in, i32 %old) { 11840entry: 11841 %gep = getelementptr i32, i32* %out, i32 4 11842 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire 11843 %val0 = extractvalue { i32, i1 } %val, 0 11844 store i32 %val0, i32* %out, align 4 11845 ret void 11846} 11847 11848define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg( 11849; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 11850; GFX7: ; %bb.0: ; %entry 11851; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11852; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 11853; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11854; GFX7-NEXT: s_add_u32 s4, s0, 16 11855; GFX7-NEXT: s_addc_u32 s5, s1, 0 11856; GFX7-NEXT: v_mov_b32_e32 v0, s4 11857; GFX7-NEXT: v_mov_b32_e32 v2, s2 11858; GFX7-NEXT: v_mov_b32_e32 v1, s5 11859; GFX7-NEXT: v_mov_b32_e32 v3, s3 11860; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11861; GFX7-NEXT: v_mov_b32_e32 v0, s0 11862; GFX7-NEXT: v_mov_b32_e32 v1, s1 11863; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11864; GFX7-NEXT: flat_store_dword v[0:1], v2 11865; GFX7-NEXT: s_endpgm 11866; 11867; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 11868; GFX10-WGP: ; %bb.0: ; %entry 11869; GFX10-WGP-NEXT: s_clause 0x1 11870; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11871; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11872; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11873; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 11874; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 11875; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 11876; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 11877; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 11878; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 11879; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11880; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 11881; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 11882; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11883; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 11884; GFX10-WGP-NEXT: s_endpgm 11885; 11886; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 11887; GFX10-CU: ; %bb.0: ; %entry 11888; GFX10-CU-NEXT: s_clause 0x1 11889; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11890; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11891; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11892; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 11893; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 11894; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 11895; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 11896; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 11897; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 11898; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11899; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 11900; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 11901; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11902; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 11903; GFX10-CU-NEXT: s_endpgm 11904; 11905; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 11906; SKIP-CACHE-INV: ; %bb.0: ; %entry 11907; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11908; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 11909; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11910; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 11911; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 11912; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 11913; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 11914; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 11915; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 11916; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11917; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 11918; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 11919; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11920; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 11921; SKIP-CACHE-INV-NEXT: s_endpgm 11922; 11923; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 11924; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11925; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11926; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11927; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11928; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 11929; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 11930; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 11931; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11932; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 11933; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11934; 11935; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 11936; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11937; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11938; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11939; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11940; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 11941; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 11942; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 11943; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11944; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 11945; GFX90A-TGSPLIT-NEXT: s_endpgm 11946; 11947; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 11948; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11949; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11950; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 11951; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11952; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11953; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 11954; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 11955; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11956; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 11957; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11958; 11959; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 11960; GFX940-TGSPLIT: ; %bb.0: ; %entry 11961; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 11962; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 11963; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11964; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11965; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 11966; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 11967; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11968; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 11969; GFX940-TGSPLIT-NEXT: s_endpgm 11970; 11971; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 11972; GFX11-WGP: ; %bb.0: ; %entry 11973; GFX11-WGP-NEXT: s_clause 0x1 11974; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 11975; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 11976; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11977; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 11978; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 11979; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 11980; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11981; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 11982; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 11983; GFX11-WGP-NEXT: s_endpgm 11984; 11985; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 11986; GFX11-CU: ; %bb.0: ; %entry 11987; GFX11-CU-NEXT: s_clause 0x1 11988; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 11989; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 11990; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11991; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 11992; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 11993; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 11994; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11995; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 11996; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 11997; GFX11-CU-NEXT: s_endpgm 11998 i32* %out, i32 %in, i32 %old) { 11999entry: 12000 %gep = getelementptr i32, i32* %out, i32 4 12001 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst 12002 %val0 = extractvalue { i32, i1 } %val, 0 12003 store i32 %val0, i32* %out, align 4 12004 ret void 12005} 12006 12007define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( 12008; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 12009; GFX7: ; %bb.0: ; %entry 12010; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12011; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 12012; GFX7-NEXT: s_waitcnt lgkmcnt(0) 12013; GFX7-NEXT: s_add_u32 s4, s0, 16 12014; GFX7-NEXT: s_addc_u32 s5, s1, 0 12015; GFX7-NEXT: v_mov_b32_e32 v0, s4 12016; GFX7-NEXT: v_mov_b32_e32 v2, s2 12017; GFX7-NEXT: v_mov_b32_e32 v1, s5 12018; GFX7-NEXT: v_mov_b32_e32 v3, s3 12019; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12020; GFX7-NEXT: v_mov_b32_e32 v0, s0 12021; GFX7-NEXT: v_mov_b32_e32 v1, s1 12022; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12023; GFX7-NEXT: flat_store_dword v[0:1], v2 12024; GFX7-NEXT: s_endpgm 12025; 12026; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 12027; GFX10-WGP: ; %bb.0: ; %entry 12028; GFX10-WGP-NEXT: s_clause 0x1 12029; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12030; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 12031; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 12032; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 12033; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 12034; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 12035; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 12036; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 12037; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 12038; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12039; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 12040; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 12041; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12042; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 12043; GFX10-WGP-NEXT: s_endpgm 12044; 12045; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 12046; GFX10-CU: ; %bb.0: ; %entry 12047; GFX10-CU-NEXT: s_clause 0x1 12048; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12049; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 12050; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 12051; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 12052; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 12053; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 12054; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 12055; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 12056; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 12057; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12058; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 12059; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 12060; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12061; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 12062; GFX10-CU-NEXT: s_endpgm 12063; 12064; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 12065; SKIP-CACHE-INV: ; %bb.0: ; %entry 12066; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 12067; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 12068; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 12069; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 12070; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 12071; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 12072; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 12073; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 12074; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 12075; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12076; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 12077; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 12078; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12079; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 12080; SKIP-CACHE-INV-NEXT: s_endpgm 12081; 12082; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 12083; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 12084; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12085; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 12086; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12087; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 12088; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 12089; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 12090; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12091; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 12092; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 12093; 12094; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 12095; GFX90A-TGSPLIT: ; %bb.0: ; %entry 12096; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12097; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 12098; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12099; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 12100; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 12101; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 12102; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12103; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 12104; GFX90A-TGSPLIT-NEXT: s_endpgm 12105; 12106; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 12107; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 12108; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 12109; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 12110; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12111; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 12112; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 12113; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 12114; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12115; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 12116; GFX940-NOTTGSPLIT-NEXT: s_endpgm 12117; 12118; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 12119; GFX940-TGSPLIT: ; %bb.0: ; %entry 12120; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 12121; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 12122; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12123; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 12124; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 12125; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 12126; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12127; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 12128; GFX940-TGSPLIT-NEXT: s_endpgm 12129; 12130; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 12131; GFX11-WGP: ; %bb.0: ; %entry 12132; GFX11-WGP-NEXT: s_clause 0x1 12133; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 12134; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 12135; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 12136; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 12137; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 12138; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 12139; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12140; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 12141; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 12142; GFX11-WGP-NEXT: s_endpgm 12143; 12144; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 12145; GFX11-CU: ; %bb.0: ; %entry 12146; GFX11-CU-NEXT: s_clause 0x1 12147; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 12148; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 12149; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 12150; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 12151; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 12152; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 12153; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12154; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 12155; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 12156; GFX11-CU-NEXT: s_endpgm 12157 i32* %out, i32 %in, i32 %old) { 12158entry: 12159 %gep = getelementptr i32, i32* %out, i32 4 12160 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst 12161 %val0 = extractvalue { i32, i1 } %val, 0 12162 store i32 %val0, i32* %out, align 4 12163 ret void 12164} 12165 12166define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( 12167; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 12168; GFX7: ; %bb.0: ; %entry 12169; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12170; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 12171; GFX7-NEXT: s_waitcnt lgkmcnt(0) 12172; GFX7-NEXT: s_add_u32 s4, s0, 16 12173; GFX7-NEXT: s_addc_u32 s5, s1, 0 12174; GFX7-NEXT: v_mov_b32_e32 v0, s4 12175; GFX7-NEXT: v_mov_b32_e32 v2, s2 12176; GFX7-NEXT: v_mov_b32_e32 v1, s5 12177; GFX7-NEXT: v_mov_b32_e32 v3, s3 12178; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12179; GFX7-NEXT: v_mov_b32_e32 v0, s0 12180; GFX7-NEXT: v_mov_b32_e32 v1, s1 12181; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12182; GFX7-NEXT: flat_store_dword v[0:1], v2 12183; GFX7-NEXT: s_endpgm 12184; 12185; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 12186; GFX10-WGP: ; %bb.0: ; %entry 12187; GFX10-WGP-NEXT: s_clause 0x1 12188; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12189; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 12190; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 12191; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 12192; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 12193; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 12194; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 12195; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 12196; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 12197; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12198; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 12199; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 12200; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12201; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 12202; GFX10-WGP-NEXT: s_endpgm 12203; 12204; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 12205; GFX10-CU: ; %bb.0: ; %entry 12206; GFX10-CU-NEXT: s_clause 0x1 12207; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12208; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 12209; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 12210; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 12211; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 12212; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 12213; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 12214; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 12215; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 12216; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12217; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 12218; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 12219; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12220; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 12221; GFX10-CU-NEXT: s_endpgm 12222; 12223; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 12224; SKIP-CACHE-INV: ; %bb.0: ; %entry 12225; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 12226; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 12227; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 12228; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 12229; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 12230; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 12231; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 12232; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 12233; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 12234; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12235; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 12236; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 12237; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12238; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 12239; SKIP-CACHE-INV-NEXT: s_endpgm 12240; 12241; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 12242; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 12243; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12244; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 12245; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12246; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 12247; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 12248; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 12249; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12250; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 12251; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 12252; 12253; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 12254; GFX90A-TGSPLIT: ; %bb.0: ; %entry 12255; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12256; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 12257; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12258; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 12259; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 12260; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 12261; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12262; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 12263; GFX90A-TGSPLIT-NEXT: s_endpgm 12264; 12265; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 12266; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 12267; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 12268; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 12269; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12270; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 12271; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 12272; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 12273; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12274; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 12275; GFX940-NOTTGSPLIT-NEXT: s_endpgm 12276; 12277; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 12278; GFX940-TGSPLIT: ; %bb.0: ; %entry 12279; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 12280; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 12281; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12282; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 12283; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 12284; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 12285; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12286; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 12287; GFX940-TGSPLIT-NEXT: s_endpgm 12288; 12289; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 12290; GFX11-WGP: ; %bb.0: ; %entry 12291; GFX11-WGP-NEXT: s_clause 0x1 12292; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 12293; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 12294; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 12295; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 12296; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 12297; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 12298; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12299; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 12300; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 12301; GFX11-WGP-NEXT: s_endpgm 12302; 12303; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 12304; GFX11-CU: ; %bb.0: ; %entry 12305; GFX11-CU-NEXT: s_clause 0x1 12306; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 12307; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 12308; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 12309; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 12310; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 12311; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 12312; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12313; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 12314; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 12315; GFX11-CU-NEXT: s_endpgm 12316 i32* %out, i32 %in, i32 %old) { 12317entry: 12318 %gep = getelementptr i32, i32* %out, i32 4 12319 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst 12320 %val0 = extractvalue { i32, i1 } %val, 0 12321 store i32 %val0, i32* %out, align 4 12322 ret void 12323} 12324 12325define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( 12326; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 12327; GFX7: ; %bb.0: ; %entry 12328; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12329; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 12330; GFX7-NEXT: s_waitcnt lgkmcnt(0) 12331; GFX7-NEXT: s_add_u32 s4, s0, 16 12332; GFX7-NEXT: s_addc_u32 s5, s1, 0 12333; GFX7-NEXT: v_mov_b32_e32 v0, s4 12334; GFX7-NEXT: v_mov_b32_e32 v2, s2 12335; GFX7-NEXT: v_mov_b32_e32 v1, s5 12336; GFX7-NEXT: v_mov_b32_e32 v3, s3 12337; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12338; GFX7-NEXT: v_mov_b32_e32 v0, s0 12339; GFX7-NEXT: v_mov_b32_e32 v1, s1 12340; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12341; GFX7-NEXT: flat_store_dword v[0:1], v2 12342; GFX7-NEXT: s_endpgm 12343; 12344; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 12345; GFX10-WGP: ; %bb.0: ; %entry 12346; GFX10-WGP-NEXT: s_clause 0x1 12347; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12348; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 12349; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 12350; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 12351; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 12352; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 12353; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 12354; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 12355; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 12356; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12357; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 12358; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 12359; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12360; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 12361; GFX10-WGP-NEXT: s_endpgm 12362; 12363; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 12364; GFX10-CU: ; %bb.0: ; %entry 12365; GFX10-CU-NEXT: s_clause 0x1 12366; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12367; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 12368; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 12369; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 12370; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 12371; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 12372; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 12373; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 12374; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 12375; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12376; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 12377; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 12378; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12379; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 12380; GFX10-CU-NEXT: s_endpgm 12381; 12382; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 12383; SKIP-CACHE-INV: ; %bb.0: ; %entry 12384; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 12385; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 12386; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 12387; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 12388; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 12389; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 12390; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 12391; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 12392; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 12393; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12394; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 12395; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 12396; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12397; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 12398; SKIP-CACHE-INV-NEXT: s_endpgm 12399; 12400; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 12401; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 12402; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12403; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 12404; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12405; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 12406; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 12407; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 12408; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12409; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 12410; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 12411; 12412; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 12413; GFX90A-TGSPLIT: ; %bb.0: ; %entry 12414; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12415; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 12416; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12417; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 12418; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 12419; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 12420; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12421; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 12422; GFX90A-TGSPLIT-NEXT: s_endpgm 12423; 12424; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 12425; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 12426; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 12427; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 12428; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12429; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 12430; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 12431; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 12432; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12433; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 12434; GFX940-NOTTGSPLIT-NEXT: s_endpgm 12435; 12436; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 12437; GFX940-TGSPLIT: ; %bb.0: ; %entry 12438; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 12439; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 12440; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12441; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 12442; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 12443; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 12444; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12445; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 12446; GFX940-TGSPLIT-NEXT: s_endpgm 12447; 12448; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 12449; GFX11-WGP: ; %bb.0: ; %entry 12450; GFX11-WGP-NEXT: s_clause 0x1 12451; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 12452; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 12453; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 12454; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 12455; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 12456; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 12457; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12458; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 12459; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 12460; GFX11-WGP-NEXT: s_endpgm 12461; 12462; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 12463; GFX11-CU: ; %bb.0: ; %entry 12464; GFX11-CU-NEXT: s_clause 0x1 12465; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 12466; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 12467; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 12468; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 12469; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 12470; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 12471; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12472; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 12473; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 12474; GFX11-CU-NEXT: s_endpgm 12475 i32* %out, i32 %in, i32 %old) { 12476entry: 12477 %gep = getelementptr i32, i32* %out, i32 4 12478 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst 12479 %val0 = extractvalue { i32, i1 } %val, 0 12480 store i32 %val0, i32* %out, align 4 12481 ret void 12482} 12483 12484define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( 12485; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 12486; GFX7: ; %bb.0: ; %entry 12487; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12488; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 12489; GFX7-NEXT: s_waitcnt lgkmcnt(0) 12490; GFX7-NEXT: s_add_u32 s4, s0, 16 12491; GFX7-NEXT: s_addc_u32 s5, s1, 0 12492; GFX7-NEXT: v_mov_b32_e32 v0, s4 12493; GFX7-NEXT: v_mov_b32_e32 v2, s2 12494; GFX7-NEXT: v_mov_b32_e32 v1, s5 12495; GFX7-NEXT: v_mov_b32_e32 v3, s3 12496; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12497; GFX7-NEXT: v_mov_b32_e32 v0, s0 12498; GFX7-NEXT: v_mov_b32_e32 v1, s1 12499; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12500; GFX7-NEXT: flat_store_dword v[0:1], v2 12501; GFX7-NEXT: s_endpgm 12502; 12503; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 12504; GFX10-WGP: ; %bb.0: ; %entry 12505; GFX10-WGP-NEXT: s_clause 0x1 12506; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12507; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 12508; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 12509; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 12510; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 12511; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 12512; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 12513; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 12514; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 12515; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12516; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 12517; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 12518; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12519; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 12520; GFX10-WGP-NEXT: s_endpgm 12521; 12522; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 12523; GFX10-CU: ; %bb.0: ; %entry 12524; GFX10-CU-NEXT: s_clause 0x1 12525; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12526; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 12527; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 12528; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 12529; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 12530; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 12531; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 12532; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 12533; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 12534; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12535; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 12536; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 12537; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12538; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 12539; GFX10-CU-NEXT: s_endpgm 12540; 12541; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 12542; SKIP-CACHE-INV: ; %bb.0: ; %entry 12543; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 12544; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 12545; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 12546; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 12547; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 12548; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 12549; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 12550; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 12551; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 12552; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12553; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 12554; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 12555; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12556; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 12557; SKIP-CACHE-INV-NEXT: s_endpgm 12558; 12559; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 12560; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 12561; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12562; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 12563; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12564; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 12565; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 12566; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 12567; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12568; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 12569; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 12570; 12571; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 12572; GFX90A-TGSPLIT: ; %bb.0: ; %entry 12573; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12574; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 12575; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12576; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 12577; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 12578; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 12579; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12580; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 12581; GFX90A-TGSPLIT-NEXT: s_endpgm 12582; 12583; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 12584; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 12585; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 12586; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 12587; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12588; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 12589; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 12590; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 12591; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12592; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 12593; GFX940-NOTTGSPLIT-NEXT: s_endpgm 12594; 12595; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 12596; GFX940-TGSPLIT: ; %bb.0: ; %entry 12597; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 12598; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 12599; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12600; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 12601; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] 12602; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 12603; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12604; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 12605; GFX940-TGSPLIT-NEXT: s_endpgm 12606; 12607; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 12608; GFX11-WGP: ; %bb.0: ; %entry 12609; GFX11-WGP-NEXT: s_clause 0x1 12610; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 12611; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 12612; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 12613; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 12614; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 12615; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 12616; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12617; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 12618; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 12619; GFX11-WGP-NEXT: s_endpgm 12620; 12621; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 12622; GFX11-CU: ; %bb.0: ; %entry 12623; GFX11-CU-NEXT: s_clause 0x1 12624; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 12625; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 12626; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 12627; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 12628; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 12629; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 12630; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12631; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 12632; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 12633; GFX11-CU-NEXT: s_endpgm 12634 i32* %out, i32 %in, i32 %old) { 12635entry: 12636 %gep = getelementptr i32, i32* %out, i32 4 12637 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst 12638 %val0 = extractvalue { i32, i1 } %val, 0 12639 store i32 %val0, i32* %out, align 4 12640 ret void 12641} 12642 12643