1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s 7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s 8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s 9 10declare i32 @llvm.amdgcn.workitem.id.x() 11 12; Show what the atomic optimization pass will do for global pointers. 13 14define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 15; GFX7LESS-LABEL: add_i32_constant: 16; GFX7LESS: ; %bb.0: ; %entry 17; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 18; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 19; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 20; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 21; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 22; GFX7LESS-NEXT: ; implicit-def: $vgpr1 23; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 24; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 25; GFX7LESS-NEXT: ; %bb.1: 26; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 27; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 28; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 29; GFX7LESS-NEXT: s_mov_b32 s10, -1 30; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 31; GFX7LESS-NEXT: s_mov_b32 s8, s2 32; GFX7LESS-NEXT: s_mov_b32 s9, s3 33; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 34; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 35; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 36; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 37; GFX7LESS-NEXT: buffer_wbinvl1 38; GFX7LESS-NEXT: .LBB0_2: 39; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 40; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 41; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 42; GFX7LESS-NEXT: s_mov_b32 s2, -1 43; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 44; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4 45; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 46; GFX7LESS-NEXT: s_endpgm 47; 48; GFX89-LABEL: add_i32_constant: 49; GFX89: ; %bb.0: ; %entry 50; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 51; GFX89-NEXT: s_mov_b64 s[6:7], exec 52; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 53; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 54; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 55; GFX89-NEXT: ; implicit-def: $vgpr1 56; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc 57; GFX89-NEXT: s_cbranch_execz .LBB0_2 58; GFX89-NEXT: ; %bb.1: 59; GFX89-NEXT: s_waitcnt lgkmcnt(0) 60; GFX89-NEXT: s_mov_b32 s8, s2 61; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 62; GFX89-NEXT: s_mul_i32 s2, s2, 5 63; GFX89-NEXT: s_mov_b32 s11, 0xf000 64; GFX89-NEXT: s_mov_b32 s10, -1 65; GFX89-NEXT: s_mov_b32 s9, s3 66; GFX89-NEXT: v_mov_b32_e32 v1, s2 67; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 68; GFX89-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 69; GFX89-NEXT: s_waitcnt vmcnt(0) 70; GFX89-NEXT: buffer_wbinvl1_vol 71; GFX89-NEXT: .LBB0_2: 72; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] 73; GFX89-NEXT: v_readfirstlane_b32 s4, v1 74; GFX89-NEXT: s_waitcnt lgkmcnt(0) 75; GFX89-NEXT: s_mov_b32 s3, 0xf000 76; GFX89-NEXT: s_mov_b32 s2, -1 77; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4 78; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 79; GFX89-NEXT: s_endpgm 80; 81; GFX1064-LABEL: add_i32_constant: 82; GFX1064: ; %bb.0: ; %entry 83; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 84; GFX1064-NEXT: s_mov_b64 s[6:7], exec 85; GFX1064-NEXT: ; implicit-def: $vgpr1 86; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 87; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 88; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 89; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 90; GFX1064-NEXT: s_cbranch_execz .LBB0_2 91; GFX1064-NEXT: ; %bb.1: 92; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 93; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 94; GFX1064-NEXT: s_mul_i32 s6, s6, 5 95; GFX1064-NEXT: s_mov_b32 s10, -1 96; GFX1064-NEXT: v_mov_b32_e32 v1, s6 97; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 98; GFX1064-NEXT: s_mov_b32 s8, s2 99; GFX1064-NEXT: s_mov_b32 s9, s3 100; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 101; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 102; GFX1064-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 103; GFX1064-NEXT: s_waitcnt vmcnt(0) 104; GFX1064-NEXT: buffer_gl0_inv 105; GFX1064-NEXT: buffer_gl1_inv 106; GFX1064-NEXT: .LBB0_2: 107; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 108; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 109; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 110; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 111; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 112; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 113; GFX1064-NEXT: s_mov_b32 s2, -1 114; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 115; GFX1064-NEXT: s_endpgm 116; 117; GFX1032-LABEL: add_i32_constant: 118; GFX1032: ; %bb.0: ; %entry 119; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 120; GFX1032-NEXT: s_mov_b32 s5, exec_lo 121; GFX1032-NEXT: ; implicit-def: $vgpr1 122; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 123; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 124; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 125; GFX1032-NEXT: s_cbranch_execz .LBB0_2 126; GFX1032-NEXT: ; %bb.1: 127; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 128; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 129; GFX1032-NEXT: s_mul_i32 s5, s5, 5 130; GFX1032-NEXT: s_mov_b32 s10, -1 131; GFX1032-NEXT: v_mov_b32_e32 v1, s5 132; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 133; GFX1032-NEXT: s_mov_b32 s8, s2 134; GFX1032-NEXT: s_mov_b32 s9, s3 135; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 136; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 137; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 138; GFX1032-NEXT: s_waitcnt vmcnt(0) 139; GFX1032-NEXT: buffer_gl0_inv 140; GFX1032-NEXT: buffer_gl1_inv 141; GFX1032-NEXT: .LBB0_2: 142; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 143; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 144; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 145; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 146; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 147; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 148; GFX1032-NEXT: s_mov_b32 s2, -1 149; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 150; GFX1032-NEXT: s_endpgm 151; 152; GFX1164-LABEL: add_i32_constant: 153; GFX1164: ; %bb.0: ; %entry 154; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 155; GFX1164-NEXT: s_mov_b64 s[6:7], exec 156; GFX1164-NEXT: s_mov_b64 s[4:5], exec 157; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 158; GFX1164-NEXT: ; implicit-def: $vgpr1 159; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 160; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 161; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 162; GFX1164-NEXT: s_cbranch_execz .LBB0_2 163; GFX1164-NEXT: ; %bb.1: 164; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 165; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 166; GFX1164-NEXT: s_mul_i32 s6, s6, 5 167; GFX1164-NEXT: s_mov_b32 s10, -1 168; GFX1164-NEXT: v_mov_b32_e32 v1, s6 169; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 170; GFX1164-NEXT: s_mov_b32 s8, s2 171; GFX1164-NEXT: s_mov_b32 s9, s3 172; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 173; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 174; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 175; GFX1164-NEXT: s_waitcnt vmcnt(0) 176; GFX1164-NEXT: buffer_gl0_inv 177; GFX1164-NEXT: buffer_gl1_inv 178; GFX1164-NEXT: .LBB0_2: 179; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 180; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 181; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 182; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 183; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 184; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2 185; GFX1164-NEXT: s_mov_b32 s2, -1 186; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 187; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 188; GFX1164-NEXT: s_endpgm 189; 190; GFX1132-LABEL: add_i32_constant: 191; GFX1132: ; %bb.0: ; %entry 192; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 193; GFX1132-NEXT: s_mov_b32 s5, exec_lo 194; GFX1132-NEXT: s_mov_b32 s4, exec_lo 195; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 196; GFX1132-NEXT: ; implicit-def: $vgpr1 197; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 198; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 199; GFX1132-NEXT: s_cbranch_execz .LBB0_2 200; GFX1132-NEXT: ; %bb.1: 201; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 202; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 203; GFX1132-NEXT: s_mul_i32 s5, s5, 5 204; GFX1132-NEXT: s_mov_b32 s10, -1 205; GFX1132-NEXT: v_mov_b32_e32 v1, s5 206; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 207; GFX1132-NEXT: s_mov_b32 s8, s2 208; GFX1132-NEXT: s_mov_b32 s9, s3 209; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 210; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 211; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 212; GFX1132-NEXT: s_waitcnt vmcnt(0) 213; GFX1132-NEXT: buffer_gl0_inv 214; GFX1132-NEXT: buffer_gl1_inv 215; GFX1132-NEXT: .LBB0_2: 216; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 217; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 218; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 219; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 220; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 221; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2 222; GFX1132-NEXT: s_mov_b32 s2, -1 223; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 224; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 225; GFX1132-NEXT: s_endpgm 226entry: 227 %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel 228 store i32 %old, i32 addrspace(1)* %out 229 ret void 230} 231 232define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) { 233; GFX7LESS-LABEL: add_i32_uniform: 234; GFX7LESS: ; %bb.0: ; %entry 235; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 236; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 237; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd 238; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 239; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 240; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 241; GFX7LESS-NEXT: ; implicit-def: $vgpr1 242; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 243; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 244; GFX7LESS-NEXT: ; %bb.1: 245; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 246; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 247; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 248; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 249; GFX7LESS-NEXT: s_mov_b32 s14, -1 250; GFX7LESS-NEXT: s_mov_b32 s12, s6 251; GFX7LESS-NEXT: s_mov_b32 s13, s7 252; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 253; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 254; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 255; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 256; GFX7LESS-NEXT: buffer_wbinvl1 257; GFX7LESS-NEXT: .LBB1_2: 258; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 259; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 260; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 261; GFX7LESS-NEXT: s_mov_b32 s6, -1 262; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 263; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 264; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 265; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 266; GFX7LESS-NEXT: s_endpgm 267; 268; GFX8-LABEL: add_i32_uniform: 269; GFX8: ; %bb.0: ; %entry 270; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 271; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 272; GFX8-NEXT: s_mov_b64 s[2:3], exec 273; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 274; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 275; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 276; GFX8-NEXT: ; implicit-def: $vgpr1 277; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 278; GFX8-NEXT: s_cbranch_execz .LBB1_2 279; GFX8-NEXT: ; %bb.1: 280; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 281; GFX8-NEXT: s_waitcnt lgkmcnt(0) 282; GFX8-NEXT: s_mul_i32 s2, s8, s2 283; GFX8-NEXT: s_mov_b32 s15, 0xf000 284; GFX8-NEXT: s_mov_b32 s14, -1 285; GFX8-NEXT: s_mov_b32 s12, s6 286; GFX8-NEXT: s_mov_b32 s13, s7 287; GFX8-NEXT: v_mov_b32_e32 v1, s2 288; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 289; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 290; GFX8-NEXT: s_waitcnt vmcnt(0) 291; GFX8-NEXT: buffer_wbinvl1_vol 292; GFX8-NEXT: .LBB1_2: 293; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 294; GFX8-NEXT: s_waitcnt lgkmcnt(0) 295; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 296; GFX8-NEXT: v_readfirstlane_b32 s0, v1 297; GFX8-NEXT: s_mov_b32 s7, 0xf000 298; GFX8-NEXT: s_mov_b32 s6, -1 299; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 300; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 301; GFX8-NEXT: s_endpgm 302; 303; GFX9-LABEL: add_i32_uniform: 304; GFX9: ; %bb.0: ; %entry 305; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 306; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 307; GFX9-NEXT: s_mov_b64 s[2:3], exec 308; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 309; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 310; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 311; GFX9-NEXT: ; implicit-def: $vgpr1 312; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 313; GFX9-NEXT: s_cbranch_execz .LBB1_2 314; GFX9-NEXT: ; %bb.1: 315; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 316; GFX9-NEXT: s_waitcnt lgkmcnt(0) 317; GFX9-NEXT: s_mul_i32 s2, s8, s2 318; GFX9-NEXT: s_mov_b32 s15, 0xf000 319; GFX9-NEXT: s_mov_b32 s14, -1 320; GFX9-NEXT: s_mov_b32 s12, s6 321; GFX9-NEXT: s_mov_b32 s13, s7 322; GFX9-NEXT: v_mov_b32_e32 v1, s2 323; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 324; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 325; GFX9-NEXT: s_waitcnt vmcnt(0) 326; GFX9-NEXT: buffer_wbinvl1_vol 327; GFX9-NEXT: .LBB1_2: 328; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 329; GFX9-NEXT: s_waitcnt lgkmcnt(0) 330; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 331; GFX9-NEXT: v_readfirstlane_b32 s0, v1 332; GFX9-NEXT: s_mov_b32 s7, 0xf000 333; GFX9-NEXT: s_mov_b32 s6, -1 334; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 335; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 336; GFX9-NEXT: s_endpgm 337; 338; GFX1064-LABEL: add_i32_uniform: 339; GFX1064: ; %bb.0: ; %entry 340; GFX1064-NEXT: s_clause 0x1 341; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 342; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 343; GFX1064-NEXT: s_mov_b64 s[2:3], exec 344; GFX1064-NEXT: ; implicit-def: $vgpr1 345; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 346; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 347; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 348; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 349; GFX1064-NEXT: s_cbranch_execz .LBB1_2 350; GFX1064-NEXT: ; %bb.1: 351; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 352; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 353; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 354; GFX1064-NEXT: s_mul_i32 s2, s8, s2 355; GFX1064-NEXT: s_mov_b32 s14, -1 356; GFX1064-NEXT: v_mov_b32_e32 v1, s2 357; GFX1064-NEXT: s_mov_b32 s12, s6 358; GFX1064-NEXT: s_mov_b32 s13, s7 359; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 360; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 361; GFX1064-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 362; GFX1064-NEXT: s_waitcnt vmcnt(0) 363; GFX1064-NEXT: buffer_gl0_inv 364; GFX1064-NEXT: buffer_gl1_inv 365; GFX1064-NEXT: .LBB1_2: 366; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 367; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 368; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 369; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 370; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 371; GFX1064-NEXT: s_mov_b32 s6, -1 372; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1] 373; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 374; GFX1064-NEXT: s_endpgm 375; 376; GFX1032-LABEL: add_i32_uniform: 377; GFX1032: ; %bb.0: ; %entry 378; GFX1032-NEXT: s_clause 0x1 379; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 380; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 381; GFX1032-NEXT: s_mov_b32 s3, exec_lo 382; GFX1032-NEXT: ; implicit-def: $vgpr1 383; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 384; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 385; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 386; GFX1032-NEXT: s_cbranch_execz .LBB1_2 387; GFX1032-NEXT: ; %bb.1: 388; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 389; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 390; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 391; GFX1032-NEXT: s_mul_i32 s1, s2, s1 392; GFX1032-NEXT: s_mov_b32 s10, -1 393; GFX1032-NEXT: v_mov_b32_e32 v1, s1 394; GFX1032-NEXT: s_mov_b32 s8, s6 395; GFX1032-NEXT: s_mov_b32 s9, s7 396; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 397; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 398; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 399; GFX1032-NEXT: s_waitcnt vmcnt(0) 400; GFX1032-NEXT: buffer_gl0_inv 401; GFX1032-NEXT: buffer_gl1_inv 402; GFX1032-NEXT: .LBB1_2: 403; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 404; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 405; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 406; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 407; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 408; GFX1032-NEXT: s_mov_b32 s6, -1 409; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1] 410; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 411; GFX1032-NEXT: s_endpgm 412; 413; GFX1164-LABEL: add_i32_uniform: 414; GFX1164: ; %bb.0: ; %entry 415; GFX1164-NEXT: s_clause 0x1 416; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 417; GFX1164-NEXT: s_load_b32 s8, s[0:1], 0x34 418; GFX1164-NEXT: s_mov_b64 s[2:3], exec 419; GFX1164-NEXT: s_mov_b64 s[0:1], exec 420; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 421; GFX1164-NEXT: ; implicit-def: $vgpr1 422; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 423; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 424; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 425; GFX1164-NEXT: s_cbranch_execz .LBB1_2 426; GFX1164-NEXT: ; %bb.1: 427; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 428; GFX1164-NEXT: s_mov_b32 s15, 0x31016000 429; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 430; GFX1164-NEXT: s_mul_i32 s2, s8, s2 431; GFX1164-NEXT: s_mov_b32 s14, -1 432; GFX1164-NEXT: v_mov_b32_e32 v1, s2 433; GFX1164-NEXT: s_mov_b32 s12, s6 434; GFX1164-NEXT: s_mov_b32 s13, s7 435; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 436; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 437; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], 0 glc 438; GFX1164-NEXT: s_waitcnt vmcnt(0) 439; GFX1164-NEXT: buffer_gl0_inv 440; GFX1164-NEXT: buffer_gl1_inv 441; GFX1164-NEXT: .LBB1_2: 442; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 443; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 444; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 445; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 446; GFX1164-NEXT: s_mov_b32 s6, -1 447; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 448; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1] 449; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0 450; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 451; GFX1164-NEXT: s_endpgm 452; 453; GFX1132-LABEL: add_i32_uniform: 454; GFX1132: ; %bb.0: ; %entry 455; GFX1132-NEXT: s_clause 0x1 456; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 457; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x34 458; GFX1132-NEXT: s_mov_b32 s2, exec_lo 459; GFX1132-NEXT: s_mov_b32 s1, exec_lo 460; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 461; GFX1132-NEXT: ; implicit-def: $vgpr1 462; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 463; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 464; GFX1132-NEXT: s_cbranch_execz .LBB1_2 465; GFX1132-NEXT: ; %bb.1: 466; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 467; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 468; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 469; GFX1132-NEXT: s_mul_i32 s2, s0, s2 470; GFX1132-NEXT: s_mov_b32 s10, -1 471; GFX1132-NEXT: v_mov_b32_e32 v1, s2 472; GFX1132-NEXT: s_mov_b32 s8, s6 473; GFX1132-NEXT: s_mov_b32 s9, s7 474; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 475; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 476; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 477; GFX1132-NEXT: s_waitcnt vmcnt(0) 478; GFX1132-NEXT: buffer_gl0_inv 479; GFX1132-NEXT: buffer_gl1_inv 480; GFX1132-NEXT: .LBB1_2: 481; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 482; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 483; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 484; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 485; GFX1132-NEXT: s_mov_b32 s6, -1 486; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 487; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] 488; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 489; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 490; GFX1132-NEXT: s_endpgm 491entry: 492 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel 493 store i32 %old, i32 addrspace(1)* %out 494 ret void 495} 496 497define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 498; GFX7LESS-LABEL: add_i32_varying: 499; GFX7LESS: ; %bb.0: ; %entry 500; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 501; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 502; GFX7LESS-NEXT: s_mov_b32 s6, -1 503; GFX7LESS-NEXT: s_mov_b32 s10, s6 504; GFX7LESS-NEXT: s_mov_b32 s11, s7 505; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 506; GFX7LESS-NEXT: s_mov_b32 s8, s2 507; GFX7LESS-NEXT: s_mov_b32 s9, s3 508; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 509; GFX7LESS-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 510; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 511; GFX7LESS-NEXT: buffer_wbinvl1 512; GFX7LESS-NEXT: s_mov_b32 s4, s0 513; GFX7LESS-NEXT: s_mov_b32 s5, s1 514; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 515; GFX7LESS-NEXT: s_endpgm 516; 517; GFX8-LABEL: add_i32_varying: 518; GFX8: ; %bb.0: ; %entry 519; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 520; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 521; GFX8-NEXT: v_mov_b32_e32 v1, 0 522; GFX8-NEXT: s_mov_b64 exec, s[4:5] 523; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 524; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 525; GFX8-NEXT: v_mov_b32_e32 v2, v0 526; GFX8-NEXT: s_not_b64 exec, exec 527; GFX8-NEXT: v_mov_b32_e32 v2, 0 528; GFX8-NEXT: s_not_b64 exec, exec 529; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 530; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 531; GFX8-NEXT: s_nop 1 532; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 533; GFX8-NEXT: s_nop 1 534; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 535; GFX8-NEXT: s_nop 1 536; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 537; GFX8-NEXT: s_nop 1 538; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 539; GFX8-NEXT: s_nop 1 540; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 541; GFX8-NEXT: v_readlane_b32 s6, v2, 63 542; GFX8-NEXT: s_nop 0 543; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 544; GFX8-NEXT: s_mov_b64 exec, s[4:5] 545; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 546; GFX8-NEXT: ; implicit-def: $vgpr0 547; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 548; GFX8-NEXT: s_cbranch_execz .LBB2_2 549; GFX8-NEXT: ; %bb.1: 550; GFX8-NEXT: s_mov_b32 s11, 0xf000 551; GFX8-NEXT: s_mov_b32 s10, -1 552; GFX8-NEXT: s_waitcnt lgkmcnt(0) 553; GFX8-NEXT: s_mov_b32 s8, s2 554; GFX8-NEXT: s_mov_b32 s9, s3 555; GFX8-NEXT: v_mov_b32_e32 v0, s6 556; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 557; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 558; GFX8-NEXT: s_waitcnt vmcnt(0) 559; GFX8-NEXT: buffer_wbinvl1_vol 560; GFX8-NEXT: .LBB2_2: 561; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 562; GFX8-NEXT: v_readfirstlane_b32 s4, v0 563; GFX8-NEXT: v_mov_b32_e32 v0, v1 564; GFX8-NEXT: s_waitcnt lgkmcnt(0) 565; GFX8-NEXT: s_mov_b32 s3, 0xf000 566; GFX8-NEXT: s_mov_b32 s2, -1 567; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 568; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 569; GFX8-NEXT: s_endpgm 570; 571; GFX9-LABEL: add_i32_varying: 572; GFX9: ; %bb.0: ; %entry 573; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 574; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 575; GFX9-NEXT: v_mov_b32_e32 v1, 0 576; GFX9-NEXT: s_mov_b64 exec, s[4:5] 577; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 578; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 579; GFX9-NEXT: v_mov_b32_e32 v2, v0 580; GFX9-NEXT: s_not_b64 exec, exec 581; GFX9-NEXT: v_mov_b32_e32 v2, 0 582; GFX9-NEXT: s_not_b64 exec, exec 583; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 584; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 585; GFX9-NEXT: s_nop 1 586; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 587; GFX9-NEXT: s_nop 1 588; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 589; GFX9-NEXT: s_nop 1 590; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 591; GFX9-NEXT: s_nop 1 592; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 593; GFX9-NEXT: s_nop 1 594; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 595; GFX9-NEXT: v_readlane_b32 s6, v2, 63 596; GFX9-NEXT: s_nop 0 597; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 598; GFX9-NEXT: s_mov_b64 exec, s[4:5] 599; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 600; GFX9-NEXT: ; implicit-def: $vgpr0 601; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 602; GFX9-NEXT: s_cbranch_execz .LBB2_2 603; GFX9-NEXT: ; %bb.1: 604; GFX9-NEXT: s_mov_b32 s11, 0xf000 605; GFX9-NEXT: s_mov_b32 s10, -1 606; GFX9-NEXT: s_waitcnt lgkmcnt(0) 607; GFX9-NEXT: s_mov_b32 s8, s2 608; GFX9-NEXT: s_mov_b32 s9, s3 609; GFX9-NEXT: v_mov_b32_e32 v0, s6 610; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 611; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 612; GFX9-NEXT: s_waitcnt vmcnt(0) 613; GFX9-NEXT: buffer_wbinvl1_vol 614; GFX9-NEXT: .LBB2_2: 615; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 616; GFX9-NEXT: v_readfirstlane_b32 s4, v0 617; GFX9-NEXT: v_mov_b32_e32 v0, v1 618; GFX9-NEXT: s_waitcnt lgkmcnt(0) 619; GFX9-NEXT: s_mov_b32 s3, 0xf000 620; GFX9-NEXT: s_mov_b32 s2, -1 621; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 622; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 623; GFX9-NEXT: s_endpgm 624; 625; GFX1064-LABEL: add_i32_varying: 626; GFX1064: ; %bb.0: ; %entry 627; GFX1064-NEXT: v_mov_b32_e32 v1, v0 628; GFX1064-NEXT: s_not_b64 exec, exec 629; GFX1064-NEXT: v_mov_b32_e32 v1, 0 630; GFX1064-NEXT: s_not_b64 exec, exec 631; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 632; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 633; GFX1064-NEXT: v_mov_b32_e32 v3, 0 634; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 635; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 636; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 637; GFX1064-NEXT: v_mov_b32_e32 v2, v1 638; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 639; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 640; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 641; GFX1064-NEXT: v_mov_b32_e32 v2, s4 642; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 643; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 644; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 645; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 646; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 647; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 648; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 649; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 650; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 651; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 652; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 653; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 654; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 655; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 656; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 657; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 658; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 659; GFX1064-NEXT: s_mov_b32 s4, s9 660; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 661; GFX1064-NEXT: s_mov_b64 exec, s[6:7] 662; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 663; GFX1064-NEXT: s_mov_b32 s6, -1 664; GFX1064-NEXT: ; implicit-def: $vgpr0 665; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc 666; GFX1064-NEXT: s_cbranch_execz .LBB2_2 667; GFX1064-NEXT: ; %bb.1: 668; GFX1064-NEXT: v_mov_b32_e32 v0, s4 669; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 670; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 671; GFX1064-NEXT: s_mov_b32 s4, s2 672; GFX1064-NEXT: s_mov_b32 s5, s3 673; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 674; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 675; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 676; GFX1064-NEXT: s_waitcnt vmcnt(0) 677; GFX1064-NEXT: buffer_gl0_inv 678; GFX1064-NEXT: buffer_gl1_inv 679; GFX1064-NEXT: .LBB2_2: 680; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 681; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 682; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 683; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 684; GFX1064-NEXT: v_mov_b32_e32 v0, v3 685; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 686; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v0 687; GFX1064-NEXT: s_mov_b32 s2, s6 688; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 689; GFX1064-NEXT: s_endpgm 690; 691; GFX1032-LABEL: add_i32_varying: 692; GFX1032: ; %bb.0: ; %entry 693; GFX1032-NEXT: v_mov_b32_e32 v1, v0 694; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 695; GFX1032-NEXT: v_mov_b32_e32 v1, 0 696; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 697; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 698; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 699; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 700; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 701; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 702; GFX1032-NEXT: v_mov_b32_e32 v2, v1 703; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 704; GFX1032-NEXT: s_mov_b32 exec_lo, s2 705; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 706; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 707; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 708; GFX1032-NEXT: v_mov_b32_e32 v3, 0 709; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 710; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 711; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 712; GFX1032-NEXT: s_mov_b32 exec_lo, s4 713; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 714; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 715; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 716; GFX1032-NEXT: s_mov_b32 exec_lo, s4 717; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 718; GFX1032-NEXT: s_mov_b32 s4, s6 719; GFX1032-NEXT: s_mov_b32 s6, -1 720; GFX1032-NEXT: ; implicit-def: $vgpr0 721; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo 722; GFX1032-NEXT: s_cbranch_execz .LBB2_2 723; GFX1032-NEXT: ; %bb.1: 724; GFX1032-NEXT: v_mov_b32_e32 v0, s4 725; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 726; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 727; GFX1032-NEXT: s_mov_b32 s4, s2 728; GFX1032-NEXT: s_mov_b32 s5, s3 729; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 730; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 731; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 732; GFX1032-NEXT: s_waitcnt vmcnt(0) 733; GFX1032-NEXT: buffer_gl0_inv 734; GFX1032-NEXT: buffer_gl1_inv 735; GFX1032-NEXT: .LBB2_2: 736; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 737; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 738; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 739; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 740; GFX1032-NEXT: v_mov_b32_e32 v0, v3 741; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 742; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v0 743; GFX1032-NEXT: s_mov_b32 s2, s6 744; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 745; GFX1032-NEXT: s_endpgm 746; 747; GFX1164-LABEL: add_i32_varying: 748; GFX1164: ; %bb.0: ; %entry 749; GFX1164-NEXT: v_mov_b32_e32 v1, v0 750; GFX1164-NEXT: s_not_b64 exec, exec 751; GFX1164-NEXT: v_mov_b32_e32 v1, 0 752; GFX1164-NEXT: s_not_b64 exec, exec 753; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 754; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 755; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 756; GFX1164-NEXT: v_mov_b32_e32 v3, 0 757; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 758; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 759; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 760; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 761; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 762; GFX1164-NEXT: v_mov_b32_e32 v2, v1 763; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 764; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 765; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 766; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 767; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 768; GFX1164-NEXT: v_mov_b32_e32 v2, s4 769; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 770; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 771; GFX1164-NEXT: v_readlane_b32 s6, v1, 15 772; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 773; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 774; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 775; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 776; GFX1164-NEXT: v_readlane_b32 s7, v1, 31 777; GFX1164-NEXT: v_writelane_b32 v3, s6, 16 778; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 779; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 780; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 781; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 782; GFX1164-NEXT: v_readlane_b32 s8, v1, 47 783; GFX1164-NEXT: v_readlane_b32 s9, v1, 63 784; GFX1164-NEXT: v_writelane_b32 v3, s7, 32 785; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 786; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 787; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 788; GFX1164-NEXT: s_or_saveexec_b64 s[6:7], -1 789; GFX1164-NEXT: s_mov_b32 s4, s9 790; GFX1164-NEXT: v_writelane_b32 v3, s8, 48 791; GFX1164-NEXT: s_mov_b64 exec, s[6:7] 792; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 793; GFX1164-NEXT: s_mov_b32 s6, -1 794; GFX1164-NEXT: ; implicit-def: $vgpr0 795; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], vcc 796; GFX1164-NEXT: s_cbranch_execz .LBB2_2 797; GFX1164-NEXT: ; %bb.1: 798; GFX1164-NEXT: v_mov_b32_e32 v0, s4 799; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 800; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 801; GFX1164-NEXT: s_mov_b32 s4, s2 802; GFX1164-NEXT: s_mov_b32 s5, s3 803; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 804; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 805; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc 806; GFX1164-NEXT: s_waitcnt vmcnt(0) 807; GFX1164-NEXT: buffer_gl0_inv 808; GFX1164-NEXT: buffer_gl1_inv 809; GFX1164-NEXT: .LBB2_2: 810; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] 811; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 812; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 813; GFX1164-NEXT: v_mov_b32_e32 v0, v3 814; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 815; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 816; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v0 817; GFX1164-NEXT: s_mov_b32 s2, s6 818; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 819; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 820; GFX1164-NEXT: s_endpgm 821; 822; GFX1132-LABEL: add_i32_varying: 823; GFX1132: ; %bb.0: ; %entry 824; GFX1132-NEXT: v_mov_b32_e32 v1, v0 825; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 826; GFX1132-NEXT: v_mov_b32_e32 v1, 0 827; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 828; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 829; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 830; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 831; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 832; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 833; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 834; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 835; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 836; GFX1132-NEXT: v_mov_b32_e32 v2, v1 837; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 838; GFX1132-NEXT: s_mov_b32 exec_lo, s2 839; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 840; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 841; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 842; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 843; GFX1132-NEXT: v_mov_b32_e32 v3, 0 844; GFX1132-NEXT: v_readlane_b32 s5, v1, 15 845; GFX1132-NEXT: v_readlane_b32 s6, v1, 31 846; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 847; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 848; GFX1132-NEXT: s_mov_b32 exec_lo, s4 849; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 850; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 851; GFX1132-NEXT: v_writelane_b32 v3, s5, 16 852; GFX1132-NEXT: s_mov_b32 exec_lo, s4 853; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 854; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 855; GFX1132-NEXT: s_mov_b32 s4, s6 856; GFX1132-NEXT: s_mov_b32 s6, -1 857; GFX1132-NEXT: ; implicit-def: $vgpr0 858; GFX1132-NEXT: s_and_saveexec_b32 s8, vcc_lo 859; GFX1132-NEXT: s_cbranch_execz .LBB2_2 860; GFX1132-NEXT: ; %bb.1: 861; GFX1132-NEXT: v_mov_b32_e32 v0, s4 862; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 863; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 864; GFX1132-NEXT: s_mov_b32 s4, s2 865; GFX1132-NEXT: s_mov_b32 s5, s3 866; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 867; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 868; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc 869; GFX1132-NEXT: s_waitcnt vmcnt(0) 870; GFX1132-NEXT: buffer_gl0_inv 871; GFX1132-NEXT: buffer_gl1_inv 872; GFX1132-NEXT: .LBB2_2: 873; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 874; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 875; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 876; GFX1132-NEXT: v_mov_b32_e32 v0, v3 877; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 878; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 879; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v0 880; GFX1132-NEXT: s_mov_b32 s2, s6 881; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 882; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 883; GFX1132-NEXT: s_endpgm 884entry: 885 %lane = call i32 @llvm.amdgcn.workitem.id.x() 886 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel 887 store i32 %old, i32 addrspace(1)* %out 888 ret void 889} 890 891define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 892; GFX7LESS-LABEL: add_i64_constant: 893; GFX7LESS: ; %bb.0: ; %entry 894; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 895; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 896; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 897; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 898; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 899; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 900; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 901; GFX7LESS-NEXT: s_cbranch_execz .LBB3_2 902; GFX7LESS-NEXT: ; %bb.1: 903; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 904; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 905; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 906; GFX7LESS-NEXT: s_mov_b32 s10, -1 907; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 908; GFX7LESS-NEXT: s_mov_b32 s8, s2 909; GFX7LESS-NEXT: s_mov_b32 s9, s3 910; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 911; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 912; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 913; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 914; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 915; GFX7LESS-NEXT: buffer_wbinvl1 916; GFX7LESS-NEXT: .LBB3_2: 917; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 918; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 919; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 920; GFX7LESS-NEXT: s_mov_b32 s2, -1 921; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 922; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 923; GFX7LESS-NEXT: s_waitcnt expcnt(0) 924; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 925; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 926; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 927; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 928; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 929; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 930; GFX7LESS-NEXT: s_endpgm 931; 932; GFX89-LABEL: add_i64_constant: 933; GFX89: ; %bb.0: ; %entry 934; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 935; GFX89-NEXT: s_mov_b64 s[6:7], exec 936; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 937; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 938; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 939; GFX89-NEXT: ; implicit-def: $vgpr0_vgpr1 940; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc 941; GFX89-NEXT: s_cbranch_execz .LBB3_2 942; GFX89-NEXT: ; %bb.1: 943; GFX89-NEXT: s_waitcnt lgkmcnt(0) 944; GFX89-NEXT: s_mov_b32 s8, s2 945; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 946; GFX89-NEXT: s_mul_i32 s2, s2, 5 947; GFX89-NEXT: s_mov_b32 s11, 0xf000 948; GFX89-NEXT: s_mov_b32 s10, -1 949; GFX89-NEXT: s_mov_b32 s9, s3 950; GFX89-NEXT: v_mov_b32_e32 v0, s2 951; GFX89-NEXT: v_mov_b32_e32 v1, 0 952; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 953; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 954; GFX89-NEXT: s_waitcnt vmcnt(0) 955; GFX89-NEXT: buffer_wbinvl1_vol 956; GFX89-NEXT: .LBB3_2: 957; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] 958; GFX89-NEXT: s_waitcnt lgkmcnt(0) 959; GFX89-NEXT: v_readfirstlane_b32 s2, v0 960; GFX89-NEXT: v_readfirstlane_b32 s3, v1 961; GFX89-NEXT: v_mov_b32_e32 v0, s2 962; GFX89-NEXT: v_mov_b32_e32 v1, s3 963; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 964; GFX89-NEXT: s_mov_b32 s3, 0xf000 965; GFX89-NEXT: s_mov_b32 s2, -1 966; GFX89-NEXT: s_nop 2 967; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 968; GFX89-NEXT: s_endpgm 969; 970; GFX1064-LABEL: add_i64_constant: 971; GFX1064: ; %bb.0: ; %entry 972; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 973; GFX1064-NEXT: s_mov_b64 s[6:7], exec 974; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 975; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 976; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 977; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 978; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 979; GFX1064-NEXT: s_cbranch_execz .LBB3_2 980; GFX1064-NEXT: ; %bb.1: 981; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 982; GFX1064-NEXT: v_mov_b32_e32 v1, 0 983; GFX1064-NEXT: s_mul_i32 s6, s6, 5 984; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 985; GFX1064-NEXT: v_mov_b32_e32 v0, s6 986; GFX1064-NEXT: s_mov_b32 s10, -1 987; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 988; GFX1064-NEXT: s_mov_b32 s8, s2 989; GFX1064-NEXT: s_mov_b32 s9, s3 990; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 991; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 992; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 993; GFX1064-NEXT: s_waitcnt vmcnt(0) 994; GFX1064-NEXT: buffer_gl0_inv 995; GFX1064-NEXT: buffer_gl1_inv 996; GFX1064-NEXT: .LBB3_2: 997; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 998; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 999; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1000; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 1001; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 1002; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] 1003; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1004; GFX1064-NEXT: s_mov_b32 s2, -1 1005; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1006; GFX1064-NEXT: s_endpgm 1007; 1008; GFX1032-LABEL: add_i64_constant: 1009; GFX1032: ; %bb.0: ; %entry 1010; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1011; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1012; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1013; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1014; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1015; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1016; GFX1032-NEXT: s_cbranch_execz .LBB3_2 1017; GFX1032-NEXT: ; %bb.1: 1018; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1019; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1020; GFX1032-NEXT: s_mul_i32 s5, s5, 5 1021; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 1022; GFX1032-NEXT: v_mov_b32_e32 v0, s5 1023; GFX1032-NEXT: s_mov_b32 s10, -1 1024; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1025; GFX1032-NEXT: s_mov_b32 s8, s2 1026; GFX1032-NEXT: s_mov_b32 s9, s3 1027; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1028; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1029; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1030; GFX1032-NEXT: s_waitcnt vmcnt(0) 1031; GFX1032-NEXT: buffer_gl0_inv 1032; GFX1032-NEXT: buffer_gl1_inv 1033; GFX1032-NEXT: .LBB3_2: 1034; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1035; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1036; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1037; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 1038; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 1039; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] 1040; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1041; GFX1032-NEXT: s_mov_b32 s2, -1 1042; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1043; GFX1032-NEXT: s_endpgm 1044; 1045; GFX1164-LABEL: add_i64_constant: 1046; GFX1164: ; %bb.0: ; %entry 1047; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1048; GFX1164-NEXT: s_mov_b64 s[6:7], exec 1049; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1050; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1051; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1052; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1053; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 1054; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 1055; GFX1164-NEXT: s_cbranch_execz .LBB3_2 1056; GFX1164-NEXT: ; %bb.1: 1057; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1058; GFX1164-NEXT: v_mov_b32_e32 v1, 0 1059; GFX1164-NEXT: s_mul_i32 s6, s6, 5 1060; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 1061; GFX1164-NEXT: v_mov_b32_e32 v0, s6 1062; GFX1164-NEXT: s_mov_b32 s10, -1 1063; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1064; GFX1164-NEXT: s_mov_b32 s8, s2 1065; GFX1164-NEXT: s_mov_b32 s9, s3 1066; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1067; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1068; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc 1069; GFX1164-NEXT: s_waitcnt vmcnt(0) 1070; GFX1164-NEXT: buffer_gl0_inv 1071; GFX1164-NEXT: buffer_gl1_inv 1072; GFX1164-NEXT: .LBB3_2: 1073; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 1074; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1075; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 1076; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 1077; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 1078; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 1079; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1080; GFX1164-NEXT: s_mov_b32 s2, -1 1081; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1082; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1083; GFX1164-NEXT: s_endpgm 1084; 1085; GFX1132-LABEL: add_i64_constant: 1086; GFX1132: ; %bb.0: ; %entry 1087; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1088; GFX1132-NEXT: s_mov_b32 s5, exec_lo 1089; GFX1132-NEXT: s_mov_b32 s4, exec_lo 1090; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1091; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 1092; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1093; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 1094; GFX1132-NEXT: s_cbranch_execz .LBB3_2 1095; GFX1132-NEXT: ; %bb.1: 1096; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 1097; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 1098; GFX1132-NEXT: s_mul_i32 s5, s5, 5 1099; GFX1132-NEXT: s_mov_b32 s10, -1 1100; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0 1101; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1102; GFX1132-NEXT: s_mov_b32 s8, s2 1103; GFX1132-NEXT: s_mov_b32 s9, s3 1104; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1105; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1106; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc 1107; GFX1132-NEXT: s_waitcnt vmcnt(0) 1108; GFX1132-NEXT: buffer_gl0_inv 1109; GFX1132-NEXT: buffer_gl1_inv 1110; GFX1132-NEXT: .LBB3_2: 1111; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 1112; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1113; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 1114; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 1115; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1116; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 1117; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1118; GFX1132-NEXT: s_mov_b32 s2, -1 1119; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1120; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1121; GFX1132-NEXT: s_endpgm 1122entry: 1123 %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel 1124 store i64 %old, i64 addrspace(1)* %out 1125 ret void 1126} 1127 1128define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) { 1129; GFX7LESS-LABEL: add_i64_uniform: 1130; GFX7LESS: ; %bb.0: ; %entry 1131; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec 1132; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1133; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1134; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 1135; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 1136; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1137; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1138; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1139; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 1140; GFX7LESS-NEXT: ; %bb.1: 1141; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 1142; GFX7LESS-NEXT: s_mov_b32 s14, -1 1143; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1144; GFX7LESS-NEXT: s_mov_b32 s12, s6 1145; GFX7LESS-NEXT: s_mov_b32 s13, s7 1146; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 1147; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 1148; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1149; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 1150; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 1151; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 1152; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1153; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1154; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc 1155; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1156; GFX7LESS-NEXT: buffer_wbinvl1 1157; GFX7LESS-NEXT: .LBB4_2: 1158; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1159; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1160; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1161; GFX7LESS-NEXT: s_mov_b32 s6, -1 1162; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 1163; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1 1164; GFX7LESS-NEXT: s_waitcnt expcnt(0) 1165; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 1166; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 1167; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 1168; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 1169; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3 1170; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1171; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 1172; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1173; GFX7LESS-NEXT: s_endpgm 1174; 1175; GFX8-LABEL: add_i64_uniform: 1176; GFX8: ; %bb.0: ; %entry 1177; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1178; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1179; GFX8-NEXT: s_mov_b64 s[8:9], exec 1180; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 1181; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 1182; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1183; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1184; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1185; GFX8-NEXT: s_cbranch_execz .LBB4_2 1186; GFX8-NEXT: ; %bb.1: 1187; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1188; GFX8-NEXT: s_mov_b32 s12, s6 1189; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 1190; GFX8-NEXT: v_mov_b32_e32 v0, s6 1191; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0 1192; GFX8-NEXT: s_mul_i32 s6, s1, s6 1193; GFX8-NEXT: s_mov_b32 s15, 0xf000 1194; GFX8-NEXT: s_mov_b32 s14, -1 1195; GFX8-NEXT: s_mov_b32 s13, s7 1196; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 1197; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1198; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc 1199; GFX8-NEXT: s_waitcnt vmcnt(0) 1200; GFX8-NEXT: buffer_wbinvl1_vol 1201; GFX8-NEXT: .LBB4_2: 1202; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1203; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1204; GFX8-NEXT: v_readfirstlane_b32 s3, v1 1205; GFX8-NEXT: v_mov_b32_e32 v0, s2 1206; GFX8-NEXT: v_mov_b32_e32 v1, s3 1207; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1208; GFX8-NEXT: v_mul_lo_u32 v3, s1, v2 1209; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v2, v[0:1] 1210; GFX8-NEXT: s_mov_b32 s7, 0xf000 1211; GFX8-NEXT: s_mov_b32 s6, -1 1212; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1213; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1214; GFX8-NEXT: s_endpgm 1215; 1216; GFX9-LABEL: add_i64_uniform: 1217; GFX9: ; %bb.0: ; %entry 1218; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1219; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1220; GFX9-NEXT: s_mov_b64 s[8:9], exec 1221; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 1222; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 1223; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1224; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1225; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1226; GFX9-NEXT: s_cbranch_execz .LBB4_2 1227; GFX9-NEXT: ; %bb.1: 1228; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1229; GFX9-NEXT: s_mov_b32 s12, s6 1230; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 1231; GFX9-NEXT: s_mov_b32 s13, s7 1232; GFX9-NEXT: s_mul_i32 s7, s3, s6 1233; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1234; GFX9-NEXT: s_add_i32 s8, s8, s7 1235; GFX9-NEXT: s_mul_i32 s6, s2, s6 1236; GFX9-NEXT: s_mov_b32 s15, 0xf000 1237; GFX9-NEXT: s_mov_b32 s14, -1 1238; GFX9-NEXT: v_mov_b32_e32 v0, s6 1239; GFX9-NEXT: v_mov_b32_e32 v1, s8 1240; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1241; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc 1242; GFX9-NEXT: s_waitcnt vmcnt(0) 1243; GFX9-NEXT: buffer_wbinvl1_vol 1244; GFX9-NEXT: .LBB4_2: 1245; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1246; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1247; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1248; GFX9-NEXT: v_mov_b32_e32 v0, s0 1249; GFX9-NEXT: v_mov_b32_e32 v1, s1 1250; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1251; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1] 1252; GFX9-NEXT: s_mov_b32 s7, 0xf000 1253; GFX9-NEXT: s_mov_b32 s6, -1 1254; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2] 1255; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1256; GFX9-NEXT: s_endpgm 1257; 1258; GFX1064-LABEL: add_i64_uniform: 1259; GFX1064: ; %bb.0: ; %entry 1260; GFX1064-NEXT: s_clause 0x1 1261; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1262; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1263; GFX1064-NEXT: s_mov_b64 s[8:9], exec 1264; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 1265; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 1266; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1267; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1268; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1269; GFX1064-NEXT: s_cbranch_execz .LBB4_2 1270; GFX1064-NEXT: ; %bb.1: 1271; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] 1272; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 1273; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1274; GFX1064-NEXT: s_mul_i32 s9, s3, s8 1275; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 1276; GFX1064-NEXT: s_mul_i32 s8, s2, s8 1277; GFX1064-NEXT: s_add_i32 s10, s10, s9 1278; GFX1064-NEXT: v_mov_b32_e32 v0, s8 1279; GFX1064-NEXT: v_mov_b32_e32 v1, s10 1280; GFX1064-NEXT: s_mov_b32 s10, -1 1281; GFX1064-NEXT: s_mov_b32 s8, s6 1282; GFX1064-NEXT: s_mov_b32 s9, s7 1283; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1284; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1285; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1286; GFX1064-NEXT: s_waitcnt vmcnt(0) 1287; GFX1064-NEXT: buffer_gl0_inv 1288; GFX1064-NEXT: buffer_gl1_inv 1289; GFX1064-NEXT: .LBB4_2: 1290; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1291; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1292; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 1293; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 1294; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1295; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1296; GFX1064-NEXT: s_mov_b32 s6, -1 1297; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, s[0:1] 1298; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2] 1299; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1300; GFX1064-NEXT: s_endpgm 1301; 1302; GFX1032-LABEL: add_i64_uniform: 1303; GFX1032: ; %bb.0: ; %entry 1304; GFX1032-NEXT: s_clause 0x1 1305; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1306; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1307; GFX1032-NEXT: s_mov_b32 s8, exec_lo 1308; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1309; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 1310; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1311; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1312; GFX1032-NEXT: s_cbranch_execz .LBB4_2 1313; GFX1032-NEXT: ; %bb.1: 1314; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 1315; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 1316; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1317; GFX1032-NEXT: s_mul_i32 s8, s3, s1 1318; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 1319; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1320; GFX1032-NEXT: s_add_i32 s9, s9, s8 1321; GFX1032-NEXT: v_mov_b32_e32 v0, s1 1322; GFX1032-NEXT: v_mov_b32_e32 v1, s9 1323; GFX1032-NEXT: s_mov_b32 s10, -1 1324; GFX1032-NEXT: s_mov_b32 s8, s6 1325; GFX1032-NEXT: s_mov_b32 s9, s7 1326; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1327; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1328; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1329; GFX1032-NEXT: s_waitcnt vmcnt(0) 1330; GFX1032-NEXT: buffer_gl0_inv 1331; GFX1032-NEXT: buffer_gl1_inv 1332; GFX1032-NEXT: .LBB4_2: 1333; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1334; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1335; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 1336; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 1337; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1338; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1339; GFX1032-NEXT: s_mov_b32 s6, -1 1340; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v2, s[0:1] 1341; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s0, s3, v2, v[1:2] 1342; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1343; GFX1032-NEXT: s_endpgm 1344; 1345; GFX1164-LABEL: add_i64_uniform: 1346; GFX1164: ; %bb.0: ; %entry 1347; GFX1164-NEXT: s_clause 0x1 1348; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 1349; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 1350; GFX1164-NEXT: s_mov_b64 s[8:9], exec 1351; GFX1164-NEXT: s_mov_b64 s[2:3], exec 1352; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 1353; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1354; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 1355; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 1356; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 1357; GFX1164-NEXT: s_cbranch_execz .LBB4_2 1358; GFX1164-NEXT: ; %bb.1: 1359; GFX1164-NEXT: s_bcnt1_i32_b64 s8, s[8:9] 1360; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 1361; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1362; GFX1164-NEXT: s_mul_i32 s9, s1, s8 1363; GFX1164-NEXT: s_mul_hi_u32 s10, s0, s8 1364; GFX1164-NEXT: s_mul_i32 s8, s0, s8 1365; GFX1164-NEXT: s_add_i32 s10, s10, s9 1366; GFX1164-NEXT: v_mov_b32_e32 v0, s8 1367; GFX1164-NEXT: v_mov_b32_e32 v1, s10 1368; GFX1164-NEXT: s_mov_b32 s10, -1 1369; GFX1164-NEXT: s_mov_b32 s8, s6 1370; GFX1164-NEXT: s_mov_b32 s9, s7 1371; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1372; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1373; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc 1374; GFX1164-NEXT: s_waitcnt vmcnt(0) 1375; GFX1164-NEXT: buffer_gl0_inv 1376; GFX1164-NEXT: buffer_gl1_inv 1377; GFX1164-NEXT: .LBB4_2: 1378; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 1379; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 1380; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 1381; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1382; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 1383; GFX1164-NEXT: s_mov_b32 s6, -1 1384; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1385; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3] 1386; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2] 1387; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 1388; GFX1164-NEXT: v_mov_b32_e32 v1, v3 1389; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 1390; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1391; GFX1164-NEXT: s_endpgm 1392; 1393; GFX1132-LABEL: add_i64_uniform: 1394; GFX1132: ; %bb.0: ; %entry 1395; GFX1132-NEXT: s_clause 0x1 1396; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 1397; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 1398; GFX1132-NEXT: s_mov_b32 s3, exec_lo 1399; GFX1132-NEXT: s_mov_b32 s2, exec_lo 1400; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 1401; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 1402; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1403; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 1404; GFX1132-NEXT: s_cbranch_execz .LBB4_2 1405; GFX1132-NEXT: ; %bb.1: 1406; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 1407; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 1408; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1409; GFX1132-NEXT: s_mul_i32 s8, s1, s3 1410; GFX1132-NEXT: s_mul_hi_u32 s9, s0, s3 1411; GFX1132-NEXT: s_mul_i32 s3, s0, s3 1412; GFX1132-NEXT: s_add_i32 s9, s9, s8 1413; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1414; GFX1132-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9 1415; GFX1132-NEXT: s_mov_b32 s10, -1 1416; GFX1132-NEXT: s_mov_b32 s8, s6 1417; GFX1132-NEXT: s_mov_b32 s9, s7 1418; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1419; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1420; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc 1421; GFX1132-NEXT: s_waitcnt vmcnt(0) 1422; GFX1132-NEXT: buffer_gl0_inv 1423; GFX1132-NEXT: buffer_gl1_inv 1424; GFX1132-NEXT: .LBB4_2: 1425; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 1426; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 1427; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 1428; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1429; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 1430; GFX1132-NEXT: s_mov_b32 s6, -1 1431; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1432; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3] 1433; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2] 1434; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1435; GFX1132-NEXT: v_mov_b32_e32 v1, v3 1436; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 1437; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1438; GFX1132-NEXT: s_endpgm 1439entry: 1440 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel 1441 store i64 %old, i64 addrspace(1)* %out 1442 ret void 1443} 1444 1445define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 1446; GFX7LESS-LABEL: add_i64_varying: 1447; GFX7LESS: ; %bb.0: ; %entry 1448; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1449; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1450; GFX7LESS-NEXT: s_mov_b32 s6, -1 1451; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1452; GFX7LESS-NEXT: s_mov_b32 s10, s6 1453; GFX7LESS-NEXT: s_mov_b32 s11, s7 1454; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1455; GFX7LESS-NEXT: s_mov_b32 s8, s2 1456; GFX7LESS-NEXT: s_mov_b32 s9, s3 1457; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1458; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1459; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1460; GFX7LESS-NEXT: buffer_wbinvl1 1461; GFX7LESS-NEXT: s_mov_b32 s4, s0 1462; GFX7LESS-NEXT: s_mov_b32 s5, s1 1463; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1464; GFX7LESS-NEXT: s_endpgm 1465; 1466; GFX89-LABEL: add_i64_varying: 1467; GFX89: ; %bb.0: ; %entry 1468; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1469; GFX89-NEXT: s_mov_b32 s7, 0xf000 1470; GFX89-NEXT: s_mov_b32 s6, -1 1471; GFX89-NEXT: s_mov_b32 s10, s6 1472; GFX89-NEXT: s_mov_b32 s11, s7 1473; GFX89-NEXT: s_waitcnt lgkmcnt(0) 1474; GFX89-NEXT: s_mov_b32 s8, s2 1475; GFX89-NEXT: s_mov_b32 s9, s3 1476; GFX89-NEXT: v_mov_b32_e32 v1, 0 1477; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1478; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1479; GFX89-NEXT: s_waitcnt vmcnt(0) 1480; GFX89-NEXT: buffer_wbinvl1_vol 1481; GFX89-NEXT: s_mov_b32 s4, s0 1482; GFX89-NEXT: s_mov_b32 s5, s1 1483; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1484; GFX89-NEXT: s_endpgm 1485; 1486; GFX10-LABEL: add_i64_varying: 1487; GFX10: ; %bb.0: ; %entry 1488; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1489; GFX10-NEXT: v_mov_b32_e32 v1, 0 1490; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1491; GFX10-NEXT: s_mov_b32 s6, -1 1492; GFX10-NEXT: s_mov_b32 s11, s7 1493; GFX10-NEXT: s_mov_b32 s10, s6 1494; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1495; GFX10-NEXT: s_mov_b32 s8, s2 1496; GFX10-NEXT: s_mov_b32 s9, s3 1497; GFX10-NEXT: s_mov_b32 s4, s0 1498; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1499; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1500; GFX10-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1501; GFX10-NEXT: s_waitcnt vmcnt(0) 1502; GFX10-NEXT: buffer_gl0_inv 1503; GFX10-NEXT: buffer_gl1_inv 1504; GFX10-NEXT: s_mov_b32 s5, s1 1505; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1506; GFX10-NEXT: s_endpgm 1507; 1508; GFX11-LABEL: add_i64_varying: 1509; GFX11: ; %bb.0: ; %entry 1510; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1511; GFX11-NEXT: v_mov_b32_e32 v1, 0 1512; GFX11-NEXT: s_mov_b32 s7, 0x31016000 1513; GFX11-NEXT: s_mov_b32 s6, -1 1514; GFX11-NEXT: s_mov_b32 s11, s7 1515; GFX11-NEXT: s_mov_b32 s10, s6 1516; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1517; GFX11-NEXT: s_mov_b32 s8, s2 1518; GFX11-NEXT: s_mov_b32 s9, s3 1519; GFX11-NEXT: s_mov_b32 s4, s0 1520; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1521; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1522; GFX11-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc 1523; GFX11-NEXT: s_waitcnt vmcnt(0) 1524; GFX11-NEXT: buffer_gl0_inv 1525; GFX11-NEXT: buffer_gl1_inv 1526; GFX11-NEXT: s_mov_b32 s5, s1 1527; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 1528; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1529; GFX11-NEXT: s_endpgm 1530entry: 1531 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1532 %zext = zext i32 %lane to i64 1533 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel 1534 store i64 %old, i64 addrspace(1)* %out 1535 ret void 1536} 1537 1538define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 1539; GFX7LESS-LABEL: sub_i32_constant: 1540; GFX7LESS: ; %bb.0: ; %entry 1541; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1542; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1543; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1544; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1545; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1546; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1547; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1548; GFX7LESS-NEXT: s_cbranch_execz .LBB6_2 1549; GFX7LESS-NEXT: ; %bb.1: 1550; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 1551; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1552; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 1553; GFX7LESS-NEXT: s_mov_b32 s10, -1 1554; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1555; GFX7LESS-NEXT: s_mov_b32 s8, s2 1556; GFX7LESS-NEXT: s_mov_b32 s9, s3 1557; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1558; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1559; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1560; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1561; GFX7LESS-NEXT: buffer_wbinvl1 1562; GFX7LESS-NEXT: .LBB6_2: 1563; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1564; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1565; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1566; GFX7LESS-NEXT: s_mov_b32 s2, -1 1567; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 1568; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1569; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1570; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1571; GFX7LESS-NEXT: s_endpgm 1572; 1573; GFX8-LABEL: sub_i32_constant: 1574; GFX8: ; %bb.0: ; %entry 1575; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1576; GFX8-NEXT: s_mov_b64 s[6:7], exec 1577; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1578; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1579; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1580; GFX8-NEXT: ; implicit-def: $vgpr1 1581; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1582; GFX8-NEXT: s_cbranch_execz .LBB6_2 1583; GFX8-NEXT: ; %bb.1: 1584; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1585; GFX8-NEXT: s_mov_b32 s8, s2 1586; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1587; GFX8-NEXT: s_mul_i32 s2, s2, 5 1588; GFX8-NEXT: s_mov_b32 s11, 0xf000 1589; GFX8-NEXT: s_mov_b32 s10, -1 1590; GFX8-NEXT: s_mov_b32 s9, s3 1591; GFX8-NEXT: v_mov_b32_e32 v1, s2 1592; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1593; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1594; GFX8-NEXT: s_waitcnt vmcnt(0) 1595; GFX8-NEXT: buffer_wbinvl1_vol 1596; GFX8-NEXT: .LBB6_2: 1597; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1598; GFX8-NEXT: v_readfirstlane_b32 s4, v1 1599; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1600; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1601; GFX8-NEXT: s_mov_b32 s3, 0xf000 1602; GFX8-NEXT: s_mov_b32 s2, -1 1603; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 1604; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1605; GFX8-NEXT: s_endpgm 1606; 1607; GFX9-LABEL: sub_i32_constant: 1608; GFX9: ; %bb.0: ; %entry 1609; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1610; GFX9-NEXT: s_mov_b64 s[6:7], exec 1611; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1612; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1613; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1614; GFX9-NEXT: ; implicit-def: $vgpr1 1615; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1616; GFX9-NEXT: s_cbranch_execz .LBB6_2 1617; GFX9-NEXT: ; %bb.1: 1618; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1619; GFX9-NEXT: s_mov_b32 s8, s2 1620; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1621; GFX9-NEXT: s_mul_i32 s2, s2, 5 1622; GFX9-NEXT: s_mov_b32 s11, 0xf000 1623; GFX9-NEXT: s_mov_b32 s10, -1 1624; GFX9-NEXT: s_mov_b32 s9, s3 1625; GFX9-NEXT: v_mov_b32_e32 v1, s2 1626; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1627; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1628; GFX9-NEXT: s_waitcnt vmcnt(0) 1629; GFX9-NEXT: buffer_wbinvl1_vol 1630; GFX9-NEXT: .LBB6_2: 1631; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1632; GFX9-NEXT: v_readfirstlane_b32 s4, v1 1633; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1634; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1635; GFX9-NEXT: s_mov_b32 s3, 0xf000 1636; GFX9-NEXT: s_mov_b32 s2, -1 1637; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1638; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1639; GFX9-NEXT: s_endpgm 1640; 1641; GFX1064-LABEL: sub_i32_constant: 1642; GFX1064: ; %bb.0: ; %entry 1643; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1644; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1645; GFX1064-NEXT: ; implicit-def: $vgpr1 1646; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1647; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1648; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1649; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1650; GFX1064-NEXT: s_cbranch_execz .LBB6_2 1651; GFX1064-NEXT: ; %bb.1: 1652; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1653; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 1654; GFX1064-NEXT: s_mul_i32 s6, s6, 5 1655; GFX1064-NEXT: s_mov_b32 s10, -1 1656; GFX1064-NEXT: v_mov_b32_e32 v1, s6 1657; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1658; GFX1064-NEXT: s_mov_b32 s8, s2 1659; GFX1064-NEXT: s_mov_b32 s9, s3 1660; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1661; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1662; GFX1064-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1663; GFX1064-NEXT: s_waitcnt vmcnt(0) 1664; GFX1064-NEXT: buffer_gl0_inv 1665; GFX1064-NEXT: buffer_gl1_inv 1666; GFX1064-NEXT: .LBB6_2: 1667; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1668; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1669; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1670; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1671; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1672; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1673; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1674; GFX1064-NEXT: s_mov_b32 s2, -1 1675; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1676; GFX1064-NEXT: s_endpgm 1677; 1678; GFX1032-LABEL: sub_i32_constant: 1679; GFX1032: ; %bb.0: ; %entry 1680; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1681; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1682; GFX1032-NEXT: ; implicit-def: $vgpr1 1683; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 1684; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1685; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1686; GFX1032-NEXT: s_cbranch_execz .LBB6_2 1687; GFX1032-NEXT: ; %bb.1: 1688; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1689; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 1690; GFX1032-NEXT: s_mul_i32 s5, s5, 5 1691; GFX1032-NEXT: s_mov_b32 s10, -1 1692; GFX1032-NEXT: v_mov_b32_e32 v1, s5 1693; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1694; GFX1032-NEXT: s_mov_b32 s8, s2 1695; GFX1032-NEXT: s_mov_b32 s9, s3 1696; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1697; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1698; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1699; GFX1032-NEXT: s_waitcnt vmcnt(0) 1700; GFX1032-NEXT: buffer_gl0_inv 1701; GFX1032-NEXT: buffer_gl1_inv 1702; GFX1032-NEXT: .LBB6_2: 1703; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1704; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1705; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1706; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1707; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1708; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1709; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1710; GFX1032-NEXT: s_mov_b32 s2, -1 1711; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1712; GFX1032-NEXT: s_endpgm 1713; 1714; GFX1164-LABEL: sub_i32_constant: 1715; GFX1164: ; %bb.0: ; %entry 1716; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1717; GFX1164-NEXT: s_mov_b64 s[6:7], exec 1718; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1719; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1720; GFX1164-NEXT: ; implicit-def: $vgpr1 1721; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1722; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1723; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 1724; GFX1164-NEXT: s_cbranch_execz .LBB6_2 1725; GFX1164-NEXT: ; %bb.1: 1726; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1727; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 1728; GFX1164-NEXT: s_mul_i32 s6, s6, 5 1729; GFX1164-NEXT: s_mov_b32 s10, -1 1730; GFX1164-NEXT: v_mov_b32_e32 v1, s6 1731; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1732; GFX1164-NEXT: s_mov_b32 s8, s2 1733; GFX1164-NEXT: s_mov_b32 s9, s3 1734; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1735; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1736; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 1737; GFX1164-NEXT: s_waitcnt vmcnt(0) 1738; GFX1164-NEXT: buffer_gl0_inv 1739; GFX1164-NEXT: buffer_gl1_inv 1740; GFX1164-NEXT: .LBB6_2: 1741; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 1742; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1743; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 1744; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1745; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1746; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 1747; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1748; GFX1164-NEXT: s_mov_b32 s2, -1 1749; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1750; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1751; GFX1164-NEXT: s_endpgm 1752; 1753; GFX1132-LABEL: sub_i32_constant: 1754; GFX1132: ; %bb.0: ; %entry 1755; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1756; GFX1132-NEXT: s_mov_b32 s5, exec_lo 1757; GFX1132-NEXT: s_mov_b32 s4, exec_lo 1758; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 1759; GFX1132-NEXT: ; implicit-def: $vgpr1 1760; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1761; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 1762; GFX1132-NEXT: s_cbranch_execz .LBB6_2 1763; GFX1132-NEXT: ; %bb.1: 1764; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 1765; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 1766; GFX1132-NEXT: s_mul_i32 s5, s5, 5 1767; GFX1132-NEXT: s_mov_b32 s10, -1 1768; GFX1132-NEXT: v_mov_b32_e32 v1, s5 1769; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1770; GFX1132-NEXT: s_mov_b32 s8, s2 1771; GFX1132-NEXT: s_mov_b32 s9, s3 1772; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1773; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1774; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 1775; GFX1132-NEXT: s_waitcnt vmcnt(0) 1776; GFX1132-NEXT: buffer_gl0_inv 1777; GFX1132-NEXT: buffer_gl1_inv 1778; GFX1132-NEXT: .LBB6_2: 1779; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 1780; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1781; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 1782; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1783; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1784; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1785; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1786; GFX1132-NEXT: s_mov_b32 s2, -1 1787; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1788; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1789; GFX1132-NEXT: s_endpgm 1790entry: 1791 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel 1792 store i32 %old, i32 addrspace(1)* %out 1793 ret void 1794} 1795 1796define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) { 1797; GFX7LESS-LABEL: sub_i32_uniform: 1798; GFX7LESS: ; %bb.0: ; %entry 1799; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1800; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1801; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd 1802; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1803; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1804; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1805; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1806; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1807; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 1808; GFX7LESS-NEXT: ; %bb.1: 1809; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 1810; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1811; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1812; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 1813; GFX7LESS-NEXT: s_mov_b32 s14, -1 1814; GFX7LESS-NEXT: s_mov_b32 s12, s6 1815; GFX7LESS-NEXT: s_mov_b32 s13, s7 1816; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 1817; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1818; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1819; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1820; GFX7LESS-NEXT: buffer_wbinvl1 1821; GFX7LESS-NEXT: .LBB7_2: 1822; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1823; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1824; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1825; GFX7LESS-NEXT: s_mov_b32 s6, -1 1826; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1827; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 1828; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1829; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1830; GFX7LESS-NEXT: s_endpgm 1831; 1832; GFX8-LABEL: sub_i32_uniform: 1833; GFX8: ; %bb.0: ; %entry 1834; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1835; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 1836; GFX8-NEXT: s_mov_b64 s[2:3], exec 1837; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1838; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1839; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1840; GFX8-NEXT: ; implicit-def: $vgpr1 1841; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1842; GFX8-NEXT: s_cbranch_execz .LBB7_2 1843; GFX8-NEXT: ; %bb.1: 1844; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1845; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1846; GFX8-NEXT: s_mul_i32 s2, s8, s2 1847; GFX8-NEXT: s_mov_b32 s15, 0xf000 1848; GFX8-NEXT: s_mov_b32 s14, -1 1849; GFX8-NEXT: s_mov_b32 s12, s6 1850; GFX8-NEXT: s_mov_b32 s13, s7 1851; GFX8-NEXT: v_mov_b32_e32 v1, s2 1852; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1853; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1854; GFX8-NEXT: s_waitcnt vmcnt(0) 1855; GFX8-NEXT: buffer_wbinvl1_vol 1856; GFX8-NEXT: .LBB7_2: 1857; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1858; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1859; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 1860; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1861; GFX8-NEXT: s_mov_b32 s7, 0xf000 1862; GFX8-NEXT: s_mov_b32 s6, -1 1863; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1864; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1865; GFX8-NEXT: s_endpgm 1866; 1867; GFX9-LABEL: sub_i32_uniform: 1868; GFX9: ; %bb.0: ; %entry 1869; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1870; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 1871; GFX9-NEXT: s_mov_b64 s[2:3], exec 1872; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1873; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1874; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1875; GFX9-NEXT: ; implicit-def: $vgpr1 1876; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1877; GFX9-NEXT: s_cbranch_execz .LBB7_2 1878; GFX9-NEXT: ; %bb.1: 1879; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1880; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1881; GFX9-NEXT: s_mul_i32 s2, s8, s2 1882; GFX9-NEXT: s_mov_b32 s15, 0xf000 1883; GFX9-NEXT: s_mov_b32 s14, -1 1884; GFX9-NEXT: s_mov_b32 s12, s6 1885; GFX9-NEXT: s_mov_b32 s13, s7 1886; GFX9-NEXT: v_mov_b32_e32 v1, s2 1887; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1888; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1889; GFX9-NEXT: s_waitcnt vmcnt(0) 1890; GFX9-NEXT: buffer_wbinvl1_vol 1891; GFX9-NEXT: .LBB7_2: 1892; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1893; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1894; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 1895; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1896; GFX9-NEXT: s_mov_b32 s7, 0xf000 1897; GFX9-NEXT: s_mov_b32 s6, -1 1898; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1899; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1900; GFX9-NEXT: s_endpgm 1901; 1902; GFX1064-LABEL: sub_i32_uniform: 1903; GFX1064: ; %bb.0: ; %entry 1904; GFX1064-NEXT: s_clause 0x1 1905; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1906; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 1907; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1908; GFX1064-NEXT: ; implicit-def: $vgpr1 1909; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1910; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1911; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1912; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1913; GFX1064-NEXT: s_cbranch_execz .LBB7_2 1914; GFX1064-NEXT: ; %bb.1: 1915; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1916; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 1917; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1918; GFX1064-NEXT: s_mul_i32 s2, s8, s2 1919; GFX1064-NEXT: s_mov_b32 s14, -1 1920; GFX1064-NEXT: v_mov_b32_e32 v1, s2 1921; GFX1064-NEXT: s_mov_b32 s12, s6 1922; GFX1064-NEXT: s_mov_b32 s13, s7 1923; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1924; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1925; GFX1064-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1926; GFX1064-NEXT: s_waitcnt vmcnt(0) 1927; GFX1064-NEXT: buffer_gl0_inv 1928; GFX1064-NEXT: buffer_gl1_inv 1929; GFX1064-NEXT: .LBB7_2: 1930; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1931; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1932; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1933; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0 1934; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1935; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1936; GFX1064-NEXT: s_mov_b32 s6, -1 1937; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1938; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1939; GFX1064-NEXT: s_endpgm 1940; 1941; GFX1032-LABEL: sub_i32_uniform: 1942; GFX1032: ; %bb.0: ; %entry 1943; GFX1032-NEXT: s_clause 0x1 1944; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1945; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 1946; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1947; GFX1032-NEXT: ; implicit-def: $vgpr1 1948; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1949; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1950; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1951; GFX1032-NEXT: s_cbranch_execz .LBB7_2 1952; GFX1032-NEXT: ; %bb.1: 1953; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 1954; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 1955; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1956; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1957; GFX1032-NEXT: s_mov_b32 s10, -1 1958; GFX1032-NEXT: v_mov_b32_e32 v1, s1 1959; GFX1032-NEXT: s_mov_b32 s8, s6 1960; GFX1032-NEXT: s_mov_b32 s9, s7 1961; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1962; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1963; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1964; GFX1032-NEXT: s_waitcnt vmcnt(0) 1965; GFX1032-NEXT: buffer_gl0_inv 1966; GFX1032-NEXT: buffer_gl1_inv 1967; GFX1032-NEXT: .LBB7_2: 1968; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1969; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1970; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1971; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1972; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1973; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1974; GFX1032-NEXT: s_mov_b32 s6, -1 1975; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1976; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1977; GFX1032-NEXT: s_endpgm 1978; 1979; GFX1164-LABEL: sub_i32_uniform: 1980; GFX1164: ; %bb.0: ; %entry 1981; GFX1164-NEXT: s_clause 0x1 1982; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 1983; GFX1164-NEXT: s_load_b32 s8, s[0:1], 0x34 1984; GFX1164-NEXT: s_mov_b64 s[2:3], exec 1985; GFX1164-NEXT: s_mov_b64 s[0:1], exec 1986; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1987; GFX1164-NEXT: ; implicit-def: $vgpr1 1988; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1989; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1990; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 1991; GFX1164-NEXT: s_cbranch_execz .LBB7_2 1992; GFX1164-NEXT: ; %bb.1: 1993; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1994; GFX1164-NEXT: s_mov_b32 s15, 0x31016000 1995; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1996; GFX1164-NEXT: s_mul_i32 s2, s8, s2 1997; GFX1164-NEXT: s_mov_b32 s14, -1 1998; GFX1164-NEXT: v_mov_b32_e32 v1, s2 1999; GFX1164-NEXT: s_mov_b32 s12, s6 2000; GFX1164-NEXT: s_mov_b32 s13, s7 2001; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2002; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2003; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc 2004; GFX1164-NEXT: s_waitcnt vmcnt(0) 2005; GFX1164-NEXT: buffer_gl0_inv 2006; GFX1164-NEXT: buffer_gl1_inv 2007; GFX1164-NEXT: .LBB7_2: 2008; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 2009; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2010; GFX1164-NEXT: v_mul_lo_u32 v0, s8, v0 2011; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 2012; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 2013; GFX1164-NEXT: s_mov_b32 s6, -1 2014; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 2015; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2016; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0 2017; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2018; GFX1164-NEXT: s_endpgm 2019; 2020; GFX1132-LABEL: sub_i32_uniform: 2021; GFX1132: ; %bb.0: ; %entry 2022; GFX1132-NEXT: s_clause 0x1 2023; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 2024; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x34 2025; GFX1132-NEXT: s_mov_b32 s2, exec_lo 2026; GFX1132-NEXT: s_mov_b32 s1, exec_lo 2027; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2028; GFX1132-NEXT: ; implicit-def: $vgpr1 2029; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 2030; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 2031; GFX1132-NEXT: s_cbranch_execz .LBB7_2 2032; GFX1132-NEXT: ; %bb.1: 2033; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 2034; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 2035; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2036; GFX1132-NEXT: s_mul_i32 s2, s0, s2 2037; GFX1132-NEXT: s_mov_b32 s10, -1 2038; GFX1132-NEXT: v_mov_b32_e32 v1, s2 2039; GFX1132-NEXT: s_mov_b32 s8, s6 2040; GFX1132-NEXT: s_mov_b32 s9, s7 2041; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2042; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2043; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 2044; GFX1132-NEXT: s_waitcnt vmcnt(0) 2045; GFX1132-NEXT: buffer_gl0_inv 2046; GFX1132-NEXT: buffer_gl1_inv 2047; GFX1132-NEXT: .LBB7_2: 2048; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 2049; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2050; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 2051; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 2052; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 2053; GFX1132-NEXT: s_mov_b32 s6, -1 2054; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 2055; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2056; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 2057; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2058; GFX1132-NEXT: s_endpgm 2059entry: 2060 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel 2061 store i32 %old, i32 addrspace(1)* %out 2062 ret void 2063} 2064 2065define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 2066; GFX7LESS-LABEL: sub_i32_varying: 2067; GFX7LESS: ; %bb.0: ; %entry 2068; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2069; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2070; GFX7LESS-NEXT: s_mov_b32 s6, -1 2071; GFX7LESS-NEXT: s_mov_b32 s10, s6 2072; GFX7LESS-NEXT: s_mov_b32 s11, s7 2073; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2074; GFX7LESS-NEXT: s_mov_b32 s8, s2 2075; GFX7LESS-NEXT: s_mov_b32 s9, s3 2076; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2077; GFX7LESS-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 2078; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 2079; GFX7LESS-NEXT: buffer_wbinvl1 2080; GFX7LESS-NEXT: s_mov_b32 s4, s0 2081; GFX7LESS-NEXT: s_mov_b32 s5, s1 2082; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 2083; GFX7LESS-NEXT: s_endpgm 2084; 2085; GFX8-LABEL: sub_i32_varying: 2086; GFX8: ; %bb.0: ; %entry 2087; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2088; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 2089; GFX8-NEXT: v_mov_b32_e32 v1, 0 2090; GFX8-NEXT: s_mov_b64 exec, s[4:5] 2091; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2092; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2093; GFX8-NEXT: v_mov_b32_e32 v2, v0 2094; GFX8-NEXT: s_not_b64 exec, exec 2095; GFX8-NEXT: v_mov_b32_e32 v2, 0 2096; GFX8-NEXT: s_not_b64 exec, exec 2097; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 2098; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2099; GFX8-NEXT: s_nop 1 2100; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2101; GFX8-NEXT: s_nop 1 2102; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2103; GFX8-NEXT: s_nop 1 2104; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2105; GFX8-NEXT: s_nop 1 2106; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2107; GFX8-NEXT: s_nop 1 2108; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2109; GFX8-NEXT: v_readlane_b32 s6, v2, 63 2110; GFX8-NEXT: s_nop 0 2111; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2112; GFX8-NEXT: s_mov_b64 exec, s[4:5] 2113; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2114; GFX8-NEXT: ; implicit-def: $vgpr0 2115; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2116; GFX8-NEXT: s_cbranch_execz .LBB8_2 2117; GFX8-NEXT: ; %bb.1: 2118; GFX8-NEXT: s_mov_b32 s11, 0xf000 2119; GFX8-NEXT: s_mov_b32 s10, -1 2120; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2121; GFX8-NEXT: s_mov_b32 s8, s2 2122; GFX8-NEXT: s_mov_b32 s9, s3 2123; GFX8-NEXT: v_mov_b32_e32 v0, s6 2124; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2125; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 2126; GFX8-NEXT: s_waitcnt vmcnt(0) 2127; GFX8-NEXT: buffer_wbinvl1_vol 2128; GFX8-NEXT: .LBB8_2: 2129; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2130; GFX8-NEXT: v_readfirstlane_b32 s4, v0 2131; GFX8-NEXT: v_mov_b32_e32 v0, v1 2132; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2133; GFX8-NEXT: s_mov_b32 s3, 0xf000 2134; GFX8-NEXT: s_mov_b32 s2, -1 2135; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 2136; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2137; GFX8-NEXT: s_endpgm 2138; 2139; GFX9-LABEL: sub_i32_varying: 2140; GFX9: ; %bb.0: ; %entry 2141; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2142; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 2143; GFX9-NEXT: v_mov_b32_e32 v1, 0 2144; GFX9-NEXT: s_mov_b64 exec, s[4:5] 2145; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2146; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2147; GFX9-NEXT: v_mov_b32_e32 v2, v0 2148; GFX9-NEXT: s_not_b64 exec, exec 2149; GFX9-NEXT: v_mov_b32_e32 v2, 0 2150; GFX9-NEXT: s_not_b64 exec, exec 2151; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 2152; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2153; GFX9-NEXT: s_nop 1 2154; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2155; GFX9-NEXT: s_nop 1 2156; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2157; GFX9-NEXT: s_nop 1 2158; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2159; GFX9-NEXT: s_nop 1 2160; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2161; GFX9-NEXT: s_nop 1 2162; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2163; GFX9-NEXT: v_readlane_b32 s6, v2, 63 2164; GFX9-NEXT: s_nop 0 2165; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2166; GFX9-NEXT: s_mov_b64 exec, s[4:5] 2167; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2168; GFX9-NEXT: ; implicit-def: $vgpr0 2169; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2170; GFX9-NEXT: s_cbranch_execz .LBB8_2 2171; GFX9-NEXT: ; %bb.1: 2172; GFX9-NEXT: s_mov_b32 s11, 0xf000 2173; GFX9-NEXT: s_mov_b32 s10, -1 2174; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2175; GFX9-NEXT: s_mov_b32 s8, s2 2176; GFX9-NEXT: s_mov_b32 s9, s3 2177; GFX9-NEXT: v_mov_b32_e32 v0, s6 2178; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2179; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 2180; GFX9-NEXT: s_waitcnt vmcnt(0) 2181; GFX9-NEXT: buffer_wbinvl1_vol 2182; GFX9-NEXT: .LBB8_2: 2183; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2184; GFX9-NEXT: v_readfirstlane_b32 s4, v0 2185; GFX9-NEXT: v_mov_b32_e32 v0, v1 2186; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2187; GFX9-NEXT: s_mov_b32 s3, 0xf000 2188; GFX9-NEXT: s_mov_b32 s2, -1 2189; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 2190; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2191; GFX9-NEXT: s_endpgm 2192; 2193; GFX1064-LABEL: sub_i32_varying: 2194; GFX1064: ; %bb.0: ; %entry 2195; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2196; GFX1064-NEXT: s_not_b64 exec, exec 2197; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2198; GFX1064-NEXT: s_not_b64 exec, exec 2199; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2200; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2201; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2202; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2203; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2204; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2205; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2206; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2207; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2208; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2209; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2210; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2211; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 2212; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2213; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2214; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2215; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2216; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 2217; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 2218; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2219; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2220; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2221; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 2222; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 2223; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 2224; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2225; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2226; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 2227; GFX1064-NEXT: s_mov_b32 s4, s9 2228; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 2229; GFX1064-NEXT: s_mov_b64 exec, s[6:7] 2230; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2231; GFX1064-NEXT: s_mov_b32 s6, -1 2232; GFX1064-NEXT: ; implicit-def: $vgpr0 2233; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc 2234; GFX1064-NEXT: s_cbranch_execz .LBB8_2 2235; GFX1064-NEXT: ; %bb.1: 2236; GFX1064-NEXT: v_mov_b32_e32 v0, s4 2237; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 2238; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2239; GFX1064-NEXT: s_mov_b32 s4, s2 2240; GFX1064-NEXT: s_mov_b32 s5, s3 2241; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2242; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2243; GFX1064-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 2244; GFX1064-NEXT: s_waitcnt vmcnt(0) 2245; GFX1064-NEXT: buffer_gl0_inv 2246; GFX1064-NEXT: buffer_gl1_inv 2247; GFX1064-NEXT: .LBB8_2: 2248; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2249; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 2250; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2251; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 2252; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2253; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2254; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 2255; GFX1064-NEXT: s_mov_b32 s2, s6 2256; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2257; GFX1064-NEXT: s_endpgm 2258; 2259; GFX1032-LABEL: sub_i32_varying: 2260; GFX1032: ; %bb.0: ; %entry 2261; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2262; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2263; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2264; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2265; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2266; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2267; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2268; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2269; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2270; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2271; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2272; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2273; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2274; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 2275; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2276; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2277; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 2278; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 2279; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2280; GFX1032-NEXT: s_mov_b32 exec_lo, s4 2281; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2282; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 2283; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 2284; GFX1032-NEXT: s_mov_b32 exec_lo, s4 2285; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2286; GFX1032-NEXT: s_mov_b32 s4, s6 2287; GFX1032-NEXT: s_mov_b32 s6, -1 2288; GFX1032-NEXT: ; implicit-def: $vgpr0 2289; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo 2290; GFX1032-NEXT: s_cbranch_execz .LBB8_2 2291; GFX1032-NEXT: ; %bb.1: 2292; GFX1032-NEXT: v_mov_b32_e32 v0, s4 2293; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 2294; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2295; GFX1032-NEXT: s_mov_b32 s4, s2 2296; GFX1032-NEXT: s_mov_b32 s5, s3 2297; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2298; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2299; GFX1032-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 2300; GFX1032-NEXT: s_waitcnt vmcnt(0) 2301; GFX1032-NEXT: buffer_gl0_inv 2302; GFX1032-NEXT: buffer_gl1_inv 2303; GFX1032-NEXT: .LBB8_2: 2304; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2305; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 2306; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2307; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 2308; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2309; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2310; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 2311; GFX1032-NEXT: s_mov_b32 s2, s6 2312; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2313; GFX1032-NEXT: s_endpgm 2314; 2315; GFX1164-LABEL: sub_i32_varying: 2316; GFX1164: ; %bb.0: ; %entry 2317; GFX1164-NEXT: v_mov_b32_e32 v1, v0 2318; GFX1164-NEXT: s_not_b64 exec, exec 2319; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2320; GFX1164-NEXT: s_not_b64 exec, exec 2321; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2322; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2323; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2324; GFX1164-NEXT: v_mov_b32_e32 v3, 0 2325; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2326; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2327; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2328; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2329; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2330; GFX1164-NEXT: v_mov_b32_e32 v2, v1 2331; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2332; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2333; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2334; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 2335; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2336; GFX1164-NEXT: v_mov_b32_e32 v2, s4 2337; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2338; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 2339; GFX1164-NEXT: v_readlane_b32 s6, v1, 15 2340; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2341; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2342; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 2343; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 2344; GFX1164-NEXT: v_readlane_b32 s7, v1, 31 2345; GFX1164-NEXT: v_writelane_b32 v3, s6, 16 2346; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 2347; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2348; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2349; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 2350; GFX1164-NEXT: v_readlane_b32 s8, v1, 47 2351; GFX1164-NEXT: v_readlane_b32 s9, v1, 63 2352; GFX1164-NEXT: v_writelane_b32 v3, s7, 32 2353; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 2354; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2355; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2356; GFX1164-NEXT: s_or_saveexec_b64 s[6:7], -1 2357; GFX1164-NEXT: s_mov_b32 s4, s9 2358; GFX1164-NEXT: v_writelane_b32 v3, s8, 48 2359; GFX1164-NEXT: s_mov_b64 exec, s[6:7] 2360; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2361; GFX1164-NEXT: s_mov_b32 s6, -1 2362; GFX1164-NEXT: ; implicit-def: $vgpr0 2363; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], vcc 2364; GFX1164-NEXT: s_cbranch_execz .LBB8_2 2365; GFX1164-NEXT: ; %bb.1: 2366; GFX1164-NEXT: v_mov_b32_e32 v0, s4 2367; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 2368; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2369; GFX1164-NEXT: s_mov_b32 s4, s2 2370; GFX1164-NEXT: s_mov_b32 s5, s3 2371; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2372; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2373; GFX1164-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc 2374; GFX1164-NEXT: s_waitcnt vmcnt(0) 2375; GFX1164-NEXT: buffer_gl0_inv 2376; GFX1164-NEXT: buffer_gl1_inv 2377; GFX1164-NEXT: .LBB8_2: 2378; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] 2379; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2380; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 2381; GFX1164-NEXT: v_mov_b32_e32 v0, v3 2382; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 2383; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 2384; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 2385; GFX1164-NEXT: s_mov_b32 s2, s6 2386; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 2387; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2388; GFX1164-NEXT: s_endpgm 2389; 2390; GFX1132-LABEL: sub_i32_varying: 2391; GFX1132: ; %bb.0: ; %entry 2392; GFX1132-NEXT: v_mov_b32_e32 v1, v0 2393; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2394; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2395; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2396; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2397; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2398; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2399; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2400; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2401; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2402; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2403; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2404; GFX1132-NEXT: v_mov_b32_e32 v2, v1 2405; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2406; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2407; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 2408; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 2409; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2410; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2411; GFX1132-NEXT: v_mov_b32_e32 v3, 0 2412; GFX1132-NEXT: v_readlane_b32 s5, v1, 15 2413; GFX1132-NEXT: v_readlane_b32 s6, v1, 31 2414; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 2415; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2416; GFX1132-NEXT: s_mov_b32 exec_lo, s4 2417; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2418; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 2419; GFX1132-NEXT: v_writelane_b32 v3, s5, 16 2420; GFX1132-NEXT: s_mov_b32 exec_lo, s4 2421; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 2422; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2423; GFX1132-NEXT: s_mov_b32 s4, s6 2424; GFX1132-NEXT: s_mov_b32 s6, -1 2425; GFX1132-NEXT: ; implicit-def: $vgpr0 2426; GFX1132-NEXT: s_and_saveexec_b32 s8, vcc_lo 2427; GFX1132-NEXT: s_cbranch_execz .LBB8_2 2428; GFX1132-NEXT: ; %bb.1: 2429; GFX1132-NEXT: v_mov_b32_e32 v0, s4 2430; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 2431; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2432; GFX1132-NEXT: s_mov_b32 s4, s2 2433; GFX1132-NEXT: s_mov_b32 s5, s3 2434; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2435; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2436; GFX1132-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc 2437; GFX1132-NEXT: s_waitcnt vmcnt(0) 2438; GFX1132-NEXT: buffer_gl0_inv 2439; GFX1132-NEXT: buffer_gl1_inv 2440; GFX1132-NEXT: .LBB8_2: 2441; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 2442; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2443; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 2444; GFX1132-NEXT: v_mov_b32_e32 v0, v3 2445; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 2446; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 2447; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 2448; GFX1132-NEXT: s_mov_b32 s2, s6 2449; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 2450; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2451; GFX1132-NEXT: s_endpgm 2452entry: 2453 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2454 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel 2455 store i32 %old, i32 addrspace(1)* %out 2456 ret void 2457} 2458 2459define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 2460; GFX7LESS-LABEL: sub_i64_constant: 2461; GFX7LESS: ; %bb.0: ; %entry 2462; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 2463; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2464; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2465; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 2466; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2467; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 2468; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2469; GFX7LESS-NEXT: s_cbranch_execz .LBB9_2 2470; GFX7LESS-NEXT: ; %bb.1: 2471; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 2472; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2473; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 2474; GFX7LESS-NEXT: s_mov_b32 s10, -1 2475; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2476; GFX7LESS-NEXT: s_mov_b32 s8, s2 2477; GFX7LESS-NEXT: s_mov_b32 s9, s3 2478; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 2479; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2480; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2481; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2482; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 2483; GFX7LESS-NEXT: buffer_wbinvl1 2484; GFX7LESS-NEXT: .LBB9_2: 2485; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2486; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2487; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2488; GFX7LESS-NEXT: s_mov_b32 s2, -1 2489; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 2490; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 2491; GFX7LESS-NEXT: s_waitcnt expcnt(0) 2492; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2493; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2494; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 2495; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 2496; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2497; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2498; GFX7LESS-NEXT: s_endpgm 2499; 2500; GFX8-LABEL: sub_i64_constant: 2501; GFX8: ; %bb.0: ; %entry 2502; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2503; GFX8-NEXT: s_mov_b64 s[6:7], exec 2504; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2505; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2506; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2507; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 2508; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2509; GFX8-NEXT: s_cbranch_execz .LBB9_2 2510; GFX8-NEXT: ; %bb.1: 2511; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2512; GFX8-NEXT: s_mov_b32 s8, s2 2513; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 2514; GFX8-NEXT: s_mul_i32 s2, s2, 5 2515; GFX8-NEXT: s_mov_b32 s11, 0xf000 2516; GFX8-NEXT: s_mov_b32 s10, -1 2517; GFX8-NEXT: s_mov_b32 s9, s3 2518; GFX8-NEXT: v_mov_b32_e32 v0, s2 2519; GFX8-NEXT: v_mov_b32_e32 v1, 0 2520; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2521; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2522; GFX8-NEXT: s_waitcnt vmcnt(0) 2523; GFX8-NEXT: buffer_wbinvl1_vol 2524; GFX8-NEXT: .LBB9_2: 2525; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2526; GFX8-NEXT: v_readfirstlane_b32 s4, v0 2527; GFX8-NEXT: v_readfirstlane_b32 s5, v1 2528; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2529; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2530; GFX8-NEXT: v_mov_b32_e32 v2, s5 2531; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 2532; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2533; GFX8-NEXT: s_mov_b32 s3, 0xf000 2534; GFX8-NEXT: s_mov_b32 s2, -1 2535; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2536; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2537; GFX8-NEXT: s_endpgm 2538; 2539; GFX9-LABEL: sub_i64_constant: 2540; GFX9: ; %bb.0: ; %entry 2541; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2542; GFX9-NEXT: s_mov_b64 s[6:7], exec 2543; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2544; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2545; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2546; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 2547; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2548; GFX9-NEXT: s_cbranch_execz .LBB9_2 2549; GFX9-NEXT: ; %bb.1: 2550; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2551; GFX9-NEXT: s_mov_b32 s8, s2 2552; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 2553; GFX9-NEXT: s_mul_i32 s2, s2, 5 2554; GFX9-NEXT: s_mov_b32 s11, 0xf000 2555; GFX9-NEXT: s_mov_b32 s10, -1 2556; GFX9-NEXT: s_mov_b32 s9, s3 2557; GFX9-NEXT: v_mov_b32_e32 v0, s2 2558; GFX9-NEXT: v_mov_b32_e32 v1, 0 2559; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2560; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2561; GFX9-NEXT: s_waitcnt vmcnt(0) 2562; GFX9-NEXT: buffer_wbinvl1_vol 2563; GFX9-NEXT: .LBB9_2: 2564; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2565; GFX9-NEXT: v_readfirstlane_b32 s4, v0 2566; GFX9-NEXT: v_readfirstlane_b32 s5, v1 2567; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2568; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2569; GFX9-NEXT: v_mov_b32_e32 v2, s5 2570; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 2571; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2572; GFX9-NEXT: s_mov_b32 s3, 0xf000 2573; GFX9-NEXT: s_mov_b32 s2, -1 2574; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2575; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2576; GFX9-NEXT: s_endpgm 2577; 2578; GFX1064-LABEL: sub_i64_constant: 2579; GFX1064: ; %bb.0: ; %entry 2580; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2581; GFX1064-NEXT: s_mov_b64 s[6:7], exec 2582; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2583; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2584; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 2585; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2586; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2587; GFX1064-NEXT: s_cbranch_execz .LBB9_2 2588; GFX1064-NEXT: ; %bb.1: 2589; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2590; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2591; GFX1064-NEXT: s_mul_i32 s6, s6, 5 2592; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 2593; GFX1064-NEXT: v_mov_b32_e32 v0, s6 2594; GFX1064-NEXT: s_mov_b32 s10, -1 2595; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2596; GFX1064-NEXT: s_mov_b32 s8, s2 2597; GFX1064-NEXT: s_mov_b32 s9, s3 2598; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2599; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2600; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2601; GFX1064-NEXT: s_waitcnt vmcnt(0) 2602; GFX1064-NEXT: buffer_gl0_inv 2603; GFX1064-NEXT: buffer_gl1_inv 2604; GFX1064-NEXT: .LBB9_2: 2605; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2606; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2607; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2608; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 2609; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2610; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 2611; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2612; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 2613; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2614; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2615; GFX1064-NEXT: s_mov_b32 s2, -1 2616; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2617; GFX1064-NEXT: s_endpgm 2618; 2619; GFX1032-LABEL: sub_i64_constant: 2620; GFX1032: ; %bb.0: ; %entry 2621; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2622; GFX1032-NEXT: s_mov_b32 s5, exec_lo 2623; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2624; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 2625; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2626; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2627; GFX1032-NEXT: s_cbranch_execz .LBB9_2 2628; GFX1032-NEXT: ; %bb.1: 2629; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 2630; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2631; GFX1032-NEXT: s_mul_i32 s5, s5, 5 2632; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 2633; GFX1032-NEXT: v_mov_b32_e32 v0, s5 2634; GFX1032-NEXT: s_mov_b32 s10, -1 2635; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2636; GFX1032-NEXT: s_mov_b32 s8, s2 2637; GFX1032-NEXT: s_mov_b32 s9, s3 2638; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2639; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2640; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2641; GFX1032-NEXT: s_waitcnt vmcnt(0) 2642; GFX1032-NEXT: buffer_gl0_inv 2643; GFX1032-NEXT: buffer_gl1_inv 2644; GFX1032-NEXT: .LBB9_2: 2645; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2646; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2647; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2648; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 2649; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2650; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 2651; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2652; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 2653; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2654; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2655; GFX1032-NEXT: s_mov_b32 s2, -1 2656; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2657; GFX1032-NEXT: s_endpgm 2658; 2659; GFX1164-LABEL: sub_i64_constant: 2660; GFX1164: ; %bb.0: ; %entry 2661; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 2662; GFX1164-NEXT: s_mov_b64 s[6:7], exec 2663; GFX1164-NEXT: s_mov_b64 s[4:5], exec 2664; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2665; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2666; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2667; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 2668; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 2669; GFX1164-NEXT: s_cbranch_execz .LBB9_2 2670; GFX1164-NEXT: ; %bb.1: 2671; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2672; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2673; GFX1164-NEXT: s_mul_i32 s6, s6, 5 2674; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 2675; GFX1164-NEXT: v_mov_b32_e32 v0, s6 2676; GFX1164-NEXT: s_mov_b32 s10, -1 2677; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2678; GFX1164-NEXT: s_mov_b32 s8, s2 2679; GFX1164-NEXT: s_mov_b32 s9, s3 2680; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2681; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2682; GFX1164-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc 2683; GFX1164-NEXT: s_waitcnt vmcnt(0) 2684; GFX1164-NEXT: buffer_gl0_inv 2685; GFX1164-NEXT: buffer_gl1_inv 2686; GFX1164-NEXT: .LBB9_2: 2687; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 2688; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2689; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 2690; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2691; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 2692; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2693; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 2694; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 2695; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2696; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 2697; GFX1164-NEXT: s_mov_b32 s2, -1 2698; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2699; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2700; GFX1164-NEXT: s_endpgm 2701; 2702; GFX1132-LABEL: sub_i64_constant: 2703; GFX1132: ; %bb.0: ; %entry 2704; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 2705; GFX1132-NEXT: s_mov_b32 s5, exec_lo 2706; GFX1132-NEXT: s_mov_b32 s4, exec_lo 2707; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 2708; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 2709; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 2710; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 2711; GFX1132-NEXT: s_cbranch_execz .LBB9_2 2712; GFX1132-NEXT: ; %bb.1: 2713; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 2714; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 2715; GFX1132-NEXT: s_mul_i32 s5, s5, 5 2716; GFX1132-NEXT: s_mov_b32 s10, -1 2717; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0 2718; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2719; GFX1132-NEXT: s_mov_b32 s8, s2 2720; GFX1132-NEXT: s_mov_b32 s9, s3 2721; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2722; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2723; GFX1132-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc 2724; GFX1132-NEXT: s_waitcnt vmcnt(0) 2725; GFX1132-NEXT: buffer_gl0_inv 2726; GFX1132-NEXT: buffer_gl1_inv 2727; GFX1132-NEXT: .LBB9_2: 2728; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 2729; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2730; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 2731; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2732; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 2733; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2734; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 2735; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 2736; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2737; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 2738; GFX1132-NEXT: s_mov_b32 s2, -1 2739; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2740; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2741; GFX1132-NEXT: s_endpgm 2742entry: 2743 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel 2744 store i64 %old, i64 addrspace(1)* %out 2745 ret void 2746} 2747 2748define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) { 2749; GFX7LESS-LABEL: sub_i64_uniform: 2750; GFX7LESS: ; %bb.0: ; %entry 2751; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec 2752; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2753; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2754; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 2755; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 2756; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2757; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 2758; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 2759; GFX7LESS-NEXT: s_cbranch_execz .LBB10_2 2760; GFX7LESS-NEXT: ; %bb.1: 2761; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 2762; GFX7LESS-NEXT: s_mov_b32 s14, -1 2763; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2764; GFX7LESS-NEXT: s_mov_b32 s12, s6 2765; GFX7LESS-NEXT: s_mov_b32 s13, s7 2766; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 2767; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 2768; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 2769; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 2770; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 2771; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 2772; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 2773; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2774; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc 2775; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 2776; GFX7LESS-NEXT: buffer_wbinvl1 2777; GFX7LESS-NEXT: .LBB10_2: 2778; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 2779; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2780; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2781; GFX7LESS-NEXT: s_mov_b32 s6, -1 2782; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 2783; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1 2784; GFX7LESS-NEXT: s_waitcnt expcnt(0) 2785; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 2786; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 2787; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 2788; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 2789; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3 2790; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v2 2791; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 2792; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2793; GFX7LESS-NEXT: s_endpgm 2794; 2795; GFX8-LABEL: sub_i64_uniform: 2796; GFX8: ; %bb.0: ; %entry 2797; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2798; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2799; GFX8-NEXT: s_mov_b64 s[8:9], exec 2800; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 2801; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 2802; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2803; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 2804; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2805; GFX8-NEXT: s_cbranch_execz .LBB10_2 2806; GFX8-NEXT: ; %bb.1: 2807; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2808; GFX8-NEXT: s_mov_b32 s12, s6 2809; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 2810; GFX8-NEXT: v_mov_b32_e32 v0, s6 2811; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0 2812; GFX8-NEXT: s_mul_i32 s6, s1, s6 2813; GFX8-NEXT: s_mov_b32 s15, 0xf000 2814; GFX8-NEXT: s_mov_b32 s14, -1 2815; GFX8-NEXT: s_mov_b32 s13, s7 2816; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 2817; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2818; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc 2819; GFX8-NEXT: s_waitcnt vmcnt(0) 2820; GFX8-NEXT: buffer_wbinvl1_vol 2821; GFX8-NEXT: .LBB10_2: 2822; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2823; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2824; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2 2825; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0 2826; GFX8-NEXT: v_readfirstlane_b32 s0, v0 2827; GFX8-NEXT: v_readfirstlane_b32 s1, v1 2828; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 2829; GFX8-NEXT: v_mov_b32_e32 v3, s1 2830; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 2831; GFX8-NEXT: s_mov_b32 s7, 0xf000 2832; GFX8-NEXT: s_mov_b32 s6, -1 2833; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 2834; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2835; GFX8-NEXT: s_endpgm 2836; 2837; GFX9-LABEL: sub_i64_uniform: 2838; GFX9: ; %bb.0: ; %entry 2839; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2840; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2841; GFX9-NEXT: s_mov_b64 s[8:9], exec 2842; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 2843; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 2844; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2845; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 2846; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 2847; GFX9-NEXT: s_cbranch_execz .LBB10_2 2848; GFX9-NEXT: ; %bb.1: 2849; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2850; GFX9-NEXT: s_mov_b32 s12, s6 2851; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 2852; GFX9-NEXT: s_mov_b32 s13, s7 2853; GFX9-NEXT: s_mul_i32 s7, s3, s6 2854; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2855; GFX9-NEXT: s_add_i32 s8, s8, s7 2856; GFX9-NEXT: s_mul_i32 s6, s2, s6 2857; GFX9-NEXT: s_mov_b32 s15, 0xf000 2858; GFX9-NEXT: s_mov_b32 s14, -1 2859; GFX9-NEXT: v_mov_b32_e32 v0, s6 2860; GFX9-NEXT: v_mov_b32_e32 v1, s8 2861; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2862; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc 2863; GFX9-NEXT: s_waitcnt vmcnt(0) 2864; GFX9-NEXT: buffer_wbinvl1_vol 2865; GFX9-NEXT: .LBB10_2: 2866; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 2867; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2868; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0 2869; GFX9-NEXT: s_mov_b32 s7, 0xf000 2870; GFX9-NEXT: s_mov_b32 s6, -1 2871; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5] 2872; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2873; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2874; GFX9-NEXT: v_mov_b32_e32 v1, v4 2875; GFX9-NEXT: v_mov_b32_e32 v2, s1 2876; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3 2877; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2878; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2879; GFX9-NEXT: s_endpgm 2880; 2881; GFX1064-LABEL: sub_i64_uniform: 2882; GFX1064: ; %bb.0: ; %entry 2883; GFX1064-NEXT: s_clause 0x1 2884; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2885; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2886; GFX1064-NEXT: s_mov_b64 s[8:9], exec 2887; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 2888; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 2889; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 2890; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2891; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 2892; GFX1064-NEXT: s_cbranch_execz .LBB10_2 2893; GFX1064-NEXT: ; %bb.1: 2894; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] 2895; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 2896; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2897; GFX1064-NEXT: s_mul_i32 s9, s3, s8 2898; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 2899; GFX1064-NEXT: s_mul_i32 s8, s2, s8 2900; GFX1064-NEXT: s_add_i32 s10, s10, s9 2901; GFX1064-NEXT: v_mov_b32_e32 v0, s8 2902; GFX1064-NEXT: v_mov_b32_e32 v1, s10 2903; GFX1064-NEXT: s_mov_b32 s10, -1 2904; GFX1064-NEXT: s_mov_b32 s8, s6 2905; GFX1064-NEXT: s_mov_b32 s9, s7 2906; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2907; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2908; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2909; GFX1064-NEXT: s_waitcnt vmcnt(0) 2910; GFX1064-NEXT: buffer_gl0_inv 2911; GFX1064-NEXT: buffer_gl1_inv 2912; GFX1064-NEXT: .LBB10_2: 2913; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2914; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 2915; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2916; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0 2917; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 2918; GFX1064-NEXT: s_mov_b32 s6, -1 2919; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5] 2920; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 2921; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 2922; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v3 2923; GFX1064-NEXT: v_mov_b32_e32 v1, v4 2924; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc 2925; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2926; GFX1064-NEXT: s_endpgm 2927; 2928; GFX1032-LABEL: sub_i64_uniform: 2929; GFX1032: ; %bb.0: ; %entry 2930; GFX1032-NEXT: s_clause 0x1 2931; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2932; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2933; GFX1032-NEXT: s_mov_b32 s8, exec_lo 2934; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2935; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 2936; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2937; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 2938; GFX1032-NEXT: s_cbranch_execz .LBB10_2 2939; GFX1032-NEXT: ; %bb.1: 2940; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 2941; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 2942; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2943; GFX1032-NEXT: s_mul_i32 s8, s3, s1 2944; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 2945; GFX1032-NEXT: s_mul_i32 s1, s2, s1 2946; GFX1032-NEXT: s_add_i32 s9, s9, s8 2947; GFX1032-NEXT: v_mov_b32_e32 v0, s1 2948; GFX1032-NEXT: v_mov_b32_e32 v1, s9 2949; GFX1032-NEXT: s_mov_b32 s10, -1 2950; GFX1032-NEXT: s_mov_b32 s8, s6 2951; GFX1032-NEXT: s_mov_b32 s9, s7 2952; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2953; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2954; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2955; GFX1032-NEXT: s_waitcnt vmcnt(0) 2956; GFX1032-NEXT: buffer_gl0_inv 2957; GFX1032-NEXT: buffer_gl1_inv 2958; GFX1032-NEXT: .LBB10_2: 2959; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2960; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 2961; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2962; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s2, v2, 0 2963; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 2964; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 2965; GFX1032-NEXT: s_mov_b32 s6, -1 2966; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s0, s3, v2, v[4:5] 2967; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 2968; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3 2969; GFX1032-NEXT: v_mov_b32_e32 v1, v4 2970; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 2971; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2972; GFX1032-NEXT: s_endpgm 2973; 2974; GFX1164-LABEL: sub_i64_uniform: 2975; GFX1164: ; %bb.0: ; %entry 2976; GFX1164-NEXT: s_clause 0x1 2977; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 2978; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 2979; GFX1164-NEXT: s_mov_b64 s[8:9], exec 2980; GFX1164-NEXT: s_mov_b64 s[2:3], exec 2981; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 2982; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2983; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 2984; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 2985; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 2986; GFX1164-NEXT: s_cbranch_execz .LBB10_2 2987; GFX1164-NEXT: ; %bb.1: 2988; GFX1164-NEXT: s_bcnt1_i32_b64 s8, s[8:9] 2989; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 2990; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2991; GFX1164-NEXT: s_mul_i32 s9, s1, s8 2992; GFX1164-NEXT: s_mul_hi_u32 s10, s0, s8 2993; GFX1164-NEXT: s_mul_i32 s8, s0, s8 2994; GFX1164-NEXT: s_add_i32 s10, s10, s9 2995; GFX1164-NEXT: v_mov_b32_e32 v0, s8 2996; GFX1164-NEXT: v_mov_b32_e32 v1, s10 2997; GFX1164-NEXT: s_mov_b32 s10, -1 2998; GFX1164-NEXT: s_mov_b32 s8, s6 2999; GFX1164-NEXT: s_mov_b32 s9, s7 3000; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3001; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 3002; GFX1164-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc 3003; GFX1164-NEXT: s_waitcnt vmcnt(0) 3004; GFX1164-NEXT: buffer_gl0_inv 3005; GFX1164-NEXT: buffer_gl1_inv 3006; GFX1164-NEXT: .LBB10_2: 3007; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 3008; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3009; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0 3010; GFX1164-NEXT: v_readfirstlane_b32 s0, v0 3011; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 3012; GFX1164-NEXT: s_mov_b32 s6, -1 3013; GFX1164-NEXT: s_waitcnt_depctr 0xfff 3014; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5] 3015; GFX1164-NEXT: v_readfirstlane_b32 s1, v1 3016; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s0, v3 3017; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 3018; GFX1164-NEXT: v_mov_b32_e32 v1, v5 3019; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc 3020; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 3021; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3022; GFX1164-NEXT: s_endpgm 3023; 3024; GFX1132-LABEL: sub_i64_uniform: 3025; GFX1132: ; %bb.0: ; %entry 3026; GFX1132-NEXT: s_clause 0x1 3027; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 3028; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 3029; GFX1132-NEXT: s_mov_b32 s3, exec_lo 3030; GFX1132-NEXT: s_mov_b32 s2, exec_lo 3031; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 3032; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 3033; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 3034; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 3035; GFX1132-NEXT: s_cbranch_execz .LBB10_2 3036; GFX1132-NEXT: ; %bb.1: 3037; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 3038; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 3039; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3040; GFX1132-NEXT: s_mul_i32 s8, s1, s3 3041; GFX1132-NEXT: s_mul_hi_u32 s9, s0, s3 3042; GFX1132-NEXT: s_mul_i32 s3, s0, s3 3043; GFX1132-NEXT: s_add_i32 s9, s9, s8 3044; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3045; GFX1132-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9 3046; GFX1132-NEXT: s_mov_b32 s10, -1 3047; GFX1132-NEXT: s_mov_b32 s8, s6 3048; GFX1132-NEXT: s_mov_b32 s9, s7 3049; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3050; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 3051; GFX1132-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc 3052; GFX1132-NEXT: s_waitcnt vmcnt(0) 3053; GFX1132-NEXT: buffer_gl0_inv 3054; GFX1132-NEXT: buffer_gl1_inv 3055; GFX1132-NEXT: .LBB10_2: 3056; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 3057; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3058; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0 3059; GFX1132-NEXT: v_readfirstlane_b32 s0, v0 3060; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 3061; GFX1132-NEXT: s_mov_b32 s6, -1 3062; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) 3063; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5] 3064; GFX1132-NEXT: v_readfirstlane_b32 s1, v1 3065; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3 3066; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 3067; GFX1132-NEXT: v_mov_b32_e32 v1, v5 3068; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 3069; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 3070; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3071; GFX1132-NEXT: s_endpgm 3072entry: 3073 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel 3074 store i64 %old, i64 addrspace(1)* %out 3075 ret void 3076} 3077 3078define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 3079; GFX7LESS-LABEL: sub_i64_varying: 3080; GFX7LESS: ; %bb.0: ; %entry 3081; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 3082; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 3083; GFX7LESS-NEXT: s_mov_b32 s6, -1 3084; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3085; GFX7LESS-NEXT: s_mov_b32 s10, s6 3086; GFX7LESS-NEXT: s_mov_b32 s11, s7 3087; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3088; GFX7LESS-NEXT: s_mov_b32 s8, s2 3089; GFX7LESS-NEXT: s_mov_b32 s9, s3 3090; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3091; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 3092; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 3093; GFX7LESS-NEXT: buffer_wbinvl1 3094; GFX7LESS-NEXT: s_mov_b32 s4, s0 3095; GFX7LESS-NEXT: s_mov_b32 s5, s1 3096; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3097; GFX7LESS-NEXT: s_endpgm 3098; 3099; GFX89-LABEL: sub_i64_varying: 3100; GFX89: ; %bb.0: ; %entry 3101; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3102; GFX89-NEXT: s_mov_b32 s7, 0xf000 3103; GFX89-NEXT: s_mov_b32 s6, -1 3104; GFX89-NEXT: s_mov_b32 s10, s6 3105; GFX89-NEXT: s_mov_b32 s11, s7 3106; GFX89-NEXT: s_waitcnt lgkmcnt(0) 3107; GFX89-NEXT: s_mov_b32 s8, s2 3108; GFX89-NEXT: s_mov_b32 s9, s3 3109; GFX89-NEXT: v_mov_b32_e32 v1, 0 3110; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3111; GFX89-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 3112; GFX89-NEXT: s_waitcnt vmcnt(0) 3113; GFX89-NEXT: buffer_wbinvl1_vol 3114; GFX89-NEXT: s_mov_b32 s4, s0 3115; GFX89-NEXT: s_mov_b32 s5, s1 3116; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3117; GFX89-NEXT: s_endpgm 3118; 3119; GFX10-LABEL: sub_i64_varying: 3120; GFX10: ; %bb.0: ; %entry 3121; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3122; GFX10-NEXT: v_mov_b32_e32 v1, 0 3123; GFX10-NEXT: s_mov_b32 s7, 0x31016000 3124; GFX10-NEXT: s_mov_b32 s6, -1 3125; GFX10-NEXT: s_mov_b32 s11, s7 3126; GFX10-NEXT: s_mov_b32 s10, s6 3127; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3128; GFX10-NEXT: s_mov_b32 s8, s2 3129; GFX10-NEXT: s_mov_b32 s9, s3 3130; GFX10-NEXT: s_mov_b32 s4, s0 3131; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3132; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3133; GFX10-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 3134; GFX10-NEXT: s_waitcnt vmcnt(0) 3135; GFX10-NEXT: buffer_gl0_inv 3136; GFX10-NEXT: buffer_gl1_inv 3137; GFX10-NEXT: s_mov_b32 s5, s1 3138; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3139; GFX10-NEXT: s_endpgm 3140; 3141; GFX11-LABEL: sub_i64_varying: 3142; GFX11: ; %bb.0: ; %entry 3143; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 3144; GFX11-NEXT: v_mov_b32_e32 v1, 0 3145; GFX11-NEXT: s_mov_b32 s7, 0x31016000 3146; GFX11-NEXT: s_mov_b32 s6, -1 3147; GFX11-NEXT: s_mov_b32 s11, s7 3148; GFX11-NEXT: s_mov_b32 s10, s6 3149; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3150; GFX11-NEXT: s_mov_b32 s8, s2 3151; GFX11-NEXT: s_mov_b32 s9, s3 3152; GFX11-NEXT: s_mov_b32 s4, s0 3153; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3154; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3155; GFX11-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc 3156; GFX11-NEXT: s_waitcnt vmcnt(0) 3157; GFX11-NEXT: buffer_gl0_inv 3158; GFX11-NEXT: buffer_gl1_inv 3159; GFX11-NEXT: s_mov_b32 s5, s1 3160; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 3161; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3162; GFX11-NEXT: s_endpgm 3163entry: 3164 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3165 %zext = zext i32 %lane to i64 3166 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel 3167 store i64 %old, i64 addrspace(1)* %out 3168 ret void 3169} 3170