1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10; Show what the atomic optimization pass will do for global pointers. 11 12define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 13; GFX7LESS-LABEL: add_i32_constant: 14; GFX7LESS: ; %bb.0: ; %entry 15; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 16; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 17; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 18; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 19; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 20; GFX7LESS-NEXT: ; implicit-def: $vgpr1 21; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 22; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 23; GFX7LESS-NEXT: ; %bb.1: 24; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 25; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 26; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 27; GFX7LESS-NEXT: s_mov_b32 s10, -1 28; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 29; GFX7LESS-NEXT: s_mov_b32 s8, s2 30; GFX7LESS-NEXT: s_mov_b32 s9, s3 31; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 32; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 33; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 34; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 35; GFX7LESS-NEXT: buffer_wbinvl1 36; GFX7LESS-NEXT: .LBB0_2: 37; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 38; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 39; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 40; GFX7LESS-NEXT: s_mov_b32 s2, -1 41; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 42; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4 43; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 44; GFX7LESS-NEXT: s_endpgm 45; 46; GFX89-LABEL: add_i32_constant: 47; GFX89: ; %bb.0: ; %entry 48; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 49; GFX89-NEXT: s_mov_b64 s[6:7], exec 50; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 51; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 52; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 53; GFX89-NEXT: ; implicit-def: $vgpr1 54; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc 55; GFX89-NEXT: s_cbranch_execz .LBB0_2 56; GFX89-NEXT: ; %bb.1: 57; GFX89-NEXT: s_waitcnt lgkmcnt(0) 58; GFX89-NEXT: s_mov_b32 s8, s2 59; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 60; GFX89-NEXT: s_mul_i32 s2, s2, 5 61; GFX89-NEXT: s_mov_b32 s11, 0xf000 62; GFX89-NEXT: s_mov_b32 s10, -1 63; GFX89-NEXT: s_mov_b32 s9, s3 64; GFX89-NEXT: v_mov_b32_e32 v1, s2 65; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 66; GFX89-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 67; GFX89-NEXT: s_waitcnt vmcnt(0) 68; GFX89-NEXT: buffer_wbinvl1_vol 69; GFX89-NEXT: .LBB0_2: 70; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] 71; GFX89-NEXT: v_readfirstlane_b32 s4, v1 72; GFX89-NEXT: s_waitcnt lgkmcnt(0) 73; GFX89-NEXT: s_mov_b32 s3, 0xf000 74; GFX89-NEXT: s_mov_b32 s2, -1 75; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4 76; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 77; GFX89-NEXT: s_endpgm 78; 79; GFX1064-LABEL: add_i32_constant: 80; GFX1064: ; %bb.0: ; %entry 81; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 82; GFX1064-NEXT: s_mov_b64 s[6:7], exec 83; GFX1064-NEXT: ; implicit-def: $vgpr1 84; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 85; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 86; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 87; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 88; GFX1064-NEXT: s_cbranch_execz .LBB0_2 89; GFX1064-NEXT: ; %bb.1: 90; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 91; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 92; GFX1064-NEXT: s_mul_i32 s6, s6, 5 93; GFX1064-NEXT: s_mov_b32 s10, -1 94; GFX1064-NEXT: v_mov_b32_e32 v1, s6 95; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 96; GFX1064-NEXT: s_mov_b32 s8, s2 97; GFX1064-NEXT: s_mov_b32 s9, s3 98; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 99; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 100; GFX1064-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 101; GFX1064-NEXT: s_waitcnt vmcnt(0) 102; GFX1064-NEXT: buffer_gl0_inv 103; GFX1064-NEXT: buffer_gl1_inv 104; GFX1064-NEXT: .LBB0_2: 105; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 106; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 107; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 108; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 109; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 110; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 111; GFX1064-NEXT: s_mov_b32 s2, -1 112; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 113; GFX1064-NEXT: s_endpgm 114; 115; GFX1032-LABEL: add_i32_constant: 116; GFX1032: ; %bb.0: ; %entry 117; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 118; GFX1032-NEXT: s_mov_b32 s5, exec_lo 119; GFX1032-NEXT: ; implicit-def: $vgpr1 120; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 121; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 122; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 123; GFX1032-NEXT: s_cbranch_execz .LBB0_2 124; GFX1032-NEXT: ; %bb.1: 125; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 126; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 127; GFX1032-NEXT: s_mul_i32 s5, s5, 5 128; GFX1032-NEXT: s_mov_b32 s10, -1 129; GFX1032-NEXT: v_mov_b32_e32 v1, s5 130; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 131; GFX1032-NEXT: s_mov_b32 s8, s2 132; GFX1032-NEXT: s_mov_b32 s9, s3 133; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 134; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 135; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 136; GFX1032-NEXT: s_waitcnt vmcnt(0) 137; GFX1032-NEXT: buffer_gl0_inv 138; GFX1032-NEXT: buffer_gl1_inv 139; GFX1032-NEXT: .LBB0_2: 140; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 141; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 142; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 143; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 144; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 145; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 146; GFX1032-NEXT: s_mov_b32 s2, -1 147; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 148; GFX1032-NEXT: s_endpgm 149entry: 150 %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel 151 store i32 %old, i32 addrspace(1)* %out 152 ret void 153} 154 155define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) { 156; GFX7LESS-LABEL: add_i32_uniform: 157; GFX7LESS: ; %bb.0: ; %entry 158; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 159; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 160; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd 161; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 162; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 163; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 164; GFX7LESS-NEXT: ; implicit-def: $vgpr1 165; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 166; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 167; GFX7LESS-NEXT: ; %bb.1: 168; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 169; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 170; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 171; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 172; GFX7LESS-NEXT: s_mov_b32 s14, -1 173; GFX7LESS-NEXT: s_mov_b32 s12, s6 174; GFX7LESS-NEXT: s_mov_b32 s13, s7 175; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 176; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 177; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 178; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 179; GFX7LESS-NEXT: buffer_wbinvl1 180; GFX7LESS-NEXT: .LBB1_2: 181; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 182; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 183; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 184; GFX7LESS-NEXT: s_mov_b32 s6, -1 185; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 186; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 187; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 188; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 189; GFX7LESS-NEXT: s_endpgm 190; 191; GFX8-LABEL: add_i32_uniform: 192; GFX8: ; %bb.0: ; %entry 193; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 194; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 195; GFX8-NEXT: s_mov_b64 s[2:3], exec 196; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 197; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 198; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 199; GFX8-NEXT: ; implicit-def: $vgpr1 200; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 201; GFX8-NEXT: s_cbranch_execz .LBB1_2 202; GFX8-NEXT: ; %bb.1: 203; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 204; GFX8-NEXT: s_waitcnt lgkmcnt(0) 205; GFX8-NEXT: s_mul_i32 s2, s8, s2 206; GFX8-NEXT: s_mov_b32 s15, 0xf000 207; GFX8-NEXT: s_mov_b32 s14, -1 208; GFX8-NEXT: s_mov_b32 s12, s6 209; GFX8-NEXT: s_mov_b32 s13, s7 210; GFX8-NEXT: v_mov_b32_e32 v1, s2 211; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 212; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 213; GFX8-NEXT: s_waitcnt vmcnt(0) 214; GFX8-NEXT: buffer_wbinvl1_vol 215; GFX8-NEXT: .LBB1_2: 216; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 217; GFX8-NEXT: s_waitcnt lgkmcnt(0) 218; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 219; GFX8-NEXT: v_readfirstlane_b32 s0, v1 220; GFX8-NEXT: s_mov_b32 s7, 0xf000 221; GFX8-NEXT: s_mov_b32 s6, -1 222; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 223; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 224; GFX8-NEXT: s_endpgm 225; 226; GFX9-LABEL: add_i32_uniform: 227; GFX9: ; %bb.0: ; %entry 228; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 229; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 230; GFX9-NEXT: s_mov_b64 s[2:3], exec 231; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 232; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 233; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 234; GFX9-NEXT: ; implicit-def: $vgpr1 235; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 236; GFX9-NEXT: s_cbranch_execz .LBB1_2 237; GFX9-NEXT: ; %bb.1: 238; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 239; GFX9-NEXT: s_waitcnt lgkmcnt(0) 240; GFX9-NEXT: s_mul_i32 s2, s8, s2 241; GFX9-NEXT: s_mov_b32 s15, 0xf000 242; GFX9-NEXT: s_mov_b32 s14, -1 243; GFX9-NEXT: s_mov_b32 s12, s6 244; GFX9-NEXT: s_mov_b32 s13, s7 245; GFX9-NEXT: v_mov_b32_e32 v1, s2 246; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 247; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 248; GFX9-NEXT: s_waitcnt vmcnt(0) 249; GFX9-NEXT: buffer_wbinvl1_vol 250; GFX9-NEXT: .LBB1_2: 251; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 252; GFX9-NEXT: s_waitcnt lgkmcnt(0) 253; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 254; GFX9-NEXT: v_readfirstlane_b32 s0, v1 255; GFX9-NEXT: s_mov_b32 s7, 0xf000 256; GFX9-NEXT: s_mov_b32 s6, -1 257; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 258; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 259; GFX9-NEXT: s_endpgm 260; 261; GFX1064-LABEL: add_i32_uniform: 262; GFX1064: ; %bb.0: ; %entry 263; GFX1064-NEXT: s_clause 0x1 264; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 265; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 266; GFX1064-NEXT: s_mov_b64 s[2:3], exec 267; GFX1064-NEXT: ; implicit-def: $vgpr1 268; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 269; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 270; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 271; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 272; GFX1064-NEXT: s_cbranch_execz .LBB1_2 273; GFX1064-NEXT: ; %bb.1: 274; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 275; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 276; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 277; GFX1064-NEXT: s_mul_i32 s2, s8, s2 278; GFX1064-NEXT: s_mov_b32 s14, -1 279; GFX1064-NEXT: v_mov_b32_e32 v1, s2 280; GFX1064-NEXT: s_mov_b32 s12, s6 281; GFX1064-NEXT: s_mov_b32 s13, s7 282; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 283; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 284; GFX1064-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 285; GFX1064-NEXT: s_waitcnt vmcnt(0) 286; GFX1064-NEXT: buffer_gl0_inv 287; GFX1064-NEXT: buffer_gl1_inv 288; GFX1064-NEXT: .LBB1_2: 289; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 290; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 291; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 292; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 293; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 294; GFX1064-NEXT: s_mov_b32 s6, -1 295; GFX1064-NEXT: v_mad_u64_u32 v[0:1], null, s8, v0, s[0:1] 296; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 297; GFX1064-NEXT: s_endpgm 298; 299; GFX1032-LABEL: add_i32_uniform: 300; GFX1032: ; %bb.0: ; %entry 301; GFX1032-NEXT: s_clause 0x1 302; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 303; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 304; GFX1032-NEXT: s_mov_b32 s3, exec_lo 305; GFX1032-NEXT: ; implicit-def: $vgpr1 306; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 307; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 308; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 309; GFX1032-NEXT: s_cbranch_execz .LBB1_2 310; GFX1032-NEXT: ; %bb.1: 311; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 312; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 313; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 314; GFX1032-NEXT: s_mul_i32 s1, s2, s1 315; GFX1032-NEXT: s_mov_b32 s10, -1 316; GFX1032-NEXT: v_mov_b32_e32 v1, s1 317; GFX1032-NEXT: s_mov_b32 s8, s6 318; GFX1032-NEXT: s_mov_b32 s9, s7 319; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 320; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 321; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 322; GFX1032-NEXT: s_waitcnt vmcnt(0) 323; GFX1032-NEXT: buffer_gl0_inv 324; GFX1032-NEXT: buffer_gl1_inv 325; GFX1032-NEXT: .LBB1_2: 326; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 327; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 328; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 329; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 330; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 331; GFX1032-NEXT: s_mov_b32 s6, -1 332; GFX1032-NEXT: v_mad_u64_u32 v[0:1], null, s2, v0, s[0:1] 333; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 334; GFX1032-NEXT: s_endpgm 335entry: 336 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel 337 store i32 %old, i32 addrspace(1)* %out 338 ret void 339} 340 341define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 342; GFX7LESS-LABEL: add_i32_varying: 343; GFX7LESS: ; %bb.0: ; %entry 344; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 345; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 346; GFX7LESS-NEXT: s_mov_b32 s6, -1 347; GFX7LESS-NEXT: s_mov_b32 s10, s6 348; GFX7LESS-NEXT: s_mov_b32 s11, s7 349; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 350; GFX7LESS-NEXT: s_mov_b32 s8, s2 351; GFX7LESS-NEXT: s_mov_b32 s9, s3 352; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 353; GFX7LESS-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 354; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 355; GFX7LESS-NEXT: buffer_wbinvl1 356; GFX7LESS-NEXT: s_mov_b32 s4, s0 357; GFX7LESS-NEXT: s_mov_b32 s5, s1 358; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 359; GFX7LESS-NEXT: s_endpgm 360; 361; GFX8-LABEL: add_i32_varying: 362; GFX8: ; %bb.0: ; %entry 363; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 364; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 365; GFX8-NEXT: v_mov_b32_e32 v1, 0 366; GFX8-NEXT: s_mov_b64 exec, s[4:5] 367; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 368; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 369; GFX8-NEXT: v_mov_b32_e32 v2, v0 370; GFX8-NEXT: s_not_b64 exec, exec 371; GFX8-NEXT: v_mov_b32_e32 v2, 0 372; GFX8-NEXT: s_not_b64 exec, exec 373; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 374; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 375; GFX8-NEXT: s_nop 1 376; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 377; GFX8-NEXT: s_nop 1 378; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 379; GFX8-NEXT: s_nop 1 380; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 381; GFX8-NEXT: s_nop 1 382; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 383; GFX8-NEXT: s_nop 1 384; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 385; GFX8-NEXT: v_readlane_b32 s6, v2, 63 386; GFX8-NEXT: s_nop 0 387; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 388; GFX8-NEXT: s_mov_b64 exec, s[4:5] 389; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 390; GFX8-NEXT: ; implicit-def: $vgpr0 391; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 392; GFX8-NEXT: s_cbranch_execz .LBB2_2 393; GFX8-NEXT: ; %bb.1: 394; GFX8-NEXT: s_mov_b32 s11, 0xf000 395; GFX8-NEXT: s_mov_b32 s10, -1 396; GFX8-NEXT: s_waitcnt lgkmcnt(0) 397; GFX8-NEXT: s_mov_b32 s8, s2 398; GFX8-NEXT: s_mov_b32 s9, s3 399; GFX8-NEXT: v_mov_b32_e32 v0, s6 400; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 401; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 402; GFX8-NEXT: s_waitcnt vmcnt(0) 403; GFX8-NEXT: buffer_wbinvl1_vol 404; GFX8-NEXT: .LBB2_2: 405; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 406; GFX8-NEXT: v_readfirstlane_b32 s4, v0 407; GFX8-NEXT: v_mov_b32_e32 v0, v1 408; GFX8-NEXT: s_waitcnt lgkmcnt(0) 409; GFX8-NEXT: s_mov_b32 s3, 0xf000 410; GFX8-NEXT: s_mov_b32 s2, -1 411; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 412; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 413; GFX8-NEXT: s_endpgm 414; 415; GFX9-LABEL: add_i32_varying: 416; GFX9: ; %bb.0: ; %entry 417; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 418; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 419; GFX9-NEXT: v_mov_b32_e32 v1, 0 420; GFX9-NEXT: s_mov_b64 exec, s[4:5] 421; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 422; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 423; GFX9-NEXT: v_mov_b32_e32 v2, v0 424; GFX9-NEXT: s_not_b64 exec, exec 425; GFX9-NEXT: v_mov_b32_e32 v2, 0 426; GFX9-NEXT: s_not_b64 exec, exec 427; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 428; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 429; GFX9-NEXT: s_nop 1 430; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 431; GFX9-NEXT: s_nop 1 432; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 433; GFX9-NEXT: s_nop 1 434; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 435; GFX9-NEXT: s_nop 1 436; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 437; GFX9-NEXT: s_nop 1 438; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 439; GFX9-NEXT: v_readlane_b32 s6, v2, 63 440; GFX9-NEXT: s_nop 0 441; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 442; GFX9-NEXT: s_mov_b64 exec, s[4:5] 443; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 444; GFX9-NEXT: ; implicit-def: $vgpr0 445; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 446; GFX9-NEXT: s_cbranch_execz .LBB2_2 447; GFX9-NEXT: ; %bb.1: 448; GFX9-NEXT: s_mov_b32 s11, 0xf000 449; GFX9-NEXT: s_mov_b32 s10, -1 450; GFX9-NEXT: s_waitcnt lgkmcnt(0) 451; GFX9-NEXT: s_mov_b32 s8, s2 452; GFX9-NEXT: s_mov_b32 s9, s3 453; GFX9-NEXT: v_mov_b32_e32 v0, s6 454; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 455; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 456; GFX9-NEXT: s_waitcnt vmcnt(0) 457; GFX9-NEXT: buffer_wbinvl1_vol 458; GFX9-NEXT: .LBB2_2: 459; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 460; GFX9-NEXT: v_readfirstlane_b32 s4, v0 461; GFX9-NEXT: v_mov_b32_e32 v0, v1 462; GFX9-NEXT: s_waitcnt lgkmcnt(0) 463; GFX9-NEXT: s_mov_b32 s3, 0xf000 464; GFX9-NEXT: s_mov_b32 s2, -1 465; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 466; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 467; GFX9-NEXT: s_endpgm 468; 469; GFX1064-LABEL: add_i32_varying: 470; GFX1064: ; %bb.0: ; %entry 471; GFX1064-NEXT: v_mov_b32_e32 v1, v0 472; GFX1064-NEXT: s_not_b64 exec, exec 473; GFX1064-NEXT: v_mov_b32_e32 v1, 0 474; GFX1064-NEXT: s_not_b64 exec, exec 475; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 476; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 477; GFX1064-NEXT: v_mov_b32_e32 v3, 0 478; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 479; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 480; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 481; GFX1064-NEXT: v_mov_b32_e32 v2, v1 482; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 483; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 484; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 485; GFX1064-NEXT: v_mov_b32_e32 v2, s4 486; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 487; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 488; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 489; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 490; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 491; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 492; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 493; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 494; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 495; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 496; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 497; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 498; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 499; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 500; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 501; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 502; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 503; GFX1064-NEXT: s_mov_b32 s4, s9 504; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 505; GFX1064-NEXT: s_mov_b64 exec, s[6:7] 506; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 507; GFX1064-NEXT: s_mov_b32 s6, -1 508; GFX1064-NEXT: ; implicit-def: $vgpr0 509; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc 510; GFX1064-NEXT: s_cbranch_execz .LBB2_2 511; GFX1064-NEXT: ; %bb.1: 512; GFX1064-NEXT: v_mov_b32_e32 v0, s4 513; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 514; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 515; GFX1064-NEXT: s_mov_b32 s4, s2 516; GFX1064-NEXT: s_mov_b32 s5, s3 517; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 518; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 519; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 520; GFX1064-NEXT: s_waitcnt vmcnt(0) 521; GFX1064-NEXT: buffer_gl0_inv 522; GFX1064-NEXT: buffer_gl1_inv 523; GFX1064-NEXT: .LBB2_2: 524; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 525; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 526; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 527; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 528; GFX1064-NEXT: v_mov_b32_e32 v0, v3 529; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 530; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v0 531; GFX1064-NEXT: s_mov_b32 s2, s6 532; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 533; GFX1064-NEXT: s_endpgm 534; 535; GFX1032-LABEL: add_i32_varying: 536; GFX1032: ; %bb.0: ; %entry 537; GFX1032-NEXT: v_mov_b32_e32 v1, v0 538; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 539; GFX1032-NEXT: v_mov_b32_e32 v1, 0 540; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 541; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 542; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 543; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 544; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 545; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 546; GFX1032-NEXT: v_mov_b32_e32 v2, v1 547; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 548; GFX1032-NEXT: s_mov_b32 exec_lo, s2 549; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 550; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 551; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 552; GFX1032-NEXT: v_mov_b32_e32 v3, 0 553; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 554; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 555; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 556; GFX1032-NEXT: s_mov_b32 exec_lo, s4 557; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 558; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 559; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 560; GFX1032-NEXT: s_mov_b32 exec_lo, s4 561; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 562; GFX1032-NEXT: s_mov_b32 s4, s6 563; GFX1032-NEXT: s_mov_b32 s6, -1 564; GFX1032-NEXT: ; implicit-def: $vgpr0 565; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo 566; GFX1032-NEXT: s_cbranch_execz .LBB2_2 567; GFX1032-NEXT: ; %bb.1: 568; GFX1032-NEXT: v_mov_b32_e32 v0, s4 569; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 570; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 571; GFX1032-NEXT: s_mov_b32 s4, s2 572; GFX1032-NEXT: s_mov_b32 s5, s3 573; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 574; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 575; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 576; GFX1032-NEXT: s_waitcnt vmcnt(0) 577; GFX1032-NEXT: buffer_gl0_inv 578; GFX1032-NEXT: buffer_gl1_inv 579; GFX1032-NEXT: .LBB2_2: 580; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 581; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 582; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 583; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 584; GFX1032-NEXT: v_mov_b32_e32 v0, v3 585; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 586; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v0 587; GFX1032-NEXT: s_mov_b32 s2, s6 588; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 589; GFX1032-NEXT: s_endpgm 590entry: 591 %lane = call i32 @llvm.amdgcn.workitem.id.x() 592 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel 593 store i32 %old, i32 addrspace(1)* %out 594 ret void 595} 596 597define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 598; GFX7LESS-LABEL: add_i64_constant: 599; GFX7LESS: ; %bb.0: ; %entry 600; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 601; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 602; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 603; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 604; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 605; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 606; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 607; GFX7LESS-NEXT: s_cbranch_execz .LBB3_2 608; GFX7LESS-NEXT: ; %bb.1: 609; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 610; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 611; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 612; GFX7LESS-NEXT: s_mov_b32 s10, -1 613; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 614; GFX7LESS-NEXT: s_mov_b32 s8, s2 615; GFX7LESS-NEXT: s_mov_b32 s9, s3 616; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 617; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 618; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 619; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 620; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 621; GFX7LESS-NEXT: buffer_wbinvl1 622; GFX7LESS-NEXT: .LBB3_2: 623; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 624; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 625; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 626; GFX7LESS-NEXT: s_mov_b32 s2, -1 627; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 628; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 629; GFX7LESS-NEXT: s_waitcnt expcnt(0) 630; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 631; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 632; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 633; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 634; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 635; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 636; GFX7LESS-NEXT: s_endpgm 637; 638; GFX89-LABEL: add_i64_constant: 639; GFX89: ; %bb.0: ; %entry 640; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 641; GFX89-NEXT: s_mov_b64 s[6:7], exec 642; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 643; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 644; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 645; GFX89-NEXT: ; implicit-def: $vgpr0_vgpr1 646; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc 647; GFX89-NEXT: s_cbranch_execz .LBB3_2 648; GFX89-NEXT: ; %bb.1: 649; GFX89-NEXT: s_waitcnt lgkmcnt(0) 650; GFX89-NEXT: s_mov_b32 s8, s2 651; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 652; GFX89-NEXT: s_mul_i32 s2, s2, 5 653; GFX89-NEXT: s_mov_b32 s11, 0xf000 654; GFX89-NEXT: s_mov_b32 s10, -1 655; GFX89-NEXT: s_mov_b32 s9, s3 656; GFX89-NEXT: v_mov_b32_e32 v0, s2 657; GFX89-NEXT: v_mov_b32_e32 v1, 0 658; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 659; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 660; GFX89-NEXT: s_waitcnt vmcnt(0) 661; GFX89-NEXT: buffer_wbinvl1_vol 662; GFX89-NEXT: .LBB3_2: 663; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] 664; GFX89-NEXT: s_waitcnt lgkmcnt(0) 665; GFX89-NEXT: v_readfirstlane_b32 s2, v0 666; GFX89-NEXT: v_readfirstlane_b32 s3, v1 667; GFX89-NEXT: v_mov_b32_e32 v0, s2 668; GFX89-NEXT: v_mov_b32_e32 v1, s3 669; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 670; GFX89-NEXT: s_mov_b32 s3, 0xf000 671; GFX89-NEXT: s_mov_b32 s2, -1 672; GFX89-NEXT: s_nop 2 673; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 674; GFX89-NEXT: s_endpgm 675; 676; GFX1064-LABEL: add_i64_constant: 677; GFX1064: ; %bb.0: ; %entry 678; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 679; GFX1064-NEXT: s_mov_b64 s[6:7], exec 680; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 681; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 682; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 683; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 684; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 685; GFX1064-NEXT: s_cbranch_execz .LBB3_2 686; GFX1064-NEXT: ; %bb.1: 687; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 688; GFX1064-NEXT: v_mov_b32_e32 v1, 0 689; GFX1064-NEXT: s_mul_i32 s6, s6, 5 690; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 691; GFX1064-NEXT: v_mov_b32_e32 v0, s6 692; GFX1064-NEXT: s_mov_b32 s10, -1 693; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 694; GFX1064-NEXT: s_mov_b32 s8, s2 695; GFX1064-NEXT: s_mov_b32 s9, s3 696; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 697; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 698; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 699; GFX1064-NEXT: s_waitcnt vmcnt(0) 700; GFX1064-NEXT: buffer_gl0_inv 701; GFX1064-NEXT: buffer_gl1_inv 702; GFX1064-NEXT: .LBB3_2: 703; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 704; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 705; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 706; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 707; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 708; GFX1064-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 709; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 710; GFX1064-NEXT: s_mov_b32 s2, -1 711; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 712; GFX1064-NEXT: s_endpgm 713; 714; GFX1032-LABEL: add_i64_constant: 715; GFX1032: ; %bb.0: ; %entry 716; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 717; GFX1032-NEXT: s_mov_b32 s5, exec_lo 718; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 719; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 720; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 721; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 722; GFX1032-NEXT: s_cbranch_execz .LBB3_2 723; GFX1032-NEXT: ; %bb.1: 724; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 725; GFX1032-NEXT: v_mov_b32_e32 v1, 0 726; GFX1032-NEXT: s_mul_i32 s5, s5, 5 727; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 728; GFX1032-NEXT: v_mov_b32_e32 v0, s5 729; GFX1032-NEXT: s_mov_b32 s10, -1 730; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 731; GFX1032-NEXT: s_mov_b32 s8, s2 732; GFX1032-NEXT: s_mov_b32 s9, s3 733; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 734; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 735; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 736; GFX1032-NEXT: s_waitcnt vmcnt(0) 737; GFX1032-NEXT: buffer_gl0_inv 738; GFX1032-NEXT: buffer_gl1_inv 739; GFX1032-NEXT: .LBB3_2: 740; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 741; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 742; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 743; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 744; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 745; GFX1032-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 746; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 747; GFX1032-NEXT: s_mov_b32 s2, -1 748; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 749; GFX1032-NEXT: s_endpgm 750entry: 751 %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel 752 store i64 %old, i64 addrspace(1)* %out 753 ret void 754} 755 756define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) { 757; GFX7LESS-LABEL: add_i64_uniform: 758; GFX7LESS: ; %bb.0: ; %entry 759; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec 760; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 761; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 762; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 763; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 764; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 765; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 766; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 767; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 768; GFX7LESS-NEXT: ; %bb.1: 769; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 770; GFX7LESS-NEXT: s_mov_b32 s14, -1 771; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 772; GFX7LESS-NEXT: s_mov_b32 s12, s6 773; GFX7LESS-NEXT: s_mov_b32 s13, s7 774; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 775; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 776; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 777; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 778; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 779; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 780; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 781; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 782; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc 783; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 784; GFX7LESS-NEXT: buffer_wbinvl1 785; GFX7LESS-NEXT: .LBB4_2: 786; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 787; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 788; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 789; GFX7LESS-NEXT: s_mov_b32 s6, -1 790; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 791; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1 792; GFX7LESS-NEXT: s_waitcnt expcnt(0) 793; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 794; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 795; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 796; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 797; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3 798; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v2 799; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 800; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 801; GFX7LESS-NEXT: s_endpgm 802; 803; GFX8-LABEL: add_i64_uniform: 804; GFX8: ; %bb.0: ; %entry 805; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 806; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 807; GFX8-NEXT: s_mov_b64 s[8:9], exec 808; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 809; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 810; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 811; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 812; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 813; GFX8-NEXT: s_cbranch_execz .LBB4_2 814; GFX8-NEXT: ; %bb.1: 815; GFX8-NEXT: s_waitcnt lgkmcnt(0) 816; GFX8-NEXT: s_mov_b32 s12, s6 817; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 818; GFX8-NEXT: v_mov_b32_e32 v0, s6 819; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0 820; GFX8-NEXT: s_mul_i32 s6, s1, s6 821; GFX8-NEXT: s_mov_b32 s15, 0xf000 822; GFX8-NEXT: s_mov_b32 s14, -1 823; GFX8-NEXT: s_mov_b32 s13, s7 824; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 825; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 826; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc 827; GFX8-NEXT: s_waitcnt vmcnt(0) 828; GFX8-NEXT: buffer_wbinvl1_vol 829; GFX8-NEXT: .LBB4_2: 830; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 831; GFX8-NEXT: v_readfirstlane_b32 s2, v0 832; GFX8-NEXT: v_readfirstlane_b32 s3, v1 833; GFX8-NEXT: v_mov_b32_e32 v0, s2 834; GFX8-NEXT: v_mov_b32_e32 v1, s3 835; GFX8-NEXT: s_waitcnt lgkmcnt(0) 836; GFX8-NEXT: v_mul_lo_u32 v3, s1, v2 837; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v2, v[0:1] 838; GFX8-NEXT: s_mov_b32 s7, 0xf000 839; GFX8-NEXT: s_mov_b32 s6, -1 840; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 841; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 842; GFX8-NEXT: s_endpgm 843; 844; GFX9-LABEL: add_i64_uniform: 845; GFX9: ; %bb.0: ; %entry 846; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 847; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 848; GFX9-NEXT: s_mov_b64 s[8:9], exec 849; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 850; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 851; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 852; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 853; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 854; GFX9-NEXT: s_cbranch_execz .LBB4_2 855; GFX9-NEXT: ; %bb.1: 856; GFX9-NEXT: s_waitcnt lgkmcnt(0) 857; GFX9-NEXT: s_mov_b32 s12, s6 858; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 859; GFX9-NEXT: s_mov_b32 s13, s7 860; GFX9-NEXT: s_mul_i32 s7, s3, s6 861; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 862; GFX9-NEXT: s_add_i32 s8, s8, s7 863; GFX9-NEXT: s_mul_i32 s6, s2, s6 864; GFX9-NEXT: s_mov_b32 s15, 0xf000 865; GFX9-NEXT: s_mov_b32 s14, -1 866; GFX9-NEXT: v_mov_b32_e32 v0, s6 867; GFX9-NEXT: v_mov_b32_e32 v1, s8 868; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 869; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc 870; GFX9-NEXT: s_waitcnt vmcnt(0) 871; GFX9-NEXT: buffer_wbinvl1_vol 872; GFX9-NEXT: .LBB4_2: 873; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 874; GFX9-NEXT: v_readfirstlane_b32 s0, v0 875; GFX9-NEXT: v_readfirstlane_b32 s1, v1 876; GFX9-NEXT: v_mov_b32_e32 v0, s0 877; GFX9-NEXT: v_mov_b32_e32 v1, s1 878; GFX9-NEXT: s_waitcnt lgkmcnt(0) 879; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1] 880; GFX9-NEXT: s_mov_b32 s7, 0xf000 881; GFX9-NEXT: s_mov_b32 s6, -1 882; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2] 883; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 884; GFX9-NEXT: s_endpgm 885; 886; GFX1064-LABEL: add_i64_uniform: 887; GFX1064: ; %bb.0: ; %entry 888; GFX1064-NEXT: s_clause 0x1 889; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 890; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 891; GFX1064-NEXT: s_mov_b64 s[8:9], exec 892; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 893; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 894; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 895; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 896; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 897; GFX1064-NEXT: s_cbranch_execz .LBB4_2 898; GFX1064-NEXT: ; %bb.1: 899; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] 900; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 901; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 902; GFX1064-NEXT: s_mul_i32 s9, s3, s8 903; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 904; GFX1064-NEXT: s_mul_i32 s8, s2, s8 905; GFX1064-NEXT: s_add_i32 s10, s10, s9 906; GFX1064-NEXT: v_mov_b32_e32 v0, s8 907; GFX1064-NEXT: v_mov_b32_e32 v1, s10 908; GFX1064-NEXT: s_mov_b32 s10, -1 909; GFX1064-NEXT: s_mov_b32 s8, s6 910; GFX1064-NEXT: s_mov_b32 s9, s7 911; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 912; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 913; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 914; GFX1064-NEXT: s_waitcnt vmcnt(0) 915; GFX1064-NEXT: buffer_gl0_inv 916; GFX1064-NEXT: buffer_gl1_inv 917; GFX1064-NEXT: .LBB4_2: 918; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 919; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 920; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 921; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 922; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 923; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 924; GFX1064-NEXT: s_mov_b32 s6, -1 925; GFX1064-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[0:1] 926; GFX1064-NEXT: v_mad_u64_u32 v[1:2], null, s3, v2, v[1:2] 927; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 928; GFX1064-NEXT: s_endpgm 929; 930; GFX1032-LABEL: add_i64_uniform: 931; GFX1032: ; %bb.0: ; %entry 932; GFX1032-NEXT: s_clause 0x1 933; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 934; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 935; GFX1032-NEXT: s_mov_b32 s8, exec_lo 936; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 937; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 938; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 939; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 940; GFX1032-NEXT: s_cbranch_execz .LBB4_2 941; GFX1032-NEXT: ; %bb.1: 942; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 943; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 944; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 945; GFX1032-NEXT: s_mul_i32 s8, s3, s1 946; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 947; GFX1032-NEXT: s_mul_i32 s1, s2, s1 948; GFX1032-NEXT: s_add_i32 s9, s9, s8 949; GFX1032-NEXT: v_mov_b32_e32 v0, s1 950; GFX1032-NEXT: v_mov_b32_e32 v1, s9 951; GFX1032-NEXT: s_mov_b32 s10, -1 952; GFX1032-NEXT: s_mov_b32 s8, s6 953; GFX1032-NEXT: s_mov_b32 s9, s7 954; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 955; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 956; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 957; GFX1032-NEXT: s_waitcnt vmcnt(0) 958; GFX1032-NEXT: buffer_gl0_inv 959; GFX1032-NEXT: buffer_gl1_inv 960; GFX1032-NEXT: .LBB4_2: 961; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 962; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 963; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 964; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 965; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 966; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 967; GFX1032-NEXT: s_mov_b32 s6, -1 968; GFX1032-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[0:1] 969; GFX1032-NEXT: v_mad_u64_u32 v[1:2], null, s3, v2, v[1:2] 970; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 971; GFX1032-NEXT: s_endpgm 972entry: 973 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel 974 store i64 %old, i64 addrspace(1)* %out 975 ret void 976} 977 978define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 979; GFX7LESS-LABEL: add_i64_varying: 980; GFX7LESS: ; %bb.0: ; %entry 981; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 982; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 983; GFX7LESS-NEXT: s_mov_b32 s6, -1 984; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 985; GFX7LESS-NEXT: s_mov_b32 s10, s6 986; GFX7LESS-NEXT: s_mov_b32 s11, s7 987; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 988; GFX7LESS-NEXT: s_mov_b32 s8, s2 989; GFX7LESS-NEXT: s_mov_b32 s9, s3 990; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 991; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 992; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 993; GFX7LESS-NEXT: buffer_wbinvl1 994; GFX7LESS-NEXT: s_mov_b32 s4, s0 995; GFX7LESS-NEXT: s_mov_b32 s5, s1 996; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 997; GFX7LESS-NEXT: s_endpgm 998; 999; GFX89-LABEL: add_i64_varying: 1000; GFX89: ; %bb.0: ; %entry 1001; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1002; GFX89-NEXT: s_mov_b32 s7, 0xf000 1003; GFX89-NEXT: s_mov_b32 s6, -1 1004; GFX89-NEXT: s_mov_b32 s10, s6 1005; GFX89-NEXT: s_mov_b32 s11, s7 1006; GFX89-NEXT: s_waitcnt lgkmcnt(0) 1007; GFX89-NEXT: s_mov_b32 s8, s2 1008; GFX89-NEXT: s_mov_b32 s9, s3 1009; GFX89-NEXT: v_mov_b32_e32 v1, 0 1010; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1011; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1012; GFX89-NEXT: s_waitcnt vmcnt(0) 1013; GFX89-NEXT: buffer_wbinvl1_vol 1014; GFX89-NEXT: s_mov_b32 s4, s0 1015; GFX89-NEXT: s_mov_b32 s5, s1 1016; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1017; GFX89-NEXT: s_endpgm 1018; 1019; GFX10-LABEL: add_i64_varying: 1020; GFX10: ; %bb.0: ; %entry 1021; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1022; GFX10-NEXT: v_mov_b32_e32 v1, 0 1023; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1024; GFX10-NEXT: s_mov_b32 s6, -1 1025; GFX10-NEXT: s_mov_b32 s11, s7 1026; GFX10-NEXT: s_mov_b32 s10, s6 1027; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1028; GFX10-NEXT: s_mov_b32 s8, s2 1029; GFX10-NEXT: s_mov_b32 s9, s3 1030; GFX10-NEXT: s_mov_b32 s4, s0 1031; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1032; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1033; GFX10-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1034; GFX10-NEXT: s_waitcnt vmcnt(0) 1035; GFX10-NEXT: buffer_gl0_inv 1036; GFX10-NEXT: buffer_gl1_inv 1037; GFX10-NEXT: s_mov_b32 s5, s1 1038; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1039; GFX10-NEXT: s_endpgm 1040entry: 1041 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1042 %zext = zext i32 %lane to i64 1043 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel 1044 store i64 %old, i64 addrspace(1)* %out 1045 ret void 1046} 1047 1048define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 1049; GFX7LESS-LABEL: sub_i32_constant: 1050; GFX7LESS: ; %bb.0: ; %entry 1051; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1052; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1053; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1054; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1055; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1056; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1057; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1058; GFX7LESS-NEXT: s_cbranch_execz .LBB6_2 1059; GFX7LESS-NEXT: ; %bb.1: 1060; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 1061; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1062; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 1063; GFX7LESS-NEXT: s_mov_b32 s10, -1 1064; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1065; GFX7LESS-NEXT: s_mov_b32 s8, s2 1066; GFX7LESS-NEXT: s_mov_b32 s9, s3 1067; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1068; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1069; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1070; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1071; GFX7LESS-NEXT: buffer_wbinvl1 1072; GFX7LESS-NEXT: .LBB6_2: 1073; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1074; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1075; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1076; GFX7LESS-NEXT: s_mov_b32 s2, -1 1077; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 1078; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1079; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1080; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1081; GFX7LESS-NEXT: s_endpgm 1082; 1083; GFX8-LABEL: sub_i32_constant: 1084; GFX8: ; %bb.0: ; %entry 1085; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1086; GFX8-NEXT: s_mov_b64 s[6:7], exec 1087; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1088; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1089; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1090; GFX8-NEXT: ; implicit-def: $vgpr1 1091; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1092; GFX8-NEXT: s_cbranch_execz .LBB6_2 1093; GFX8-NEXT: ; %bb.1: 1094; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1095; GFX8-NEXT: s_mov_b32 s8, s2 1096; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1097; GFX8-NEXT: s_mul_i32 s2, s2, 5 1098; GFX8-NEXT: s_mov_b32 s11, 0xf000 1099; GFX8-NEXT: s_mov_b32 s10, -1 1100; GFX8-NEXT: s_mov_b32 s9, s3 1101; GFX8-NEXT: v_mov_b32_e32 v1, s2 1102; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1103; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1104; GFX8-NEXT: s_waitcnt vmcnt(0) 1105; GFX8-NEXT: buffer_wbinvl1_vol 1106; GFX8-NEXT: .LBB6_2: 1107; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1108; GFX8-NEXT: v_readfirstlane_b32 s4, v1 1109; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1110; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1111; GFX8-NEXT: s_mov_b32 s3, 0xf000 1112; GFX8-NEXT: s_mov_b32 s2, -1 1113; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 1114; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1115; GFX8-NEXT: s_endpgm 1116; 1117; GFX9-LABEL: sub_i32_constant: 1118; GFX9: ; %bb.0: ; %entry 1119; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1120; GFX9-NEXT: s_mov_b64 s[6:7], exec 1121; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1122; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1123; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1124; GFX9-NEXT: ; implicit-def: $vgpr1 1125; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1126; GFX9-NEXT: s_cbranch_execz .LBB6_2 1127; GFX9-NEXT: ; %bb.1: 1128; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1129; GFX9-NEXT: s_mov_b32 s8, s2 1130; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1131; GFX9-NEXT: s_mul_i32 s2, s2, 5 1132; GFX9-NEXT: s_mov_b32 s11, 0xf000 1133; GFX9-NEXT: s_mov_b32 s10, -1 1134; GFX9-NEXT: s_mov_b32 s9, s3 1135; GFX9-NEXT: v_mov_b32_e32 v1, s2 1136; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1137; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1138; GFX9-NEXT: s_waitcnt vmcnt(0) 1139; GFX9-NEXT: buffer_wbinvl1_vol 1140; GFX9-NEXT: .LBB6_2: 1141; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1142; GFX9-NEXT: v_readfirstlane_b32 s4, v1 1143; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1144; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1145; GFX9-NEXT: s_mov_b32 s3, 0xf000 1146; GFX9-NEXT: s_mov_b32 s2, -1 1147; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1148; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1149; GFX9-NEXT: s_endpgm 1150; 1151; GFX1064-LABEL: sub_i32_constant: 1152; GFX1064: ; %bb.0: ; %entry 1153; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1154; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1155; GFX1064-NEXT: ; implicit-def: $vgpr1 1156; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1157; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1158; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1159; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1160; GFX1064-NEXT: s_cbranch_execz .LBB6_2 1161; GFX1064-NEXT: ; %bb.1: 1162; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1163; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 1164; GFX1064-NEXT: s_mul_i32 s6, s6, 5 1165; GFX1064-NEXT: s_mov_b32 s10, -1 1166; GFX1064-NEXT: v_mov_b32_e32 v1, s6 1167; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1168; GFX1064-NEXT: s_mov_b32 s8, s2 1169; GFX1064-NEXT: s_mov_b32 s9, s3 1170; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1171; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1172; GFX1064-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1173; GFX1064-NEXT: s_waitcnt vmcnt(0) 1174; GFX1064-NEXT: buffer_gl0_inv 1175; GFX1064-NEXT: buffer_gl1_inv 1176; GFX1064-NEXT: .LBB6_2: 1177; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1178; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1179; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1180; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1181; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1182; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1183; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1184; GFX1064-NEXT: s_mov_b32 s2, -1 1185; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1186; GFX1064-NEXT: s_endpgm 1187; 1188; GFX1032-LABEL: sub_i32_constant: 1189; GFX1032: ; %bb.0: ; %entry 1190; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1191; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1192; GFX1032-NEXT: ; implicit-def: $vgpr1 1193; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 1194; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1195; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1196; GFX1032-NEXT: s_cbranch_execz .LBB6_2 1197; GFX1032-NEXT: ; %bb.1: 1198; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1199; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 1200; GFX1032-NEXT: s_mul_i32 s5, s5, 5 1201; GFX1032-NEXT: s_mov_b32 s10, -1 1202; GFX1032-NEXT: v_mov_b32_e32 v1, s5 1203; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1204; GFX1032-NEXT: s_mov_b32 s8, s2 1205; GFX1032-NEXT: s_mov_b32 s9, s3 1206; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1207; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1208; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1209; GFX1032-NEXT: s_waitcnt vmcnt(0) 1210; GFX1032-NEXT: buffer_gl0_inv 1211; GFX1032-NEXT: buffer_gl1_inv 1212; GFX1032-NEXT: .LBB6_2: 1213; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1214; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1215; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1216; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1217; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1218; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1219; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1220; GFX1032-NEXT: s_mov_b32 s2, -1 1221; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1222; GFX1032-NEXT: s_endpgm 1223entry: 1224 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel 1225 store i32 %old, i32 addrspace(1)* %out 1226 ret void 1227} 1228 1229define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) { 1230; GFX7LESS-LABEL: sub_i32_uniform: 1231; GFX7LESS: ; %bb.0: ; %entry 1232; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1233; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1234; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd 1235; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1236; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1237; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1238; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1239; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1240; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 1241; GFX7LESS-NEXT: ; %bb.1: 1242; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 1243; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1244; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1245; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 1246; GFX7LESS-NEXT: s_mov_b32 s14, -1 1247; GFX7LESS-NEXT: s_mov_b32 s12, s6 1248; GFX7LESS-NEXT: s_mov_b32 s13, s7 1249; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 1250; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1251; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1252; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1253; GFX7LESS-NEXT: buffer_wbinvl1 1254; GFX7LESS-NEXT: .LBB7_2: 1255; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1256; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1257; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1258; GFX7LESS-NEXT: s_mov_b32 s6, -1 1259; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1260; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 1261; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1262; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1263; GFX7LESS-NEXT: s_endpgm 1264; 1265; GFX8-LABEL: sub_i32_uniform: 1266; GFX8: ; %bb.0: ; %entry 1267; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1268; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 1269; GFX8-NEXT: s_mov_b64 s[2:3], exec 1270; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1271; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1272; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1273; GFX8-NEXT: ; implicit-def: $vgpr1 1274; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1275; GFX8-NEXT: s_cbranch_execz .LBB7_2 1276; GFX8-NEXT: ; %bb.1: 1277; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1278; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1279; GFX8-NEXT: s_mul_i32 s2, s8, s2 1280; GFX8-NEXT: s_mov_b32 s15, 0xf000 1281; GFX8-NEXT: s_mov_b32 s14, -1 1282; GFX8-NEXT: s_mov_b32 s12, s6 1283; GFX8-NEXT: s_mov_b32 s13, s7 1284; GFX8-NEXT: v_mov_b32_e32 v1, s2 1285; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1286; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1287; GFX8-NEXT: s_waitcnt vmcnt(0) 1288; GFX8-NEXT: buffer_wbinvl1_vol 1289; GFX8-NEXT: .LBB7_2: 1290; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1291; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1292; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 1293; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1294; GFX8-NEXT: s_mov_b32 s7, 0xf000 1295; GFX8-NEXT: s_mov_b32 s6, -1 1296; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1297; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1298; GFX8-NEXT: s_endpgm 1299; 1300; GFX9-LABEL: sub_i32_uniform: 1301; GFX9: ; %bb.0: ; %entry 1302; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1303; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 1304; GFX9-NEXT: s_mov_b64 s[2:3], exec 1305; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1306; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1307; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1308; GFX9-NEXT: ; implicit-def: $vgpr1 1309; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1310; GFX9-NEXT: s_cbranch_execz .LBB7_2 1311; GFX9-NEXT: ; %bb.1: 1312; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1313; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1314; GFX9-NEXT: s_mul_i32 s2, s8, s2 1315; GFX9-NEXT: s_mov_b32 s15, 0xf000 1316; GFX9-NEXT: s_mov_b32 s14, -1 1317; GFX9-NEXT: s_mov_b32 s12, s6 1318; GFX9-NEXT: s_mov_b32 s13, s7 1319; GFX9-NEXT: v_mov_b32_e32 v1, s2 1320; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1321; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1322; GFX9-NEXT: s_waitcnt vmcnt(0) 1323; GFX9-NEXT: buffer_wbinvl1_vol 1324; GFX9-NEXT: .LBB7_2: 1325; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1326; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1327; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 1328; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1329; GFX9-NEXT: s_mov_b32 s7, 0xf000 1330; GFX9-NEXT: s_mov_b32 s6, -1 1331; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1332; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1333; GFX9-NEXT: s_endpgm 1334; 1335; GFX1064-LABEL: sub_i32_uniform: 1336; GFX1064: ; %bb.0: ; %entry 1337; GFX1064-NEXT: s_clause 0x1 1338; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1339; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 1340; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1341; GFX1064-NEXT: ; implicit-def: $vgpr1 1342; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1343; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1344; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1345; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1346; GFX1064-NEXT: s_cbranch_execz .LBB7_2 1347; GFX1064-NEXT: ; %bb.1: 1348; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1349; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 1350; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1351; GFX1064-NEXT: s_mul_i32 s2, s8, s2 1352; GFX1064-NEXT: s_mov_b32 s14, -1 1353; GFX1064-NEXT: v_mov_b32_e32 v1, s2 1354; GFX1064-NEXT: s_mov_b32 s12, s6 1355; GFX1064-NEXT: s_mov_b32 s13, s7 1356; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1357; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1358; GFX1064-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1359; GFX1064-NEXT: s_waitcnt vmcnt(0) 1360; GFX1064-NEXT: buffer_gl0_inv 1361; GFX1064-NEXT: buffer_gl1_inv 1362; GFX1064-NEXT: .LBB7_2: 1363; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1364; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1365; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1366; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0 1367; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1368; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1369; GFX1064-NEXT: s_mov_b32 s6, -1 1370; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1371; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1372; GFX1064-NEXT: s_endpgm 1373; 1374; GFX1032-LABEL: sub_i32_uniform: 1375; GFX1032: ; %bb.0: ; %entry 1376; GFX1032-NEXT: s_clause 0x1 1377; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1378; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 1379; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1380; GFX1032-NEXT: ; implicit-def: $vgpr1 1381; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1382; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1383; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1384; GFX1032-NEXT: s_cbranch_execz .LBB7_2 1385; GFX1032-NEXT: ; %bb.1: 1386; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 1387; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 1388; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1389; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1390; GFX1032-NEXT: s_mov_b32 s10, -1 1391; GFX1032-NEXT: v_mov_b32_e32 v1, s1 1392; GFX1032-NEXT: s_mov_b32 s8, s6 1393; GFX1032-NEXT: s_mov_b32 s9, s7 1394; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1395; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1396; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1397; GFX1032-NEXT: s_waitcnt vmcnt(0) 1398; GFX1032-NEXT: buffer_gl0_inv 1399; GFX1032-NEXT: buffer_gl1_inv 1400; GFX1032-NEXT: .LBB7_2: 1401; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1402; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1403; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1404; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1405; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1406; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1407; GFX1032-NEXT: s_mov_b32 s6, -1 1408; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1409; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1410; GFX1032-NEXT: s_endpgm 1411entry: 1412 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel 1413 store i32 %old, i32 addrspace(1)* %out 1414 ret void 1415} 1416 1417define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 1418; GFX7LESS-LABEL: sub_i32_varying: 1419; GFX7LESS: ; %bb.0: ; %entry 1420; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1421; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1422; GFX7LESS-NEXT: s_mov_b32 s6, -1 1423; GFX7LESS-NEXT: s_mov_b32 s10, s6 1424; GFX7LESS-NEXT: s_mov_b32 s11, s7 1425; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1426; GFX7LESS-NEXT: s_mov_b32 s8, s2 1427; GFX7LESS-NEXT: s_mov_b32 s9, s3 1428; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1429; GFX7LESS-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1430; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1431; GFX7LESS-NEXT: buffer_wbinvl1 1432; GFX7LESS-NEXT: s_mov_b32 s4, s0 1433; GFX7LESS-NEXT: s_mov_b32 s5, s1 1434; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1435; GFX7LESS-NEXT: s_endpgm 1436; 1437; GFX8-LABEL: sub_i32_varying: 1438; GFX8: ; %bb.0: ; %entry 1439; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1440; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1441; GFX8-NEXT: v_mov_b32_e32 v1, 0 1442; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1443; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1444; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1445; GFX8-NEXT: v_mov_b32_e32 v2, v0 1446; GFX8-NEXT: s_not_b64 exec, exec 1447; GFX8-NEXT: v_mov_b32_e32 v2, 0 1448; GFX8-NEXT: s_not_b64 exec, exec 1449; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1450; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1451; GFX8-NEXT: s_nop 1 1452; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1453; GFX8-NEXT: s_nop 1 1454; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1455; GFX8-NEXT: s_nop 1 1456; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1457; GFX8-NEXT: s_nop 1 1458; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1459; GFX8-NEXT: s_nop 1 1460; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1461; GFX8-NEXT: v_readlane_b32 s6, v2, 63 1462; GFX8-NEXT: s_nop 0 1463; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1464; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1465; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1466; GFX8-NEXT: ; implicit-def: $vgpr0 1467; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1468; GFX8-NEXT: s_cbranch_execz .LBB8_2 1469; GFX8-NEXT: ; %bb.1: 1470; GFX8-NEXT: s_mov_b32 s11, 0xf000 1471; GFX8-NEXT: s_mov_b32 s10, -1 1472; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1473; GFX8-NEXT: s_mov_b32 s8, s2 1474; GFX8-NEXT: s_mov_b32 s9, s3 1475; GFX8-NEXT: v_mov_b32_e32 v0, s6 1476; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1477; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1478; GFX8-NEXT: s_waitcnt vmcnt(0) 1479; GFX8-NEXT: buffer_wbinvl1_vol 1480; GFX8-NEXT: .LBB8_2: 1481; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1482; GFX8-NEXT: v_readfirstlane_b32 s4, v0 1483; GFX8-NEXT: v_mov_b32_e32 v0, v1 1484; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1485; GFX8-NEXT: s_mov_b32 s3, 0xf000 1486; GFX8-NEXT: s_mov_b32 s2, -1 1487; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 1488; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1489; GFX8-NEXT: s_endpgm 1490; 1491; GFX9-LABEL: sub_i32_varying: 1492; GFX9: ; %bb.0: ; %entry 1493; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1494; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1495; GFX9-NEXT: v_mov_b32_e32 v1, 0 1496; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1497; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1498; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1499; GFX9-NEXT: v_mov_b32_e32 v2, v0 1500; GFX9-NEXT: s_not_b64 exec, exec 1501; GFX9-NEXT: v_mov_b32_e32 v2, 0 1502; GFX9-NEXT: s_not_b64 exec, exec 1503; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1504; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1505; GFX9-NEXT: s_nop 1 1506; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1507; GFX9-NEXT: s_nop 1 1508; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1509; GFX9-NEXT: s_nop 1 1510; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1511; GFX9-NEXT: s_nop 1 1512; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1513; GFX9-NEXT: s_nop 1 1514; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1515; GFX9-NEXT: v_readlane_b32 s6, v2, 63 1516; GFX9-NEXT: s_nop 0 1517; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1518; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1519; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1520; GFX9-NEXT: ; implicit-def: $vgpr0 1521; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1522; GFX9-NEXT: s_cbranch_execz .LBB8_2 1523; GFX9-NEXT: ; %bb.1: 1524; GFX9-NEXT: s_mov_b32 s11, 0xf000 1525; GFX9-NEXT: s_mov_b32 s10, -1 1526; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1527; GFX9-NEXT: s_mov_b32 s8, s2 1528; GFX9-NEXT: s_mov_b32 s9, s3 1529; GFX9-NEXT: v_mov_b32_e32 v0, s6 1530; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1531; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1532; GFX9-NEXT: s_waitcnt vmcnt(0) 1533; GFX9-NEXT: buffer_wbinvl1_vol 1534; GFX9-NEXT: .LBB8_2: 1535; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1536; GFX9-NEXT: v_readfirstlane_b32 s4, v0 1537; GFX9-NEXT: v_mov_b32_e32 v0, v1 1538; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1539; GFX9-NEXT: s_mov_b32 s3, 0xf000 1540; GFX9-NEXT: s_mov_b32 s2, -1 1541; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1542; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1543; GFX9-NEXT: s_endpgm 1544; 1545; GFX1064-LABEL: sub_i32_varying: 1546; GFX1064: ; %bb.0: ; %entry 1547; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1548; GFX1064-NEXT: s_not_b64 exec, exec 1549; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1550; GFX1064-NEXT: s_not_b64 exec, exec 1551; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1552; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1553; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1554; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1555; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1556; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1557; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1558; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1559; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1560; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 1561; GFX1064-NEXT: v_mov_b32_e32 v2, s4 1562; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1563; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 1564; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1565; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1566; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1567; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 1568; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 1569; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 1570; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 1571; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1572; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 1573; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 1574; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 1575; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 1576; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 1577; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1578; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 1579; GFX1064-NEXT: s_mov_b32 s4, s9 1580; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 1581; GFX1064-NEXT: s_mov_b64 exec, s[6:7] 1582; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1583; GFX1064-NEXT: s_mov_b32 s6, -1 1584; GFX1064-NEXT: ; implicit-def: $vgpr0 1585; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc 1586; GFX1064-NEXT: s_cbranch_execz .LBB8_2 1587; GFX1064-NEXT: ; %bb.1: 1588; GFX1064-NEXT: v_mov_b32_e32 v0, s4 1589; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1590; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1591; GFX1064-NEXT: s_mov_b32 s4, s2 1592; GFX1064-NEXT: s_mov_b32 s5, s3 1593; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1594; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1595; GFX1064-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 1596; GFX1064-NEXT: s_waitcnt vmcnt(0) 1597; GFX1064-NEXT: buffer_gl0_inv 1598; GFX1064-NEXT: buffer_gl1_inv 1599; GFX1064-NEXT: .LBB8_2: 1600; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1601; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 1602; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1603; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 1604; GFX1064-NEXT: v_mov_b32_e32 v0, v3 1605; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1606; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1607; GFX1064-NEXT: s_mov_b32 s2, s6 1608; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1609; GFX1064-NEXT: s_endpgm 1610; 1611; GFX1032-LABEL: sub_i32_varying: 1612; GFX1032: ; %bb.0: ; %entry 1613; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1614; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1615; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1616; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1617; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1618; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1619; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1620; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1621; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1622; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1623; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1624; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1625; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1626; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 1627; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1628; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1629; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 1630; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 1631; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1632; GFX1032-NEXT: s_mov_b32 exec_lo, s4 1633; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1634; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 1635; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 1636; GFX1032-NEXT: s_mov_b32 exec_lo, s4 1637; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1638; GFX1032-NEXT: s_mov_b32 s4, s6 1639; GFX1032-NEXT: s_mov_b32 s6, -1 1640; GFX1032-NEXT: ; implicit-def: $vgpr0 1641; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo 1642; GFX1032-NEXT: s_cbranch_execz .LBB8_2 1643; GFX1032-NEXT: ; %bb.1: 1644; GFX1032-NEXT: v_mov_b32_e32 v0, s4 1645; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1646; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1647; GFX1032-NEXT: s_mov_b32 s4, s2 1648; GFX1032-NEXT: s_mov_b32 s5, s3 1649; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1650; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1651; GFX1032-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 1652; GFX1032-NEXT: s_waitcnt vmcnt(0) 1653; GFX1032-NEXT: buffer_gl0_inv 1654; GFX1032-NEXT: buffer_gl1_inv 1655; GFX1032-NEXT: .LBB8_2: 1656; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1657; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 1658; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1659; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 1660; GFX1032-NEXT: v_mov_b32_e32 v0, v3 1661; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1662; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1663; GFX1032-NEXT: s_mov_b32 s2, s6 1664; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1665; GFX1032-NEXT: s_endpgm 1666entry: 1667 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1668 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel 1669 store i32 %old, i32 addrspace(1)* %out 1670 ret void 1671} 1672 1673define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 1674; GFX7LESS-LABEL: sub_i64_constant: 1675; GFX7LESS: ; %bb.0: ; %entry 1676; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1677; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1678; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1679; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 1680; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1681; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1682; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1683; GFX7LESS-NEXT: s_cbranch_execz .LBB9_2 1684; GFX7LESS-NEXT: ; %bb.1: 1685; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 1686; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1687; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 1688; GFX7LESS-NEXT: s_mov_b32 s10, -1 1689; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1690; GFX7LESS-NEXT: s_mov_b32 s8, s2 1691; GFX7LESS-NEXT: s_mov_b32 s9, s3 1692; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1693; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1694; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1695; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1696; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1697; GFX7LESS-NEXT: buffer_wbinvl1 1698; GFX7LESS-NEXT: .LBB9_2: 1699; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1700; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1701; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1702; GFX7LESS-NEXT: s_mov_b32 s2, -1 1703; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 1704; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 1705; GFX7LESS-NEXT: s_waitcnt expcnt(0) 1706; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1707; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1708; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 1709; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1710; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1711; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1712; GFX7LESS-NEXT: s_endpgm 1713; 1714; GFX8-LABEL: sub_i64_constant: 1715; GFX8: ; %bb.0: ; %entry 1716; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1717; GFX8-NEXT: s_mov_b64 s[6:7], exec 1718; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1719; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1720; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1721; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1722; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1723; GFX8-NEXT: s_cbranch_execz .LBB9_2 1724; GFX8-NEXT: ; %bb.1: 1725; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1726; GFX8-NEXT: s_mov_b32 s8, s2 1727; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1728; GFX8-NEXT: s_mul_i32 s2, s2, 5 1729; GFX8-NEXT: s_mov_b32 s11, 0xf000 1730; GFX8-NEXT: s_mov_b32 s10, -1 1731; GFX8-NEXT: s_mov_b32 s9, s3 1732; GFX8-NEXT: v_mov_b32_e32 v0, s2 1733; GFX8-NEXT: v_mov_b32_e32 v1, 0 1734; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1735; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1736; GFX8-NEXT: s_waitcnt vmcnt(0) 1737; GFX8-NEXT: buffer_wbinvl1_vol 1738; GFX8-NEXT: .LBB9_2: 1739; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1740; GFX8-NEXT: v_readfirstlane_b32 s4, v0 1741; GFX8-NEXT: v_readfirstlane_b32 s5, v1 1742; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1743; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1744; GFX8-NEXT: v_mov_b32_e32 v2, s5 1745; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 1746; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1747; GFX8-NEXT: s_mov_b32 s3, 0xf000 1748; GFX8-NEXT: s_mov_b32 s2, -1 1749; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1750; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1751; GFX8-NEXT: s_endpgm 1752; 1753; GFX9-LABEL: sub_i64_constant: 1754; GFX9: ; %bb.0: ; %entry 1755; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1756; GFX9-NEXT: s_mov_b64 s[6:7], exec 1757; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1758; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1759; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1760; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1761; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1762; GFX9-NEXT: s_cbranch_execz .LBB9_2 1763; GFX9-NEXT: ; %bb.1: 1764; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1765; GFX9-NEXT: s_mov_b32 s8, s2 1766; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1767; GFX9-NEXT: s_mul_i32 s2, s2, 5 1768; GFX9-NEXT: s_mov_b32 s11, 0xf000 1769; GFX9-NEXT: s_mov_b32 s10, -1 1770; GFX9-NEXT: s_mov_b32 s9, s3 1771; GFX9-NEXT: v_mov_b32_e32 v0, s2 1772; GFX9-NEXT: v_mov_b32_e32 v1, 0 1773; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1774; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1775; GFX9-NEXT: s_waitcnt vmcnt(0) 1776; GFX9-NEXT: buffer_wbinvl1_vol 1777; GFX9-NEXT: .LBB9_2: 1778; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1779; GFX9-NEXT: v_readfirstlane_b32 s4, v0 1780; GFX9-NEXT: v_readfirstlane_b32 s5, v1 1781; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1782; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1783; GFX9-NEXT: v_mov_b32_e32 v2, s5 1784; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 1785; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1786; GFX9-NEXT: s_mov_b32 s3, 0xf000 1787; GFX9-NEXT: s_mov_b32 s2, -1 1788; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 1789; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1790; GFX9-NEXT: s_endpgm 1791; 1792; GFX1064-LABEL: sub_i64_constant: 1793; GFX1064: ; %bb.0: ; %entry 1794; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1795; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1796; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1797; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1798; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1799; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1800; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1801; GFX1064-NEXT: s_cbranch_execz .LBB9_2 1802; GFX1064-NEXT: ; %bb.1: 1803; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1804; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1805; GFX1064-NEXT: s_mul_i32 s6, s6, 5 1806; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 1807; GFX1064-NEXT: v_mov_b32_e32 v0, s6 1808; GFX1064-NEXT: s_mov_b32 s10, -1 1809; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1810; GFX1064-NEXT: s_mov_b32 s8, s2 1811; GFX1064-NEXT: s_mov_b32 s9, s3 1812; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1813; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1814; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1815; GFX1064-NEXT: s_waitcnt vmcnt(0) 1816; GFX1064-NEXT: buffer_gl0_inv 1817; GFX1064-NEXT: buffer_gl1_inv 1818; GFX1064-NEXT: .LBB9_2: 1819; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1820; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1821; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1822; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 1823; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1824; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 1825; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1826; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 1827; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 1828; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1829; GFX1064-NEXT: s_mov_b32 s2, -1 1830; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1831; GFX1064-NEXT: s_endpgm 1832; 1833; GFX1032-LABEL: sub_i64_constant: 1834; GFX1032: ; %bb.0: ; %entry 1835; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1836; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1837; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1838; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1839; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1840; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1841; GFX1032-NEXT: s_cbranch_execz .LBB9_2 1842; GFX1032-NEXT: ; %bb.1: 1843; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1844; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1845; GFX1032-NEXT: s_mul_i32 s5, s5, 5 1846; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 1847; GFX1032-NEXT: v_mov_b32_e32 v0, s5 1848; GFX1032-NEXT: s_mov_b32 s10, -1 1849; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1850; GFX1032-NEXT: s_mov_b32 s8, s2 1851; GFX1032-NEXT: s_mov_b32 s9, s3 1852; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1853; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1854; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1855; GFX1032-NEXT: s_waitcnt vmcnt(0) 1856; GFX1032-NEXT: buffer_gl0_inv 1857; GFX1032-NEXT: buffer_gl1_inv 1858; GFX1032-NEXT: .LBB9_2: 1859; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1860; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1861; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1862; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 1863; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1864; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 1865; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1866; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 1867; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 1868; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1869; GFX1032-NEXT: s_mov_b32 s2, -1 1870; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1871; GFX1032-NEXT: s_endpgm 1872entry: 1873 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel 1874 store i64 %old, i64 addrspace(1)* %out 1875 ret void 1876} 1877 1878define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) { 1879; GFX7LESS-LABEL: sub_i64_uniform: 1880; GFX7LESS: ; %bb.0: ; %entry 1881; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec 1882; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1883; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1884; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 1885; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 1886; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1887; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1888; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1889; GFX7LESS-NEXT: s_cbranch_execz .LBB10_2 1890; GFX7LESS-NEXT: ; %bb.1: 1891; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 1892; GFX7LESS-NEXT: s_mov_b32 s14, -1 1893; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1894; GFX7LESS-NEXT: s_mov_b32 s12, s6 1895; GFX7LESS-NEXT: s_mov_b32 s13, s7 1896; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 1897; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 1898; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1899; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 1900; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 1901; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 1902; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1903; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1904; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc 1905; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1906; GFX7LESS-NEXT: buffer_wbinvl1 1907; GFX7LESS-NEXT: .LBB10_2: 1908; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1909; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1910; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1911; GFX7LESS-NEXT: s_mov_b32 s6, -1 1912; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 1913; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1 1914; GFX7LESS-NEXT: s_waitcnt expcnt(0) 1915; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 1916; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 1917; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 1918; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 1919; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3 1920; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v2 1921; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 1922; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1923; GFX7LESS-NEXT: s_endpgm 1924; 1925; GFX8-LABEL: sub_i64_uniform: 1926; GFX8: ; %bb.0: ; %entry 1927; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1928; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1929; GFX8-NEXT: s_mov_b64 s[8:9], exec 1930; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 1931; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 1932; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1933; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1934; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1935; GFX8-NEXT: s_cbranch_execz .LBB10_2 1936; GFX8-NEXT: ; %bb.1: 1937; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1938; GFX8-NEXT: s_mov_b32 s12, s6 1939; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 1940; GFX8-NEXT: v_mov_b32_e32 v0, s6 1941; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0 1942; GFX8-NEXT: s_mul_i32 s6, s1, s6 1943; GFX8-NEXT: s_mov_b32 s15, 0xf000 1944; GFX8-NEXT: s_mov_b32 s14, -1 1945; GFX8-NEXT: s_mov_b32 s13, s7 1946; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 1947; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1948; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc 1949; GFX8-NEXT: s_waitcnt vmcnt(0) 1950; GFX8-NEXT: buffer_wbinvl1_vol 1951; GFX8-NEXT: .LBB10_2: 1952; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1953; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1954; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2 1955; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0 1956; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1957; GFX8-NEXT: v_readfirstlane_b32 s1, v1 1958; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 1959; GFX8-NEXT: v_mov_b32_e32 v3, s1 1960; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 1961; GFX8-NEXT: s_mov_b32 s7, 0xf000 1962; GFX8-NEXT: s_mov_b32 s6, -1 1963; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 1964; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1965; GFX8-NEXT: s_endpgm 1966; 1967; GFX9-LABEL: sub_i64_uniform: 1968; GFX9: ; %bb.0: ; %entry 1969; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1970; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1971; GFX9-NEXT: s_mov_b64 s[8:9], exec 1972; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 1973; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 1974; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1975; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1976; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1977; GFX9-NEXT: s_cbranch_execz .LBB10_2 1978; GFX9-NEXT: ; %bb.1: 1979; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1980; GFX9-NEXT: s_mov_b32 s12, s6 1981; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 1982; GFX9-NEXT: s_mov_b32 s13, s7 1983; GFX9-NEXT: s_mul_i32 s7, s3, s6 1984; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1985; GFX9-NEXT: s_add_i32 s8, s8, s7 1986; GFX9-NEXT: s_mul_i32 s6, s2, s6 1987; GFX9-NEXT: s_mov_b32 s15, 0xf000 1988; GFX9-NEXT: s_mov_b32 s14, -1 1989; GFX9-NEXT: v_mov_b32_e32 v0, s6 1990; GFX9-NEXT: v_mov_b32_e32 v1, s8 1991; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1992; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc 1993; GFX9-NEXT: s_waitcnt vmcnt(0) 1994; GFX9-NEXT: buffer_wbinvl1_vol 1995; GFX9-NEXT: .LBB10_2: 1996; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1997; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1998; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0 1999; GFX9-NEXT: s_mov_b32 s7, 0xf000 2000; GFX9-NEXT: s_mov_b32 s6, -1 2001; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5] 2002; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2003; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2004; GFX9-NEXT: v_mov_b32_e32 v1, v4 2005; GFX9-NEXT: v_mov_b32_e32 v2, s1 2006; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3 2007; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2008; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2009; GFX9-NEXT: s_endpgm 2010; 2011; GFX1064-LABEL: sub_i64_uniform: 2012; GFX1064: ; %bb.0: ; %entry 2013; GFX1064-NEXT: s_clause 0x1 2014; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2015; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2016; GFX1064-NEXT: s_mov_b64 s[8:9], exec 2017; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 2018; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 2019; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 2020; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2021; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 2022; GFX1064-NEXT: s_cbranch_execz .LBB10_2 2023; GFX1064-NEXT: ; %bb.1: 2024; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] 2025; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 2026; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2027; GFX1064-NEXT: s_mul_i32 s9, s3, s8 2028; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 2029; GFX1064-NEXT: s_mul_i32 s8, s2, s8 2030; GFX1064-NEXT: s_add_i32 s10, s10, s9 2031; GFX1064-NEXT: v_mov_b32_e32 v0, s8 2032; GFX1064-NEXT: v_mov_b32_e32 v1, s10 2033; GFX1064-NEXT: s_mov_b32 s10, -1 2034; GFX1064-NEXT: s_mov_b32 s8, s6 2035; GFX1064-NEXT: s_mov_b32 s9, s7 2036; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2037; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2038; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2039; GFX1064-NEXT: s_waitcnt vmcnt(0) 2040; GFX1064-NEXT: buffer_gl0_inv 2041; GFX1064-NEXT: buffer_gl1_inv 2042; GFX1064-NEXT: .LBB10_2: 2043; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2044; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 2045; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2046; GFX1064-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 2047; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 2048; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 2049; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 2050; GFX1064-NEXT: s_mov_b32 s6, -1 2051; GFX1064-NEXT: v_mad_u64_u32 v[4:5], null, s3, v2, v[4:5] 2052; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v3 2053; GFX1064-NEXT: v_mov_b32_e32 v1, v4 2054; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc 2055; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2056; GFX1064-NEXT: s_endpgm 2057; 2058; GFX1032-LABEL: sub_i64_uniform: 2059; GFX1032: ; %bb.0: ; %entry 2060; GFX1032-NEXT: s_clause 0x1 2061; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2062; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2063; GFX1032-NEXT: s_mov_b32 s8, exec_lo 2064; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2065; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 2066; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2067; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 2068; GFX1032-NEXT: s_cbranch_execz .LBB10_2 2069; GFX1032-NEXT: ; %bb.1: 2070; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 2071; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 2072; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2073; GFX1032-NEXT: s_mul_i32 s8, s3, s1 2074; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 2075; GFX1032-NEXT: s_mul_i32 s1, s2, s1 2076; GFX1032-NEXT: s_add_i32 s9, s9, s8 2077; GFX1032-NEXT: v_mov_b32_e32 v0, s1 2078; GFX1032-NEXT: v_mov_b32_e32 v1, s9 2079; GFX1032-NEXT: s_mov_b32 s10, -1 2080; GFX1032-NEXT: s_mov_b32 s8, s6 2081; GFX1032-NEXT: s_mov_b32 s9, s7 2082; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2083; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2084; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2085; GFX1032-NEXT: s_waitcnt vmcnt(0) 2086; GFX1032-NEXT: buffer_gl0_inv 2087; GFX1032-NEXT: buffer_gl1_inv 2088; GFX1032-NEXT: .LBB10_2: 2089; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2090; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 2091; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2092; GFX1032-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 2093; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 2094; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 2095; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 2096; GFX1032-NEXT: s_mov_b32 s6, -1 2097; GFX1032-NEXT: v_mad_u64_u32 v[4:5], null, s3, v2, v[4:5] 2098; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3 2099; GFX1032-NEXT: v_mov_b32_e32 v1, v4 2100; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 2101; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2102; GFX1032-NEXT: s_endpgm 2103entry: 2104 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel 2105 store i64 %old, i64 addrspace(1)* %out 2106 ret void 2107} 2108 2109define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 2110; GFX7LESS-LABEL: sub_i64_varying: 2111; GFX7LESS: ; %bb.0: ; %entry 2112; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2113; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2114; GFX7LESS-NEXT: s_mov_b32 s6, -1 2115; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2116; GFX7LESS-NEXT: s_mov_b32 s10, s6 2117; GFX7LESS-NEXT: s_mov_b32 s11, s7 2118; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2119; GFX7LESS-NEXT: s_mov_b32 s8, s2 2120; GFX7LESS-NEXT: s_mov_b32 s9, s3 2121; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2122; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2123; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 2124; GFX7LESS-NEXT: buffer_wbinvl1 2125; GFX7LESS-NEXT: s_mov_b32 s4, s0 2126; GFX7LESS-NEXT: s_mov_b32 s5, s1 2127; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2128; GFX7LESS-NEXT: s_endpgm 2129; 2130; GFX89-LABEL: sub_i64_varying: 2131; GFX89: ; %bb.0: ; %entry 2132; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2133; GFX89-NEXT: s_mov_b32 s7, 0xf000 2134; GFX89-NEXT: s_mov_b32 s6, -1 2135; GFX89-NEXT: s_mov_b32 s10, s6 2136; GFX89-NEXT: s_mov_b32 s11, s7 2137; GFX89-NEXT: s_waitcnt lgkmcnt(0) 2138; GFX89-NEXT: s_mov_b32 s8, s2 2139; GFX89-NEXT: s_mov_b32 s9, s3 2140; GFX89-NEXT: v_mov_b32_e32 v1, 0 2141; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2142; GFX89-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2143; GFX89-NEXT: s_waitcnt vmcnt(0) 2144; GFX89-NEXT: buffer_wbinvl1_vol 2145; GFX89-NEXT: s_mov_b32 s4, s0 2146; GFX89-NEXT: s_mov_b32 s5, s1 2147; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2148; GFX89-NEXT: s_endpgm 2149; 2150; GFX10-LABEL: sub_i64_varying: 2151; GFX10: ; %bb.0: ; %entry 2152; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2153; GFX10-NEXT: v_mov_b32_e32 v1, 0 2154; GFX10-NEXT: s_mov_b32 s7, 0x31016000 2155; GFX10-NEXT: s_mov_b32 s6, -1 2156; GFX10-NEXT: s_mov_b32 s11, s7 2157; GFX10-NEXT: s_mov_b32 s10, s6 2158; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2159; GFX10-NEXT: s_mov_b32 s8, s2 2160; GFX10-NEXT: s_mov_b32 s9, s3 2161; GFX10-NEXT: s_mov_b32 s4, s0 2162; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2163; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2164; GFX10-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2165; GFX10-NEXT: s_waitcnt vmcnt(0) 2166; GFX10-NEXT: buffer_gl0_inv 2167; GFX10-NEXT: buffer_gl1_inv 2168; GFX10-NEXT: s_mov_b32 s5, s1 2169; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2170; GFX10-NEXT: s_endpgm 2171entry: 2172 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2173 %zext = zext i32 %lane to i64 2174 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel 2175 store i64 %old, i64 addrspace(1)* %out 2176 ret void 2177} 2178