1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s 4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s 5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GCN64 %s 6; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GCN32 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10; Show what the atomic optimization pass will do for global pointers. 11 12define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 13; GFX7LESS-LABEL: add_i32_constant: 14; GFX7LESS: ; %bb.0: ; %entry 15; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 16; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 17; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 18; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 19; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 20; GFX7LESS-NEXT: ; implicit-def: $vgpr1 21; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 22; GFX7LESS-NEXT: s_cbranch_execz BB0_2 23; GFX7LESS-NEXT: ; %bb.1: 24; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 25; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 26; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 27; GFX7LESS-NEXT: s_mov_b32 s10, -1 28; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 29; GFX7LESS-NEXT: s_mov_b32 s8, s2 30; GFX7LESS-NEXT: s_mov_b32 s9, s3 31; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 32; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 33; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 34; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 35; GFX7LESS-NEXT: buffer_wbinvl1 36; GFX7LESS-NEXT: BB0_2: 37; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 38; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 39; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 40; GFX7LESS-NEXT: s_mov_b32 s2, -1 41; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 42; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4 43; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 44; GFX7LESS-NEXT: s_endpgm 45; 46; GFX89-LABEL: add_i32_constant: 47; GFX89: ; %bb.0: ; %entry 48; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 49; GFX89-NEXT: s_mov_b64 s[6:7], exec 50; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 51; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 52; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 53; GFX89-NEXT: ; implicit-def: $vgpr1 54; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc 55; GFX89-NEXT: s_cbranch_execz BB0_2 56; GFX89-NEXT: ; %bb.1: 57; GFX89-NEXT: s_waitcnt lgkmcnt(0) 58; GFX89-NEXT: s_mov_b32 s8, s2 59; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 60; GFX89-NEXT: s_mul_i32 s2, s2, 5 61; GFX89-NEXT: s_mov_b32 s11, 0xf000 62; GFX89-NEXT: s_mov_b32 s10, -1 63; GFX89-NEXT: s_mov_b32 s9, s3 64; GFX89-NEXT: v_mov_b32_e32 v1, s2 65; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 66; GFX89-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 67; GFX89-NEXT: s_waitcnt vmcnt(0) 68; GFX89-NEXT: buffer_wbinvl1_vol 69; GFX89-NEXT: BB0_2: 70; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] 71; GFX89-NEXT: v_readfirstlane_b32 s4, v1 72; GFX89-NEXT: s_waitcnt lgkmcnt(0) 73; GFX89-NEXT: s_mov_b32 s3, 0xf000 74; GFX89-NEXT: s_mov_b32 s2, -1 75; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4 76; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 77; GFX89-NEXT: s_endpgm 78; 79; GCN64-LABEL: add_i32_constant: 80; GCN64: ; %bb.0: ; %entry 81; GCN64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 82; GCN64-NEXT: s_mov_b64 s[6:7], exec 83; GCN64-NEXT: ; implicit-def: $vgpr1 84; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 85; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 86; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 87; GCN64-NEXT: s_and_saveexec_b64 s[4:5], vcc 88; GCN64-NEXT: s_cbranch_execz BB0_2 89; GCN64-NEXT: ; %bb.1: 90; GCN64-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 91; GCN64-NEXT: s_mov_b32 s11, 0x31016000 92; GCN64-NEXT: s_mul_i32 s6, s6, 5 93; GCN64-NEXT: s_mov_b32 s10, -1 94; GCN64-NEXT: v_mov_b32_e32 v1, s6 95; GCN64-NEXT: s_waitcnt lgkmcnt(0) 96; GCN64-NEXT: s_mov_b32 s8, s2 97; GCN64-NEXT: s_mov_b32 s9, s3 98; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 99; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 100; GCN64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 101; GCN64-NEXT: s_waitcnt vmcnt(0) 102; GCN64-NEXT: buffer_gl0_inv 103; GCN64-NEXT: buffer_gl1_inv 104; GCN64-NEXT: BB0_2: 105; GCN64-NEXT: s_waitcnt_depctr 0xffe3 106; GCN64-NEXT: s_or_b64 exec, exec, s[4:5] 107; GCN64-NEXT: s_waitcnt lgkmcnt(0) 108; GCN64-NEXT: v_readfirstlane_b32 s2, v1 109; GCN64-NEXT: s_mov_b32 s3, 0x31016000 110; GCN64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 111; GCN64-NEXT: s_mov_b32 s2, -1 112; GCN64-NEXT: buffer_store_dword v0, off, s[0:3], 0 113; GCN64-NEXT: s_endpgm 114; 115; GCN32-LABEL: add_i32_constant: 116; GCN32: ; %bb.0: ; %entry 117; GCN32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 118; GCN32-NEXT: s_mov_b32 s5, exec_lo 119; GCN32-NEXT: ; implicit-def: $vgpr1 120; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 121; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 122; GCN32-NEXT: s_and_saveexec_b32 s4, vcc_lo 123; GCN32-NEXT: s_cbranch_execz BB0_2 124; GCN32-NEXT: ; %bb.1: 125; GCN32-NEXT: s_bcnt1_i32_b32 s5, s5 126; GCN32-NEXT: s_mov_b32 s11, 0x31016000 127; GCN32-NEXT: s_mul_i32 s5, s5, 5 128; GCN32-NEXT: s_mov_b32 s10, -1 129; GCN32-NEXT: v_mov_b32_e32 v1, s5 130; GCN32-NEXT: s_waitcnt lgkmcnt(0) 131; GCN32-NEXT: s_mov_b32 s8, s2 132; GCN32-NEXT: s_mov_b32 s9, s3 133; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 134; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 135; GCN32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 136; GCN32-NEXT: s_waitcnt vmcnt(0) 137; GCN32-NEXT: buffer_gl0_inv 138; GCN32-NEXT: buffer_gl1_inv 139; GCN32-NEXT: BB0_2: 140; GCN32-NEXT: s_waitcnt_depctr 0xffe3 141; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s4 142; GCN32-NEXT: s_waitcnt lgkmcnt(0) 143; GCN32-NEXT: v_readfirstlane_b32 s2, v1 144; GCN32-NEXT: s_mov_b32 s3, 0x31016000 145; GCN32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 146; GCN32-NEXT: s_mov_b32 s2, -1 147; GCN32-NEXT: buffer_store_dword v0, off, s[0:3], 0 148; GCN32-NEXT: s_endpgm 149entry: 150 %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel 151 store i32 %old, i32 addrspace(1)* %out 152 ret void 153} 154 155define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) { 156; GFX7LESS-LABEL: add_i32_uniform: 157; GFX7LESS: ; %bb.0: ; %entry 158; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 159; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 160; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xd 161; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 162; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 163; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 164; GFX7LESS-NEXT: ; implicit-def: $vgpr1 165; GFX7LESS-NEXT: s_and_saveexec_b64 s[8:9], vcc 166; GFX7LESS-NEXT: s_cbranch_execz BB1_2 167; GFX7LESS-NEXT: ; %bb.1: 168; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 169; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 170; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 171; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 172; GFX7LESS-NEXT: s_mov_b32 s14, -1 173; GFX7LESS-NEXT: s_mov_b32 s12, s6 174; GFX7LESS-NEXT: s_mov_b32 s13, s7 175; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 176; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 177; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 178; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 179; GFX7LESS-NEXT: buffer_wbinvl1 180; GFX7LESS-NEXT: BB1_2: 181; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] 182; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 183; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 184; GFX7LESS-NEXT: s_mov_b32 s6, -1 185; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 186; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 187; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v0 188; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 189; GFX7LESS-NEXT: s_endpgm 190; 191; GFX8-LABEL: add_i32_uniform: 192; GFX8: ; %bb.0: ; %entry 193; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 194; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 195; GFX8-NEXT: s_mov_b64 s[2:3], exec 196; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 197; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 198; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 199; GFX8-NEXT: ; implicit-def: $vgpr1 200; GFX8-NEXT: s_and_saveexec_b64 s[8:9], vcc 201; GFX8-NEXT: s_cbranch_execz BB1_2 202; GFX8-NEXT: ; %bb.1: 203; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 204; GFX8-NEXT: s_waitcnt lgkmcnt(0) 205; GFX8-NEXT: s_mul_i32 s1, s0, s1 206; GFX8-NEXT: s_mov_b32 s15, 0xf000 207; GFX8-NEXT: s_mov_b32 s14, -1 208; GFX8-NEXT: s_mov_b32 s12, s6 209; GFX8-NEXT: s_mov_b32 s13, s7 210; GFX8-NEXT: v_mov_b32_e32 v1, s1 211; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 212; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 213; GFX8-NEXT: s_waitcnt vmcnt(0) 214; GFX8-NEXT: buffer_wbinvl1_vol 215; GFX8-NEXT: BB1_2: 216; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] 217; GFX8-NEXT: s_waitcnt lgkmcnt(0) 218; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 219; GFX8-NEXT: v_readfirstlane_b32 s0, v1 220; GFX8-NEXT: s_mov_b32 s7, 0xf000 221; GFX8-NEXT: s_mov_b32 s6, -1 222; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 223; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 224; GFX8-NEXT: s_endpgm 225; 226; GFX9-LABEL: add_i32_uniform: 227; GFX9: ; %bb.0: ; %entry 228; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 229; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 230; GFX9-NEXT: s_mov_b64 s[8:9], exec 231; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 232; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 233; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 234; GFX9-NEXT: ; implicit-def: $vgpr1 235; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 236; GFX9-NEXT: s_cbranch_execz BB1_2 237; GFX9-NEXT: ; %bb.1: 238; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[8:9] 239; GFX9-NEXT: s_waitcnt lgkmcnt(0) 240; GFX9-NEXT: s_mul_i32 s3, s2, s3 241; GFX9-NEXT: s_mov_b32 s15, 0xf000 242; GFX9-NEXT: s_mov_b32 s14, -1 243; GFX9-NEXT: s_mov_b32 s12, s6 244; GFX9-NEXT: s_mov_b32 s13, s7 245; GFX9-NEXT: v_mov_b32_e32 v1, s3 246; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 247; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 248; GFX9-NEXT: s_waitcnt vmcnt(0) 249; GFX9-NEXT: buffer_wbinvl1_vol 250; GFX9-NEXT: BB1_2: 251; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 252; GFX9-NEXT: s_waitcnt lgkmcnt(0) 253; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 254; GFX9-NEXT: v_readfirstlane_b32 s0, v1 255; GFX9-NEXT: s_mov_b32 s7, 0xf000 256; GFX9-NEXT: s_mov_b32 s6, -1 257; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 258; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 259; GFX9-NEXT: s_endpgm 260; 261; GCN64-LABEL: add_i32_uniform: 262; GCN64: ; %bb.0: ; %entry 263; GCN64-NEXT: s_clause 0x1 264; GCN64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 265; GCN64-NEXT: s_load_dword s2, s[0:1], 0x34 266; GCN64-NEXT: s_mov_b64 s[8:9], exec 267; GCN64-NEXT: ; implicit-def: $vgpr1 268; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 269; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0 270; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 271; GCN64-NEXT: s_and_saveexec_b64 s[0:1], vcc 272; GCN64-NEXT: s_cbranch_execz BB1_2 273; GCN64-NEXT: ; %bb.1: 274; GCN64-NEXT: s_bcnt1_i32_b64 s3, s[8:9] 275; GCN64-NEXT: s_mov_b32 s11, 0x31016000 276; GCN64-NEXT: s_waitcnt lgkmcnt(0) 277; GCN64-NEXT: s_mul_i32 s3, s2, s3 278; GCN64-NEXT: s_mov_b32 s10, -1 279; GCN64-NEXT: v_mov_b32_e32 v1, s3 280; GCN64-NEXT: s_mov_b32 s8, s6 281; GCN64-NEXT: s_mov_b32 s9, s7 282; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 283; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 284; GCN64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 285; GCN64-NEXT: s_waitcnt vmcnt(0) 286; GCN64-NEXT: buffer_gl0_inv 287; GCN64-NEXT: buffer_gl1_inv 288; GCN64-NEXT: BB1_2: 289; GCN64-NEXT: s_waitcnt_depctr 0xffe3 290; GCN64-NEXT: s_or_b64 exec, exec, s[0:1] 291; GCN64-NEXT: s_waitcnt lgkmcnt(0) 292; GCN64-NEXT: v_mul_lo_u32 v0, s2, v0 293; GCN64-NEXT: v_readfirstlane_b32 s0, v1 294; GCN64-NEXT: s_mov_b32 s7, 0x31016000 295; GCN64-NEXT: s_mov_b32 s6, -1 296; GCN64-NEXT: v_add_nc_u32_e32 v0, s0, v0 297; GCN64-NEXT: buffer_store_dword v0, off, s[4:7], 0 298; GCN64-NEXT: s_endpgm 299; 300; GCN32-LABEL: add_i32_uniform: 301; GCN32: ; %bb.0: ; %entry 302; GCN32-NEXT: s_clause 0x1 303; GCN32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 304; GCN32-NEXT: s_load_dword s2, s[0:1], 0x34 305; GCN32-NEXT: s_mov_b32 s3, exec_lo 306; GCN32-NEXT: ; implicit-def: $vgpr1 307; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 308; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 309; GCN32-NEXT: s_and_saveexec_b32 s0, vcc_lo 310; GCN32-NEXT: s_cbranch_execz BB1_2 311; GCN32-NEXT: ; %bb.1: 312; GCN32-NEXT: s_bcnt1_i32_b32 s1, s3 313; GCN32-NEXT: s_mov_b32 s11, 0x31016000 314; GCN32-NEXT: s_waitcnt lgkmcnt(0) 315; GCN32-NEXT: s_mul_i32 s1, s2, s1 316; GCN32-NEXT: s_mov_b32 s10, -1 317; GCN32-NEXT: v_mov_b32_e32 v1, s1 318; GCN32-NEXT: s_mov_b32 s8, s6 319; GCN32-NEXT: s_mov_b32 s9, s7 320; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 321; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 322; GCN32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 323; GCN32-NEXT: s_waitcnt vmcnt(0) 324; GCN32-NEXT: buffer_gl0_inv 325; GCN32-NEXT: buffer_gl1_inv 326; GCN32-NEXT: BB1_2: 327; GCN32-NEXT: s_waitcnt_depctr 0xffe3 328; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s0 329; GCN32-NEXT: s_waitcnt lgkmcnt(0) 330; GCN32-NEXT: v_mul_lo_u32 v0, s2, v0 331; GCN32-NEXT: v_readfirstlane_b32 s0, v1 332; GCN32-NEXT: s_mov_b32 s7, 0x31016000 333; GCN32-NEXT: s_mov_b32 s6, -1 334; GCN32-NEXT: v_add_nc_u32_e32 v0, s0, v0 335; GCN32-NEXT: buffer_store_dword v0, off, s[4:7], 0 336; GCN32-NEXT: s_endpgm 337entry: 338 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel 339 store i32 %old, i32 addrspace(1)* %out 340 ret void 341} 342 343define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 344; GFX7LESS-LABEL: add_i32_varying: 345; GFX7LESS: ; %bb.0: ; %entry 346; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 347; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 348; GFX7LESS-NEXT: s_mov_b32 s6, -1 349; GFX7LESS-NEXT: s_mov_b32 s10, s6 350; GFX7LESS-NEXT: s_mov_b32 s11, s7 351; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 352; GFX7LESS-NEXT: s_mov_b32 s8, s2 353; GFX7LESS-NEXT: s_mov_b32 s9, s3 354; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 355; GFX7LESS-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 356; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 357; GFX7LESS-NEXT: buffer_wbinvl1 358; GFX7LESS-NEXT: s_mov_b32 s4, s0 359; GFX7LESS-NEXT: s_mov_b32 s5, s1 360; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 361; GFX7LESS-NEXT: s_endpgm 362; 363; GFX8-LABEL: add_i32_varying: 364; GFX8: ; %bb.0: ; %entry 365; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 366; GFX8-NEXT: v_mov_b32_e32 v2, v0 367; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 368; GFX8-NEXT: v_mov_b32_e32 v1, 0 369; GFX8-NEXT: s_mov_b64 exec, s[4:5] 370; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 371; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 372; GFX8-NEXT: s_not_b64 exec, exec 373; GFX8-NEXT: v_mov_b32_e32 v2, 0 374; GFX8-NEXT: s_not_b64 exec, exec 375; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 376; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 377; GFX8-NEXT: s_nop 1 378; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 379; GFX8-NEXT: s_nop 1 380; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 381; GFX8-NEXT: s_nop 1 382; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 383; GFX8-NEXT: s_nop 1 384; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 385; GFX8-NEXT: s_nop 1 386; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 387; GFX8-NEXT: v_readlane_b32 s6, v2, 63 388; GFX8-NEXT: s_nop 0 389; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 390; GFX8-NEXT: s_mov_b64 exec, s[4:5] 391; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 392; GFX8-NEXT: ; implicit-def: $vgpr0 393; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 394; GFX8-NEXT: s_cbranch_execz BB2_2 395; GFX8-NEXT: ; %bb.1: 396; GFX8-NEXT: s_mov_b32 s11, 0xf000 397; GFX8-NEXT: s_mov_b32 s10, -1 398; GFX8-NEXT: s_waitcnt lgkmcnt(0) 399; GFX8-NEXT: s_mov_b32 s8, s2 400; GFX8-NEXT: s_mov_b32 s9, s3 401; GFX8-NEXT: v_mov_b32_e32 v0, s6 402; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 403; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 404; GFX8-NEXT: s_waitcnt vmcnt(0) 405; GFX8-NEXT: buffer_wbinvl1_vol 406; GFX8-NEXT: BB2_2: 407; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 408; GFX8-NEXT: v_readfirstlane_b32 s4, v0 409; GFX8-NEXT: v_mov_b32_e32 v0, v1 410; GFX8-NEXT: s_waitcnt lgkmcnt(0) 411; GFX8-NEXT: s_mov_b32 s3, 0xf000 412; GFX8-NEXT: s_mov_b32 s2, -1 413; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 414; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 415; GFX8-NEXT: s_endpgm 416; 417; GFX9-LABEL: add_i32_varying: 418; GFX9: ; %bb.0: ; %entry 419; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 420; GFX9-NEXT: v_mov_b32_e32 v2, v0 421; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 422; GFX9-NEXT: v_mov_b32_e32 v1, 0 423; GFX9-NEXT: s_mov_b64 exec, s[4:5] 424; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 425; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 426; GFX9-NEXT: s_not_b64 exec, exec 427; GFX9-NEXT: v_mov_b32_e32 v2, 0 428; GFX9-NEXT: s_not_b64 exec, exec 429; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 430; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 431; GFX9-NEXT: s_nop 1 432; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 433; GFX9-NEXT: s_nop 1 434; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 435; GFX9-NEXT: s_nop 1 436; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 437; GFX9-NEXT: s_nop 1 438; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 439; GFX9-NEXT: s_nop 1 440; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 441; GFX9-NEXT: v_readlane_b32 s6, v2, 63 442; GFX9-NEXT: s_nop 0 443; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 444; GFX9-NEXT: s_mov_b64 exec, s[4:5] 445; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 446; GFX9-NEXT: ; implicit-def: $vgpr0 447; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 448; GFX9-NEXT: s_cbranch_execz BB2_2 449; GFX9-NEXT: ; %bb.1: 450; GFX9-NEXT: s_mov_b32 s11, 0xf000 451; GFX9-NEXT: s_mov_b32 s10, -1 452; GFX9-NEXT: s_waitcnt lgkmcnt(0) 453; GFX9-NEXT: s_mov_b32 s8, s2 454; GFX9-NEXT: s_mov_b32 s9, s3 455; GFX9-NEXT: v_mov_b32_e32 v0, s6 456; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 457; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 458; GFX9-NEXT: s_waitcnt vmcnt(0) 459; GFX9-NEXT: buffer_wbinvl1_vol 460; GFX9-NEXT: BB2_2: 461; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 462; GFX9-NEXT: v_readfirstlane_b32 s4, v0 463; GFX9-NEXT: v_mov_b32_e32 v0, v1 464; GFX9-NEXT: s_waitcnt lgkmcnt(0) 465; GFX9-NEXT: s_mov_b32 s3, 0xf000 466; GFX9-NEXT: s_mov_b32 s2, -1 467; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 468; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 469; GFX9-NEXT: s_endpgm 470; 471; GCN64-LABEL: add_i32_varying: 472; GCN64: ; %bb.0: ; %entry 473; GCN64-NEXT: v_mov_b32_e32 v1, v0 474; GCN64-NEXT: s_not_b64 exec, exec 475; GCN64-NEXT: v_mov_b32_e32 v1, 0 476; GCN64-NEXT: s_not_b64 exec, exec 477; GCN64-NEXT: s_or_saveexec_b64 s[2:3], -1 478; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 479; GCN64-NEXT: v_mov_b32_e32 v3, 0 480; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 481; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 482; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 483; GCN64-NEXT: v_mov_b32_e32 v2, v1 484; GCN64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 485; GCN64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 486; GCN64-NEXT: v_readlane_b32 s4, v1, 31 487; GCN64-NEXT: v_mov_b32_e32 v2, s4 488; GCN64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 489; GCN64-NEXT: v_readlane_b32 s6, v1, 15 490; GCN64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 491; GCN64-NEXT: s_mov_b64 exec, s[2:3] 492; GCN64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 493; GCN64-NEXT: s_or_saveexec_b64 s[4:5], -1 494; GCN64-NEXT: v_readlane_b32 s7, v1, 31 495; GCN64-NEXT: v_writelane_b32 v3, s6, 16 496; GCN64-NEXT: s_mov_b64 exec, s[4:5] 497; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 498; GCN64-NEXT: s_or_saveexec_b64 s[4:5], -1 499; GCN64-NEXT: v_readlane_b32 s8, v1, 47 500; GCN64-NEXT: v_readlane_b32 s9, v1, 63 501; GCN64-NEXT: v_writelane_b32 v3, s7, 32 502; GCN64-NEXT: s_mov_b64 exec, s[4:5] 503; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 504; GCN64-NEXT: s_or_saveexec_b64 s[6:7], -1 505; GCN64-NEXT: s_mov_b32 s4, s9 506; GCN64-NEXT: v_writelane_b32 v3, s8, 48 507; GCN64-NEXT: s_mov_b64 exec, s[6:7] 508; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 509; GCN64-NEXT: s_mov_b32 s6, -1 510; GCN64-NEXT: ; implicit-def: $vgpr0 511; GCN64-NEXT: s_and_saveexec_b64 s[8:9], vcc 512; GCN64-NEXT: s_cbranch_execz BB2_2 513; GCN64-NEXT: ; %bb.1: 514; GCN64-NEXT: v_mov_b32_e32 v0, s4 515; GCN64-NEXT: s_mov_b32 s7, 0x31016000 516; GCN64-NEXT: s_waitcnt lgkmcnt(0) 517; GCN64-NEXT: s_mov_b32 s4, s2 518; GCN64-NEXT: s_mov_b32 s5, s3 519; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 520; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 521; GCN64-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 522; GCN64-NEXT: s_waitcnt vmcnt(0) 523; GCN64-NEXT: buffer_gl0_inv 524; GCN64-NEXT: buffer_gl1_inv 525; GCN64-NEXT: BB2_2: 526; GCN64-NEXT: s_waitcnt_depctr 0xffe3 527; GCN64-NEXT: s_or_b64 exec, exec, s[8:9] 528; GCN64-NEXT: s_waitcnt lgkmcnt(0) 529; GCN64-NEXT: v_readfirstlane_b32 s2, v0 530; GCN64-NEXT: v_mov_b32_e32 v0, v3 531; GCN64-NEXT: s_mov_b32 s3, 0x31016000 532; GCN64-NEXT: v_add_nc_u32_e32 v0, s2, v0 533; GCN64-NEXT: s_mov_b32 s2, s6 534; GCN64-NEXT: buffer_store_dword v0, off, s[0:3], 0 535; GCN64-NEXT: s_endpgm 536; 537; GCN32-LABEL: add_i32_varying: 538; GCN32: ; %bb.0: ; %entry 539; GCN32-NEXT: v_mov_b32_e32 v1, v0 540; GCN32-NEXT: s_not_b32 exec_lo, exec_lo 541; GCN32-NEXT: v_mov_b32_e32 v1, 0 542; GCN32-NEXT: s_not_b32 exec_lo, exec_lo 543; GCN32-NEXT: s_or_saveexec_b32 s2, -1 544; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 545; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 546; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 547; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 548; GCN32-NEXT: v_mov_b32_e32 v2, v1 549; GCN32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 550; GCN32-NEXT: s_mov_b32 exec_lo, s2 551; GCN32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 552; GCN32-NEXT: s_or_saveexec_b32 s4, -1 553; GCN32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 554; GCN32-NEXT: v_mov_b32_e32 v3, 0 555; GCN32-NEXT: v_readlane_b32 s5, v1, 15 556; GCN32-NEXT: v_readlane_b32 s6, v1, 31 557; GCN32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 558; GCN32-NEXT: s_mov_b32 exec_lo, s4 559; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 560; GCN32-NEXT: s_or_saveexec_b32 s4, -1 561; GCN32-NEXT: v_writelane_b32 v3, s5, 16 562; GCN32-NEXT: s_mov_b32 exec_lo, s4 563; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 564; GCN32-NEXT: s_mov_b32 s4, s6 565; GCN32-NEXT: s_mov_b32 s6, -1 566; GCN32-NEXT: ; implicit-def: $vgpr0 567; GCN32-NEXT: s_and_saveexec_b32 s8, vcc_lo 568; GCN32-NEXT: s_cbranch_execz BB2_2 569; GCN32-NEXT: ; %bb.1: 570; GCN32-NEXT: v_mov_b32_e32 v0, s4 571; GCN32-NEXT: s_mov_b32 s7, 0x31016000 572; GCN32-NEXT: s_waitcnt lgkmcnt(0) 573; GCN32-NEXT: s_mov_b32 s4, s2 574; GCN32-NEXT: s_mov_b32 s5, s3 575; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 576; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 577; GCN32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 578; GCN32-NEXT: s_waitcnt vmcnt(0) 579; GCN32-NEXT: buffer_gl0_inv 580; GCN32-NEXT: buffer_gl1_inv 581; GCN32-NEXT: BB2_2: 582; GCN32-NEXT: s_waitcnt_depctr 0xffe3 583; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s8 584; GCN32-NEXT: s_waitcnt lgkmcnt(0) 585; GCN32-NEXT: v_readfirstlane_b32 s2, v0 586; GCN32-NEXT: v_mov_b32_e32 v0, v3 587; GCN32-NEXT: s_mov_b32 s3, 0x31016000 588; GCN32-NEXT: v_add_nc_u32_e32 v0, s2, v0 589; GCN32-NEXT: s_mov_b32 s2, s6 590; GCN32-NEXT: buffer_store_dword v0, off, s[0:3], 0 591; GCN32-NEXT: s_endpgm 592entry: 593 %lane = call i32 @llvm.amdgcn.workitem.id.x() 594 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel 595 store i32 %old, i32 addrspace(1)* %out 596 ret void 597} 598 599define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 600; GFX7LESS-LABEL: add_i64_constant: 601; GFX7LESS: ; %bb.0: ; %entry 602; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 603; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 604; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 605; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 606; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 607; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 608; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 609; GFX7LESS-NEXT: s_cbranch_execz BB3_2 610; GFX7LESS-NEXT: ; %bb.1: 611; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 612; GFX7LESS-NEXT: s_mov_b32 s10, -1 613; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 614; GFX7LESS-NEXT: s_mov_b32 s8, s2 615; GFX7LESS-NEXT: s_mov_b32 s9, s3 616; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 617; GFX7LESS-NEXT: s_mul_i32 s3, s2, 5 618; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 619; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 620; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 621; GFX7LESS-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc 622; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 623; GFX7LESS-NEXT: buffer_wbinvl1 624; GFX7LESS-NEXT: BB3_2: 625; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 626; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 627; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 628; GFX7LESS-NEXT: s_mov_b32 s2, -1 629; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 630; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v2 631; GFX7LESS-NEXT: s_waitcnt expcnt(0) 632; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 633; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 634; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 635; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 636; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 637; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 638; GFX7LESS-NEXT: s_endpgm 639; 640; GFX89-LABEL: add_i64_constant: 641; GFX89: ; %bb.0: ; %entry 642; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 643; GFX89-NEXT: s_mov_b64 s[6:7], exec 644; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 645; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 646; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 647; GFX89-NEXT: ; implicit-def: $vgpr1_vgpr2 648; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc 649; GFX89-NEXT: s_cbranch_execz BB3_2 650; GFX89-NEXT: ; %bb.1: 651; GFX89-NEXT: s_waitcnt lgkmcnt(0) 652; GFX89-NEXT: s_mov_b32 s8, s2 653; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 654; GFX89-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 655; GFX89-NEXT: s_mul_i32 s2, s2, 5 656; GFX89-NEXT: s_mov_b32 s11, 0xf000 657; GFX89-NEXT: s_mov_b32 s10, -1 658; GFX89-NEXT: s_mov_b32 s9, s3 659; GFX89-NEXT: v_mov_b32_e32 v1, s2 660; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 661; GFX89-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc 662; GFX89-NEXT: s_waitcnt vmcnt(0) 663; GFX89-NEXT: buffer_wbinvl1_vol 664; GFX89-NEXT: BB3_2: 665; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] 666; GFX89-NEXT: s_waitcnt lgkmcnt(0) 667; GFX89-NEXT: v_readfirstlane_b32 s2, v1 668; GFX89-NEXT: v_readfirstlane_b32 s3, v2 669; GFX89-NEXT: v_mov_b32_e32 v1, s2 670; GFX89-NEXT: v_mov_b32_e32 v2, s3 671; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 672; GFX89-NEXT: s_mov_b32 s3, 0xf000 673; GFX89-NEXT: s_mov_b32 s2, -1 674; GFX89-NEXT: s_nop 2 675; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 676; GFX89-NEXT: s_endpgm 677; 678; GCN64-LABEL: add_i64_constant: 679; GCN64: ; %bb.0: ; %entry 680; GCN64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 681; GCN64-NEXT: s_mov_b64 s[6:7], exec 682; GCN64-NEXT: ; implicit-def: $vgpr1_vgpr2 683; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 684; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 685; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 686; GCN64-NEXT: s_and_saveexec_b64 s[4:5], vcc 687; GCN64-NEXT: s_cbranch_execz BB3_2 688; GCN64-NEXT: ; %bb.1: 689; GCN64-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 690; GCN64-NEXT: s_mov_b32 s11, 0x31016000 691; GCN64-NEXT: s_mul_i32 s7, s6, 5 692; GCN64-NEXT: v_mul_hi_u32_u24_e64 v2, s6, 5 693; GCN64-NEXT: v_mov_b32_e32 v1, s7 694; GCN64-NEXT: s_mov_b32 s10, -1 695; GCN64-NEXT: s_waitcnt lgkmcnt(0) 696; GCN64-NEXT: s_mov_b32 s8, s2 697; GCN64-NEXT: s_mov_b32 s9, s3 698; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 699; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 700; GCN64-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc 701; GCN64-NEXT: s_waitcnt vmcnt(0) 702; GCN64-NEXT: buffer_gl0_inv 703; GCN64-NEXT: buffer_gl1_inv 704; GCN64-NEXT: BB3_2: 705; GCN64-NEXT: s_waitcnt_depctr 0xffe3 706; GCN64-NEXT: s_or_b64 exec, exec, s[4:5] 707; GCN64-NEXT: s_waitcnt lgkmcnt(0) 708; GCN64-NEXT: v_readfirstlane_b32 s2, v1 709; GCN64-NEXT: v_readfirstlane_b32 s3, v2 710; GCN64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] 711; GCN64-NEXT: s_mov_b32 s3, 0x31016000 712; GCN64-NEXT: s_mov_b32 s2, -1 713; GCN64-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 714; GCN64-NEXT: s_endpgm 715; 716; GCN32-LABEL: add_i64_constant: 717; GCN32: ; %bb.0: ; %entry 718; GCN32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 719; GCN32-NEXT: s_mov_b32 s5, exec_lo 720; GCN32-NEXT: ; implicit-def: $vgpr1_vgpr2 721; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 722; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 723; GCN32-NEXT: s_and_saveexec_b32 s4, vcc_lo 724; GCN32-NEXT: s_cbranch_execz BB3_2 725; GCN32-NEXT: ; %bb.1: 726; GCN32-NEXT: s_bcnt1_i32_b32 s5, s5 727; GCN32-NEXT: s_mov_b32 s11, 0x31016000 728; GCN32-NEXT: s_mul_i32 s6, s5, 5 729; GCN32-NEXT: v_mul_hi_u32_u24_e64 v2, s5, 5 730; GCN32-NEXT: v_mov_b32_e32 v1, s6 731; GCN32-NEXT: s_mov_b32 s10, -1 732; GCN32-NEXT: s_waitcnt lgkmcnt(0) 733; GCN32-NEXT: s_mov_b32 s8, s2 734; GCN32-NEXT: s_mov_b32 s9, s3 735; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 736; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 737; GCN32-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc 738; GCN32-NEXT: s_waitcnt vmcnt(0) 739; GCN32-NEXT: buffer_gl0_inv 740; GCN32-NEXT: buffer_gl1_inv 741; GCN32-NEXT: BB3_2: 742; GCN32-NEXT: s_waitcnt_depctr 0xffe3 743; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s4 744; GCN32-NEXT: s_waitcnt lgkmcnt(0) 745; GCN32-NEXT: v_readfirstlane_b32 s2, v1 746; GCN32-NEXT: v_readfirstlane_b32 s3, v2 747; GCN32-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] 748; GCN32-NEXT: s_mov_b32 s3, 0x31016000 749; GCN32-NEXT: s_mov_b32 s2, -1 750; GCN32-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 751; GCN32-NEXT: s_endpgm 752entry: 753 %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel 754 store i64 %old, i64 addrspace(1)* %out 755 ret void 756} 757 758define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) { 759; GFX7LESS-LABEL: add_i64_uniform: 760; GFX7LESS: ; %bb.0: ; %entry 761; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec 762; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 763; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 764; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 765; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0 766; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 767; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 768; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 769; GFX7LESS-NEXT: s_cbranch_execz BB4_2 770; GFX7LESS-NEXT: ; %bb.1: 771; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 772; GFX7LESS-NEXT: s_mov_b32 s14, -1 773; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 774; GFX7LESS-NEXT: s_mov_b32 s12, s6 775; GFX7LESS-NEXT: s_mov_b32 s13, s7 776; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 777; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 778; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 779; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v1 780; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 781; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 782; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 783; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 784; GFX7LESS-NEXT: buffer_atomic_add_x2 v[1:2], off, s[12:15], 0 glc 785; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 786; GFX7LESS-NEXT: buffer_wbinvl1 787; GFX7LESS-NEXT: BB4_2: 788; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 789; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 790; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 791; GFX7LESS-NEXT: s_mov_b32 s6, -1 792; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 793; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v2 794; GFX7LESS-NEXT: s_waitcnt expcnt(0) 795; GFX7LESS-NEXT: v_mul_lo_u32 v1, s1, v0 796; GFX7LESS-NEXT: v_mul_hi_u32 v2, s0, v0 797; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 798; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 799; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 800; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 801; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 802; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 803; GFX7LESS-NEXT: s_endpgm 804; 805; GFX8-LABEL: add_i64_uniform: 806; GFX8: ; %bb.0: ; %entry 807; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 808; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 809; GFX8-NEXT: s_mov_b64 s[8:9], exec 810; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 811; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 812; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 813; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 814; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 815; GFX8-NEXT: s_cbranch_execz BB4_2 816; GFX8-NEXT: ; %bb.1: 817; GFX8-NEXT: s_waitcnt lgkmcnt(0) 818; GFX8-NEXT: s_mov_b32 s12, s6 819; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 820; GFX8-NEXT: v_mov_b32_e32 v1, s6 821; GFX8-NEXT: v_mul_hi_u32 v1, s0, v1 822; GFX8-NEXT: s_mov_b32 s13, s7 823; GFX8-NEXT: s_mul_i32 s7, s1, s6 824; GFX8-NEXT: s_mul_i32 s6, s0, s6 825; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 826; GFX8-NEXT: s_mov_b32 s15, 0xf000 827; GFX8-NEXT: s_mov_b32 s14, -1 828; GFX8-NEXT: v_mov_b32_e32 v1, s6 829; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 830; GFX8-NEXT: buffer_atomic_add_x2 v[1:2], off, s[12:15], 0 glc 831; GFX8-NEXT: s_waitcnt vmcnt(0) 832; GFX8-NEXT: buffer_wbinvl1_vol 833; GFX8-NEXT: BB4_2: 834; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 835; GFX8-NEXT: v_readfirstlane_b32 s2, v1 836; GFX8-NEXT: s_waitcnt lgkmcnt(0) 837; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 838; GFX8-NEXT: v_mul_hi_u32 v3, s0, v0 839; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 840; GFX8-NEXT: v_readfirstlane_b32 s1, v2 841; GFX8-NEXT: v_mov_b32_e32 v2, s1 842; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 843; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 844; GFX8-NEXT: s_mov_b32 s7, 0xf000 845; GFX8-NEXT: s_mov_b32 s6, -1 846; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 847; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 848; GFX8-NEXT: s_endpgm 849; 850; GFX9-LABEL: add_i64_uniform: 851; GFX9: ; %bb.0: ; %entry 852; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 853; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 854; GFX9-NEXT: s_mov_b64 s[8:9], exec 855; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 856; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 857; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 858; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 859; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 860; GFX9-NEXT: s_cbranch_execz BB4_2 861; GFX9-NEXT: ; %bb.1: 862; GFX9-NEXT: s_waitcnt lgkmcnt(0) 863; GFX9-NEXT: s_mov_b32 s12, s6 864; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 865; GFX9-NEXT: s_mov_b32 s13, s7 866; GFX9-NEXT: s_mul_i32 s7, s3, s6 867; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 868; GFX9-NEXT: s_add_i32 s8, s8, s7 869; GFX9-NEXT: s_mul_i32 s6, s2, s6 870; GFX9-NEXT: s_mov_b32 s15, 0xf000 871; GFX9-NEXT: s_mov_b32 s14, -1 872; GFX9-NEXT: v_mov_b32_e32 v1, s6 873; GFX9-NEXT: v_mov_b32_e32 v2, s8 874; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 875; GFX9-NEXT: buffer_atomic_add_x2 v[1:2], off, s[12:15], 0 glc 876; GFX9-NEXT: s_waitcnt vmcnt(0) 877; GFX9-NEXT: buffer_wbinvl1_vol 878; GFX9-NEXT: BB4_2: 879; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 880; GFX9-NEXT: s_waitcnt lgkmcnt(0) 881; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 882; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 883; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 884; GFX9-NEXT: v_readfirstlane_b32 s0, v1 885; GFX9-NEXT: v_readfirstlane_b32 s1, v2 886; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 887; GFX9-NEXT: v_mov_b32_e32 v2, s1 888; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 889; GFX9-NEXT: s_mov_b32 s7, 0xf000 890; GFX9-NEXT: s_mov_b32 s6, -1 891; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 892; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 893; GFX9-NEXT: s_endpgm 894; 895; GCN64-LABEL: add_i64_uniform: 896; GCN64: ; %bb.0: ; %entry 897; GCN64-NEXT: s_clause 0x1 898; GCN64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 899; GCN64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 900; GCN64-NEXT: s_mov_b64 s[8:9], exec 901; GCN64-NEXT: ; implicit-def: $vgpr1_vgpr2 902; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 903; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0 904; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 905; GCN64-NEXT: s_and_saveexec_b64 s[0:1], vcc 906; GCN64-NEXT: s_cbranch_execz BB4_2 907; GCN64-NEXT: ; %bb.1: 908; GCN64-NEXT: s_bcnt1_i32_b64 s8, s[8:9] 909; GCN64-NEXT: s_mov_b32 s11, 0x31016000 910; GCN64-NEXT: s_waitcnt lgkmcnt(0) 911; GCN64-NEXT: s_mul_i32 s9, s3, s8 912; GCN64-NEXT: s_mul_hi_u32 s10, s2, s8 913; GCN64-NEXT: s_mul_i32 s8, s2, s8 914; GCN64-NEXT: s_add_i32 s10, s10, s9 915; GCN64-NEXT: v_mov_b32_e32 v1, s8 916; GCN64-NEXT: v_mov_b32_e32 v2, s10 917; GCN64-NEXT: s_mov_b32 s10, -1 918; GCN64-NEXT: s_mov_b32 s8, s6 919; GCN64-NEXT: s_mov_b32 s9, s7 920; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 921; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 922; GCN64-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc 923; GCN64-NEXT: s_waitcnt vmcnt(0) 924; GCN64-NEXT: buffer_gl0_inv 925; GCN64-NEXT: buffer_gl1_inv 926; GCN64-NEXT: BB4_2: 927; GCN64-NEXT: s_waitcnt_depctr 0xffe3 928; GCN64-NEXT: s_or_b64 exec, exec, s[0:1] 929; GCN64-NEXT: s_waitcnt lgkmcnt(0) 930; GCN64-NEXT: v_mul_lo_u32 v3, s3, v0 931; GCN64-NEXT: v_mul_hi_u32 v4, s2, v0 932; GCN64-NEXT: v_mul_lo_u32 v0, s2, v0 933; GCN64-NEXT: v_readfirstlane_b32 s0, v1 934; GCN64-NEXT: v_readfirstlane_b32 s1, v2 935; GCN64-NEXT: s_mov_b32 s7, 0x31016000 936; GCN64-NEXT: s_mov_b32 s6, -1 937; GCN64-NEXT: v_add_nc_u32_e32 v1, v4, v3 938; GCN64-NEXT: v_add_co_u32_e64 v0, vcc, s0, v0 939; GCN64-NEXT: v_add_co_ci_u32_e32 v1, vcc, s1, v1, vcc 940; GCN64-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 941; GCN64-NEXT: s_endpgm 942; 943; GCN32-LABEL: add_i64_uniform: 944; GCN32: ; %bb.0: ; %entry 945; GCN32-NEXT: s_clause 0x1 946; GCN32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 947; GCN32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 948; GCN32-NEXT: s_mov_b32 s8, exec_lo 949; GCN32-NEXT: ; implicit-def: $vgpr1_vgpr2 950; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 951; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 952; GCN32-NEXT: s_and_saveexec_b32 s0, vcc_lo 953; GCN32-NEXT: s_cbranch_execz BB4_2 954; GCN32-NEXT: ; %bb.1: 955; GCN32-NEXT: s_bcnt1_i32_b32 s1, s8 956; GCN32-NEXT: s_mov_b32 s11, 0x31016000 957; GCN32-NEXT: s_waitcnt lgkmcnt(0) 958; GCN32-NEXT: s_mul_i32 s8, s3, s1 959; GCN32-NEXT: s_mul_hi_u32 s9, s2, s1 960; GCN32-NEXT: s_mul_i32 s1, s2, s1 961; GCN32-NEXT: s_add_i32 s9, s9, s8 962; GCN32-NEXT: v_mov_b32_e32 v1, s1 963; GCN32-NEXT: v_mov_b32_e32 v2, s9 964; GCN32-NEXT: s_mov_b32 s10, -1 965; GCN32-NEXT: s_mov_b32 s8, s6 966; GCN32-NEXT: s_mov_b32 s9, s7 967; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 968; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 969; GCN32-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc 970; GCN32-NEXT: s_waitcnt vmcnt(0) 971; GCN32-NEXT: buffer_gl0_inv 972; GCN32-NEXT: buffer_gl1_inv 973; GCN32-NEXT: BB4_2: 974; GCN32-NEXT: s_waitcnt_depctr 0xffe3 975; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s0 976; GCN32-NEXT: s_waitcnt lgkmcnt(0) 977; GCN32-NEXT: v_mul_lo_u32 v3, s3, v0 978; GCN32-NEXT: v_mul_hi_u32 v4, s2, v0 979; GCN32-NEXT: v_mul_lo_u32 v0, s2, v0 980; GCN32-NEXT: v_readfirstlane_b32 s0, v1 981; GCN32-NEXT: v_readfirstlane_b32 s1, v2 982; GCN32-NEXT: s_mov_b32 s7, 0x31016000 983; GCN32-NEXT: s_mov_b32 s6, -1 984; GCN32-NEXT: v_add_nc_u32_e32 v1, v4, v3 985; GCN32-NEXT: v_add_co_u32_e64 v0, vcc_lo, s0, v0 986; GCN32-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 987; GCN32-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 988; GCN32-NEXT: s_endpgm 989entry: 990 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel 991 store i64 %old, i64 addrspace(1)* %out 992 ret void 993} 994 995define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 996; GFX7LESS-LABEL: add_i64_varying: 997; GFX7LESS: ; %bb.0: ; %entry 998; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 999; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1000; GFX7LESS-NEXT: s_mov_b32 s6, -1 1001; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1002; GFX7LESS-NEXT: s_mov_b32 s10, s6 1003; GFX7LESS-NEXT: s_mov_b32 s11, s7 1004; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1005; GFX7LESS-NEXT: s_mov_b32 s8, s2 1006; GFX7LESS-NEXT: s_mov_b32 s9, s3 1007; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1008; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1009; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1010; GFX7LESS-NEXT: buffer_wbinvl1 1011; GFX7LESS-NEXT: s_mov_b32 s4, s0 1012; GFX7LESS-NEXT: s_mov_b32 s5, s1 1013; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1014; GFX7LESS-NEXT: s_endpgm 1015; 1016; GFX89-LABEL: add_i64_varying: 1017; GFX89: ; %bb.0: ; %entry 1018; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1019; GFX89-NEXT: s_mov_b32 s3, 0xf000 1020; GFX89-NEXT: s_mov_b32 s2, -1 1021; GFX89-NEXT: v_mov_b32_e32 v1, 0 1022; GFX89-NEXT: s_waitcnt lgkmcnt(0) 1023; GFX89-NEXT: s_mov_b32 s0, s4 1024; GFX89-NEXT: s_mov_b32 s1, s5 1025; GFX89-NEXT: s_mov_b32 s4, s6 1026; GFX89-NEXT: s_mov_b32 s5, s7 1027; GFX89-NEXT: s_mov_b32 s6, s2 1028; GFX89-NEXT: s_mov_b32 s7, s3 1029; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1030; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 glc 1031; GFX89-NEXT: s_waitcnt vmcnt(0) 1032; GFX89-NEXT: buffer_wbinvl1_vol 1033; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1034; GFX89-NEXT: s_endpgm 1035; 1036; GFX10-LABEL: add_i64_varying: 1037; GFX10: ; %bb.0: ; %entry 1038; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1039; GFX10-NEXT: v_mov_b32_e32 v1, 0 1040; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1041; GFX10-NEXT: s_mov_b32 s6, -1 1042; GFX10-NEXT: s_mov_b32 s11, s7 1043; GFX10-NEXT: s_mov_b32 s10, s6 1044; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1045; GFX10-NEXT: s_mov_b32 s8, s2 1046; GFX10-NEXT: s_mov_b32 s9, s3 1047; GFX10-NEXT: s_mov_b32 s4, s0 1048; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1049; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1050; GFX10-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1051; GFX10-NEXT: s_waitcnt vmcnt(0) 1052; GFX10-NEXT: buffer_gl0_inv 1053; GFX10-NEXT: buffer_gl1_inv 1054; GFX10-NEXT: s_mov_b32 s5, s1 1055; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1056; GFX10-NEXT: s_endpgm 1057entry: 1058 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1059 %zext = zext i32 %lane to i64 1060 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel 1061 store i64 %old, i64 addrspace(1)* %out 1062 ret void 1063} 1064 1065define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 1066; GFX7LESS-LABEL: sub_i32_constant: 1067; GFX7LESS: ; %bb.0: ; %entry 1068; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1069; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1070; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1071; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1072; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1073; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1074; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1075; GFX7LESS-NEXT: s_cbranch_execz BB6_2 1076; GFX7LESS-NEXT: ; %bb.1: 1077; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 1078; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1079; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 1080; GFX7LESS-NEXT: s_mov_b32 s10, -1 1081; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1082; GFX7LESS-NEXT: s_mov_b32 s8, s2 1083; GFX7LESS-NEXT: s_mov_b32 s9, s3 1084; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1085; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1086; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1087; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1088; GFX7LESS-NEXT: buffer_wbinvl1 1089; GFX7LESS-NEXT: BB6_2: 1090; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1091; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1092; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1093; GFX7LESS-NEXT: s_mov_b32 s2, -1 1094; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 1095; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1096; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1097; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1098; GFX7LESS-NEXT: s_endpgm 1099; 1100; GFX8-LABEL: sub_i32_constant: 1101; GFX8: ; %bb.0: ; %entry 1102; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1103; GFX8-NEXT: s_mov_b64 s[6:7], exec 1104; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1105; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1106; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1107; GFX8-NEXT: ; implicit-def: $vgpr1 1108; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1109; GFX8-NEXT: s_cbranch_execz BB6_2 1110; GFX8-NEXT: ; %bb.1: 1111; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1112; GFX8-NEXT: s_mov_b32 s8, s2 1113; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1114; GFX8-NEXT: s_mul_i32 s2, s2, 5 1115; GFX8-NEXT: s_mov_b32 s11, 0xf000 1116; GFX8-NEXT: s_mov_b32 s10, -1 1117; GFX8-NEXT: s_mov_b32 s9, s3 1118; GFX8-NEXT: v_mov_b32_e32 v1, s2 1119; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1120; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1121; GFX8-NEXT: s_waitcnt vmcnt(0) 1122; GFX8-NEXT: buffer_wbinvl1_vol 1123; GFX8-NEXT: BB6_2: 1124; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1125; GFX8-NEXT: v_readfirstlane_b32 s4, v1 1126; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1127; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1128; GFX8-NEXT: s_mov_b32 s3, 0xf000 1129; GFX8-NEXT: s_mov_b32 s2, -1 1130; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 1131; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1132; GFX8-NEXT: s_endpgm 1133; 1134; GFX9-LABEL: sub_i32_constant: 1135; GFX9: ; %bb.0: ; %entry 1136; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1137; GFX9-NEXT: s_mov_b64 s[6:7], exec 1138; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1139; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1140; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1141; GFX9-NEXT: ; implicit-def: $vgpr1 1142; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1143; GFX9-NEXT: s_cbranch_execz BB6_2 1144; GFX9-NEXT: ; %bb.1: 1145; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1146; GFX9-NEXT: s_mov_b32 s8, s2 1147; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1148; GFX9-NEXT: s_mul_i32 s2, s2, 5 1149; GFX9-NEXT: s_mov_b32 s11, 0xf000 1150; GFX9-NEXT: s_mov_b32 s10, -1 1151; GFX9-NEXT: s_mov_b32 s9, s3 1152; GFX9-NEXT: v_mov_b32_e32 v1, s2 1153; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1154; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1155; GFX9-NEXT: s_waitcnt vmcnt(0) 1156; GFX9-NEXT: buffer_wbinvl1_vol 1157; GFX9-NEXT: BB6_2: 1158; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1159; GFX9-NEXT: v_readfirstlane_b32 s4, v1 1160; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1161; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1162; GFX9-NEXT: s_mov_b32 s3, 0xf000 1163; GFX9-NEXT: s_mov_b32 s2, -1 1164; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1165; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1166; GFX9-NEXT: s_endpgm 1167; 1168; GCN64-LABEL: sub_i32_constant: 1169; GCN64: ; %bb.0: ; %entry 1170; GCN64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1171; GCN64-NEXT: s_mov_b64 s[6:7], exec 1172; GCN64-NEXT: ; implicit-def: $vgpr1 1173; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1174; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1175; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1176; GCN64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1177; GCN64-NEXT: s_cbranch_execz BB6_2 1178; GCN64-NEXT: ; %bb.1: 1179; GCN64-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1180; GCN64-NEXT: s_mov_b32 s11, 0x31016000 1181; GCN64-NEXT: s_mul_i32 s6, s6, 5 1182; GCN64-NEXT: s_mov_b32 s10, -1 1183; GCN64-NEXT: v_mov_b32_e32 v1, s6 1184; GCN64-NEXT: s_waitcnt lgkmcnt(0) 1185; GCN64-NEXT: s_mov_b32 s8, s2 1186; GCN64-NEXT: s_mov_b32 s9, s3 1187; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1188; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 1189; GCN64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1190; GCN64-NEXT: s_waitcnt vmcnt(0) 1191; GCN64-NEXT: buffer_gl0_inv 1192; GCN64-NEXT: buffer_gl1_inv 1193; GCN64-NEXT: BB6_2: 1194; GCN64-NEXT: s_waitcnt_depctr 0xffe3 1195; GCN64-NEXT: s_or_b64 exec, exec, s[4:5] 1196; GCN64-NEXT: s_waitcnt lgkmcnt(0) 1197; GCN64-NEXT: v_readfirstlane_b32 s2, v1 1198; GCN64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1199; GCN64-NEXT: s_mov_b32 s3, 0x31016000 1200; GCN64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1201; GCN64-NEXT: s_mov_b32 s2, -1 1202; GCN64-NEXT: buffer_store_dword v0, off, s[0:3], 0 1203; GCN64-NEXT: s_endpgm 1204; 1205; GCN32-LABEL: sub_i32_constant: 1206; GCN32: ; %bb.0: ; %entry 1207; GCN32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1208; GCN32-NEXT: s_mov_b32 s5, exec_lo 1209; GCN32-NEXT: ; implicit-def: $vgpr1 1210; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 1211; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1212; GCN32-NEXT: s_and_saveexec_b32 s4, vcc_lo 1213; GCN32-NEXT: s_cbranch_execz BB6_2 1214; GCN32-NEXT: ; %bb.1: 1215; GCN32-NEXT: s_bcnt1_i32_b32 s5, s5 1216; GCN32-NEXT: s_mov_b32 s11, 0x31016000 1217; GCN32-NEXT: s_mul_i32 s5, s5, 5 1218; GCN32-NEXT: s_mov_b32 s10, -1 1219; GCN32-NEXT: v_mov_b32_e32 v1, s5 1220; GCN32-NEXT: s_waitcnt lgkmcnt(0) 1221; GCN32-NEXT: s_mov_b32 s8, s2 1222; GCN32-NEXT: s_mov_b32 s9, s3 1223; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1224; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 1225; GCN32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1226; GCN32-NEXT: s_waitcnt vmcnt(0) 1227; GCN32-NEXT: buffer_gl0_inv 1228; GCN32-NEXT: buffer_gl1_inv 1229; GCN32-NEXT: BB6_2: 1230; GCN32-NEXT: s_waitcnt_depctr 0xffe3 1231; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1232; GCN32-NEXT: s_waitcnt lgkmcnt(0) 1233; GCN32-NEXT: v_readfirstlane_b32 s2, v1 1234; GCN32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1235; GCN32-NEXT: s_mov_b32 s3, 0x31016000 1236; GCN32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1237; GCN32-NEXT: s_mov_b32 s2, -1 1238; GCN32-NEXT: buffer_store_dword v0, off, s[0:3], 0 1239; GCN32-NEXT: s_endpgm 1240entry: 1241 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel 1242 store i32 %old, i32 addrspace(1)* %out 1243 ret void 1244} 1245 1246define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) { 1247; GFX7LESS-LABEL: sub_i32_uniform: 1248; GFX7LESS: ; %bb.0: ; %entry 1249; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1250; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1251; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xd 1252; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1253; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1254; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1255; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1256; GFX7LESS-NEXT: s_and_saveexec_b64 s[8:9], vcc 1257; GFX7LESS-NEXT: s_cbranch_execz BB7_2 1258; GFX7LESS-NEXT: ; %bb.1: 1259; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 1260; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1261; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1262; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 1263; GFX7LESS-NEXT: s_mov_b32 s14, -1 1264; GFX7LESS-NEXT: s_mov_b32 s12, s6 1265; GFX7LESS-NEXT: s_mov_b32 s13, s7 1266; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 1267; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1268; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1269; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1270; GFX7LESS-NEXT: buffer_wbinvl1 1271; GFX7LESS-NEXT: BB7_2: 1272; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] 1273; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1274; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1275; GFX7LESS-NEXT: s_mov_b32 s6, -1 1276; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 1277; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 1278; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 1279; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1280; GFX7LESS-NEXT: s_endpgm 1281; 1282; GFX8-LABEL: sub_i32_uniform: 1283; GFX8: ; %bb.0: ; %entry 1284; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1285; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 1286; GFX8-NEXT: s_mov_b64 s[2:3], exec 1287; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1288; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1289; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1290; GFX8-NEXT: ; implicit-def: $vgpr1 1291; GFX8-NEXT: s_and_saveexec_b64 s[8:9], vcc 1292; GFX8-NEXT: s_cbranch_execz BB7_2 1293; GFX8-NEXT: ; %bb.1: 1294; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1295; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1296; GFX8-NEXT: s_mul_i32 s1, s0, s1 1297; GFX8-NEXT: s_mov_b32 s15, 0xf000 1298; GFX8-NEXT: s_mov_b32 s14, -1 1299; GFX8-NEXT: s_mov_b32 s12, s6 1300; GFX8-NEXT: s_mov_b32 s13, s7 1301; GFX8-NEXT: v_mov_b32_e32 v1, s1 1302; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1303; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1304; GFX8-NEXT: s_waitcnt vmcnt(0) 1305; GFX8-NEXT: buffer_wbinvl1_vol 1306; GFX8-NEXT: BB7_2: 1307; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] 1308; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1309; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 1310; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1311; GFX8-NEXT: s_mov_b32 s7, 0xf000 1312; GFX8-NEXT: s_mov_b32 s6, -1 1313; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1314; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1315; GFX8-NEXT: s_endpgm 1316; 1317; GFX9-LABEL: sub_i32_uniform: 1318; GFX9: ; %bb.0: ; %entry 1319; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1320; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 1321; GFX9-NEXT: s_mov_b64 s[8:9], exec 1322; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 1323; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 1324; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1325; GFX9-NEXT: ; implicit-def: $vgpr1 1326; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1327; GFX9-NEXT: s_cbranch_execz BB7_2 1328; GFX9-NEXT: ; %bb.1: 1329; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[8:9] 1330; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1331; GFX9-NEXT: s_mul_i32 s3, s2, s3 1332; GFX9-NEXT: s_mov_b32 s15, 0xf000 1333; GFX9-NEXT: s_mov_b32 s14, -1 1334; GFX9-NEXT: s_mov_b32 s12, s6 1335; GFX9-NEXT: s_mov_b32 s13, s7 1336; GFX9-NEXT: v_mov_b32_e32 v1, s3 1337; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1338; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1339; GFX9-NEXT: s_waitcnt vmcnt(0) 1340; GFX9-NEXT: buffer_wbinvl1_vol 1341; GFX9-NEXT: BB7_2: 1342; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1343; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1344; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1345; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1346; GFX9-NEXT: s_mov_b32 s7, 0xf000 1347; GFX9-NEXT: s_mov_b32 s6, -1 1348; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1349; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1350; GFX9-NEXT: s_endpgm 1351; 1352; GCN64-LABEL: sub_i32_uniform: 1353; GCN64: ; %bb.0: ; %entry 1354; GCN64-NEXT: s_clause 0x1 1355; GCN64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1356; GCN64-NEXT: s_load_dword s2, s[0:1], 0x34 1357; GCN64-NEXT: s_mov_b64 s[8:9], exec 1358; GCN64-NEXT: ; implicit-def: $vgpr1 1359; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 1360; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0 1361; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1362; GCN64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1363; GCN64-NEXT: s_cbranch_execz BB7_2 1364; GCN64-NEXT: ; %bb.1: 1365; GCN64-NEXT: s_bcnt1_i32_b64 s3, s[8:9] 1366; GCN64-NEXT: s_mov_b32 s11, 0x31016000 1367; GCN64-NEXT: s_waitcnt lgkmcnt(0) 1368; GCN64-NEXT: s_mul_i32 s3, s2, s3 1369; GCN64-NEXT: s_mov_b32 s10, -1 1370; GCN64-NEXT: v_mov_b32_e32 v1, s3 1371; GCN64-NEXT: s_mov_b32 s8, s6 1372; GCN64-NEXT: s_mov_b32 s9, s7 1373; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1374; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 1375; GCN64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1376; GCN64-NEXT: s_waitcnt vmcnt(0) 1377; GCN64-NEXT: buffer_gl0_inv 1378; GCN64-NEXT: buffer_gl1_inv 1379; GCN64-NEXT: BB7_2: 1380; GCN64-NEXT: s_waitcnt_depctr 0xffe3 1381; GCN64-NEXT: s_or_b64 exec, exec, s[0:1] 1382; GCN64-NEXT: s_waitcnt lgkmcnt(0) 1383; GCN64-NEXT: v_mul_lo_u32 v0, s2, v0 1384; GCN64-NEXT: v_readfirstlane_b32 s0, v1 1385; GCN64-NEXT: s_mov_b32 s7, 0x31016000 1386; GCN64-NEXT: s_mov_b32 s6, -1 1387; GCN64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1388; GCN64-NEXT: buffer_store_dword v0, off, s[4:7], 0 1389; GCN64-NEXT: s_endpgm 1390; 1391; GCN32-LABEL: sub_i32_uniform: 1392; GCN32: ; %bb.0: ; %entry 1393; GCN32-NEXT: s_clause 0x1 1394; GCN32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1395; GCN32-NEXT: s_load_dword s2, s[0:1], 0x34 1396; GCN32-NEXT: s_mov_b32 s3, exec_lo 1397; GCN32-NEXT: ; implicit-def: $vgpr1 1398; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1399; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1400; GCN32-NEXT: s_and_saveexec_b32 s0, vcc_lo 1401; GCN32-NEXT: s_cbranch_execz BB7_2 1402; GCN32-NEXT: ; %bb.1: 1403; GCN32-NEXT: s_bcnt1_i32_b32 s1, s3 1404; GCN32-NEXT: s_mov_b32 s11, 0x31016000 1405; GCN32-NEXT: s_waitcnt lgkmcnt(0) 1406; GCN32-NEXT: s_mul_i32 s1, s2, s1 1407; GCN32-NEXT: s_mov_b32 s10, -1 1408; GCN32-NEXT: v_mov_b32_e32 v1, s1 1409; GCN32-NEXT: s_mov_b32 s8, s6 1410; GCN32-NEXT: s_mov_b32 s9, s7 1411; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1412; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 1413; GCN32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1414; GCN32-NEXT: s_waitcnt vmcnt(0) 1415; GCN32-NEXT: buffer_gl0_inv 1416; GCN32-NEXT: buffer_gl1_inv 1417; GCN32-NEXT: BB7_2: 1418; GCN32-NEXT: s_waitcnt_depctr 0xffe3 1419; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1420; GCN32-NEXT: s_waitcnt lgkmcnt(0) 1421; GCN32-NEXT: v_mul_lo_u32 v0, s2, v0 1422; GCN32-NEXT: v_readfirstlane_b32 s0, v1 1423; GCN32-NEXT: s_mov_b32 s7, 0x31016000 1424; GCN32-NEXT: s_mov_b32 s6, -1 1425; GCN32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1426; GCN32-NEXT: buffer_store_dword v0, off, s[4:7], 0 1427; GCN32-NEXT: s_endpgm 1428entry: 1429 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel 1430 store i32 %old, i32 addrspace(1)* %out 1431 ret void 1432} 1433 1434define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 1435; GFX7LESS-LABEL: sub_i32_varying: 1436; GFX7LESS: ; %bb.0: ; %entry 1437; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1438; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1439; GFX7LESS-NEXT: s_mov_b32 s6, -1 1440; GFX7LESS-NEXT: s_mov_b32 s10, s6 1441; GFX7LESS-NEXT: s_mov_b32 s11, s7 1442; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1443; GFX7LESS-NEXT: s_mov_b32 s8, s2 1444; GFX7LESS-NEXT: s_mov_b32 s9, s3 1445; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1446; GFX7LESS-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1447; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1448; GFX7LESS-NEXT: buffer_wbinvl1 1449; GFX7LESS-NEXT: s_mov_b32 s4, s0 1450; GFX7LESS-NEXT: s_mov_b32 s5, s1 1451; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1452; GFX7LESS-NEXT: s_endpgm 1453; 1454; GFX8-LABEL: sub_i32_varying: 1455; GFX8: ; %bb.0: ; %entry 1456; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1457; GFX8-NEXT: v_mov_b32_e32 v2, v0 1458; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1459; GFX8-NEXT: v_mov_b32_e32 v1, 0 1460; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1461; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1462; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1463; GFX8-NEXT: s_not_b64 exec, exec 1464; GFX8-NEXT: v_mov_b32_e32 v2, 0 1465; GFX8-NEXT: s_not_b64 exec, exec 1466; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1467; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1468; GFX8-NEXT: s_nop 1 1469; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1470; GFX8-NEXT: s_nop 1 1471; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1472; GFX8-NEXT: s_nop 1 1473; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1474; GFX8-NEXT: s_nop 1 1475; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1476; GFX8-NEXT: s_nop 1 1477; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1478; GFX8-NEXT: v_readlane_b32 s6, v2, 63 1479; GFX8-NEXT: s_nop 0 1480; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1481; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1482; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1483; GFX8-NEXT: ; implicit-def: $vgpr0 1484; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1485; GFX8-NEXT: s_cbranch_execz BB8_2 1486; GFX8-NEXT: ; %bb.1: 1487; GFX8-NEXT: s_mov_b32 s11, 0xf000 1488; GFX8-NEXT: s_mov_b32 s10, -1 1489; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1490; GFX8-NEXT: s_mov_b32 s8, s2 1491; GFX8-NEXT: s_mov_b32 s9, s3 1492; GFX8-NEXT: v_mov_b32_e32 v0, s6 1493; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1494; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1495; GFX8-NEXT: s_waitcnt vmcnt(0) 1496; GFX8-NEXT: buffer_wbinvl1_vol 1497; GFX8-NEXT: BB8_2: 1498; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1499; GFX8-NEXT: v_readfirstlane_b32 s4, v0 1500; GFX8-NEXT: v_mov_b32_e32 v0, v1 1501; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1502; GFX8-NEXT: s_mov_b32 s3, 0xf000 1503; GFX8-NEXT: s_mov_b32 s2, -1 1504; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 1505; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1506; GFX8-NEXT: s_endpgm 1507; 1508; GFX9-LABEL: sub_i32_varying: 1509; GFX9: ; %bb.0: ; %entry 1510; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1511; GFX9-NEXT: v_mov_b32_e32 v2, v0 1512; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1513; GFX9-NEXT: v_mov_b32_e32 v1, 0 1514; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1515; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1516; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1517; GFX9-NEXT: s_not_b64 exec, exec 1518; GFX9-NEXT: v_mov_b32_e32 v2, 0 1519; GFX9-NEXT: s_not_b64 exec, exec 1520; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1521; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1522; GFX9-NEXT: s_nop 1 1523; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1524; GFX9-NEXT: s_nop 1 1525; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1526; GFX9-NEXT: s_nop 1 1527; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1528; GFX9-NEXT: s_nop 1 1529; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1530; GFX9-NEXT: s_nop 1 1531; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1532; GFX9-NEXT: v_readlane_b32 s6, v2, 63 1533; GFX9-NEXT: s_nop 0 1534; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1535; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1536; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1537; GFX9-NEXT: ; implicit-def: $vgpr0 1538; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1539; GFX9-NEXT: s_cbranch_execz BB8_2 1540; GFX9-NEXT: ; %bb.1: 1541; GFX9-NEXT: s_mov_b32 s11, 0xf000 1542; GFX9-NEXT: s_mov_b32 s10, -1 1543; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1544; GFX9-NEXT: s_mov_b32 s8, s2 1545; GFX9-NEXT: s_mov_b32 s9, s3 1546; GFX9-NEXT: v_mov_b32_e32 v0, s6 1547; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1548; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1549; GFX9-NEXT: s_waitcnt vmcnt(0) 1550; GFX9-NEXT: buffer_wbinvl1_vol 1551; GFX9-NEXT: BB8_2: 1552; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1553; GFX9-NEXT: v_readfirstlane_b32 s4, v0 1554; GFX9-NEXT: v_mov_b32_e32 v0, v1 1555; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1556; GFX9-NEXT: s_mov_b32 s3, 0xf000 1557; GFX9-NEXT: s_mov_b32 s2, -1 1558; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1559; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1560; GFX9-NEXT: s_endpgm 1561; 1562; GCN64-LABEL: sub_i32_varying: 1563; GCN64: ; %bb.0: ; %entry 1564; GCN64-NEXT: v_mov_b32_e32 v1, v0 1565; GCN64-NEXT: s_not_b64 exec, exec 1566; GCN64-NEXT: v_mov_b32_e32 v1, 0 1567; GCN64-NEXT: s_not_b64 exec, exec 1568; GCN64-NEXT: s_or_saveexec_b64 s[2:3], -1 1569; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1570; GCN64-NEXT: v_mov_b32_e32 v3, 0 1571; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1572; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1573; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1574; GCN64-NEXT: v_mov_b32_e32 v2, v1 1575; GCN64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1576; GCN64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1577; GCN64-NEXT: v_readlane_b32 s4, v1, 31 1578; GCN64-NEXT: v_mov_b32_e32 v2, s4 1579; GCN64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1580; GCN64-NEXT: v_readlane_b32 s6, v1, 15 1581; GCN64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1582; GCN64-NEXT: s_mov_b64 exec, s[2:3] 1583; GCN64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1584; GCN64-NEXT: s_or_saveexec_b64 s[4:5], -1 1585; GCN64-NEXT: v_readlane_b32 s7, v1, 31 1586; GCN64-NEXT: v_writelane_b32 v3, s6, 16 1587; GCN64-NEXT: s_mov_b64 exec, s[4:5] 1588; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1589; GCN64-NEXT: s_or_saveexec_b64 s[4:5], -1 1590; GCN64-NEXT: v_readlane_b32 s8, v1, 47 1591; GCN64-NEXT: v_readlane_b32 s9, v1, 63 1592; GCN64-NEXT: v_writelane_b32 v3, s7, 32 1593; GCN64-NEXT: s_mov_b64 exec, s[4:5] 1594; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 1595; GCN64-NEXT: s_or_saveexec_b64 s[6:7], -1 1596; GCN64-NEXT: s_mov_b32 s4, s9 1597; GCN64-NEXT: v_writelane_b32 v3, s8, 48 1598; GCN64-NEXT: s_mov_b64 exec, s[6:7] 1599; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1600; GCN64-NEXT: s_mov_b32 s6, -1 1601; GCN64-NEXT: ; implicit-def: $vgpr0 1602; GCN64-NEXT: s_and_saveexec_b64 s[8:9], vcc 1603; GCN64-NEXT: s_cbranch_execz BB8_2 1604; GCN64-NEXT: ; %bb.1: 1605; GCN64-NEXT: v_mov_b32_e32 v0, s4 1606; GCN64-NEXT: s_mov_b32 s7, 0x31016000 1607; GCN64-NEXT: s_waitcnt lgkmcnt(0) 1608; GCN64-NEXT: s_mov_b32 s4, s2 1609; GCN64-NEXT: s_mov_b32 s5, s3 1610; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1611; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 1612; GCN64-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 1613; GCN64-NEXT: s_waitcnt vmcnt(0) 1614; GCN64-NEXT: buffer_gl0_inv 1615; GCN64-NEXT: buffer_gl1_inv 1616; GCN64-NEXT: BB8_2: 1617; GCN64-NEXT: s_waitcnt_depctr 0xffe3 1618; GCN64-NEXT: s_or_b64 exec, exec, s[8:9] 1619; GCN64-NEXT: s_waitcnt lgkmcnt(0) 1620; GCN64-NEXT: v_readfirstlane_b32 s2, v0 1621; GCN64-NEXT: v_mov_b32_e32 v0, v3 1622; GCN64-NEXT: s_mov_b32 s3, 0x31016000 1623; GCN64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1624; GCN64-NEXT: s_mov_b32 s2, s6 1625; GCN64-NEXT: buffer_store_dword v0, off, s[0:3], 0 1626; GCN64-NEXT: s_endpgm 1627; 1628; GCN32-LABEL: sub_i32_varying: 1629; GCN32: ; %bb.0: ; %entry 1630; GCN32-NEXT: v_mov_b32_e32 v1, v0 1631; GCN32-NEXT: s_not_b32 exec_lo, exec_lo 1632; GCN32-NEXT: v_mov_b32_e32 v1, 0 1633; GCN32-NEXT: s_not_b32 exec_lo, exec_lo 1634; GCN32-NEXT: s_or_saveexec_b32 s2, -1 1635; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1636; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1637; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1638; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1639; GCN32-NEXT: v_mov_b32_e32 v2, v1 1640; GCN32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1641; GCN32-NEXT: s_mov_b32 exec_lo, s2 1642; GCN32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1643; GCN32-NEXT: s_or_saveexec_b32 s4, -1 1644; GCN32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1645; GCN32-NEXT: v_mov_b32_e32 v3, 0 1646; GCN32-NEXT: v_readlane_b32 s5, v1, 15 1647; GCN32-NEXT: v_readlane_b32 s6, v1, 31 1648; GCN32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1649; GCN32-NEXT: s_mov_b32 exec_lo, s4 1650; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1651; GCN32-NEXT: s_or_saveexec_b32 s4, -1 1652; GCN32-NEXT: v_writelane_b32 v3, s5, 16 1653; GCN32-NEXT: s_mov_b32 exec_lo, s4 1654; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1655; GCN32-NEXT: s_mov_b32 s4, s6 1656; GCN32-NEXT: s_mov_b32 s6, -1 1657; GCN32-NEXT: ; implicit-def: $vgpr0 1658; GCN32-NEXT: s_and_saveexec_b32 s8, vcc_lo 1659; GCN32-NEXT: s_cbranch_execz BB8_2 1660; GCN32-NEXT: ; %bb.1: 1661; GCN32-NEXT: v_mov_b32_e32 v0, s4 1662; GCN32-NEXT: s_mov_b32 s7, 0x31016000 1663; GCN32-NEXT: s_waitcnt lgkmcnt(0) 1664; GCN32-NEXT: s_mov_b32 s4, s2 1665; GCN32-NEXT: s_mov_b32 s5, s3 1666; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1667; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 1668; GCN32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 1669; GCN32-NEXT: s_waitcnt vmcnt(0) 1670; GCN32-NEXT: buffer_gl0_inv 1671; GCN32-NEXT: buffer_gl1_inv 1672; GCN32-NEXT: BB8_2: 1673; GCN32-NEXT: s_waitcnt_depctr 0xffe3 1674; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s8 1675; GCN32-NEXT: s_waitcnt lgkmcnt(0) 1676; GCN32-NEXT: v_readfirstlane_b32 s2, v0 1677; GCN32-NEXT: v_mov_b32_e32 v0, v3 1678; GCN32-NEXT: s_mov_b32 s3, 0x31016000 1679; GCN32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1680; GCN32-NEXT: s_mov_b32 s2, s6 1681; GCN32-NEXT: buffer_store_dword v0, off, s[0:3], 0 1682; GCN32-NEXT: s_endpgm 1683entry: 1684 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1685 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel 1686 store i32 %old, i32 addrspace(1)* %out 1687 ret void 1688} 1689 1690define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 1691; GFX7LESS-LABEL: sub_i64_constant: 1692; GFX7LESS: ; %bb.0: ; %entry 1693; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1694; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1695; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1696; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1697; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1698; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1699; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1700; GFX7LESS-NEXT: s_cbranch_execz BB9_2 1701; GFX7LESS-NEXT: ; %bb.1: 1702; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 1703; GFX7LESS-NEXT: s_mov_b32 s10, -1 1704; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1705; GFX7LESS-NEXT: s_mov_b32 s8, s2 1706; GFX7LESS-NEXT: s_mov_b32 s9, s3 1707; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1708; GFX7LESS-NEXT: s_mul_i32 s3, s2, 5 1709; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 1710; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 1711; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1712; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc 1713; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1714; GFX7LESS-NEXT: buffer_wbinvl1 1715; GFX7LESS-NEXT: BB9_2: 1716; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1717; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1718; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1719; GFX7LESS-NEXT: s_mov_b32 s2, -1 1720; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 1721; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v2 1722; GFX7LESS-NEXT: s_waitcnt expcnt(0) 1723; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1724; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1725; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 1726; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1727; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1728; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1729; GFX7LESS-NEXT: s_endpgm 1730; 1731; GFX8-LABEL: sub_i64_constant: 1732; GFX8: ; %bb.0: ; %entry 1733; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1734; GFX8-NEXT: s_mov_b64 s[6:7], exec 1735; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1736; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1737; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1738; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1739; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1740; GFX8-NEXT: s_cbranch_execz BB9_2 1741; GFX8-NEXT: ; %bb.1: 1742; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1743; GFX8-NEXT: s_mov_b32 s8, s2 1744; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1745; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 1746; GFX8-NEXT: s_mul_i32 s2, s2, 5 1747; GFX8-NEXT: s_mov_b32 s11, 0xf000 1748; GFX8-NEXT: s_mov_b32 s10, -1 1749; GFX8-NEXT: s_mov_b32 s9, s3 1750; GFX8-NEXT: v_mov_b32_e32 v1, s2 1751; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1752; GFX8-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc 1753; GFX8-NEXT: s_waitcnt vmcnt(0) 1754; GFX8-NEXT: buffer_wbinvl1_vol 1755; GFX8-NEXT: BB9_2: 1756; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1757; GFX8-NEXT: v_readfirstlane_b32 s5, v2 1758; GFX8-NEXT: v_readfirstlane_b32 s4, v1 1759; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1760; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1761; GFX8-NEXT: v_mov_b32_e32 v2, s5 1762; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 1763; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1764; GFX8-NEXT: s_mov_b32 s3, 0xf000 1765; GFX8-NEXT: s_mov_b32 s2, -1 1766; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1767; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1768; GFX8-NEXT: s_endpgm 1769; 1770; GFX9-LABEL: sub_i64_constant: 1771; GFX9: ; %bb.0: ; %entry 1772; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1773; GFX9-NEXT: s_mov_b64 s[6:7], exec 1774; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1775; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1776; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1777; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1778; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1779; GFX9-NEXT: s_cbranch_execz BB9_2 1780; GFX9-NEXT: ; %bb.1: 1781; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1782; GFX9-NEXT: s_mov_b32 s8, s2 1783; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1784; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 1785; GFX9-NEXT: s_mul_i32 s2, s2, 5 1786; GFX9-NEXT: s_mov_b32 s11, 0xf000 1787; GFX9-NEXT: s_mov_b32 s10, -1 1788; GFX9-NEXT: s_mov_b32 s9, s3 1789; GFX9-NEXT: v_mov_b32_e32 v1, s2 1790; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1791; GFX9-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc 1792; GFX9-NEXT: s_waitcnt vmcnt(0) 1793; GFX9-NEXT: buffer_wbinvl1_vol 1794; GFX9-NEXT: BB9_2: 1795; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1796; GFX9-NEXT: v_readfirstlane_b32 s5, v2 1797; GFX9-NEXT: v_readfirstlane_b32 s4, v1 1798; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1799; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1800; GFX9-NEXT: v_mov_b32_e32 v2, s5 1801; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 1802; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1803; GFX9-NEXT: s_mov_b32 s3, 0xf000 1804; GFX9-NEXT: s_mov_b32 s2, -1 1805; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 1806; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1807; GFX9-NEXT: s_endpgm 1808; 1809; GCN64-LABEL: sub_i64_constant: 1810; GCN64: ; %bb.0: ; %entry 1811; GCN64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1812; GCN64-NEXT: s_mov_b64 s[6:7], exec 1813; GCN64-NEXT: ; implicit-def: $vgpr1_vgpr2 1814; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1815; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1816; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1817; GCN64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1818; GCN64-NEXT: s_cbranch_execz BB9_2 1819; GCN64-NEXT: ; %bb.1: 1820; GCN64-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1821; GCN64-NEXT: s_mov_b32 s11, 0x31016000 1822; GCN64-NEXT: s_mul_i32 s7, s6, 5 1823; GCN64-NEXT: v_mul_hi_u32_u24_e64 v2, s6, 5 1824; GCN64-NEXT: v_mov_b32_e32 v1, s7 1825; GCN64-NEXT: s_mov_b32 s10, -1 1826; GCN64-NEXT: s_waitcnt lgkmcnt(0) 1827; GCN64-NEXT: s_mov_b32 s8, s2 1828; GCN64-NEXT: s_mov_b32 s9, s3 1829; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1830; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 1831; GCN64-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc 1832; GCN64-NEXT: s_waitcnt vmcnt(0) 1833; GCN64-NEXT: buffer_gl0_inv 1834; GCN64-NEXT: buffer_gl1_inv 1835; GCN64-NEXT: BB9_2: 1836; GCN64-NEXT: s_waitcnt_depctr 0xffe3 1837; GCN64-NEXT: s_or_b64 exec, exec, s[4:5] 1838; GCN64-NEXT: s_waitcnt lgkmcnt(0) 1839; GCN64-NEXT: v_readfirstlane_b32 s2, v1 1840; GCN64-NEXT: v_mul_u32_u24_e32 v1, 5, v0 1841; GCN64-NEXT: v_readfirstlane_b32 s3, v2 1842; GCN64-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 1843; GCN64-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v1 1844; GCN64-NEXT: s_mov_b32 s2, -1 1845; GCN64-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc 1846; GCN64-NEXT: s_mov_b32 s3, 0x31016000 1847; GCN64-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1848; GCN64-NEXT: s_endpgm 1849; 1850; GCN32-LABEL: sub_i64_constant: 1851; GCN32: ; %bb.0: ; %entry 1852; GCN32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1853; GCN32-NEXT: s_mov_b32 s5, exec_lo 1854; GCN32-NEXT: ; implicit-def: $vgpr1_vgpr2 1855; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 1856; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1857; GCN32-NEXT: s_and_saveexec_b32 s4, vcc_lo 1858; GCN32-NEXT: s_cbranch_execz BB9_2 1859; GCN32-NEXT: ; %bb.1: 1860; GCN32-NEXT: s_bcnt1_i32_b32 s5, s5 1861; GCN32-NEXT: s_mov_b32 s11, 0x31016000 1862; GCN32-NEXT: s_mul_i32 s6, s5, 5 1863; GCN32-NEXT: v_mul_hi_u32_u24_e64 v2, s5, 5 1864; GCN32-NEXT: v_mov_b32_e32 v1, s6 1865; GCN32-NEXT: s_mov_b32 s10, -1 1866; GCN32-NEXT: s_waitcnt lgkmcnt(0) 1867; GCN32-NEXT: s_mov_b32 s8, s2 1868; GCN32-NEXT: s_mov_b32 s9, s3 1869; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1870; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 1871; GCN32-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc 1872; GCN32-NEXT: s_waitcnt vmcnt(0) 1873; GCN32-NEXT: buffer_gl0_inv 1874; GCN32-NEXT: buffer_gl1_inv 1875; GCN32-NEXT: BB9_2: 1876; GCN32-NEXT: s_waitcnt_depctr 0xffe3 1877; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1878; GCN32-NEXT: s_waitcnt lgkmcnt(0) 1879; GCN32-NEXT: v_readfirstlane_b32 s2, v1 1880; GCN32-NEXT: v_mul_u32_u24_e32 v1, 5, v0 1881; GCN32-NEXT: v_readfirstlane_b32 s3, v2 1882; GCN32-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 1883; GCN32-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v1 1884; GCN32-NEXT: s_mov_b32 s2, -1 1885; GCN32-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo 1886; GCN32-NEXT: s_mov_b32 s3, 0x31016000 1887; GCN32-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1888; GCN32-NEXT: s_endpgm 1889entry: 1890 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel 1891 store i64 %old, i64 addrspace(1)* %out 1892 ret void 1893} 1894 1895define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) { 1896; GFX7LESS-LABEL: sub_i64_uniform: 1897; GFX7LESS: ; %bb.0: ; %entry 1898; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec 1899; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1900; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1901; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 1902; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0 1903; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1904; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1905; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1906; GFX7LESS-NEXT: s_cbranch_execz BB10_2 1907; GFX7LESS-NEXT: ; %bb.1: 1908; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 1909; GFX7LESS-NEXT: s_mov_b32 s14, -1 1910; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1911; GFX7LESS-NEXT: s_mov_b32 s12, s6 1912; GFX7LESS-NEXT: s_mov_b32 s13, s7 1913; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 1914; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 1915; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1916; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v1 1917; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 1918; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 1919; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1920; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1921; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[12:15], 0 glc 1922; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1923; GFX7LESS-NEXT: buffer_wbinvl1 1924; GFX7LESS-NEXT: BB10_2: 1925; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1926; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1927; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1928; GFX7LESS-NEXT: s_mov_b32 s6, -1 1929; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1930; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v2 1931; GFX7LESS-NEXT: s_waitcnt expcnt(0) 1932; GFX7LESS-NEXT: v_mul_lo_u32 v1, s1, v0 1933; GFX7LESS-NEXT: v_mul_hi_u32 v2, s0, v0 1934; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 1935; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1936; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 1937; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1938; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1939; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1940; GFX7LESS-NEXT: s_endpgm 1941; 1942; GFX8-LABEL: sub_i64_uniform: 1943; GFX8: ; %bb.0: ; %entry 1944; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1945; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1946; GFX8-NEXT: s_mov_b64 s[8:9], exec 1947; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 1948; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 1949; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1950; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1951; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1952; GFX8-NEXT: s_cbranch_execz BB10_2 1953; GFX8-NEXT: ; %bb.1: 1954; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1955; GFX8-NEXT: s_mov_b32 s12, s6 1956; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 1957; GFX8-NEXT: v_mov_b32_e32 v1, s6 1958; GFX8-NEXT: v_mul_hi_u32 v1, s0, v1 1959; GFX8-NEXT: s_mov_b32 s13, s7 1960; GFX8-NEXT: s_mul_i32 s7, s1, s6 1961; GFX8-NEXT: s_mul_i32 s6, s0, s6 1962; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 1963; GFX8-NEXT: s_mov_b32 s15, 0xf000 1964; GFX8-NEXT: s_mov_b32 s14, -1 1965; GFX8-NEXT: v_mov_b32_e32 v1, s6 1966; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1967; GFX8-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[12:15], 0 glc 1968; GFX8-NEXT: s_waitcnt vmcnt(0) 1969; GFX8-NEXT: buffer_wbinvl1_vol 1970; GFX8-NEXT: BB10_2: 1971; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1972; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1973; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1974; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 1975; GFX8-NEXT: v_mul_hi_u32 v3, s0, v0 1976; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 1977; GFX8-NEXT: v_readfirstlane_b32 s1, v2 1978; GFX8-NEXT: v_mov_b32_e32 v2, s1 1979; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1980; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1981; GFX8-NEXT: s_mov_b32 s7, 0xf000 1982; GFX8-NEXT: s_mov_b32 s6, -1 1983; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1984; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1985; GFX8-NEXT: s_endpgm 1986; 1987; GFX9-LABEL: sub_i64_uniform: 1988; GFX9: ; %bb.0: ; %entry 1989; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1990; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1991; GFX9-NEXT: s_mov_b64 s[8:9], exec 1992; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 1993; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 1994; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1995; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1996; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1997; GFX9-NEXT: s_cbranch_execz BB10_2 1998; GFX9-NEXT: ; %bb.1: 1999; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2000; GFX9-NEXT: s_mov_b32 s12, s6 2001; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 2002; GFX9-NEXT: s_mov_b32 s13, s7 2003; GFX9-NEXT: s_mul_i32 s7, s3, s6 2004; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2005; GFX9-NEXT: s_add_i32 s8, s8, s7 2006; GFX9-NEXT: s_mul_i32 s6, s2, s6 2007; GFX9-NEXT: s_mov_b32 s15, 0xf000 2008; GFX9-NEXT: s_mov_b32 s14, -1 2009; GFX9-NEXT: v_mov_b32_e32 v1, s6 2010; GFX9-NEXT: v_mov_b32_e32 v2, s8 2011; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2012; GFX9-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[12:15], 0 glc 2013; GFX9-NEXT: s_waitcnt vmcnt(0) 2014; GFX9-NEXT: buffer_wbinvl1_vol 2015; GFX9-NEXT: BB10_2: 2016; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 2017; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2018; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 2019; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 2020; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 2021; GFX9-NEXT: v_readfirstlane_b32 s0, v1 2022; GFX9-NEXT: v_readfirstlane_b32 s1, v2 2023; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 2024; GFX9-NEXT: v_mov_b32_e32 v2, s1 2025; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 2026; GFX9-NEXT: s_mov_b32 s7, 0xf000 2027; GFX9-NEXT: s_mov_b32 s6, -1 2028; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2029; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2030; GFX9-NEXT: s_endpgm 2031; 2032; GCN64-LABEL: sub_i64_uniform: 2033; GCN64: ; %bb.0: ; %entry 2034; GCN64-NEXT: s_clause 0x1 2035; GCN64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2036; GCN64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2037; GCN64-NEXT: s_mov_b64 s[8:9], exec 2038; GCN64-NEXT: ; implicit-def: $vgpr1_vgpr2 2039; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 2040; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0 2041; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2042; GCN64-NEXT: s_and_saveexec_b64 s[0:1], vcc 2043; GCN64-NEXT: s_cbranch_execz BB10_2 2044; GCN64-NEXT: ; %bb.1: 2045; GCN64-NEXT: s_bcnt1_i32_b64 s8, s[8:9] 2046; GCN64-NEXT: s_mov_b32 s11, 0x31016000 2047; GCN64-NEXT: s_waitcnt lgkmcnt(0) 2048; GCN64-NEXT: s_mul_i32 s9, s3, s8 2049; GCN64-NEXT: s_mul_hi_u32 s10, s2, s8 2050; GCN64-NEXT: s_mul_i32 s8, s2, s8 2051; GCN64-NEXT: s_add_i32 s10, s10, s9 2052; GCN64-NEXT: v_mov_b32_e32 v1, s8 2053; GCN64-NEXT: v_mov_b32_e32 v2, s10 2054; GCN64-NEXT: s_mov_b32 s10, -1 2055; GCN64-NEXT: s_mov_b32 s8, s6 2056; GCN64-NEXT: s_mov_b32 s9, s7 2057; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2058; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 2059; GCN64-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc 2060; GCN64-NEXT: s_waitcnt vmcnt(0) 2061; GCN64-NEXT: buffer_gl0_inv 2062; GCN64-NEXT: buffer_gl1_inv 2063; GCN64-NEXT: BB10_2: 2064; GCN64-NEXT: s_waitcnt_depctr 0xffe3 2065; GCN64-NEXT: s_or_b64 exec, exec, s[0:1] 2066; GCN64-NEXT: s_waitcnt lgkmcnt(0) 2067; GCN64-NEXT: v_mul_lo_u32 v3, s3, v0 2068; GCN64-NEXT: v_mul_hi_u32 v4, s2, v0 2069; GCN64-NEXT: v_mul_lo_u32 v0, s2, v0 2070; GCN64-NEXT: v_readfirstlane_b32 s0, v1 2071; GCN64-NEXT: v_readfirstlane_b32 s1, v2 2072; GCN64-NEXT: s_mov_b32 s7, 0x31016000 2073; GCN64-NEXT: s_mov_b32 s6, -1 2074; GCN64-NEXT: v_add_nc_u32_e32 v1, v4, v3 2075; GCN64-NEXT: v_sub_co_u32_e64 v0, vcc, s0, v0 2076; GCN64-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc 2077; GCN64-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2078; GCN64-NEXT: s_endpgm 2079; 2080; GCN32-LABEL: sub_i64_uniform: 2081; GCN32: ; %bb.0: ; %entry 2082; GCN32-NEXT: s_clause 0x1 2083; GCN32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2084; GCN32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2085; GCN32-NEXT: s_mov_b32 s8, exec_lo 2086; GCN32-NEXT: ; implicit-def: $vgpr1_vgpr2 2087; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 2088; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2089; GCN32-NEXT: s_and_saveexec_b32 s0, vcc_lo 2090; GCN32-NEXT: s_cbranch_execz BB10_2 2091; GCN32-NEXT: ; %bb.1: 2092; GCN32-NEXT: s_bcnt1_i32_b32 s1, s8 2093; GCN32-NEXT: s_mov_b32 s11, 0x31016000 2094; GCN32-NEXT: s_waitcnt lgkmcnt(0) 2095; GCN32-NEXT: s_mul_i32 s8, s3, s1 2096; GCN32-NEXT: s_mul_hi_u32 s9, s2, s1 2097; GCN32-NEXT: s_mul_i32 s1, s2, s1 2098; GCN32-NEXT: s_add_i32 s9, s9, s8 2099; GCN32-NEXT: v_mov_b32_e32 v1, s1 2100; GCN32-NEXT: v_mov_b32_e32 v2, s9 2101; GCN32-NEXT: s_mov_b32 s10, -1 2102; GCN32-NEXT: s_mov_b32 s8, s6 2103; GCN32-NEXT: s_mov_b32 s9, s7 2104; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2105; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 2106; GCN32-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc 2107; GCN32-NEXT: s_waitcnt vmcnt(0) 2108; GCN32-NEXT: buffer_gl0_inv 2109; GCN32-NEXT: buffer_gl1_inv 2110; GCN32-NEXT: BB10_2: 2111; GCN32-NEXT: s_waitcnt_depctr 0xffe3 2112; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s0 2113; GCN32-NEXT: s_waitcnt lgkmcnt(0) 2114; GCN32-NEXT: v_mul_lo_u32 v3, s3, v0 2115; GCN32-NEXT: v_mul_hi_u32 v4, s2, v0 2116; GCN32-NEXT: v_mul_lo_u32 v0, s2, v0 2117; GCN32-NEXT: v_readfirstlane_b32 s0, v1 2118; GCN32-NEXT: v_readfirstlane_b32 s1, v2 2119; GCN32-NEXT: s_mov_b32 s7, 0x31016000 2120; GCN32-NEXT: s_mov_b32 s6, -1 2121; GCN32-NEXT: v_add_nc_u32_e32 v1, v4, v3 2122; GCN32-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s0, v0 2123; GCN32-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 2124; GCN32-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2125; GCN32-NEXT: s_endpgm 2126entry: 2127 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel 2128 store i64 %old, i64 addrspace(1)* %out 2129 ret void 2130} 2131 2132define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 2133; GFX7LESS-LABEL: sub_i64_varying: 2134; GFX7LESS: ; %bb.0: ; %entry 2135; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2136; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2137; GFX7LESS-NEXT: s_mov_b32 s6, -1 2138; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2139; GFX7LESS-NEXT: s_mov_b32 s10, s6 2140; GFX7LESS-NEXT: s_mov_b32 s11, s7 2141; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2142; GFX7LESS-NEXT: s_mov_b32 s8, s2 2143; GFX7LESS-NEXT: s_mov_b32 s9, s3 2144; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2145; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2146; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 2147; GFX7LESS-NEXT: buffer_wbinvl1 2148; GFX7LESS-NEXT: s_mov_b32 s4, s0 2149; GFX7LESS-NEXT: s_mov_b32 s5, s1 2150; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2151; GFX7LESS-NEXT: s_endpgm 2152; 2153; GFX89-LABEL: sub_i64_varying: 2154; GFX89: ; %bb.0: ; %entry 2155; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2156; GFX89-NEXT: s_mov_b32 s3, 0xf000 2157; GFX89-NEXT: s_mov_b32 s2, -1 2158; GFX89-NEXT: v_mov_b32_e32 v1, 0 2159; GFX89-NEXT: s_waitcnt lgkmcnt(0) 2160; GFX89-NEXT: s_mov_b32 s0, s4 2161; GFX89-NEXT: s_mov_b32 s1, s5 2162; GFX89-NEXT: s_mov_b32 s4, s6 2163; GFX89-NEXT: s_mov_b32 s5, s7 2164; GFX89-NEXT: s_mov_b32 s6, s2 2165; GFX89-NEXT: s_mov_b32 s7, s3 2166; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2167; GFX89-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 glc 2168; GFX89-NEXT: s_waitcnt vmcnt(0) 2169; GFX89-NEXT: buffer_wbinvl1_vol 2170; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2171; GFX89-NEXT: s_endpgm 2172; 2173; GFX10-LABEL: sub_i64_varying: 2174; GFX10: ; %bb.0: ; %entry 2175; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2176; GFX10-NEXT: v_mov_b32_e32 v1, 0 2177; GFX10-NEXT: s_mov_b32 s7, 0x31016000 2178; GFX10-NEXT: s_mov_b32 s6, -1 2179; GFX10-NEXT: s_mov_b32 s11, s7 2180; GFX10-NEXT: s_mov_b32 s10, s6 2181; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2182; GFX10-NEXT: s_mov_b32 s8, s2 2183; GFX10-NEXT: s_mov_b32 s9, s3 2184; GFX10-NEXT: s_mov_b32 s4, s0 2185; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2186; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2187; GFX10-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2188; GFX10-NEXT: s_waitcnt vmcnt(0) 2189; GFX10-NEXT: buffer_gl0_inv 2190; GFX10-NEXT: buffer_gl1_inv 2191; GFX10-NEXT: s_mov_b32 s5, s1 2192; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2193; GFX10-NEXT: s_endpgm 2194entry: 2195 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2196 %zext = zext i32 %lane to i64 2197 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel 2198 store i64 %old, i64 addrspace(1)* %out 2199 ret void 2200} 2201