1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10; Show what the atomic optimization pass will do for global pointers. 11 12define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 13; GFX7LESS-LABEL: add_i32_constant: 14; GFX7LESS: ; %bb.0: ; %entry 15; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 16; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 17; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 18; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 19; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 20; GFX7LESS-NEXT: ; implicit-def: $vgpr1 21; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 22; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 23; GFX7LESS-NEXT: ; %bb.1: 24; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 25; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 26; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 27; GFX7LESS-NEXT: s_mov_b32 s10, -1 28; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 29; GFX7LESS-NEXT: s_mov_b32 s8, s2 30; GFX7LESS-NEXT: s_mov_b32 s9, s3 31; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 32; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 33; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 34; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 35; GFX7LESS-NEXT: buffer_wbinvl1 36; GFX7LESS-NEXT: .LBB0_2: 37; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 38; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 39; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 40; GFX7LESS-NEXT: s_mov_b32 s2, -1 41; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 42; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4 43; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 44; GFX7LESS-NEXT: s_endpgm 45; 46; GFX89-LABEL: add_i32_constant: 47; GFX89: ; %bb.0: ; %entry 48; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 49; GFX89-NEXT: s_mov_b64 s[6:7], exec 50; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 51; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 52; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 53; GFX89-NEXT: ; implicit-def: $vgpr1 54; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc 55; GFX89-NEXT: s_cbranch_execz .LBB0_2 56; GFX89-NEXT: ; %bb.1: 57; GFX89-NEXT: s_waitcnt lgkmcnt(0) 58; GFX89-NEXT: s_mov_b32 s8, s2 59; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 60; GFX89-NEXT: s_mul_i32 s2, s2, 5 61; GFX89-NEXT: s_mov_b32 s11, 0xf000 62; GFX89-NEXT: s_mov_b32 s10, -1 63; GFX89-NEXT: s_mov_b32 s9, s3 64; GFX89-NEXT: v_mov_b32_e32 v1, s2 65; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 66; GFX89-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 67; GFX89-NEXT: s_waitcnt vmcnt(0) 68; GFX89-NEXT: buffer_wbinvl1_vol 69; GFX89-NEXT: .LBB0_2: 70; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] 71; GFX89-NEXT: v_readfirstlane_b32 s4, v1 72; GFX89-NEXT: s_waitcnt lgkmcnt(0) 73; GFX89-NEXT: s_mov_b32 s3, 0xf000 74; GFX89-NEXT: s_mov_b32 s2, -1 75; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4 76; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 77; GFX89-NEXT: s_endpgm 78; 79; GFX1064-LABEL: add_i32_constant: 80; GFX1064: ; %bb.0: ; %entry 81; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 82; GFX1064-NEXT: s_mov_b64 s[6:7], exec 83; GFX1064-NEXT: ; implicit-def: $vgpr1 84; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 85; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 86; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 87; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 88; GFX1064-NEXT: s_cbranch_execz .LBB0_2 89; GFX1064-NEXT: ; %bb.1: 90; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 91; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 92; GFX1064-NEXT: s_mul_i32 s6, s6, 5 93; GFX1064-NEXT: s_mov_b32 s10, -1 94; GFX1064-NEXT: v_mov_b32_e32 v1, s6 95; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 96; GFX1064-NEXT: s_mov_b32 s8, s2 97; GFX1064-NEXT: s_mov_b32 s9, s3 98; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 99; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 100; GFX1064-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 101; GFX1064-NEXT: s_waitcnt vmcnt(0) 102; GFX1064-NEXT: buffer_gl0_inv 103; GFX1064-NEXT: buffer_gl1_inv 104; GFX1064-NEXT: .LBB0_2: 105; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 106; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 107; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 108; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 109; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 110; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 111; GFX1064-NEXT: s_mov_b32 s2, -1 112; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 113; GFX1064-NEXT: s_endpgm 114; 115; GFX1032-LABEL: add_i32_constant: 116; GFX1032: ; %bb.0: ; %entry 117; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 118; GFX1032-NEXT: s_mov_b32 s5, exec_lo 119; GFX1032-NEXT: ; implicit-def: $vgpr1 120; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 121; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 122; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 123; GFX1032-NEXT: s_cbranch_execz .LBB0_2 124; GFX1032-NEXT: ; %bb.1: 125; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 126; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 127; GFX1032-NEXT: s_mul_i32 s5, s5, 5 128; GFX1032-NEXT: s_mov_b32 s10, -1 129; GFX1032-NEXT: v_mov_b32_e32 v1, s5 130; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 131; GFX1032-NEXT: s_mov_b32 s8, s2 132; GFX1032-NEXT: s_mov_b32 s9, s3 133; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 134; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 135; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 136; GFX1032-NEXT: s_waitcnt vmcnt(0) 137; GFX1032-NEXT: buffer_gl0_inv 138; GFX1032-NEXT: buffer_gl1_inv 139; GFX1032-NEXT: .LBB0_2: 140; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 141; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 142; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 143; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 144; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 145; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 146; GFX1032-NEXT: s_mov_b32 s2, -1 147; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 148; GFX1032-NEXT: s_endpgm 149entry: 150 %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel 151 store i32 %old, i32 addrspace(1)* %out 152 ret void 153} 154 155define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) { 156; GFX7LESS-LABEL: add_i32_uniform: 157; GFX7LESS: ; %bb.0: ; %entry 158; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 159; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 160; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd 161; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 162; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 163; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 164; GFX7LESS-NEXT: ; implicit-def: $vgpr1 165; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 166; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 167; GFX7LESS-NEXT: ; %bb.1: 168; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 169; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 170; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 171; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 172; GFX7LESS-NEXT: s_mov_b32 s14, -1 173; GFX7LESS-NEXT: s_mov_b32 s12, s6 174; GFX7LESS-NEXT: s_mov_b32 s13, s7 175; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 176; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 177; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 178; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 179; GFX7LESS-NEXT: buffer_wbinvl1 180; GFX7LESS-NEXT: .LBB1_2: 181; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 182; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 183; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 184; GFX7LESS-NEXT: s_mov_b32 s6, -1 185; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 186; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 187; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 188; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 189; GFX7LESS-NEXT: s_endpgm 190; 191; GFX8-LABEL: add_i32_uniform: 192; GFX8: ; %bb.0: ; %entry 193; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 194; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 195; GFX8-NEXT: s_mov_b64 s[2:3], exec 196; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 197; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 198; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 199; GFX8-NEXT: ; implicit-def: $vgpr1 200; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 201; GFX8-NEXT: s_cbranch_execz .LBB1_2 202; GFX8-NEXT: ; %bb.1: 203; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 204; GFX8-NEXT: s_waitcnt lgkmcnt(0) 205; GFX8-NEXT: s_mul_i32 s2, s8, s2 206; GFX8-NEXT: s_mov_b32 s15, 0xf000 207; GFX8-NEXT: s_mov_b32 s14, -1 208; GFX8-NEXT: s_mov_b32 s12, s6 209; GFX8-NEXT: s_mov_b32 s13, s7 210; GFX8-NEXT: v_mov_b32_e32 v1, s2 211; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 212; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 213; GFX8-NEXT: s_waitcnt vmcnt(0) 214; GFX8-NEXT: buffer_wbinvl1_vol 215; GFX8-NEXT: .LBB1_2: 216; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 217; GFX8-NEXT: s_waitcnt lgkmcnt(0) 218; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 219; GFX8-NEXT: v_readfirstlane_b32 s0, v1 220; GFX8-NEXT: s_mov_b32 s7, 0xf000 221; GFX8-NEXT: s_mov_b32 s6, -1 222; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 223; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 224; GFX8-NEXT: s_endpgm 225; 226; GFX9-LABEL: add_i32_uniform: 227; GFX9: ; %bb.0: ; %entry 228; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 229; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 230; GFX9-NEXT: s_mov_b64 s[2:3], exec 231; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 232; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 233; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 234; GFX9-NEXT: ; implicit-def: $vgpr1 235; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 236; GFX9-NEXT: s_cbranch_execz .LBB1_2 237; GFX9-NEXT: ; %bb.1: 238; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 239; GFX9-NEXT: s_waitcnt lgkmcnt(0) 240; GFX9-NEXT: s_mul_i32 s2, s8, s2 241; GFX9-NEXT: s_mov_b32 s15, 0xf000 242; GFX9-NEXT: s_mov_b32 s14, -1 243; GFX9-NEXT: s_mov_b32 s12, s6 244; GFX9-NEXT: s_mov_b32 s13, s7 245; GFX9-NEXT: v_mov_b32_e32 v1, s2 246; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 247; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 248; GFX9-NEXT: s_waitcnt vmcnt(0) 249; GFX9-NEXT: buffer_wbinvl1_vol 250; GFX9-NEXT: .LBB1_2: 251; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 252; GFX9-NEXT: s_waitcnt lgkmcnt(0) 253; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 254; GFX9-NEXT: v_readfirstlane_b32 s0, v1 255; GFX9-NEXT: s_mov_b32 s7, 0xf000 256; GFX9-NEXT: s_mov_b32 s6, -1 257; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 258; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 259; GFX9-NEXT: s_endpgm 260; 261; GFX1064-LABEL: add_i32_uniform: 262; GFX1064: ; %bb.0: ; %entry 263; GFX1064-NEXT: s_clause 0x1 264; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 265; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 266; GFX1064-NEXT: s_mov_b64 s[2:3], exec 267; GFX1064-NEXT: ; implicit-def: $vgpr1 268; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 269; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 270; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 271; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 272; GFX1064-NEXT: s_cbranch_execz .LBB1_2 273; GFX1064-NEXT: ; %bb.1: 274; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 275; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 276; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 277; GFX1064-NEXT: s_mul_i32 s2, s8, s2 278; GFX1064-NEXT: s_mov_b32 s14, -1 279; GFX1064-NEXT: v_mov_b32_e32 v1, s2 280; GFX1064-NEXT: s_mov_b32 s12, s6 281; GFX1064-NEXT: s_mov_b32 s13, s7 282; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 283; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 284; GFX1064-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 285; GFX1064-NEXT: s_waitcnt vmcnt(0) 286; GFX1064-NEXT: buffer_gl0_inv 287; GFX1064-NEXT: buffer_gl1_inv 288; GFX1064-NEXT: .LBB1_2: 289; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 290; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 291; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 292; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0 293; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 294; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 295; GFX1064-NEXT: s_mov_b32 s6, -1 296; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 297; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 298; GFX1064-NEXT: s_endpgm 299; 300; GFX1032-LABEL: add_i32_uniform: 301; GFX1032: ; %bb.0: ; %entry 302; GFX1032-NEXT: s_clause 0x1 303; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 304; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 305; GFX1032-NEXT: s_mov_b32 s3, exec_lo 306; GFX1032-NEXT: ; implicit-def: $vgpr1 307; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 308; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 309; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 310; GFX1032-NEXT: s_cbranch_execz .LBB1_2 311; GFX1032-NEXT: ; %bb.1: 312; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 313; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 314; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 315; GFX1032-NEXT: s_mul_i32 s1, s2, s1 316; GFX1032-NEXT: s_mov_b32 s10, -1 317; GFX1032-NEXT: v_mov_b32_e32 v1, s1 318; GFX1032-NEXT: s_mov_b32 s8, s6 319; GFX1032-NEXT: s_mov_b32 s9, s7 320; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 321; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 322; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 323; GFX1032-NEXT: s_waitcnt vmcnt(0) 324; GFX1032-NEXT: buffer_gl0_inv 325; GFX1032-NEXT: buffer_gl1_inv 326; GFX1032-NEXT: .LBB1_2: 327; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 328; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 329; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 330; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 331; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 332; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 333; GFX1032-NEXT: s_mov_b32 s6, -1 334; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 335; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 336; GFX1032-NEXT: s_endpgm 337entry: 338 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel 339 store i32 %old, i32 addrspace(1)* %out 340 ret void 341} 342 343define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 344; GFX7LESS-LABEL: add_i32_varying: 345; GFX7LESS: ; %bb.0: ; %entry 346; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 347; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 348; GFX7LESS-NEXT: s_mov_b32 s6, -1 349; GFX7LESS-NEXT: s_mov_b32 s10, s6 350; GFX7LESS-NEXT: s_mov_b32 s11, s7 351; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 352; GFX7LESS-NEXT: s_mov_b32 s8, s2 353; GFX7LESS-NEXT: s_mov_b32 s9, s3 354; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 355; GFX7LESS-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 356; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 357; GFX7LESS-NEXT: buffer_wbinvl1 358; GFX7LESS-NEXT: s_mov_b32 s4, s0 359; GFX7LESS-NEXT: s_mov_b32 s5, s1 360; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 361; GFX7LESS-NEXT: s_endpgm 362; 363; GFX8-LABEL: add_i32_varying: 364; GFX8: ; %bb.0: ; %entry 365; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 366; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 367; GFX8-NEXT: v_mov_b32_e32 v1, 0 368; GFX8-NEXT: s_mov_b64 exec, s[4:5] 369; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 370; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 371; GFX8-NEXT: v_mov_b32_e32 v2, v0 372; GFX8-NEXT: s_not_b64 exec, exec 373; GFX8-NEXT: v_mov_b32_e32 v2, 0 374; GFX8-NEXT: s_not_b64 exec, exec 375; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 376; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 377; GFX8-NEXT: s_nop 1 378; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 379; GFX8-NEXT: s_nop 1 380; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 381; GFX8-NEXT: s_nop 1 382; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 383; GFX8-NEXT: s_nop 1 384; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 385; GFX8-NEXT: s_nop 1 386; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 387; GFX8-NEXT: v_readlane_b32 s6, v2, 63 388; GFX8-NEXT: s_nop 0 389; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 390; GFX8-NEXT: s_mov_b64 exec, s[4:5] 391; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 392; GFX8-NEXT: ; implicit-def: $vgpr0 393; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 394; GFX8-NEXT: s_cbranch_execz .LBB2_2 395; GFX8-NEXT: ; %bb.1: 396; GFX8-NEXT: s_mov_b32 s11, 0xf000 397; GFX8-NEXT: s_mov_b32 s10, -1 398; GFX8-NEXT: s_waitcnt lgkmcnt(0) 399; GFX8-NEXT: s_mov_b32 s8, s2 400; GFX8-NEXT: s_mov_b32 s9, s3 401; GFX8-NEXT: v_mov_b32_e32 v0, s6 402; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 403; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 404; GFX8-NEXT: s_waitcnt vmcnt(0) 405; GFX8-NEXT: buffer_wbinvl1_vol 406; GFX8-NEXT: .LBB2_2: 407; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 408; GFX8-NEXT: v_readfirstlane_b32 s4, v0 409; GFX8-NEXT: v_mov_b32_e32 v0, v1 410; GFX8-NEXT: s_waitcnt lgkmcnt(0) 411; GFX8-NEXT: s_mov_b32 s3, 0xf000 412; GFX8-NEXT: s_mov_b32 s2, -1 413; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 414; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 415; GFX8-NEXT: s_endpgm 416; 417; GFX9-LABEL: add_i32_varying: 418; GFX9: ; %bb.0: ; %entry 419; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 420; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 421; GFX9-NEXT: v_mov_b32_e32 v1, 0 422; GFX9-NEXT: s_mov_b64 exec, s[4:5] 423; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 424; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 425; GFX9-NEXT: v_mov_b32_e32 v2, v0 426; GFX9-NEXT: s_not_b64 exec, exec 427; GFX9-NEXT: v_mov_b32_e32 v2, 0 428; GFX9-NEXT: s_not_b64 exec, exec 429; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 430; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 431; GFX9-NEXT: s_nop 1 432; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 433; GFX9-NEXT: s_nop 1 434; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 435; GFX9-NEXT: s_nop 1 436; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 437; GFX9-NEXT: s_nop 1 438; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 439; GFX9-NEXT: s_nop 1 440; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 441; GFX9-NEXT: v_readlane_b32 s6, v2, 63 442; GFX9-NEXT: s_nop 0 443; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 444; GFX9-NEXT: s_mov_b64 exec, s[4:5] 445; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 446; GFX9-NEXT: ; implicit-def: $vgpr0 447; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 448; GFX9-NEXT: s_cbranch_execz .LBB2_2 449; GFX9-NEXT: ; %bb.1: 450; GFX9-NEXT: s_mov_b32 s11, 0xf000 451; GFX9-NEXT: s_mov_b32 s10, -1 452; GFX9-NEXT: s_waitcnt lgkmcnt(0) 453; GFX9-NEXT: s_mov_b32 s8, s2 454; GFX9-NEXT: s_mov_b32 s9, s3 455; GFX9-NEXT: v_mov_b32_e32 v0, s6 456; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 457; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 458; GFX9-NEXT: s_waitcnt vmcnt(0) 459; GFX9-NEXT: buffer_wbinvl1_vol 460; GFX9-NEXT: .LBB2_2: 461; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 462; GFX9-NEXT: v_readfirstlane_b32 s4, v0 463; GFX9-NEXT: v_mov_b32_e32 v0, v1 464; GFX9-NEXT: s_waitcnt lgkmcnt(0) 465; GFX9-NEXT: s_mov_b32 s3, 0xf000 466; GFX9-NEXT: s_mov_b32 s2, -1 467; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 468; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 469; GFX9-NEXT: s_endpgm 470; 471; GFX1064-LABEL: add_i32_varying: 472; GFX1064: ; %bb.0: ; %entry 473; GFX1064-NEXT: v_mov_b32_e32 v1, v0 474; GFX1064-NEXT: s_not_b64 exec, exec 475; GFX1064-NEXT: v_mov_b32_e32 v1, 0 476; GFX1064-NEXT: s_not_b64 exec, exec 477; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 478; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 479; GFX1064-NEXT: v_mov_b32_e32 v3, 0 480; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 481; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 482; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 483; GFX1064-NEXT: v_mov_b32_e32 v2, v1 484; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 485; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 486; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 487; GFX1064-NEXT: v_mov_b32_e32 v2, s4 488; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 489; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 490; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 491; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 492; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 493; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 494; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 495; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 496; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 497; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 498; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 499; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 500; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 501; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 502; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 503; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 504; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 505; GFX1064-NEXT: s_mov_b32 s4, s9 506; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 507; GFX1064-NEXT: s_mov_b64 exec, s[6:7] 508; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 509; GFX1064-NEXT: s_mov_b32 s6, -1 510; GFX1064-NEXT: ; implicit-def: $vgpr0 511; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc 512; GFX1064-NEXT: s_cbranch_execz .LBB2_2 513; GFX1064-NEXT: ; %bb.1: 514; GFX1064-NEXT: v_mov_b32_e32 v0, s4 515; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 516; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 517; GFX1064-NEXT: s_mov_b32 s4, s2 518; GFX1064-NEXT: s_mov_b32 s5, s3 519; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 520; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 521; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 522; GFX1064-NEXT: s_waitcnt vmcnt(0) 523; GFX1064-NEXT: buffer_gl0_inv 524; GFX1064-NEXT: buffer_gl1_inv 525; GFX1064-NEXT: .LBB2_2: 526; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 527; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 528; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 529; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 530; GFX1064-NEXT: v_mov_b32_e32 v0, v3 531; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 532; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v0 533; GFX1064-NEXT: s_mov_b32 s2, s6 534; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 535; GFX1064-NEXT: s_endpgm 536; 537; GFX1032-LABEL: add_i32_varying: 538; GFX1032: ; %bb.0: ; %entry 539; GFX1032-NEXT: v_mov_b32_e32 v1, v0 540; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 541; GFX1032-NEXT: v_mov_b32_e32 v1, 0 542; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 543; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 544; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 545; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 546; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 547; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 548; GFX1032-NEXT: v_mov_b32_e32 v2, v1 549; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 550; GFX1032-NEXT: s_mov_b32 exec_lo, s2 551; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 552; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 553; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 554; GFX1032-NEXT: v_mov_b32_e32 v3, 0 555; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 556; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 557; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 558; GFX1032-NEXT: s_mov_b32 exec_lo, s4 559; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 560; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 561; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 562; GFX1032-NEXT: s_mov_b32 exec_lo, s4 563; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 564; GFX1032-NEXT: s_mov_b32 s4, s6 565; GFX1032-NEXT: s_mov_b32 s6, -1 566; GFX1032-NEXT: ; implicit-def: $vgpr0 567; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo 568; GFX1032-NEXT: s_cbranch_execz .LBB2_2 569; GFX1032-NEXT: ; %bb.1: 570; GFX1032-NEXT: v_mov_b32_e32 v0, s4 571; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 572; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 573; GFX1032-NEXT: s_mov_b32 s4, s2 574; GFX1032-NEXT: s_mov_b32 s5, s3 575; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 576; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 577; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 578; GFX1032-NEXT: s_waitcnt vmcnt(0) 579; GFX1032-NEXT: buffer_gl0_inv 580; GFX1032-NEXT: buffer_gl1_inv 581; GFX1032-NEXT: .LBB2_2: 582; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 583; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 584; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 585; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 586; GFX1032-NEXT: v_mov_b32_e32 v0, v3 587; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 588; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v0 589; GFX1032-NEXT: s_mov_b32 s2, s6 590; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 591; GFX1032-NEXT: s_endpgm 592entry: 593 %lane = call i32 @llvm.amdgcn.workitem.id.x() 594 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel 595 store i32 %old, i32 addrspace(1)* %out 596 ret void 597} 598 599define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 600; GFX7LESS-LABEL: add_i64_constant: 601; GFX7LESS: ; %bb.0: ; %entry 602; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 603; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 604; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 605; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 606; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 607; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 608; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 609; GFX7LESS-NEXT: s_cbranch_execz .LBB3_2 610; GFX7LESS-NEXT: ; %bb.1: 611; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 612; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 613; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 614; GFX7LESS-NEXT: s_mov_b32 s10, -1 615; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 616; GFX7LESS-NEXT: s_mov_b32 s8, s2 617; GFX7LESS-NEXT: s_mov_b32 s9, s3 618; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 619; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 620; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 621; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 622; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 623; GFX7LESS-NEXT: buffer_wbinvl1 624; GFX7LESS-NEXT: .LBB3_2: 625; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 626; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 627; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 628; GFX7LESS-NEXT: s_mov_b32 s2, -1 629; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 630; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 631; GFX7LESS-NEXT: s_waitcnt expcnt(0) 632; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 633; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 634; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 635; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 636; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 637; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 638; GFX7LESS-NEXT: s_endpgm 639; 640; GFX89-LABEL: add_i64_constant: 641; GFX89: ; %bb.0: ; %entry 642; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 643; GFX89-NEXT: s_mov_b64 s[6:7], exec 644; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 645; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 646; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 647; GFX89-NEXT: ; implicit-def: $vgpr0_vgpr1 648; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc 649; GFX89-NEXT: s_cbranch_execz .LBB3_2 650; GFX89-NEXT: ; %bb.1: 651; GFX89-NEXT: s_waitcnt lgkmcnt(0) 652; GFX89-NEXT: s_mov_b32 s8, s2 653; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 654; GFX89-NEXT: s_mul_i32 s2, s2, 5 655; GFX89-NEXT: s_mov_b32 s11, 0xf000 656; GFX89-NEXT: s_mov_b32 s10, -1 657; GFX89-NEXT: s_mov_b32 s9, s3 658; GFX89-NEXT: v_mov_b32_e32 v0, s2 659; GFX89-NEXT: v_mov_b32_e32 v1, 0 660; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 661; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 662; GFX89-NEXT: s_waitcnt vmcnt(0) 663; GFX89-NEXT: buffer_wbinvl1_vol 664; GFX89-NEXT: .LBB3_2: 665; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] 666; GFX89-NEXT: s_waitcnt lgkmcnt(0) 667; GFX89-NEXT: v_readfirstlane_b32 s2, v0 668; GFX89-NEXT: v_readfirstlane_b32 s3, v1 669; GFX89-NEXT: v_mov_b32_e32 v0, s2 670; GFX89-NEXT: v_mov_b32_e32 v1, s3 671; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 672; GFX89-NEXT: s_mov_b32 s3, 0xf000 673; GFX89-NEXT: s_mov_b32 s2, -1 674; GFX89-NEXT: s_nop 2 675; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 676; GFX89-NEXT: s_endpgm 677; 678; GFX1064-LABEL: add_i64_constant: 679; GFX1064: ; %bb.0: ; %entry 680; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 681; GFX1064-NEXT: s_mov_b64 s[6:7], exec 682; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 683; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 684; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 685; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 686; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 687; GFX1064-NEXT: s_cbranch_execz .LBB3_2 688; GFX1064-NEXT: ; %bb.1: 689; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 690; GFX1064-NEXT: v_mov_b32_e32 v1, 0 691; GFX1064-NEXT: s_mul_i32 s6, s6, 5 692; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 693; GFX1064-NEXT: v_mov_b32_e32 v0, s6 694; GFX1064-NEXT: s_mov_b32 s10, -1 695; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 696; GFX1064-NEXT: s_mov_b32 s8, s2 697; GFX1064-NEXT: s_mov_b32 s9, s3 698; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 699; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 700; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 701; GFX1064-NEXT: s_waitcnt vmcnt(0) 702; GFX1064-NEXT: buffer_gl0_inv 703; GFX1064-NEXT: buffer_gl1_inv 704; GFX1064-NEXT: .LBB3_2: 705; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 706; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 707; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 708; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 709; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 710; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] 711; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 712; GFX1064-NEXT: s_mov_b32 s2, -1 713; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 714; GFX1064-NEXT: s_endpgm 715; 716; GFX1032-LABEL: add_i64_constant: 717; GFX1032: ; %bb.0: ; %entry 718; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 719; GFX1032-NEXT: s_mov_b32 s5, exec_lo 720; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 721; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 722; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 723; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 724; GFX1032-NEXT: s_cbranch_execz .LBB3_2 725; GFX1032-NEXT: ; %bb.1: 726; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 727; GFX1032-NEXT: v_mov_b32_e32 v1, 0 728; GFX1032-NEXT: s_mul_i32 s5, s5, 5 729; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 730; GFX1032-NEXT: v_mov_b32_e32 v0, s5 731; GFX1032-NEXT: s_mov_b32 s10, -1 732; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 733; GFX1032-NEXT: s_mov_b32 s8, s2 734; GFX1032-NEXT: s_mov_b32 s9, s3 735; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 736; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 737; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 738; GFX1032-NEXT: s_waitcnt vmcnt(0) 739; GFX1032-NEXT: buffer_gl0_inv 740; GFX1032-NEXT: buffer_gl1_inv 741; GFX1032-NEXT: .LBB3_2: 742; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 743; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 744; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 745; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 746; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 747; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] 748; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 749; GFX1032-NEXT: s_mov_b32 s2, -1 750; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 751; GFX1032-NEXT: s_endpgm 752entry: 753 %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel 754 store i64 %old, i64 addrspace(1)* %out 755 ret void 756} 757 758define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) { 759; GFX7LESS-LABEL: add_i64_uniform: 760; GFX7LESS: ; %bb.0: ; %entry 761; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec 762; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 763; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 764; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 765; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 766; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 767; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 768; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 769; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 770; GFX7LESS-NEXT: ; %bb.1: 771; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 772; GFX7LESS-NEXT: s_mov_b32 s14, -1 773; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 774; GFX7LESS-NEXT: s_mov_b32 s12, s6 775; GFX7LESS-NEXT: s_mov_b32 s13, s7 776; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 777; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 778; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 779; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 780; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 781; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 782; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 783; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 784; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc 785; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 786; GFX7LESS-NEXT: buffer_wbinvl1 787; GFX7LESS-NEXT: .LBB4_2: 788; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 789; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 790; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 791; GFX7LESS-NEXT: s_mov_b32 s6, -1 792; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 793; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1 794; GFX7LESS-NEXT: s_waitcnt expcnt(0) 795; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 796; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 797; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 798; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 799; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3 800; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v2 801; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 802; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 803; GFX7LESS-NEXT: s_endpgm 804; 805; GFX8-LABEL: add_i64_uniform: 806; GFX8: ; %bb.0: ; %entry 807; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 808; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 809; GFX8-NEXT: s_mov_b64 s[8:9], exec 810; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 811; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 812; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 813; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 814; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 815; GFX8-NEXT: s_cbranch_execz .LBB4_2 816; GFX8-NEXT: ; %bb.1: 817; GFX8-NEXT: s_waitcnt lgkmcnt(0) 818; GFX8-NEXT: s_mov_b32 s12, s6 819; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 820; GFX8-NEXT: v_mov_b32_e32 v0, s6 821; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0 822; GFX8-NEXT: s_mul_i32 s6, s1, s6 823; GFX8-NEXT: s_mov_b32 s15, 0xf000 824; GFX8-NEXT: s_mov_b32 s14, -1 825; GFX8-NEXT: s_mov_b32 s13, s7 826; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 827; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 828; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc 829; GFX8-NEXT: s_waitcnt vmcnt(0) 830; GFX8-NEXT: buffer_wbinvl1_vol 831; GFX8-NEXT: .LBB4_2: 832; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 833; GFX8-NEXT: v_readfirstlane_b32 s2, v0 834; GFX8-NEXT: v_readfirstlane_b32 s3, v1 835; GFX8-NEXT: v_mov_b32_e32 v0, s2 836; GFX8-NEXT: v_mov_b32_e32 v1, s3 837; GFX8-NEXT: s_waitcnt lgkmcnt(0) 838; GFX8-NEXT: v_mul_lo_u32 v3, s1, v2 839; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v2, v[0:1] 840; GFX8-NEXT: s_mov_b32 s7, 0xf000 841; GFX8-NEXT: s_mov_b32 s6, -1 842; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 843; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 844; GFX8-NEXT: s_endpgm 845; 846; GFX9-LABEL: add_i64_uniform: 847; GFX9: ; %bb.0: ; %entry 848; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 849; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 850; GFX9-NEXT: s_mov_b64 s[8:9], exec 851; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 852; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 853; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 854; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 855; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 856; GFX9-NEXT: s_cbranch_execz .LBB4_2 857; GFX9-NEXT: ; %bb.1: 858; GFX9-NEXT: s_waitcnt lgkmcnt(0) 859; GFX9-NEXT: s_mov_b32 s12, s6 860; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 861; GFX9-NEXT: s_mov_b32 s13, s7 862; GFX9-NEXT: s_mul_i32 s7, s3, s6 863; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 864; GFX9-NEXT: s_add_i32 s8, s8, s7 865; GFX9-NEXT: s_mul_i32 s6, s2, s6 866; GFX9-NEXT: s_mov_b32 s15, 0xf000 867; GFX9-NEXT: s_mov_b32 s14, -1 868; GFX9-NEXT: v_mov_b32_e32 v0, s6 869; GFX9-NEXT: v_mov_b32_e32 v1, s8 870; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 871; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc 872; GFX9-NEXT: s_waitcnt vmcnt(0) 873; GFX9-NEXT: buffer_wbinvl1_vol 874; GFX9-NEXT: .LBB4_2: 875; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 876; GFX9-NEXT: v_readfirstlane_b32 s0, v0 877; GFX9-NEXT: v_readfirstlane_b32 s1, v1 878; GFX9-NEXT: v_mov_b32_e32 v0, s0 879; GFX9-NEXT: v_mov_b32_e32 v1, s1 880; GFX9-NEXT: s_waitcnt lgkmcnt(0) 881; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 882; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1] 883; GFX9-NEXT: s_mov_b32 s7, 0xf000 884; GFX9-NEXT: s_mov_b32 s6, -1 885; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 886; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 887; GFX9-NEXT: s_endpgm 888; 889; GFX1064-LABEL: add_i64_uniform: 890; GFX1064: ; %bb.0: ; %entry 891; GFX1064-NEXT: s_clause 0x1 892; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 893; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 894; GFX1064-NEXT: s_mov_b64 s[8:9], exec 895; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 896; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 897; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 898; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 899; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 900; GFX1064-NEXT: s_cbranch_execz .LBB4_2 901; GFX1064-NEXT: ; %bb.1: 902; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] 903; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 904; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 905; GFX1064-NEXT: s_mul_i32 s9, s3, s8 906; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 907; GFX1064-NEXT: s_mul_i32 s8, s2, s8 908; GFX1064-NEXT: s_add_i32 s10, s10, s9 909; GFX1064-NEXT: v_mov_b32_e32 v0, s8 910; GFX1064-NEXT: v_mov_b32_e32 v1, s10 911; GFX1064-NEXT: s_mov_b32 s10, -1 912; GFX1064-NEXT: s_mov_b32 s8, s6 913; GFX1064-NEXT: s_mov_b32 s9, s7 914; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 915; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 916; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 917; GFX1064-NEXT: s_waitcnt vmcnt(0) 918; GFX1064-NEXT: buffer_gl0_inv 919; GFX1064-NEXT: buffer_gl1_inv 920; GFX1064-NEXT: .LBB4_2: 921; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 922; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 923; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 924; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 925; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 926; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2 927; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 928; GFX1064-NEXT: s_mov_b32 s6, -1 929; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, s[0:1] 930; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v1 931; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 932; GFX1064-NEXT: s_endpgm 933; 934; GFX1032-LABEL: add_i64_uniform: 935; GFX1032: ; %bb.0: ; %entry 936; GFX1032-NEXT: s_clause 0x1 937; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 938; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 939; GFX1032-NEXT: s_mov_b32 s8, exec_lo 940; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 941; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 942; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 943; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 944; GFX1032-NEXT: s_cbranch_execz .LBB4_2 945; GFX1032-NEXT: ; %bb.1: 946; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 947; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 948; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 949; GFX1032-NEXT: s_mul_i32 s8, s3, s1 950; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 951; GFX1032-NEXT: s_mul_i32 s1, s2, s1 952; GFX1032-NEXT: s_add_i32 s9, s9, s8 953; GFX1032-NEXT: v_mov_b32_e32 v0, s1 954; GFX1032-NEXT: v_mov_b32_e32 v1, s9 955; GFX1032-NEXT: s_mov_b32 s10, -1 956; GFX1032-NEXT: s_mov_b32 s8, s6 957; GFX1032-NEXT: s_mov_b32 s9, s7 958; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 959; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 960; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 961; GFX1032-NEXT: s_waitcnt vmcnt(0) 962; GFX1032-NEXT: buffer_gl0_inv 963; GFX1032-NEXT: buffer_gl1_inv 964; GFX1032-NEXT: .LBB4_2: 965; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 966; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 967; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 968; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 969; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 970; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2 971; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 972; GFX1032-NEXT: s_mov_b32 s6, -1 973; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v2, s[0:1] 974; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v1 975; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 976; GFX1032-NEXT: s_endpgm 977entry: 978 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel 979 store i64 %old, i64 addrspace(1)* %out 980 ret void 981} 982 983define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 984; GFX7LESS-LABEL: add_i64_varying: 985; GFX7LESS: ; %bb.0: ; %entry 986; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 987; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 988; GFX7LESS-NEXT: s_mov_b32 s6, -1 989; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 990; GFX7LESS-NEXT: s_mov_b32 s10, s6 991; GFX7LESS-NEXT: s_mov_b32 s11, s7 992; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 993; GFX7LESS-NEXT: s_mov_b32 s8, s2 994; GFX7LESS-NEXT: s_mov_b32 s9, s3 995; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 996; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 997; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 998; GFX7LESS-NEXT: buffer_wbinvl1 999; GFX7LESS-NEXT: s_mov_b32 s4, s0 1000; GFX7LESS-NEXT: s_mov_b32 s5, s1 1001; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1002; GFX7LESS-NEXT: s_endpgm 1003; 1004; GFX89-LABEL: add_i64_varying: 1005; GFX89: ; %bb.0: ; %entry 1006; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1007; GFX89-NEXT: s_mov_b32 s7, 0xf000 1008; GFX89-NEXT: s_mov_b32 s6, -1 1009; GFX89-NEXT: s_mov_b32 s10, s6 1010; GFX89-NEXT: s_mov_b32 s11, s7 1011; GFX89-NEXT: s_waitcnt lgkmcnt(0) 1012; GFX89-NEXT: s_mov_b32 s8, s2 1013; GFX89-NEXT: s_mov_b32 s9, s3 1014; GFX89-NEXT: v_mov_b32_e32 v1, 0 1015; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1016; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1017; GFX89-NEXT: s_waitcnt vmcnt(0) 1018; GFX89-NEXT: buffer_wbinvl1_vol 1019; GFX89-NEXT: s_mov_b32 s4, s0 1020; GFX89-NEXT: s_mov_b32 s5, s1 1021; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1022; GFX89-NEXT: s_endpgm 1023; 1024; GFX10-LABEL: add_i64_varying: 1025; GFX10: ; %bb.0: ; %entry 1026; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1027; GFX10-NEXT: v_mov_b32_e32 v1, 0 1028; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1029; GFX10-NEXT: s_mov_b32 s6, -1 1030; GFX10-NEXT: s_mov_b32 s11, s7 1031; GFX10-NEXT: s_mov_b32 s10, s6 1032; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1033; GFX10-NEXT: s_mov_b32 s8, s2 1034; GFX10-NEXT: s_mov_b32 s9, s3 1035; GFX10-NEXT: s_mov_b32 s4, s0 1036; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1037; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1038; GFX10-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1039; GFX10-NEXT: s_waitcnt vmcnt(0) 1040; GFX10-NEXT: buffer_gl0_inv 1041; GFX10-NEXT: buffer_gl1_inv 1042; GFX10-NEXT: s_mov_b32 s5, s1 1043; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1044; GFX10-NEXT: s_endpgm 1045entry: 1046 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1047 %zext = zext i32 %lane to i64 1048 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel 1049 store i64 %old, i64 addrspace(1)* %out 1050 ret void 1051} 1052 1053define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 1054; GFX7LESS-LABEL: sub_i32_constant: 1055; GFX7LESS: ; %bb.0: ; %entry 1056; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1057; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1058; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1059; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1060; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1061; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1062; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1063; GFX7LESS-NEXT: s_cbranch_execz .LBB6_2 1064; GFX7LESS-NEXT: ; %bb.1: 1065; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 1066; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1067; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 1068; GFX7LESS-NEXT: s_mov_b32 s10, -1 1069; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1070; GFX7LESS-NEXT: s_mov_b32 s8, s2 1071; GFX7LESS-NEXT: s_mov_b32 s9, s3 1072; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1073; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1074; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1075; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1076; GFX7LESS-NEXT: buffer_wbinvl1 1077; GFX7LESS-NEXT: .LBB6_2: 1078; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1079; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1080; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1081; GFX7LESS-NEXT: s_mov_b32 s2, -1 1082; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 1083; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1084; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1085; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1086; GFX7LESS-NEXT: s_endpgm 1087; 1088; GFX8-LABEL: sub_i32_constant: 1089; GFX8: ; %bb.0: ; %entry 1090; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1091; GFX8-NEXT: s_mov_b64 s[6:7], exec 1092; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1093; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1094; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1095; GFX8-NEXT: ; implicit-def: $vgpr1 1096; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1097; GFX8-NEXT: s_cbranch_execz .LBB6_2 1098; GFX8-NEXT: ; %bb.1: 1099; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1100; GFX8-NEXT: s_mov_b32 s8, s2 1101; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1102; GFX8-NEXT: s_mul_i32 s2, s2, 5 1103; GFX8-NEXT: s_mov_b32 s11, 0xf000 1104; GFX8-NEXT: s_mov_b32 s10, -1 1105; GFX8-NEXT: s_mov_b32 s9, s3 1106; GFX8-NEXT: v_mov_b32_e32 v1, s2 1107; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1108; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1109; GFX8-NEXT: s_waitcnt vmcnt(0) 1110; GFX8-NEXT: buffer_wbinvl1_vol 1111; GFX8-NEXT: .LBB6_2: 1112; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1113; GFX8-NEXT: v_readfirstlane_b32 s4, v1 1114; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1115; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1116; GFX8-NEXT: s_mov_b32 s3, 0xf000 1117; GFX8-NEXT: s_mov_b32 s2, -1 1118; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 1119; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1120; GFX8-NEXT: s_endpgm 1121; 1122; GFX9-LABEL: sub_i32_constant: 1123; GFX9: ; %bb.0: ; %entry 1124; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1125; GFX9-NEXT: s_mov_b64 s[6:7], exec 1126; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1127; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1128; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1129; GFX9-NEXT: ; implicit-def: $vgpr1 1130; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1131; GFX9-NEXT: s_cbranch_execz .LBB6_2 1132; GFX9-NEXT: ; %bb.1: 1133; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1134; GFX9-NEXT: s_mov_b32 s8, s2 1135; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1136; GFX9-NEXT: s_mul_i32 s2, s2, 5 1137; GFX9-NEXT: s_mov_b32 s11, 0xf000 1138; GFX9-NEXT: s_mov_b32 s10, -1 1139; GFX9-NEXT: s_mov_b32 s9, s3 1140; GFX9-NEXT: v_mov_b32_e32 v1, s2 1141; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1142; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1143; GFX9-NEXT: s_waitcnt vmcnt(0) 1144; GFX9-NEXT: buffer_wbinvl1_vol 1145; GFX9-NEXT: .LBB6_2: 1146; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1147; GFX9-NEXT: v_readfirstlane_b32 s4, v1 1148; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1149; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1150; GFX9-NEXT: s_mov_b32 s3, 0xf000 1151; GFX9-NEXT: s_mov_b32 s2, -1 1152; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1153; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1154; GFX9-NEXT: s_endpgm 1155; 1156; GFX1064-LABEL: sub_i32_constant: 1157; GFX1064: ; %bb.0: ; %entry 1158; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1159; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1160; GFX1064-NEXT: ; implicit-def: $vgpr1 1161; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1162; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1163; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1164; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1165; GFX1064-NEXT: s_cbranch_execz .LBB6_2 1166; GFX1064-NEXT: ; %bb.1: 1167; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1168; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 1169; GFX1064-NEXT: s_mul_i32 s6, s6, 5 1170; GFX1064-NEXT: s_mov_b32 s10, -1 1171; GFX1064-NEXT: v_mov_b32_e32 v1, s6 1172; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1173; GFX1064-NEXT: s_mov_b32 s8, s2 1174; GFX1064-NEXT: s_mov_b32 s9, s3 1175; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1176; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1177; GFX1064-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1178; GFX1064-NEXT: s_waitcnt vmcnt(0) 1179; GFX1064-NEXT: buffer_gl0_inv 1180; GFX1064-NEXT: buffer_gl1_inv 1181; GFX1064-NEXT: .LBB6_2: 1182; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1183; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1184; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1185; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1186; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1187; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1188; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1189; GFX1064-NEXT: s_mov_b32 s2, -1 1190; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1191; GFX1064-NEXT: s_endpgm 1192; 1193; GFX1032-LABEL: sub_i32_constant: 1194; GFX1032: ; %bb.0: ; %entry 1195; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1196; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1197; GFX1032-NEXT: ; implicit-def: $vgpr1 1198; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 1199; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1200; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1201; GFX1032-NEXT: s_cbranch_execz .LBB6_2 1202; GFX1032-NEXT: ; %bb.1: 1203; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1204; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 1205; GFX1032-NEXT: s_mul_i32 s5, s5, 5 1206; GFX1032-NEXT: s_mov_b32 s10, -1 1207; GFX1032-NEXT: v_mov_b32_e32 v1, s5 1208; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1209; GFX1032-NEXT: s_mov_b32 s8, s2 1210; GFX1032-NEXT: s_mov_b32 s9, s3 1211; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1212; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1213; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1214; GFX1032-NEXT: s_waitcnt vmcnt(0) 1215; GFX1032-NEXT: buffer_gl0_inv 1216; GFX1032-NEXT: buffer_gl1_inv 1217; GFX1032-NEXT: .LBB6_2: 1218; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1219; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1220; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1221; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1222; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1223; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1224; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1225; GFX1032-NEXT: s_mov_b32 s2, -1 1226; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1227; GFX1032-NEXT: s_endpgm 1228entry: 1229 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel 1230 store i32 %old, i32 addrspace(1)* %out 1231 ret void 1232} 1233 1234define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) { 1235; GFX7LESS-LABEL: sub_i32_uniform: 1236; GFX7LESS: ; %bb.0: ; %entry 1237; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1238; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1239; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd 1240; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1241; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1242; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1243; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1244; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1245; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 1246; GFX7LESS-NEXT: ; %bb.1: 1247; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 1248; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1249; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1250; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 1251; GFX7LESS-NEXT: s_mov_b32 s14, -1 1252; GFX7LESS-NEXT: s_mov_b32 s12, s6 1253; GFX7LESS-NEXT: s_mov_b32 s13, s7 1254; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 1255; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1256; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1257; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1258; GFX7LESS-NEXT: buffer_wbinvl1 1259; GFX7LESS-NEXT: .LBB7_2: 1260; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1261; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1262; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1263; GFX7LESS-NEXT: s_mov_b32 s6, -1 1264; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1265; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 1266; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1267; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1268; GFX7LESS-NEXT: s_endpgm 1269; 1270; GFX8-LABEL: sub_i32_uniform: 1271; GFX8: ; %bb.0: ; %entry 1272; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1273; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 1274; GFX8-NEXT: s_mov_b64 s[2:3], exec 1275; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1276; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1277; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1278; GFX8-NEXT: ; implicit-def: $vgpr1 1279; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1280; GFX8-NEXT: s_cbranch_execz .LBB7_2 1281; GFX8-NEXT: ; %bb.1: 1282; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1283; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1284; GFX8-NEXT: s_mul_i32 s2, s8, s2 1285; GFX8-NEXT: s_mov_b32 s15, 0xf000 1286; GFX8-NEXT: s_mov_b32 s14, -1 1287; GFX8-NEXT: s_mov_b32 s12, s6 1288; GFX8-NEXT: s_mov_b32 s13, s7 1289; GFX8-NEXT: v_mov_b32_e32 v1, s2 1290; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1291; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1292; GFX8-NEXT: s_waitcnt vmcnt(0) 1293; GFX8-NEXT: buffer_wbinvl1_vol 1294; GFX8-NEXT: .LBB7_2: 1295; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1296; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1297; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 1298; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1299; GFX8-NEXT: s_mov_b32 s7, 0xf000 1300; GFX8-NEXT: s_mov_b32 s6, -1 1301; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1302; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1303; GFX8-NEXT: s_endpgm 1304; 1305; GFX9-LABEL: sub_i32_uniform: 1306; GFX9: ; %bb.0: ; %entry 1307; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1308; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 1309; GFX9-NEXT: s_mov_b64 s[2:3], exec 1310; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1311; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1312; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1313; GFX9-NEXT: ; implicit-def: $vgpr1 1314; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1315; GFX9-NEXT: s_cbranch_execz .LBB7_2 1316; GFX9-NEXT: ; %bb.1: 1317; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1318; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1319; GFX9-NEXT: s_mul_i32 s2, s8, s2 1320; GFX9-NEXT: s_mov_b32 s15, 0xf000 1321; GFX9-NEXT: s_mov_b32 s14, -1 1322; GFX9-NEXT: s_mov_b32 s12, s6 1323; GFX9-NEXT: s_mov_b32 s13, s7 1324; GFX9-NEXT: v_mov_b32_e32 v1, s2 1325; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1326; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1327; GFX9-NEXT: s_waitcnt vmcnt(0) 1328; GFX9-NEXT: buffer_wbinvl1_vol 1329; GFX9-NEXT: .LBB7_2: 1330; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1331; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1332; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 1333; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1334; GFX9-NEXT: s_mov_b32 s7, 0xf000 1335; GFX9-NEXT: s_mov_b32 s6, -1 1336; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1337; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1338; GFX9-NEXT: s_endpgm 1339; 1340; GFX1064-LABEL: sub_i32_uniform: 1341; GFX1064: ; %bb.0: ; %entry 1342; GFX1064-NEXT: s_clause 0x1 1343; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1344; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 1345; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1346; GFX1064-NEXT: ; implicit-def: $vgpr1 1347; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1348; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1349; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1350; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1351; GFX1064-NEXT: s_cbranch_execz .LBB7_2 1352; GFX1064-NEXT: ; %bb.1: 1353; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1354; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 1355; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1356; GFX1064-NEXT: s_mul_i32 s2, s8, s2 1357; GFX1064-NEXT: s_mov_b32 s14, -1 1358; GFX1064-NEXT: v_mov_b32_e32 v1, s2 1359; GFX1064-NEXT: s_mov_b32 s12, s6 1360; GFX1064-NEXT: s_mov_b32 s13, s7 1361; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1362; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1363; GFX1064-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1364; GFX1064-NEXT: s_waitcnt vmcnt(0) 1365; GFX1064-NEXT: buffer_gl0_inv 1366; GFX1064-NEXT: buffer_gl1_inv 1367; GFX1064-NEXT: .LBB7_2: 1368; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1369; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1370; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1371; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0 1372; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1373; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1374; GFX1064-NEXT: s_mov_b32 s6, -1 1375; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1376; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1377; GFX1064-NEXT: s_endpgm 1378; 1379; GFX1032-LABEL: sub_i32_uniform: 1380; GFX1032: ; %bb.0: ; %entry 1381; GFX1032-NEXT: s_clause 0x1 1382; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1383; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 1384; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1385; GFX1032-NEXT: ; implicit-def: $vgpr1 1386; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1387; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1388; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1389; GFX1032-NEXT: s_cbranch_execz .LBB7_2 1390; GFX1032-NEXT: ; %bb.1: 1391; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 1392; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 1393; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1394; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1395; GFX1032-NEXT: s_mov_b32 s10, -1 1396; GFX1032-NEXT: v_mov_b32_e32 v1, s1 1397; GFX1032-NEXT: s_mov_b32 s8, s6 1398; GFX1032-NEXT: s_mov_b32 s9, s7 1399; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1400; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1401; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1402; GFX1032-NEXT: s_waitcnt vmcnt(0) 1403; GFX1032-NEXT: buffer_gl0_inv 1404; GFX1032-NEXT: buffer_gl1_inv 1405; GFX1032-NEXT: .LBB7_2: 1406; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1407; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1408; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1409; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1410; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1411; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1412; GFX1032-NEXT: s_mov_b32 s6, -1 1413; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1414; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1415; GFX1032-NEXT: s_endpgm 1416entry: 1417 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel 1418 store i32 %old, i32 addrspace(1)* %out 1419 ret void 1420} 1421 1422define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 1423; GFX7LESS-LABEL: sub_i32_varying: 1424; GFX7LESS: ; %bb.0: ; %entry 1425; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1426; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1427; GFX7LESS-NEXT: s_mov_b32 s6, -1 1428; GFX7LESS-NEXT: s_mov_b32 s10, s6 1429; GFX7LESS-NEXT: s_mov_b32 s11, s7 1430; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1431; GFX7LESS-NEXT: s_mov_b32 s8, s2 1432; GFX7LESS-NEXT: s_mov_b32 s9, s3 1433; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1434; GFX7LESS-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1435; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1436; GFX7LESS-NEXT: buffer_wbinvl1 1437; GFX7LESS-NEXT: s_mov_b32 s4, s0 1438; GFX7LESS-NEXT: s_mov_b32 s5, s1 1439; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1440; GFX7LESS-NEXT: s_endpgm 1441; 1442; GFX8-LABEL: sub_i32_varying: 1443; GFX8: ; %bb.0: ; %entry 1444; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1445; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1446; GFX8-NEXT: v_mov_b32_e32 v1, 0 1447; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1448; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1449; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1450; GFX8-NEXT: v_mov_b32_e32 v2, v0 1451; GFX8-NEXT: s_not_b64 exec, exec 1452; GFX8-NEXT: v_mov_b32_e32 v2, 0 1453; GFX8-NEXT: s_not_b64 exec, exec 1454; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1455; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1456; GFX8-NEXT: s_nop 1 1457; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1458; GFX8-NEXT: s_nop 1 1459; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1460; GFX8-NEXT: s_nop 1 1461; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1462; GFX8-NEXT: s_nop 1 1463; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1464; GFX8-NEXT: s_nop 1 1465; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1466; GFX8-NEXT: v_readlane_b32 s6, v2, 63 1467; GFX8-NEXT: s_nop 0 1468; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1469; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1470; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1471; GFX8-NEXT: ; implicit-def: $vgpr0 1472; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1473; GFX8-NEXT: s_cbranch_execz .LBB8_2 1474; GFX8-NEXT: ; %bb.1: 1475; GFX8-NEXT: s_mov_b32 s11, 0xf000 1476; GFX8-NEXT: s_mov_b32 s10, -1 1477; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1478; GFX8-NEXT: s_mov_b32 s8, s2 1479; GFX8-NEXT: s_mov_b32 s9, s3 1480; GFX8-NEXT: v_mov_b32_e32 v0, s6 1481; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1482; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1483; GFX8-NEXT: s_waitcnt vmcnt(0) 1484; GFX8-NEXT: buffer_wbinvl1_vol 1485; GFX8-NEXT: .LBB8_2: 1486; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1487; GFX8-NEXT: v_readfirstlane_b32 s4, v0 1488; GFX8-NEXT: v_mov_b32_e32 v0, v1 1489; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1490; GFX8-NEXT: s_mov_b32 s3, 0xf000 1491; GFX8-NEXT: s_mov_b32 s2, -1 1492; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 1493; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1494; GFX8-NEXT: s_endpgm 1495; 1496; GFX9-LABEL: sub_i32_varying: 1497; GFX9: ; %bb.0: ; %entry 1498; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1499; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1500; GFX9-NEXT: v_mov_b32_e32 v1, 0 1501; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1502; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1503; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1504; GFX9-NEXT: v_mov_b32_e32 v2, v0 1505; GFX9-NEXT: s_not_b64 exec, exec 1506; GFX9-NEXT: v_mov_b32_e32 v2, 0 1507; GFX9-NEXT: s_not_b64 exec, exec 1508; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1509; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1510; GFX9-NEXT: s_nop 1 1511; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1512; GFX9-NEXT: s_nop 1 1513; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1514; GFX9-NEXT: s_nop 1 1515; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1516; GFX9-NEXT: s_nop 1 1517; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1518; GFX9-NEXT: s_nop 1 1519; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1520; GFX9-NEXT: v_readlane_b32 s6, v2, 63 1521; GFX9-NEXT: s_nop 0 1522; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1523; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1524; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1525; GFX9-NEXT: ; implicit-def: $vgpr0 1526; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1527; GFX9-NEXT: s_cbranch_execz .LBB8_2 1528; GFX9-NEXT: ; %bb.1: 1529; GFX9-NEXT: s_mov_b32 s11, 0xf000 1530; GFX9-NEXT: s_mov_b32 s10, -1 1531; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1532; GFX9-NEXT: s_mov_b32 s8, s2 1533; GFX9-NEXT: s_mov_b32 s9, s3 1534; GFX9-NEXT: v_mov_b32_e32 v0, s6 1535; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1536; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1537; GFX9-NEXT: s_waitcnt vmcnt(0) 1538; GFX9-NEXT: buffer_wbinvl1_vol 1539; GFX9-NEXT: .LBB8_2: 1540; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1541; GFX9-NEXT: v_readfirstlane_b32 s4, v0 1542; GFX9-NEXT: v_mov_b32_e32 v0, v1 1543; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1544; GFX9-NEXT: s_mov_b32 s3, 0xf000 1545; GFX9-NEXT: s_mov_b32 s2, -1 1546; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1547; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1548; GFX9-NEXT: s_endpgm 1549; 1550; GFX1064-LABEL: sub_i32_varying: 1551; GFX1064: ; %bb.0: ; %entry 1552; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1553; GFX1064-NEXT: s_not_b64 exec, exec 1554; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1555; GFX1064-NEXT: s_not_b64 exec, exec 1556; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1557; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1558; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1559; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1560; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1561; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1562; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1563; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1564; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1565; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 1566; GFX1064-NEXT: v_mov_b32_e32 v2, s4 1567; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1568; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 1569; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1570; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1571; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1572; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 1573; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 1574; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 1575; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 1576; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1577; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 1578; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 1579; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 1580; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 1581; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 1582; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1583; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 1584; GFX1064-NEXT: s_mov_b32 s4, s9 1585; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 1586; GFX1064-NEXT: s_mov_b64 exec, s[6:7] 1587; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1588; GFX1064-NEXT: s_mov_b32 s6, -1 1589; GFX1064-NEXT: ; implicit-def: $vgpr0 1590; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc 1591; GFX1064-NEXT: s_cbranch_execz .LBB8_2 1592; GFX1064-NEXT: ; %bb.1: 1593; GFX1064-NEXT: v_mov_b32_e32 v0, s4 1594; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1595; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1596; GFX1064-NEXT: s_mov_b32 s4, s2 1597; GFX1064-NEXT: s_mov_b32 s5, s3 1598; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1599; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1600; GFX1064-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 1601; GFX1064-NEXT: s_waitcnt vmcnt(0) 1602; GFX1064-NEXT: buffer_gl0_inv 1603; GFX1064-NEXT: buffer_gl1_inv 1604; GFX1064-NEXT: .LBB8_2: 1605; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1606; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 1607; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1608; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 1609; GFX1064-NEXT: v_mov_b32_e32 v0, v3 1610; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1611; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1612; GFX1064-NEXT: s_mov_b32 s2, s6 1613; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1614; GFX1064-NEXT: s_endpgm 1615; 1616; GFX1032-LABEL: sub_i32_varying: 1617; GFX1032: ; %bb.0: ; %entry 1618; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1619; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1620; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1621; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1622; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1623; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1624; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1625; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1626; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1627; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1628; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1629; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1630; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1631; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 1632; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1633; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1634; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 1635; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 1636; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1637; GFX1032-NEXT: s_mov_b32 exec_lo, s4 1638; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1639; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 1640; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 1641; GFX1032-NEXT: s_mov_b32 exec_lo, s4 1642; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1643; GFX1032-NEXT: s_mov_b32 s4, s6 1644; GFX1032-NEXT: s_mov_b32 s6, -1 1645; GFX1032-NEXT: ; implicit-def: $vgpr0 1646; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo 1647; GFX1032-NEXT: s_cbranch_execz .LBB8_2 1648; GFX1032-NEXT: ; %bb.1: 1649; GFX1032-NEXT: v_mov_b32_e32 v0, s4 1650; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1651; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1652; GFX1032-NEXT: s_mov_b32 s4, s2 1653; GFX1032-NEXT: s_mov_b32 s5, s3 1654; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1655; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1656; GFX1032-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 1657; GFX1032-NEXT: s_waitcnt vmcnt(0) 1658; GFX1032-NEXT: buffer_gl0_inv 1659; GFX1032-NEXT: buffer_gl1_inv 1660; GFX1032-NEXT: .LBB8_2: 1661; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1662; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 1663; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1664; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 1665; GFX1032-NEXT: v_mov_b32_e32 v0, v3 1666; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1667; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1668; GFX1032-NEXT: s_mov_b32 s2, s6 1669; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1670; GFX1032-NEXT: s_endpgm 1671entry: 1672 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1673 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel 1674 store i32 %old, i32 addrspace(1)* %out 1675 ret void 1676} 1677 1678define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 1679; GFX7LESS-LABEL: sub_i64_constant: 1680; GFX7LESS: ; %bb.0: ; %entry 1681; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1682; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1683; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1684; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 1685; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1686; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1687; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1688; GFX7LESS-NEXT: s_cbranch_execz .LBB9_2 1689; GFX7LESS-NEXT: ; %bb.1: 1690; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 1691; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1692; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 1693; GFX7LESS-NEXT: s_mov_b32 s10, -1 1694; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1695; GFX7LESS-NEXT: s_mov_b32 s8, s2 1696; GFX7LESS-NEXT: s_mov_b32 s9, s3 1697; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1698; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1699; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1700; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1701; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1702; GFX7LESS-NEXT: buffer_wbinvl1 1703; GFX7LESS-NEXT: .LBB9_2: 1704; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1705; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1706; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1707; GFX7LESS-NEXT: s_mov_b32 s2, -1 1708; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 1709; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 1710; GFX7LESS-NEXT: s_waitcnt expcnt(0) 1711; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1712; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1713; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 1714; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1715; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1716; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1717; GFX7LESS-NEXT: s_endpgm 1718; 1719; GFX8-LABEL: sub_i64_constant: 1720; GFX8: ; %bb.0: ; %entry 1721; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1722; GFX8-NEXT: s_mov_b64 s[6:7], exec 1723; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1724; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1725; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1726; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1727; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1728; GFX8-NEXT: s_cbranch_execz .LBB9_2 1729; GFX8-NEXT: ; %bb.1: 1730; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1731; GFX8-NEXT: s_mov_b32 s8, s2 1732; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1733; GFX8-NEXT: s_mul_i32 s2, s2, 5 1734; GFX8-NEXT: s_mov_b32 s11, 0xf000 1735; GFX8-NEXT: s_mov_b32 s10, -1 1736; GFX8-NEXT: s_mov_b32 s9, s3 1737; GFX8-NEXT: v_mov_b32_e32 v0, s2 1738; GFX8-NEXT: v_mov_b32_e32 v1, 0 1739; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1740; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1741; GFX8-NEXT: s_waitcnt vmcnt(0) 1742; GFX8-NEXT: buffer_wbinvl1_vol 1743; GFX8-NEXT: .LBB9_2: 1744; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1745; GFX8-NEXT: v_readfirstlane_b32 s4, v0 1746; GFX8-NEXT: v_readfirstlane_b32 s5, v1 1747; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1748; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1749; GFX8-NEXT: v_mov_b32_e32 v2, s5 1750; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 1751; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1752; GFX8-NEXT: s_mov_b32 s3, 0xf000 1753; GFX8-NEXT: s_mov_b32 s2, -1 1754; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1755; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1756; GFX8-NEXT: s_endpgm 1757; 1758; GFX9-LABEL: sub_i64_constant: 1759; GFX9: ; %bb.0: ; %entry 1760; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1761; GFX9-NEXT: s_mov_b64 s[6:7], exec 1762; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1763; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1764; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1765; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1766; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1767; GFX9-NEXT: s_cbranch_execz .LBB9_2 1768; GFX9-NEXT: ; %bb.1: 1769; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1770; GFX9-NEXT: s_mov_b32 s8, s2 1771; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1772; GFX9-NEXT: s_mul_i32 s2, s2, 5 1773; GFX9-NEXT: s_mov_b32 s11, 0xf000 1774; GFX9-NEXT: s_mov_b32 s10, -1 1775; GFX9-NEXT: s_mov_b32 s9, s3 1776; GFX9-NEXT: v_mov_b32_e32 v0, s2 1777; GFX9-NEXT: v_mov_b32_e32 v1, 0 1778; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1779; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1780; GFX9-NEXT: s_waitcnt vmcnt(0) 1781; GFX9-NEXT: buffer_wbinvl1_vol 1782; GFX9-NEXT: .LBB9_2: 1783; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1784; GFX9-NEXT: v_readfirstlane_b32 s4, v0 1785; GFX9-NEXT: v_readfirstlane_b32 s5, v1 1786; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1787; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1788; GFX9-NEXT: v_mov_b32_e32 v2, s5 1789; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 1790; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1791; GFX9-NEXT: s_mov_b32 s3, 0xf000 1792; GFX9-NEXT: s_mov_b32 s2, -1 1793; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 1794; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1795; GFX9-NEXT: s_endpgm 1796; 1797; GFX1064-LABEL: sub_i64_constant: 1798; GFX1064: ; %bb.0: ; %entry 1799; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1800; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1801; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1802; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1803; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1804; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1805; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1806; GFX1064-NEXT: s_cbranch_execz .LBB9_2 1807; GFX1064-NEXT: ; %bb.1: 1808; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1809; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1810; GFX1064-NEXT: s_mul_i32 s6, s6, 5 1811; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 1812; GFX1064-NEXT: v_mov_b32_e32 v0, s6 1813; GFX1064-NEXT: s_mov_b32 s10, -1 1814; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1815; GFX1064-NEXT: s_mov_b32 s8, s2 1816; GFX1064-NEXT: s_mov_b32 s9, s3 1817; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1818; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1819; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1820; GFX1064-NEXT: s_waitcnt vmcnt(0) 1821; GFX1064-NEXT: buffer_gl0_inv 1822; GFX1064-NEXT: buffer_gl1_inv 1823; GFX1064-NEXT: .LBB9_2: 1824; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1825; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1826; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1827; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 1828; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1829; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 1830; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1831; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 1832; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 1833; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1834; GFX1064-NEXT: s_mov_b32 s2, -1 1835; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1836; GFX1064-NEXT: s_endpgm 1837; 1838; GFX1032-LABEL: sub_i64_constant: 1839; GFX1032: ; %bb.0: ; %entry 1840; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1841; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1842; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1843; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1844; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1845; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1846; GFX1032-NEXT: s_cbranch_execz .LBB9_2 1847; GFX1032-NEXT: ; %bb.1: 1848; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1849; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1850; GFX1032-NEXT: s_mul_i32 s5, s5, 5 1851; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 1852; GFX1032-NEXT: v_mov_b32_e32 v0, s5 1853; GFX1032-NEXT: s_mov_b32 s10, -1 1854; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1855; GFX1032-NEXT: s_mov_b32 s8, s2 1856; GFX1032-NEXT: s_mov_b32 s9, s3 1857; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1858; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1859; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1860; GFX1032-NEXT: s_waitcnt vmcnt(0) 1861; GFX1032-NEXT: buffer_gl0_inv 1862; GFX1032-NEXT: buffer_gl1_inv 1863; GFX1032-NEXT: .LBB9_2: 1864; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1865; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1866; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1867; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 1868; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1869; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 1870; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1871; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 1872; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 1873; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1874; GFX1032-NEXT: s_mov_b32 s2, -1 1875; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1876; GFX1032-NEXT: s_endpgm 1877entry: 1878 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel 1879 store i64 %old, i64 addrspace(1)* %out 1880 ret void 1881} 1882 1883define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) { 1884; GFX7LESS-LABEL: sub_i64_uniform: 1885; GFX7LESS: ; %bb.0: ; %entry 1886; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec 1887; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1888; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1889; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 1890; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 1891; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1892; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1893; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1894; GFX7LESS-NEXT: s_cbranch_execz .LBB10_2 1895; GFX7LESS-NEXT: ; %bb.1: 1896; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 1897; GFX7LESS-NEXT: s_mov_b32 s14, -1 1898; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1899; GFX7LESS-NEXT: s_mov_b32 s12, s6 1900; GFX7LESS-NEXT: s_mov_b32 s13, s7 1901; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 1902; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 1903; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1904; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 1905; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 1906; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 1907; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1908; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1909; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc 1910; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1911; GFX7LESS-NEXT: buffer_wbinvl1 1912; GFX7LESS-NEXT: .LBB10_2: 1913; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1914; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1915; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1916; GFX7LESS-NEXT: s_mov_b32 s6, -1 1917; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 1918; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1 1919; GFX7LESS-NEXT: s_waitcnt expcnt(0) 1920; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 1921; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 1922; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 1923; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 1924; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3 1925; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v2 1926; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 1927; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1928; GFX7LESS-NEXT: s_endpgm 1929; 1930; GFX8-LABEL: sub_i64_uniform: 1931; GFX8: ; %bb.0: ; %entry 1932; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1933; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1934; GFX8-NEXT: s_mov_b64 s[8:9], exec 1935; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 1936; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 1937; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1938; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1939; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1940; GFX8-NEXT: s_cbranch_execz .LBB10_2 1941; GFX8-NEXT: ; %bb.1: 1942; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1943; GFX8-NEXT: s_mov_b32 s12, s6 1944; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 1945; GFX8-NEXT: v_mov_b32_e32 v0, s6 1946; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0 1947; GFX8-NEXT: s_mul_i32 s6, s1, s6 1948; GFX8-NEXT: s_mov_b32 s15, 0xf000 1949; GFX8-NEXT: s_mov_b32 s14, -1 1950; GFX8-NEXT: s_mov_b32 s13, s7 1951; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 1952; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1953; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc 1954; GFX8-NEXT: s_waitcnt vmcnt(0) 1955; GFX8-NEXT: buffer_wbinvl1_vol 1956; GFX8-NEXT: .LBB10_2: 1957; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1958; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1959; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2 1960; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0 1961; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1962; GFX8-NEXT: v_readfirstlane_b32 s1, v1 1963; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 1964; GFX8-NEXT: v_mov_b32_e32 v3, s1 1965; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 1966; GFX8-NEXT: s_mov_b32 s7, 0xf000 1967; GFX8-NEXT: s_mov_b32 s6, -1 1968; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 1969; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1970; GFX8-NEXT: s_endpgm 1971; 1972; GFX9-LABEL: sub_i64_uniform: 1973; GFX9: ; %bb.0: ; %entry 1974; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1975; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1976; GFX9-NEXT: s_mov_b64 s[8:9], exec 1977; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 1978; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 1979; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1980; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1981; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1982; GFX9-NEXT: s_cbranch_execz .LBB10_2 1983; GFX9-NEXT: ; %bb.1: 1984; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1985; GFX9-NEXT: s_mov_b32 s12, s6 1986; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 1987; GFX9-NEXT: s_mov_b32 s13, s7 1988; GFX9-NEXT: s_mul_i32 s7, s3, s6 1989; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1990; GFX9-NEXT: s_add_i32 s8, s8, s7 1991; GFX9-NEXT: s_mul_i32 s6, s2, s6 1992; GFX9-NEXT: s_mov_b32 s15, 0xf000 1993; GFX9-NEXT: s_mov_b32 s14, -1 1994; GFX9-NEXT: v_mov_b32_e32 v0, s6 1995; GFX9-NEXT: v_mov_b32_e32 v1, s8 1996; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1997; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc 1998; GFX9-NEXT: s_waitcnt vmcnt(0) 1999; GFX9-NEXT: buffer_wbinvl1_vol 2000; GFX9-NEXT: .LBB10_2: 2001; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 2002; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2003; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2 2004; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 2005; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2006; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2007; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 2008; GFX9-NEXT: v_mov_b32_e32 v3, s1 2009; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v2 2010; GFX9-NEXT: s_mov_b32 s7, 0xf000 2011; GFX9-NEXT: s_mov_b32 s6, -1 2012; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc 2013; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2014; GFX9-NEXT: s_endpgm 2015; 2016; GFX1064-LABEL: sub_i64_uniform: 2017; GFX1064: ; %bb.0: ; %entry 2018; GFX1064-NEXT: s_clause 0x1 2019; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2020; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2021; GFX1064-NEXT: s_mov_b64 s[8:9], exec 2022; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 2023; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 2024; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 2025; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2026; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 2027; GFX1064-NEXT: s_cbranch_execz .LBB10_2 2028; GFX1064-NEXT: ; %bb.1: 2029; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] 2030; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 2031; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2032; GFX1064-NEXT: s_mul_i32 s9, s3, s8 2033; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 2034; GFX1064-NEXT: s_mul_i32 s8, s2, s8 2035; GFX1064-NEXT: s_add_i32 s10, s10, s9 2036; GFX1064-NEXT: v_mov_b32_e32 v0, s8 2037; GFX1064-NEXT: v_mov_b32_e32 v1, s10 2038; GFX1064-NEXT: s_mov_b32 s10, -1 2039; GFX1064-NEXT: s_mov_b32 s8, s6 2040; GFX1064-NEXT: s_mov_b32 s9, s7 2041; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2042; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2043; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2044; GFX1064-NEXT: s_waitcnt vmcnt(0) 2045; GFX1064-NEXT: buffer_gl0_inv 2046; GFX1064-NEXT: buffer_gl1_inv 2047; GFX1064-NEXT: .LBB10_2: 2048; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2049; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 2050; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2051; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2 2052; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 2053; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 2054; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 2055; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 2056; GFX1064-NEXT: s_mov_b32 s6, -1 2057; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4 2058; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v2 2059; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc 2060; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2061; GFX1064-NEXT: s_endpgm 2062; 2063; GFX1032-LABEL: sub_i64_uniform: 2064; GFX1032: ; %bb.0: ; %entry 2065; GFX1032-NEXT: s_clause 0x1 2066; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2067; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2068; GFX1032-NEXT: s_mov_b32 s8, exec_lo 2069; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2070; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 2071; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2072; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 2073; GFX1032-NEXT: s_cbranch_execz .LBB10_2 2074; GFX1032-NEXT: ; %bb.1: 2075; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 2076; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 2077; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2078; GFX1032-NEXT: s_mul_i32 s8, s3, s1 2079; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 2080; GFX1032-NEXT: s_mul_i32 s1, s2, s1 2081; GFX1032-NEXT: s_add_i32 s9, s9, s8 2082; GFX1032-NEXT: v_mov_b32_e32 v0, s1 2083; GFX1032-NEXT: v_mov_b32_e32 v1, s9 2084; GFX1032-NEXT: s_mov_b32 s10, -1 2085; GFX1032-NEXT: s_mov_b32 s8, s6 2086; GFX1032-NEXT: s_mov_b32 s9, s7 2087; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2088; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2089; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2090; GFX1032-NEXT: s_waitcnt vmcnt(0) 2091; GFX1032-NEXT: buffer_gl0_inv 2092; GFX1032-NEXT: buffer_gl1_inv 2093; GFX1032-NEXT: .LBB10_2: 2094; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2095; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 2096; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2097; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2 2098; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s0, s2, v2, 0 2099; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 2100; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 2101; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 2102; GFX1032-NEXT: s_mov_b32 s6, -1 2103; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4 2104; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v2 2105; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 2106; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2107; GFX1032-NEXT: s_endpgm 2108entry: 2109 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel 2110 store i64 %old, i64 addrspace(1)* %out 2111 ret void 2112} 2113 2114define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 2115; GFX7LESS-LABEL: sub_i64_varying: 2116; GFX7LESS: ; %bb.0: ; %entry 2117; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2118; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2119; GFX7LESS-NEXT: s_mov_b32 s6, -1 2120; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2121; GFX7LESS-NEXT: s_mov_b32 s10, s6 2122; GFX7LESS-NEXT: s_mov_b32 s11, s7 2123; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2124; GFX7LESS-NEXT: s_mov_b32 s8, s2 2125; GFX7LESS-NEXT: s_mov_b32 s9, s3 2126; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2127; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2128; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 2129; GFX7LESS-NEXT: buffer_wbinvl1 2130; GFX7LESS-NEXT: s_mov_b32 s4, s0 2131; GFX7LESS-NEXT: s_mov_b32 s5, s1 2132; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2133; GFX7LESS-NEXT: s_endpgm 2134; 2135; GFX89-LABEL: sub_i64_varying: 2136; GFX89: ; %bb.0: ; %entry 2137; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2138; GFX89-NEXT: s_mov_b32 s7, 0xf000 2139; GFX89-NEXT: s_mov_b32 s6, -1 2140; GFX89-NEXT: s_mov_b32 s10, s6 2141; GFX89-NEXT: s_mov_b32 s11, s7 2142; GFX89-NEXT: s_waitcnt lgkmcnt(0) 2143; GFX89-NEXT: s_mov_b32 s8, s2 2144; GFX89-NEXT: s_mov_b32 s9, s3 2145; GFX89-NEXT: v_mov_b32_e32 v1, 0 2146; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2147; GFX89-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2148; GFX89-NEXT: s_waitcnt vmcnt(0) 2149; GFX89-NEXT: buffer_wbinvl1_vol 2150; GFX89-NEXT: s_mov_b32 s4, s0 2151; GFX89-NEXT: s_mov_b32 s5, s1 2152; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2153; GFX89-NEXT: s_endpgm 2154; 2155; GFX10-LABEL: sub_i64_varying: 2156; GFX10: ; %bb.0: ; %entry 2157; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2158; GFX10-NEXT: v_mov_b32_e32 v1, 0 2159; GFX10-NEXT: s_mov_b32 s7, 0x31016000 2160; GFX10-NEXT: s_mov_b32 s6, -1 2161; GFX10-NEXT: s_mov_b32 s11, s7 2162; GFX10-NEXT: s_mov_b32 s10, s6 2163; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2164; GFX10-NEXT: s_mov_b32 s8, s2 2165; GFX10-NEXT: s_mov_b32 s9, s3 2166; GFX10-NEXT: s_mov_b32 s4, s0 2167; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2168; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2169; GFX10-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2170; GFX10-NEXT: s_waitcnt vmcnt(0) 2171; GFX10-NEXT: buffer_gl0_inv 2172; GFX10-NEXT: buffer_gl1_inv 2173; GFX10-NEXT: s_mov_b32 s5, s1 2174; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2175; GFX10-NEXT: s_endpgm 2176entry: 2177 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2178 %zext = zext i32 %lane to i64 2179 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel 2180 store i64 %old, i64 addrspace(1)* %out 2181 ret void 2182} 2183