1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10; Show what the atomic optimization pass will do for global pointers. 11 12define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 13; GFX7LESS-LABEL: add_i32_constant: 14; GFX7LESS: ; %bb.0: ; %entry 15; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 16; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 17; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 18; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 19; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 20; GFX7LESS-NEXT: ; implicit-def: $vgpr1 21; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 22; GFX7LESS-NEXT: s_cbranch_execz BB0_2 23; GFX7LESS-NEXT: ; %bb.1: 24; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 25; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 26; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 27; GFX7LESS-NEXT: s_mov_b32 s10, -1 28; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 29; GFX7LESS-NEXT: s_mov_b32 s8, s2 30; GFX7LESS-NEXT: s_mov_b32 s9, s3 31; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 32; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 33; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 34; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 35; GFX7LESS-NEXT: buffer_wbinvl1 36; GFX7LESS-NEXT: BB0_2: 37; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 38; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 39; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 40; GFX7LESS-NEXT: s_mov_b32 s2, -1 41; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 42; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4 43; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 44; GFX7LESS-NEXT: s_endpgm 45; 46; GFX89-LABEL: add_i32_constant: 47; GFX89: ; %bb.0: ; %entry 48; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 49; GFX89-NEXT: s_mov_b64 s[6:7], exec 50; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 51; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 52; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 53; GFX89-NEXT: ; implicit-def: $vgpr1 54; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc 55; GFX89-NEXT: s_cbranch_execz BB0_2 56; GFX89-NEXT: ; %bb.1: 57; GFX89-NEXT: s_waitcnt lgkmcnt(0) 58; GFX89-NEXT: s_mov_b32 s8, s2 59; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 60; GFX89-NEXT: s_mul_i32 s2, s2, 5 61; GFX89-NEXT: s_mov_b32 s11, 0xf000 62; GFX89-NEXT: s_mov_b32 s10, -1 63; GFX89-NEXT: s_mov_b32 s9, s3 64; GFX89-NEXT: v_mov_b32_e32 v1, s2 65; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 66; GFX89-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 67; GFX89-NEXT: s_waitcnt vmcnt(0) 68; GFX89-NEXT: buffer_wbinvl1_vol 69; GFX89-NEXT: BB0_2: 70; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] 71; GFX89-NEXT: v_readfirstlane_b32 s4, v1 72; GFX89-NEXT: s_waitcnt lgkmcnt(0) 73; GFX89-NEXT: s_mov_b32 s3, 0xf000 74; GFX89-NEXT: s_mov_b32 s2, -1 75; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4 76; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 77; GFX89-NEXT: s_endpgm 78; 79; GFX1064-LABEL: add_i32_constant: 80; GFX1064: ; %bb.0: ; %entry 81; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 82; GFX1064-NEXT: s_mov_b64 s[6:7], exec 83; GFX1064-NEXT: ; implicit-def: $vgpr1 84; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 85; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 86; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 87; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 88; GFX1064-NEXT: s_cbranch_execz BB0_2 89; GFX1064-NEXT: ; %bb.1: 90; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 91; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 92; GFX1064-NEXT: s_mul_i32 s6, s6, 5 93; GFX1064-NEXT: s_mov_b32 s10, -1 94; GFX1064-NEXT: v_mov_b32_e32 v1, s6 95; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 96; GFX1064-NEXT: s_mov_b32 s8, s2 97; GFX1064-NEXT: s_mov_b32 s9, s3 98; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 99; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 100; GFX1064-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 101; GFX1064-NEXT: s_waitcnt vmcnt(0) 102; GFX1064-NEXT: buffer_gl0_inv 103; GFX1064-NEXT: buffer_gl1_inv 104; GFX1064-NEXT: BB0_2: 105; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 106; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 107; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 108; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 109; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 110; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 111; GFX1064-NEXT: s_mov_b32 s2, -1 112; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 113; GFX1064-NEXT: s_endpgm 114; 115; GFX1032-LABEL: add_i32_constant: 116; GFX1032: ; %bb.0: ; %entry 117; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 118; GFX1032-NEXT: s_mov_b32 s5, exec_lo 119; GFX1032-NEXT: ; implicit-def: $vgpr1 120; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 121; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 122; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 123; GFX1032-NEXT: s_cbranch_execz BB0_2 124; GFX1032-NEXT: ; %bb.1: 125; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 126; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 127; GFX1032-NEXT: s_mul_i32 s5, s5, 5 128; GFX1032-NEXT: s_mov_b32 s10, -1 129; GFX1032-NEXT: v_mov_b32_e32 v1, s5 130; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 131; GFX1032-NEXT: s_mov_b32 s8, s2 132; GFX1032-NEXT: s_mov_b32 s9, s3 133; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 134; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 135; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 136; GFX1032-NEXT: s_waitcnt vmcnt(0) 137; GFX1032-NEXT: buffer_gl0_inv 138; GFX1032-NEXT: buffer_gl1_inv 139; GFX1032-NEXT: BB0_2: 140; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 141; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 142; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 143; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 144; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 145; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 146; GFX1032-NEXT: s_mov_b32 s2, -1 147; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 148; GFX1032-NEXT: s_endpgm 149entry: 150 %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel 151 store i32 %old, i32 addrspace(1)* %out 152 ret void 153} 154 155define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) { 156; GFX7LESS-LABEL: add_i32_uniform: 157; GFX7LESS: ; %bb.0: ; %entry 158; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 159; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 160; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd 161; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 162; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 163; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 164; GFX7LESS-NEXT: ; implicit-def: $vgpr1 165; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 166; GFX7LESS-NEXT: s_cbranch_execz BB1_2 167; GFX7LESS-NEXT: ; %bb.1: 168; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 169; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 170; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 171; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 172; GFX7LESS-NEXT: s_mov_b32 s14, -1 173; GFX7LESS-NEXT: s_mov_b32 s12, s6 174; GFX7LESS-NEXT: s_mov_b32 s13, s7 175; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 176; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 177; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 178; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 179; GFX7LESS-NEXT: buffer_wbinvl1 180; GFX7LESS-NEXT: BB1_2: 181; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 182; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 183; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 184; GFX7LESS-NEXT: s_mov_b32 s6, -1 185; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 186; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 187; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 188; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 189; GFX7LESS-NEXT: s_endpgm 190; 191; GFX8-LABEL: add_i32_uniform: 192; GFX8: ; %bb.0: ; %entry 193; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 194; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 195; GFX8-NEXT: s_mov_b64 s[2:3], exec 196; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 197; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 198; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 199; GFX8-NEXT: ; implicit-def: $vgpr1 200; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 201; GFX8-NEXT: s_cbranch_execz BB1_2 202; GFX8-NEXT: ; %bb.1: 203; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 204; GFX8-NEXT: s_waitcnt lgkmcnt(0) 205; GFX8-NEXT: s_mul_i32 s2, s8, s2 206; GFX8-NEXT: s_mov_b32 s15, 0xf000 207; GFX8-NEXT: s_mov_b32 s14, -1 208; GFX8-NEXT: s_mov_b32 s12, s6 209; GFX8-NEXT: s_mov_b32 s13, s7 210; GFX8-NEXT: v_mov_b32_e32 v1, s2 211; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 212; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 213; GFX8-NEXT: s_waitcnt vmcnt(0) 214; GFX8-NEXT: buffer_wbinvl1_vol 215; GFX8-NEXT: BB1_2: 216; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 217; GFX8-NEXT: s_waitcnt lgkmcnt(0) 218; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 219; GFX8-NEXT: v_readfirstlane_b32 s0, v1 220; GFX8-NEXT: s_mov_b32 s7, 0xf000 221; GFX8-NEXT: s_mov_b32 s6, -1 222; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 223; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 224; GFX8-NEXT: s_endpgm 225; 226; GFX9-LABEL: add_i32_uniform: 227; GFX9: ; %bb.0: ; %entry 228; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 229; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 230; GFX9-NEXT: s_mov_b64 s[2:3], exec 231; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 232; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 233; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 234; GFX9-NEXT: ; implicit-def: $vgpr1 235; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 236; GFX9-NEXT: s_cbranch_execz BB1_2 237; GFX9-NEXT: ; %bb.1: 238; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 239; GFX9-NEXT: s_waitcnt lgkmcnt(0) 240; GFX9-NEXT: s_mul_i32 s2, s8, s2 241; GFX9-NEXT: s_mov_b32 s15, 0xf000 242; GFX9-NEXT: s_mov_b32 s14, -1 243; GFX9-NEXT: s_mov_b32 s12, s6 244; GFX9-NEXT: s_mov_b32 s13, s7 245; GFX9-NEXT: v_mov_b32_e32 v1, s2 246; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 247; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 248; GFX9-NEXT: s_waitcnt vmcnt(0) 249; GFX9-NEXT: buffer_wbinvl1_vol 250; GFX9-NEXT: BB1_2: 251; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 252; GFX9-NEXT: s_waitcnt lgkmcnt(0) 253; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 254; GFX9-NEXT: v_readfirstlane_b32 s0, v1 255; GFX9-NEXT: s_mov_b32 s7, 0xf000 256; GFX9-NEXT: s_mov_b32 s6, -1 257; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 258; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 259; GFX9-NEXT: s_endpgm 260; 261; GFX1064-LABEL: add_i32_uniform: 262; GFX1064: ; %bb.0: ; %entry 263; GFX1064-NEXT: s_clause 0x1 264; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 265; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 266; GFX1064-NEXT: s_mov_b64 s[2:3], exec 267; GFX1064-NEXT: ; implicit-def: $vgpr1 268; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 269; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 270; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 271; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 272; GFX1064-NEXT: s_cbranch_execz BB1_2 273; GFX1064-NEXT: ; %bb.1: 274; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 275; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 276; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 277; GFX1064-NEXT: s_mul_i32 s2, s8, s2 278; GFX1064-NEXT: s_mov_b32 s14, -1 279; GFX1064-NEXT: v_mov_b32_e32 v1, s2 280; GFX1064-NEXT: s_mov_b32 s12, s6 281; GFX1064-NEXT: s_mov_b32 s13, s7 282; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 283; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 284; GFX1064-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 285; GFX1064-NEXT: s_waitcnt vmcnt(0) 286; GFX1064-NEXT: buffer_gl0_inv 287; GFX1064-NEXT: buffer_gl1_inv 288; GFX1064-NEXT: BB1_2: 289; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 290; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 291; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 292; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0 293; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 294; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 295; GFX1064-NEXT: s_mov_b32 s6, -1 296; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 297; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 298; GFX1064-NEXT: s_endpgm 299; 300; GFX1032-LABEL: add_i32_uniform: 301; GFX1032: ; %bb.0: ; %entry 302; GFX1032-NEXT: s_clause 0x1 303; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 304; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 305; GFX1032-NEXT: s_mov_b32 s3, exec_lo 306; GFX1032-NEXT: ; implicit-def: $vgpr1 307; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 308; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 309; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 310; GFX1032-NEXT: s_cbranch_execz BB1_2 311; GFX1032-NEXT: ; %bb.1: 312; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 313; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 314; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 315; GFX1032-NEXT: s_mul_i32 s1, s2, s1 316; GFX1032-NEXT: s_mov_b32 s10, -1 317; GFX1032-NEXT: v_mov_b32_e32 v1, s1 318; GFX1032-NEXT: s_mov_b32 s8, s6 319; GFX1032-NEXT: s_mov_b32 s9, s7 320; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 321; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 322; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 323; GFX1032-NEXT: s_waitcnt vmcnt(0) 324; GFX1032-NEXT: buffer_gl0_inv 325; GFX1032-NEXT: buffer_gl1_inv 326; GFX1032-NEXT: BB1_2: 327; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 328; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 329; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 330; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 331; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 332; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 333; GFX1032-NEXT: s_mov_b32 s6, -1 334; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 335; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 336; GFX1032-NEXT: s_endpgm 337entry: 338 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel 339 store i32 %old, i32 addrspace(1)* %out 340 ret void 341} 342 343define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 344; GFX7LESS-LABEL: add_i32_varying: 345; GFX7LESS: ; %bb.0: ; %entry 346; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 347; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 348; GFX7LESS-NEXT: s_mov_b32 s6, -1 349; GFX7LESS-NEXT: s_mov_b32 s10, s6 350; GFX7LESS-NEXT: s_mov_b32 s11, s7 351; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 352; GFX7LESS-NEXT: s_mov_b32 s8, s2 353; GFX7LESS-NEXT: s_mov_b32 s9, s3 354; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 355; GFX7LESS-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 356; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 357; GFX7LESS-NEXT: buffer_wbinvl1 358; GFX7LESS-NEXT: s_mov_b32 s4, s0 359; GFX7LESS-NEXT: s_mov_b32 s5, s1 360; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 361; GFX7LESS-NEXT: s_endpgm 362; 363; GFX8-LABEL: add_i32_varying: 364; GFX8: ; %bb.0: ; %entry 365; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 366; GFX8-NEXT: v_mov_b32_e32 v2, v0 367; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 368; GFX8-NEXT: v_mov_b32_e32 v1, 0 369; GFX8-NEXT: s_mov_b64 exec, s[4:5] 370; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 371; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 372; GFX8-NEXT: s_not_b64 exec, exec 373; GFX8-NEXT: v_mov_b32_e32 v2, 0 374; GFX8-NEXT: s_not_b64 exec, exec 375; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 376; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 377; GFX8-NEXT: s_nop 1 378; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 379; GFX8-NEXT: s_nop 1 380; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 381; GFX8-NEXT: s_nop 1 382; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 383; GFX8-NEXT: s_nop 1 384; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 385; GFX8-NEXT: s_nop 1 386; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 387; GFX8-NEXT: v_readlane_b32 s6, v2, 63 388; GFX8-NEXT: s_nop 0 389; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 390; GFX8-NEXT: s_mov_b64 exec, s[4:5] 391; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 392; GFX8-NEXT: ; implicit-def: $vgpr0 393; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 394; GFX8-NEXT: s_cbranch_execz BB2_2 395; GFX8-NEXT: ; %bb.1: 396; GFX8-NEXT: s_mov_b32 s11, 0xf000 397; GFX8-NEXT: s_mov_b32 s10, -1 398; GFX8-NEXT: s_waitcnt lgkmcnt(0) 399; GFX8-NEXT: s_mov_b32 s8, s2 400; GFX8-NEXT: s_mov_b32 s9, s3 401; GFX8-NEXT: v_mov_b32_e32 v0, s6 402; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 403; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 404; GFX8-NEXT: s_waitcnt vmcnt(0) 405; GFX8-NEXT: buffer_wbinvl1_vol 406; GFX8-NEXT: BB2_2: 407; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 408; GFX8-NEXT: v_readfirstlane_b32 s4, v0 409; GFX8-NEXT: v_mov_b32_e32 v0, v1 410; GFX8-NEXT: s_waitcnt lgkmcnt(0) 411; GFX8-NEXT: s_mov_b32 s3, 0xf000 412; GFX8-NEXT: s_mov_b32 s2, -1 413; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 414; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 415; GFX8-NEXT: s_endpgm 416; 417; GFX9-LABEL: add_i32_varying: 418; GFX9: ; %bb.0: ; %entry 419; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 420; GFX9-NEXT: v_mov_b32_e32 v2, v0 421; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 422; GFX9-NEXT: v_mov_b32_e32 v1, 0 423; GFX9-NEXT: s_mov_b64 exec, s[4:5] 424; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 425; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 426; GFX9-NEXT: s_not_b64 exec, exec 427; GFX9-NEXT: v_mov_b32_e32 v2, 0 428; GFX9-NEXT: s_not_b64 exec, exec 429; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 430; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 431; GFX9-NEXT: s_nop 1 432; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 433; GFX9-NEXT: s_nop 1 434; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 435; GFX9-NEXT: s_nop 1 436; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 437; GFX9-NEXT: s_nop 1 438; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 439; GFX9-NEXT: s_nop 1 440; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 441; GFX9-NEXT: v_readlane_b32 s6, v2, 63 442; GFX9-NEXT: s_nop 0 443; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 444; GFX9-NEXT: s_mov_b64 exec, s[4:5] 445; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 446; GFX9-NEXT: ; implicit-def: $vgpr0 447; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 448; GFX9-NEXT: s_cbranch_execz BB2_2 449; GFX9-NEXT: ; %bb.1: 450; GFX9-NEXT: s_mov_b32 s11, 0xf000 451; GFX9-NEXT: s_mov_b32 s10, -1 452; GFX9-NEXT: s_waitcnt lgkmcnt(0) 453; GFX9-NEXT: s_mov_b32 s8, s2 454; GFX9-NEXT: s_mov_b32 s9, s3 455; GFX9-NEXT: v_mov_b32_e32 v0, s6 456; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 457; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 458; GFX9-NEXT: s_waitcnt vmcnt(0) 459; GFX9-NEXT: buffer_wbinvl1_vol 460; GFX9-NEXT: BB2_2: 461; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 462; GFX9-NEXT: v_readfirstlane_b32 s4, v0 463; GFX9-NEXT: v_mov_b32_e32 v0, v1 464; GFX9-NEXT: s_waitcnt lgkmcnt(0) 465; GFX9-NEXT: s_mov_b32 s3, 0xf000 466; GFX9-NEXT: s_mov_b32 s2, -1 467; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 468; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 469; GFX9-NEXT: s_endpgm 470; 471; GFX1064-LABEL: add_i32_varying: 472; GFX1064: ; %bb.0: ; %entry 473; GFX1064-NEXT: v_mov_b32_e32 v1, v0 474; GFX1064-NEXT: s_not_b64 exec, exec 475; GFX1064-NEXT: v_mov_b32_e32 v1, 0 476; GFX1064-NEXT: s_not_b64 exec, exec 477; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 478; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 479; GFX1064-NEXT: v_mov_b32_e32 v3, 0 480; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 481; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 482; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 483; GFX1064-NEXT: v_mov_b32_e32 v2, v1 484; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 485; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 486; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 487; GFX1064-NEXT: v_mov_b32_e32 v2, s4 488; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 489; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 490; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 491; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 492; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 493; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 494; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 495; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 496; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 497; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 498; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 499; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 500; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 501; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 502; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 503; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 504; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 505; GFX1064-NEXT: s_mov_b32 s4, s9 506; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 507; GFX1064-NEXT: s_mov_b64 exec, s[6:7] 508; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 509; GFX1064-NEXT: s_mov_b32 s6, -1 510; GFX1064-NEXT: ; implicit-def: $vgpr0 511; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc 512; GFX1064-NEXT: s_cbranch_execz BB2_2 513; GFX1064-NEXT: ; %bb.1: 514; GFX1064-NEXT: v_mov_b32_e32 v0, s4 515; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 516; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 517; GFX1064-NEXT: s_mov_b32 s4, s2 518; GFX1064-NEXT: s_mov_b32 s5, s3 519; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 520; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 521; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 522; GFX1064-NEXT: s_waitcnt vmcnt(0) 523; GFX1064-NEXT: buffer_gl0_inv 524; GFX1064-NEXT: buffer_gl1_inv 525; GFX1064-NEXT: BB2_2: 526; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 527; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 528; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 529; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 530; GFX1064-NEXT: v_mov_b32_e32 v0, v3 531; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 532; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v0 533; GFX1064-NEXT: s_mov_b32 s2, s6 534; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 535; GFX1064-NEXT: s_endpgm 536; 537; GFX1032-LABEL: add_i32_varying: 538; GFX1032: ; %bb.0: ; %entry 539; GFX1032-NEXT: v_mov_b32_e32 v1, v0 540; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 541; GFX1032-NEXT: v_mov_b32_e32 v1, 0 542; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 543; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 544; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 545; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 546; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 547; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 548; GFX1032-NEXT: v_mov_b32_e32 v2, v1 549; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 550; GFX1032-NEXT: s_mov_b32 exec_lo, s2 551; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 552; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 553; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 554; GFX1032-NEXT: v_mov_b32_e32 v3, 0 555; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 556; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 557; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 558; GFX1032-NEXT: s_mov_b32 exec_lo, s4 559; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 560; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 561; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 562; GFX1032-NEXT: s_mov_b32 exec_lo, s4 563; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 564; GFX1032-NEXT: s_mov_b32 s4, s6 565; GFX1032-NEXT: s_mov_b32 s6, -1 566; GFX1032-NEXT: ; implicit-def: $vgpr0 567; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo 568; GFX1032-NEXT: s_cbranch_execz BB2_2 569; GFX1032-NEXT: ; %bb.1: 570; GFX1032-NEXT: v_mov_b32_e32 v0, s4 571; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 572; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 573; GFX1032-NEXT: s_mov_b32 s4, s2 574; GFX1032-NEXT: s_mov_b32 s5, s3 575; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 576; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 577; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 578; GFX1032-NEXT: s_waitcnt vmcnt(0) 579; GFX1032-NEXT: buffer_gl0_inv 580; GFX1032-NEXT: buffer_gl1_inv 581; GFX1032-NEXT: BB2_2: 582; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 583; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 584; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 585; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 586; GFX1032-NEXT: v_mov_b32_e32 v0, v3 587; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 588; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v0 589; GFX1032-NEXT: s_mov_b32 s2, s6 590; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 591; GFX1032-NEXT: s_endpgm 592entry: 593 %lane = call i32 @llvm.amdgcn.workitem.id.x() 594 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel 595 store i32 %old, i32 addrspace(1)* %out 596 ret void 597} 598 599define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 600; GFX7LESS-LABEL: add_i64_constant: 601; GFX7LESS: ; %bb.0: ; %entry 602; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 603; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 604; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 605; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 606; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 607; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 608; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 609; GFX7LESS-NEXT: s_cbranch_execz BB3_2 610; GFX7LESS-NEXT: ; %bb.1: 611; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 612; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 613; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 614; GFX7LESS-NEXT: s_mov_b32 s10, -1 615; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 616; GFX7LESS-NEXT: s_mov_b32 s8, s2 617; GFX7LESS-NEXT: s_mov_b32 s9, s3 618; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 619; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 620; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 621; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 622; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 623; GFX7LESS-NEXT: buffer_wbinvl1 624; GFX7LESS-NEXT: BB3_2: 625; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 626; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 627; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 628; GFX7LESS-NEXT: s_mov_b32 s2, -1 629; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 630; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 631; GFX7LESS-NEXT: s_waitcnt expcnt(0) 632; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 633; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 634; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 635; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 636; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 637; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 638; GFX7LESS-NEXT: s_endpgm 639; 640; GFX89-LABEL: add_i64_constant: 641; GFX89: ; %bb.0: ; %entry 642; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 643; GFX89-NEXT: s_mov_b64 s[6:7], exec 644; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 645; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 646; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 647; GFX89-NEXT: ; implicit-def: $vgpr0_vgpr1 648; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc 649; GFX89-NEXT: s_cbranch_execz BB3_2 650; GFX89-NEXT: ; %bb.1: 651; GFX89-NEXT: s_waitcnt lgkmcnt(0) 652; GFX89-NEXT: s_mov_b32 s8, s2 653; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 654; GFX89-NEXT: s_mul_i32 s2, s2, 5 655; GFX89-NEXT: s_mov_b32 s11, 0xf000 656; GFX89-NEXT: s_mov_b32 s10, -1 657; GFX89-NEXT: s_mov_b32 s9, s3 658; GFX89-NEXT: v_mov_b32_e32 v0, s2 659; GFX89-NEXT: v_mov_b32_e32 v1, 0 660; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 661; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 662; GFX89-NEXT: s_waitcnt vmcnt(0) 663; GFX89-NEXT: buffer_wbinvl1_vol 664; GFX89-NEXT: BB3_2: 665; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] 666; GFX89-NEXT: s_waitcnt lgkmcnt(0) 667; GFX89-NEXT: v_readfirstlane_b32 s2, v0 668; GFX89-NEXT: v_readfirstlane_b32 s3, v1 669; GFX89-NEXT: v_mov_b32_e32 v0, s2 670; GFX89-NEXT: v_mov_b32_e32 v1, s3 671; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 672; GFX89-NEXT: s_mov_b32 s3, 0xf000 673; GFX89-NEXT: s_mov_b32 s2, -1 674; GFX89-NEXT: s_nop 2 675; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 676; GFX89-NEXT: s_endpgm 677; 678; GFX1064-LABEL: add_i64_constant: 679; GFX1064: ; %bb.0: ; %entry 680; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 681; GFX1064-NEXT: s_mov_b64 s[6:7], exec 682; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 683; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 684; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 685; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 686; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 687; GFX1064-NEXT: s_cbranch_execz BB3_2 688; GFX1064-NEXT: ; %bb.1: 689; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 690; GFX1064-NEXT: v_mov_b32_e32 v1, 0 691; GFX1064-NEXT: s_mul_i32 s6, s6, 5 692; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 693; GFX1064-NEXT: v_mov_b32_e32 v0, s6 694; GFX1064-NEXT: s_mov_b32 s10, -1 695; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 696; GFX1064-NEXT: s_mov_b32 s8, s2 697; GFX1064-NEXT: s_mov_b32 s9, s3 698; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 699; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 700; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 701; GFX1064-NEXT: s_waitcnt vmcnt(0) 702; GFX1064-NEXT: buffer_gl0_inv 703; GFX1064-NEXT: buffer_gl1_inv 704; GFX1064-NEXT: BB3_2: 705; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 706; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 707; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 708; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 709; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 710; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] 711; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 712; GFX1064-NEXT: s_mov_b32 s2, -1 713; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 714; GFX1064-NEXT: s_endpgm 715; 716; GFX1032-LABEL: add_i64_constant: 717; GFX1032: ; %bb.0: ; %entry 718; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 719; GFX1032-NEXT: s_mov_b32 s5, exec_lo 720; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 721; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 722; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 723; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 724; GFX1032-NEXT: s_cbranch_execz BB3_2 725; GFX1032-NEXT: ; %bb.1: 726; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 727; GFX1032-NEXT: v_mov_b32_e32 v1, 0 728; GFX1032-NEXT: s_mul_i32 s5, s5, 5 729; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 730; GFX1032-NEXT: v_mov_b32_e32 v0, s5 731; GFX1032-NEXT: s_mov_b32 s10, -1 732; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 733; GFX1032-NEXT: s_mov_b32 s8, s2 734; GFX1032-NEXT: s_mov_b32 s9, s3 735; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 736; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 737; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 738; GFX1032-NEXT: s_waitcnt vmcnt(0) 739; GFX1032-NEXT: buffer_gl0_inv 740; GFX1032-NEXT: buffer_gl1_inv 741; GFX1032-NEXT: BB3_2: 742; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 743; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 744; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 745; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 746; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 747; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] 748; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 749; GFX1032-NEXT: s_mov_b32 s2, -1 750; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 751; GFX1032-NEXT: s_endpgm 752entry: 753 %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel 754 store i64 %old, i64 addrspace(1)* %out 755 ret void 756} 757 758define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) { 759; GFX7LESS-LABEL: add_i64_uniform: 760; GFX7LESS: ; %bb.0: ; %entry 761; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec 762; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 763; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 764; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 765; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 766; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 767; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 768; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 769; GFX7LESS-NEXT: s_cbranch_execz BB4_2 770; GFX7LESS-NEXT: ; %bb.1: 771; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 772; GFX7LESS-NEXT: s_mov_b32 s14, -1 773; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 774; GFX7LESS-NEXT: s_mov_b32 s12, s6 775; GFX7LESS-NEXT: s_mov_b32 s13, s7 776; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 777; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 778; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 779; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 780; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 781; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 782; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 783; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 784; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc 785; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 786; GFX7LESS-NEXT: buffer_wbinvl1 787; GFX7LESS-NEXT: BB4_2: 788; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 789; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 790; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 791; GFX7LESS-NEXT: s_mov_b32 s6, -1 792; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 793; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1 794; GFX7LESS-NEXT: s_waitcnt expcnt(0) 795; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 796; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 797; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 798; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 799; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3 800; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v2 801; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 802; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 803; GFX7LESS-NEXT: s_endpgm 804; 805; GFX8-LABEL: add_i64_uniform: 806; GFX8: ; %bb.0: ; %entry 807; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 808; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 809; GFX8-NEXT: s_mov_b64 s[8:9], exec 810; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 811; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 812; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 813; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 814; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 815; GFX8-NEXT: s_cbranch_execz BB4_2 816; GFX8-NEXT: ; %bb.1: 817; GFX8-NEXT: s_waitcnt lgkmcnt(0) 818; GFX8-NEXT: s_mov_b32 s12, s6 819; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 820; GFX8-NEXT: v_mov_b32_e32 v0, s6 821; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 822; GFX8-NEXT: s_mov_b32 s13, s7 823; GFX8-NEXT: s_mul_i32 s7, s1, s6 824; GFX8-NEXT: s_mul_i32 s6, s0, s6 825; GFX8-NEXT: s_mov_b32 s15, 0xf000 826; GFX8-NEXT: s_mov_b32 s14, -1 827; GFX8-NEXT: v_add_u32_e32 v1, vcc, s7, v0 828; GFX8-NEXT: v_mov_b32_e32 v0, s6 829; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 830; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc 831; GFX8-NEXT: s_waitcnt vmcnt(0) 832; GFX8-NEXT: buffer_wbinvl1_vol 833; GFX8-NEXT: BB4_2: 834; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 835; GFX8-NEXT: v_readfirstlane_b32 s2, v0 836; GFX8-NEXT: s_waitcnt lgkmcnt(0) 837; GFX8-NEXT: v_mul_lo_u32 v0, s1, v2 838; GFX8-NEXT: v_mul_hi_u32 v3, s0, v2 839; GFX8-NEXT: v_readfirstlane_b32 s1, v1 840; GFX8-NEXT: v_mul_lo_u32 v1, s0, v2 841; GFX8-NEXT: s_mov_b32 s7, 0xf000 842; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 843; GFX8-NEXT: v_mov_b32_e32 v3, s1 844; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v1 845; GFX8-NEXT: s_mov_b32 s6, -1 846; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc 847; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 848; GFX8-NEXT: s_endpgm 849; 850; GFX9-LABEL: add_i64_uniform: 851; GFX9: ; %bb.0: ; %entry 852; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 853; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 854; GFX9-NEXT: s_mov_b64 s[8:9], exec 855; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 856; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 857; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 858; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 859; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 860; GFX9-NEXT: s_cbranch_execz BB4_2 861; GFX9-NEXT: ; %bb.1: 862; GFX9-NEXT: s_waitcnt lgkmcnt(0) 863; GFX9-NEXT: s_mov_b32 s12, s6 864; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 865; GFX9-NEXT: s_mov_b32 s13, s7 866; GFX9-NEXT: s_mul_i32 s7, s3, s6 867; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 868; GFX9-NEXT: s_add_i32 s8, s8, s7 869; GFX9-NEXT: s_mul_i32 s6, s2, s6 870; GFX9-NEXT: s_mov_b32 s15, 0xf000 871; GFX9-NEXT: s_mov_b32 s14, -1 872; GFX9-NEXT: v_mov_b32_e32 v0, s6 873; GFX9-NEXT: v_mov_b32_e32 v1, s8 874; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 875; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc 876; GFX9-NEXT: s_waitcnt vmcnt(0) 877; GFX9-NEXT: buffer_wbinvl1_vol 878; GFX9-NEXT: BB4_2: 879; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 880; GFX9-NEXT: s_waitcnt lgkmcnt(0) 881; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 882; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2 883; GFX9-NEXT: v_readfirstlane_b32 s0, v0 884; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2 885; GFX9-NEXT: v_readfirstlane_b32 s1, v1 886; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 887; GFX9-NEXT: v_mov_b32_e32 v2, s1 888; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 889; GFX9-NEXT: s_mov_b32 s7, 0xf000 890; GFX9-NEXT: s_mov_b32 s6, -1 891; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 892; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 893; GFX9-NEXT: s_endpgm 894; 895; GFX1064-LABEL: add_i64_uniform: 896; GFX1064: ; %bb.0: ; %entry 897; GFX1064-NEXT: s_clause 0x1 898; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 899; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 900; GFX1064-NEXT: s_mov_b64 s[8:9], exec 901; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 902; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 903; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 904; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 905; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 906; GFX1064-NEXT: s_cbranch_execz BB4_2 907; GFX1064-NEXT: ; %bb.1: 908; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] 909; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 910; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 911; GFX1064-NEXT: s_mul_i32 s9, s3, s8 912; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 913; GFX1064-NEXT: s_mul_i32 s8, s2, s8 914; GFX1064-NEXT: s_add_i32 s10, s10, s9 915; GFX1064-NEXT: v_mov_b32_e32 v0, s8 916; GFX1064-NEXT: v_mov_b32_e32 v1, s10 917; GFX1064-NEXT: s_mov_b32 s10, -1 918; GFX1064-NEXT: s_mov_b32 s8, s6 919; GFX1064-NEXT: s_mov_b32 s9, s7 920; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 921; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 922; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 923; GFX1064-NEXT: s_waitcnt vmcnt(0) 924; GFX1064-NEXT: buffer_gl0_inv 925; GFX1064-NEXT: buffer_gl1_inv 926; GFX1064-NEXT: BB4_2: 927; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 928; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 929; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 930; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2 931; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v2 932; GFX1064-NEXT: v_mul_lo_u32 v2, s2, v2 933; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 934; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 935; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 936; GFX1064-NEXT: s_mov_b32 s6, -1 937; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 938; GFX1064-NEXT: v_add_co_u32 v0, vcc, s0, v2 939; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s1, v1, vcc 940; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 941; GFX1064-NEXT: s_endpgm 942; 943; GFX1032-LABEL: add_i64_uniform: 944; GFX1032: ; %bb.0: ; %entry 945; GFX1032-NEXT: s_clause 0x1 946; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 947; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 948; GFX1032-NEXT: s_mov_b32 s8, exec_lo 949; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 950; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 951; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 952; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 953; GFX1032-NEXT: s_cbranch_execz BB4_2 954; GFX1032-NEXT: ; %bb.1: 955; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 956; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 957; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 958; GFX1032-NEXT: s_mul_i32 s8, s3, s1 959; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 960; GFX1032-NEXT: s_mul_i32 s1, s2, s1 961; GFX1032-NEXT: s_add_i32 s9, s9, s8 962; GFX1032-NEXT: v_mov_b32_e32 v0, s1 963; GFX1032-NEXT: v_mov_b32_e32 v1, s9 964; GFX1032-NEXT: s_mov_b32 s10, -1 965; GFX1032-NEXT: s_mov_b32 s8, s6 966; GFX1032-NEXT: s_mov_b32 s9, s7 967; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 968; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 969; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 970; GFX1032-NEXT: s_waitcnt vmcnt(0) 971; GFX1032-NEXT: buffer_gl0_inv 972; GFX1032-NEXT: buffer_gl1_inv 973; GFX1032-NEXT: BB4_2: 974; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 975; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 976; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 977; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2 978; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v2 979; GFX1032-NEXT: v_mul_lo_u32 v2, s2, v2 980; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 981; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 982; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 983; GFX1032-NEXT: s_mov_b32 s6, -1 984; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 985; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s0, v2 986; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 987; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 988; GFX1032-NEXT: s_endpgm 989entry: 990 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel 991 store i64 %old, i64 addrspace(1)* %out 992 ret void 993} 994 995define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 996; GFX7LESS-LABEL: add_i64_varying: 997; GFX7LESS: ; %bb.0: ; %entry 998; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 999; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1000; GFX7LESS-NEXT: s_mov_b32 s6, -1 1001; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1002; GFX7LESS-NEXT: s_mov_b32 s10, s6 1003; GFX7LESS-NEXT: s_mov_b32 s11, s7 1004; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1005; GFX7LESS-NEXT: s_mov_b32 s8, s2 1006; GFX7LESS-NEXT: s_mov_b32 s9, s3 1007; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1008; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1009; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1010; GFX7LESS-NEXT: buffer_wbinvl1 1011; GFX7LESS-NEXT: s_mov_b32 s4, s0 1012; GFX7LESS-NEXT: s_mov_b32 s5, s1 1013; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1014; GFX7LESS-NEXT: s_endpgm 1015; 1016; GFX89-LABEL: add_i64_varying: 1017; GFX89: ; %bb.0: ; %entry 1018; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1019; GFX89-NEXT: s_mov_b32 s3, 0xf000 1020; GFX89-NEXT: s_mov_b32 s2, -1 1021; GFX89-NEXT: v_mov_b32_e32 v1, 0 1022; GFX89-NEXT: s_waitcnt lgkmcnt(0) 1023; GFX89-NEXT: s_mov_b32 s0, s4 1024; GFX89-NEXT: s_mov_b32 s1, s5 1025; GFX89-NEXT: s_mov_b32 s4, s6 1026; GFX89-NEXT: s_mov_b32 s5, s7 1027; GFX89-NEXT: s_mov_b32 s6, s2 1028; GFX89-NEXT: s_mov_b32 s7, s3 1029; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1030; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 glc 1031; GFX89-NEXT: s_waitcnt vmcnt(0) 1032; GFX89-NEXT: buffer_wbinvl1_vol 1033; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1034; GFX89-NEXT: s_endpgm 1035; 1036; GFX10-LABEL: add_i64_varying: 1037; GFX10: ; %bb.0: ; %entry 1038; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1039; GFX10-NEXT: v_mov_b32_e32 v1, 0 1040; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1041; GFX10-NEXT: s_mov_b32 s6, -1 1042; GFX10-NEXT: s_mov_b32 s11, s7 1043; GFX10-NEXT: s_mov_b32 s10, s6 1044; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1045; GFX10-NEXT: s_mov_b32 s8, s2 1046; GFX10-NEXT: s_mov_b32 s9, s3 1047; GFX10-NEXT: s_mov_b32 s4, s0 1048; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1049; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1050; GFX10-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1051; GFX10-NEXT: s_waitcnt vmcnt(0) 1052; GFX10-NEXT: buffer_gl0_inv 1053; GFX10-NEXT: buffer_gl1_inv 1054; GFX10-NEXT: s_mov_b32 s5, s1 1055; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1056; GFX10-NEXT: s_endpgm 1057entry: 1058 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1059 %zext = zext i32 %lane to i64 1060 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel 1061 store i64 %old, i64 addrspace(1)* %out 1062 ret void 1063} 1064 1065define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 1066; GFX7LESS-LABEL: sub_i32_constant: 1067; GFX7LESS: ; %bb.0: ; %entry 1068; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1069; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1070; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1071; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1072; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1073; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1074; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1075; GFX7LESS-NEXT: s_cbranch_execz BB6_2 1076; GFX7LESS-NEXT: ; %bb.1: 1077; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 1078; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1079; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 1080; GFX7LESS-NEXT: s_mov_b32 s10, -1 1081; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1082; GFX7LESS-NEXT: s_mov_b32 s8, s2 1083; GFX7LESS-NEXT: s_mov_b32 s9, s3 1084; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1085; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1086; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1087; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1088; GFX7LESS-NEXT: buffer_wbinvl1 1089; GFX7LESS-NEXT: BB6_2: 1090; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1091; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1092; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1093; GFX7LESS-NEXT: s_mov_b32 s2, -1 1094; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 1095; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1096; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1097; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1098; GFX7LESS-NEXT: s_endpgm 1099; 1100; GFX8-LABEL: sub_i32_constant: 1101; GFX8: ; %bb.0: ; %entry 1102; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1103; GFX8-NEXT: s_mov_b64 s[6:7], exec 1104; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1105; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1106; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1107; GFX8-NEXT: ; implicit-def: $vgpr1 1108; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1109; GFX8-NEXT: s_cbranch_execz BB6_2 1110; GFX8-NEXT: ; %bb.1: 1111; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1112; GFX8-NEXT: s_mov_b32 s8, s2 1113; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1114; GFX8-NEXT: s_mul_i32 s2, s2, 5 1115; GFX8-NEXT: s_mov_b32 s11, 0xf000 1116; GFX8-NEXT: s_mov_b32 s10, -1 1117; GFX8-NEXT: s_mov_b32 s9, s3 1118; GFX8-NEXT: v_mov_b32_e32 v1, s2 1119; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1120; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1121; GFX8-NEXT: s_waitcnt vmcnt(0) 1122; GFX8-NEXT: buffer_wbinvl1_vol 1123; GFX8-NEXT: BB6_2: 1124; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1125; GFX8-NEXT: v_readfirstlane_b32 s4, v1 1126; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1127; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1128; GFX8-NEXT: s_mov_b32 s3, 0xf000 1129; GFX8-NEXT: s_mov_b32 s2, -1 1130; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 1131; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1132; GFX8-NEXT: s_endpgm 1133; 1134; GFX9-LABEL: sub_i32_constant: 1135; GFX9: ; %bb.0: ; %entry 1136; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1137; GFX9-NEXT: s_mov_b64 s[6:7], exec 1138; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1139; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1140; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1141; GFX9-NEXT: ; implicit-def: $vgpr1 1142; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1143; GFX9-NEXT: s_cbranch_execz BB6_2 1144; GFX9-NEXT: ; %bb.1: 1145; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1146; GFX9-NEXT: s_mov_b32 s8, s2 1147; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1148; GFX9-NEXT: s_mul_i32 s2, s2, 5 1149; GFX9-NEXT: s_mov_b32 s11, 0xf000 1150; GFX9-NEXT: s_mov_b32 s10, -1 1151; GFX9-NEXT: s_mov_b32 s9, s3 1152; GFX9-NEXT: v_mov_b32_e32 v1, s2 1153; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1154; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1155; GFX9-NEXT: s_waitcnt vmcnt(0) 1156; GFX9-NEXT: buffer_wbinvl1_vol 1157; GFX9-NEXT: BB6_2: 1158; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1159; GFX9-NEXT: v_readfirstlane_b32 s4, v1 1160; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1161; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1162; GFX9-NEXT: s_mov_b32 s3, 0xf000 1163; GFX9-NEXT: s_mov_b32 s2, -1 1164; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1165; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1166; GFX9-NEXT: s_endpgm 1167; 1168; GFX1064-LABEL: sub_i32_constant: 1169; GFX1064: ; %bb.0: ; %entry 1170; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1171; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1172; GFX1064-NEXT: ; implicit-def: $vgpr1 1173; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1174; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1175; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1176; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1177; GFX1064-NEXT: s_cbranch_execz BB6_2 1178; GFX1064-NEXT: ; %bb.1: 1179; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1180; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 1181; GFX1064-NEXT: s_mul_i32 s6, s6, 5 1182; GFX1064-NEXT: s_mov_b32 s10, -1 1183; GFX1064-NEXT: v_mov_b32_e32 v1, s6 1184; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1185; GFX1064-NEXT: s_mov_b32 s8, s2 1186; GFX1064-NEXT: s_mov_b32 s9, s3 1187; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1188; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1189; GFX1064-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1190; GFX1064-NEXT: s_waitcnt vmcnt(0) 1191; GFX1064-NEXT: buffer_gl0_inv 1192; GFX1064-NEXT: buffer_gl1_inv 1193; GFX1064-NEXT: BB6_2: 1194; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1195; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1196; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1197; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1198; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1199; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1200; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1201; GFX1064-NEXT: s_mov_b32 s2, -1 1202; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1203; GFX1064-NEXT: s_endpgm 1204; 1205; GFX1032-LABEL: sub_i32_constant: 1206; GFX1032: ; %bb.0: ; %entry 1207; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1208; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1209; GFX1032-NEXT: ; implicit-def: $vgpr1 1210; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 1211; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1212; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1213; GFX1032-NEXT: s_cbranch_execz BB6_2 1214; GFX1032-NEXT: ; %bb.1: 1215; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1216; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 1217; GFX1032-NEXT: s_mul_i32 s5, s5, 5 1218; GFX1032-NEXT: s_mov_b32 s10, -1 1219; GFX1032-NEXT: v_mov_b32_e32 v1, s5 1220; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1221; GFX1032-NEXT: s_mov_b32 s8, s2 1222; GFX1032-NEXT: s_mov_b32 s9, s3 1223; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1224; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1225; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1226; GFX1032-NEXT: s_waitcnt vmcnt(0) 1227; GFX1032-NEXT: buffer_gl0_inv 1228; GFX1032-NEXT: buffer_gl1_inv 1229; GFX1032-NEXT: BB6_2: 1230; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1231; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1232; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1233; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1234; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1235; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1236; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1237; GFX1032-NEXT: s_mov_b32 s2, -1 1238; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1239; GFX1032-NEXT: s_endpgm 1240entry: 1241 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel 1242 store i32 %old, i32 addrspace(1)* %out 1243 ret void 1244} 1245 1246define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) { 1247; GFX7LESS-LABEL: sub_i32_uniform: 1248; GFX7LESS: ; %bb.0: ; %entry 1249; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1250; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1251; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd 1252; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1253; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1254; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1255; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1256; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1257; GFX7LESS-NEXT: s_cbranch_execz BB7_2 1258; GFX7LESS-NEXT: ; %bb.1: 1259; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 1260; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1261; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1262; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 1263; GFX7LESS-NEXT: s_mov_b32 s14, -1 1264; GFX7LESS-NEXT: s_mov_b32 s12, s6 1265; GFX7LESS-NEXT: s_mov_b32 s13, s7 1266; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 1267; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1268; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1269; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1270; GFX7LESS-NEXT: buffer_wbinvl1 1271; GFX7LESS-NEXT: BB7_2: 1272; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1273; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1274; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1275; GFX7LESS-NEXT: s_mov_b32 s6, -1 1276; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1277; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 1278; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1279; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1280; GFX7LESS-NEXT: s_endpgm 1281; 1282; GFX8-LABEL: sub_i32_uniform: 1283; GFX8: ; %bb.0: ; %entry 1284; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1285; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 1286; GFX8-NEXT: s_mov_b64 s[2:3], exec 1287; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1288; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1289; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1290; GFX8-NEXT: ; implicit-def: $vgpr1 1291; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1292; GFX8-NEXT: s_cbranch_execz BB7_2 1293; GFX8-NEXT: ; %bb.1: 1294; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1295; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1296; GFX8-NEXT: s_mul_i32 s2, s8, s2 1297; GFX8-NEXT: s_mov_b32 s15, 0xf000 1298; GFX8-NEXT: s_mov_b32 s14, -1 1299; GFX8-NEXT: s_mov_b32 s12, s6 1300; GFX8-NEXT: s_mov_b32 s13, s7 1301; GFX8-NEXT: v_mov_b32_e32 v1, s2 1302; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1303; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1304; GFX8-NEXT: s_waitcnt vmcnt(0) 1305; GFX8-NEXT: buffer_wbinvl1_vol 1306; GFX8-NEXT: BB7_2: 1307; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1308; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1309; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 1310; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1311; GFX8-NEXT: s_mov_b32 s7, 0xf000 1312; GFX8-NEXT: s_mov_b32 s6, -1 1313; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1314; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1315; GFX8-NEXT: s_endpgm 1316; 1317; GFX9-LABEL: sub_i32_uniform: 1318; GFX9: ; %bb.0: ; %entry 1319; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1320; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 1321; GFX9-NEXT: s_mov_b64 s[2:3], exec 1322; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1323; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1324; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1325; GFX9-NEXT: ; implicit-def: $vgpr1 1326; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1327; GFX9-NEXT: s_cbranch_execz BB7_2 1328; GFX9-NEXT: ; %bb.1: 1329; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1330; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1331; GFX9-NEXT: s_mul_i32 s2, s8, s2 1332; GFX9-NEXT: s_mov_b32 s15, 0xf000 1333; GFX9-NEXT: s_mov_b32 s14, -1 1334; GFX9-NEXT: s_mov_b32 s12, s6 1335; GFX9-NEXT: s_mov_b32 s13, s7 1336; GFX9-NEXT: v_mov_b32_e32 v1, s2 1337; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1338; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1339; GFX9-NEXT: s_waitcnt vmcnt(0) 1340; GFX9-NEXT: buffer_wbinvl1_vol 1341; GFX9-NEXT: BB7_2: 1342; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1343; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1344; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 1345; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1346; GFX9-NEXT: s_mov_b32 s7, 0xf000 1347; GFX9-NEXT: s_mov_b32 s6, -1 1348; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1349; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1350; GFX9-NEXT: s_endpgm 1351; 1352; GFX1064-LABEL: sub_i32_uniform: 1353; GFX1064: ; %bb.0: ; %entry 1354; GFX1064-NEXT: s_clause 0x1 1355; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1356; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 1357; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1358; GFX1064-NEXT: ; implicit-def: $vgpr1 1359; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1360; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1361; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1362; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1363; GFX1064-NEXT: s_cbranch_execz BB7_2 1364; GFX1064-NEXT: ; %bb.1: 1365; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1366; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 1367; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1368; GFX1064-NEXT: s_mul_i32 s2, s8, s2 1369; GFX1064-NEXT: s_mov_b32 s14, -1 1370; GFX1064-NEXT: v_mov_b32_e32 v1, s2 1371; GFX1064-NEXT: s_mov_b32 s12, s6 1372; GFX1064-NEXT: s_mov_b32 s13, s7 1373; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1374; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1375; GFX1064-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1376; GFX1064-NEXT: s_waitcnt vmcnt(0) 1377; GFX1064-NEXT: buffer_gl0_inv 1378; GFX1064-NEXT: buffer_gl1_inv 1379; GFX1064-NEXT: BB7_2: 1380; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1381; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1382; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1383; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0 1384; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1385; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1386; GFX1064-NEXT: s_mov_b32 s6, -1 1387; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1388; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1389; GFX1064-NEXT: s_endpgm 1390; 1391; GFX1032-LABEL: sub_i32_uniform: 1392; GFX1032: ; %bb.0: ; %entry 1393; GFX1032-NEXT: s_clause 0x1 1394; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1395; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 1396; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1397; GFX1032-NEXT: ; implicit-def: $vgpr1 1398; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1399; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1400; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1401; GFX1032-NEXT: s_cbranch_execz BB7_2 1402; GFX1032-NEXT: ; %bb.1: 1403; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 1404; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 1405; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1406; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1407; GFX1032-NEXT: s_mov_b32 s10, -1 1408; GFX1032-NEXT: v_mov_b32_e32 v1, s1 1409; GFX1032-NEXT: s_mov_b32 s8, s6 1410; GFX1032-NEXT: s_mov_b32 s9, s7 1411; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1412; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1413; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1414; GFX1032-NEXT: s_waitcnt vmcnt(0) 1415; GFX1032-NEXT: buffer_gl0_inv 1416; GFX1032-NEXT: buffer_gl1_inv 1417; GFX1032-NEXT: BB7_2: 1418; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1419; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1420; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1421; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1422; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1423; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1424; GFX1032-NEXT: s_mov_b32 s6, -1 1425; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1426; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1427; GFX1032-NEXT: s_endpgm 1428entry: 1429 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel 1430 store i32 %old, i32 addrspace(1)* %out 1431 ret void 1432} 1433 1434define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 1435; GFX7LESS-LABEL: sub_i32_varying: 1436; GFX7LESS: ; %bb.0: ; %entry 1437; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1438; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1439; GFX7LESS-NEXT: s_mov_b32 s6, -1 1440; GFX7LESS-NEXT: s_mov_b32 s10, s6 1441; GFX7LESS-NEXT: s_mov_b32 s11, s7 1442; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1443; GFX7LESS-NEXT: s_mov_b32 s8, s2 1444; GFX7LESS-NEXT: s_mov_b32 s9, s3 1445; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1446; GFX7LESS-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1447; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1448; GFX7LESS-NEXT: buffer_wbinvl1 1449; GFX7LESS-NEXT: s_mov_b32 s4, s0 1450; GFX7LESS-NEXT: s_mov_b32 s5, s1 1451; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1452; GFX7LESS-NEXT: s_endpgm 1453; 1454; GFX8-LABEL: sub_i32_varying: 1455; GFX8: ; %bb.0: ; %entry 1456; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1457; GFX8-NEXT: v_mov_b32_e32 v2, v0 1458; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1459; GFX8-NEXT: v_mov_b32_e32 v1, 0 1460; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1461; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1462; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1463; GFX8-NEXT: s_not_b64 exec, exec 1464; GFX8-NEXT: v_mov_b32_e32 v2, 0 1465; GFX8-NEXT: s_not_b64 exec, exec 1466; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1467; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1468; GFX8-NEXT: s_nop 1 1469; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1470; GFX8-NEXT: s_nop 1 1471; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1472; GFX8-NEXT: s_nop 1 1473; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1474; GFX8-NEXT: s_nop 1 1475; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1476; GFX8-NEXT: s_nop 1 1477; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1478; GFX8-NEXT: v_readlane_b32 s6, v2, 63 1479; GFX8-NEXT: s_nop 0 1480; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1481; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1482; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1483; GFX8-NEXT: ; implicit-def: $vgpr0 1484; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1485; GFX8-NEXT: s_cbranch_execz BB8_2 1486; GFX8-NEXT: ; %bb.1: 1487; GFX8-NEXT: s_mov_b32 s11, 0xf000 1488; GFX8-NEXT: s_mov_b32 s10, -1 1489; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1490; GFX8-NEXT: s_mov_b32 s8, s2 1491; GFX8-NEXT: s_mov_b32 s9, s3 1492; GFX8-NEXT: v_mov_b32_e32 v0, s6 1493; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1494; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1495; GFX8-NEXT: s_waitcnt vmcnt(0) 1496; GFX8-NEXT: buffer_wbinvl1_vol 1497; GFX8-NEXT: BB8_2: 1498; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1499; GFX8-NEXT: v_readfirstlane_b32 s4, v0 1500; GFX8-NEXT: v_mov_b32_e32 v0, v1 1501; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1502; GFX8-NEXT: s_mov_b32 s3, 0xf000 1503; GFX8-NEXT: s_mov_b32 s2, -1 1504; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 1505; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1506; GFX8-NEXT: s_endpgm 1507; 1508; GFX9-LABEL: sub_i32_varying: 1509; GFX9: ; %bb.0: ; %entry 1510; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1511; GFX9-NEXT: v_mov_b32_e32 v2, v0 1512; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1513; GFX9-NEXT: v_mov_b32_e32 v1, 0 1514; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1515; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1516; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1517; GFX9-NEXT: s_not_b64 exec, exec 1518; GFX9-NEXT: v_mov_b32_e32 v2, 0 1519; GFX9-NEXT: s_not_b64 exec, exec 1520; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1521; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1522; GFX9-NEXT: s_nop 1 1523; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1524; GFX9-NEXT: s_nop 1 1525; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1526; GFX9-NEXT: s_nop 1 1527; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1528; GFX9-NEXT: s_nop 1 1529; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1530; GFX9-NEXT: s_nop 1 1531; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1532; GFX9-NEXT: v_readlane_b32 s6, v2, 63 1533; GFX9-NEXT: s_nop 0 1534; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1535; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1536; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1537; GFX9-NEXT: ; implicit-def: $vgpr0 1538; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1539; GFX9-NEXT: s_cbranch_execz BB8_2 1540; GFX9-NEXT: ; %bb.1: 1541; GFX9-NEXT: s_mov_b32 s11, 0xf000 1542; GFX9-NEXT: s_mov_b32 s10, -1 1543; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1544; GFX9-NEXT: s_mov_b32 s8, s2 1545; GFX9-NEXT: s_mov_b32 s9, s3 1546; GFX9-NEXT: v_mov_b32_e32 v0, s6 1547; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1548; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1549; GFX9-NEXT: s_waitcnt vmcnt(0) 1550; GFX9-NEXT: buffer_wbinvl1_vol 1551; GFX9-NEXT: BB8_2: 1552; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1553; GFX9-NEXT: v_readfirstlane_b32 s4, v0 1554; GFX9-NEXT: v_mov_b32_e32 v0, v1 1555; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1556; GFX9-NEXT: s_mov_b32 s3, 0xf000 1557; GFX9-NEXT: s_mov_b32 s2, -1 1558; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1559; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1560; GFX9-NEXT: s_endpgm 1561; 1562; GFX1064-LABEL: sub_i32_varying: 1563; GFX1064: ; %bb.0: ; %entry 1564; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1565; GFX1064-NEXT: s_not_b64 exec, exec 1566; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1567; GFX1064-NEXT: s_not_b64 exec, exec 1568; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1569; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1570; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1571; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1572; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1573; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1574; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1575; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1576; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1577; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 1578; GFX1064-NEXT: v_mov_b32_e32 v2, s4 1579; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1580; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 1581; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1582; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1583; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1584; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 1585; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 1586; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 1587; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 1588; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1589; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 1590; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 1591; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 1592; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 1593; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 1594; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1595; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 1596; GFX1064-NEXT: s_mov_b32 s4, s9 1597; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 1598; GFX1064-NEXT: s_mov_b64 exec, s[6:7] 1599; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1600; GFX1064-NEXT: s_mov_b32 s6, -1 1601; GFX1064-NEXT: ; implicit-def: $vgpr0 1602; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc 1603; GFX1064-NEXT: s_cbranch_execz BB8_2 1604; GFX1064-NEXT: ; %bb.1: 1605; GFX1064-NEXT: v_mov_b32_e32 v0, s4 1606; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1607; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1608; GFX1064-NEXT: s_mov_b32 s4, s2 1609; GFX1064-NEXT: s_mov_b32 s5, s3 1610; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1611; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1612; GFX1064-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 1613; GFX1064-NEXT: s_waitcnt vmcnt(0) 1614; GFX1064-NEXT: buffer_gl0_inv 1615; GFX1064-NEXT: buffer_gl1_inv 1616; GFX1064-NEXT: BB8_2: 1617; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1618; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 1619; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1620; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 1621; GFX1064-NEXT: v_mov_b32_e32 v0, v3 1622; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1623; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1624; GFX1064-NEXT: s_mov_b32 s2, s6 1625; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1626; GFX1064-NEXT: s_endpgm 1627; 1628; GFX1032-LABEL: sub_i32_varying: 1629; GFX1032: ; %bb.0: ; %entry 1630; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1631; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1632; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1633; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1634; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1635; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1636; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1637; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1638; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1639; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1640; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1641; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1642; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1643; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 1644; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1645; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1646; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 1647; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 1648; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1649; GFX1032-NEXT: s_mov_b32 exec_lo, s4 1650; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1651; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 1652; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 1653; GFX1032-NEXT: s_mov_b32 exec_lo, s4 1654; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1655; GFX1032-NEXT: s_mov_b32 s4, s6 1656; GFX1032-NEXT: s_mov_b32 s6, -1 1657; GFX1032-NEXT: ; implicit-def: $vgpr0 1658; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo 1659; GFX1032-NEXT: s_cbranch_execz BB8_2 1660; GFX1032-NEXT: ; %bb.1: 1661; GFX1032-NEXT: v_mov_b32_e32 v0, s4 1662; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1663; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1664; GFX1032-NEXT: s_mov_b32 s4, s2 1665; GFX1032-NEXT: s_mov_b32 s5, s3 1666; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1667; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1668; GFX1032-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 1669; GFX1032-NEXT: s_waitcnt vmcnt(0) 1670; GFX1032-NEXT: buffer_gl0_inv 1671; GFX1032-NEXT: buffer_gl1_inv 1672; GFX1032-NEXT: BB8_2: 1673; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1674; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 1675; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1676; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 1677; GFX1032-NEXT: v_mov_b32_e32 v0, v3 1678; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1679; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1680; GFX1032-NEXT: s_mov_b32 s2, s6 1681; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1682; GFX1032-NEXT: s_endpgm 1683entry: 1684 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1685 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel 1686 store i32 %old, i32 addrspace(1)* %out 1687 ret void 1688} 1689 1690define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 1691; GFX7LESS-LABEL: sub_i64_constant: 1692; GFX7LESS: ; %bb.0: ; %entry 1693; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1694; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1695; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1696; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 1697; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1698; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1699; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1700; GFX7LESS-NEXT: s_cbranch_execz BB9_2 1701; GFX7LESS-NEXT: ; %bb.1: 1702; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 1703; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1704; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 1705; GFX7LESS-NEXT: s_mov_b32 s10, -1 1706; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1707; GFX7LESS-NEXT: s_mov_b32 s8, s2 1708; GFX7LESS-NEXT: s_mov_b32 s9, s3 1709; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1710; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1711; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1712; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1713; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1714; GFX7LESS-NEXT: buffer_wbinvl1 1715; GFX7LESS-NEXT: BB9_2: 1716; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1717; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1718; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1719; GFX7LESS-NEXT: s_mov_b32 s2, -1 1720; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 1721; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 1722; GFX7LESS-NEXT: s_waitcnt expcnt(0) 1723; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1724; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1725; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 1726; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1727; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1728; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1729; GFX7LESS-NEXT: s_endpgm 1730; 1731; GFX8-LABEL: sub_i64_constant: 1732; GFX8: ; %bb.0: ; %entry 1733; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1734; GFX8-NEXT: s_mov_b64 s[6:7], exec 1735; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1736; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1737; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1738; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1739; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1740; GFX8-NEXT: s_cbranch_execz BB9_2 1741; GFX8-NEXT: ; %bb.1: 1742; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1743; GFX8-NEXT: s_mov_b32 s8, s2 1744; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1745; GFX8-NEXT: s_mul_i32 s2, s2, 5 1746; GFX8-NEXT: s_mov_b32 s11, 0xf000 1747; GFX8-NEXT: s_mov_b32 s10, -1 1748; GFX8-NEXT: s_mov_b32 s9, s3 1749; GFX8-NEXT: v_mov_b32_e32 v0, s2 1750; GFX8-NEXT: v_mov_b32_e32 v1, 0 1751; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1752; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1753; GFX8-NEXT: s_waitcnt vmcnt(0) 1754; GFX8-NEXT: buffer_wbinvl1_vol 1755; GFX8-NEXT: BB9_2: 1756; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1757; GFX8-NEXT: v_readfirstlane_b32 s4, v0 1758; GFX8-NEXT: v_readfirstlane_b32 s5, v1 1759; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1760; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1761; GFX8-NEXT: v_mov_b32_e32 v2, s5 1762; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 1763; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1764; GFX8-NEXT: s_mov_b32 s3, 0xf000 1765; GFX8-NEXT: s_mov_b32 s2, -1 1766; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1767; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1768; GFX8-NEXT: s_endpgm 1769; 1770; GFX9-LABEL: sub_i64_constant: 1771; GFX9: ; %bb.0: ; %entry 1772; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1773; GFX9-NEXT: s_mov_b64 s[6:7], exec 1774; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1775; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1776; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1777; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1778; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1779; GFX9-NEXT: s_cbranch_execz BB9_2 1780; GFX9-NEXT: ; %bb.1: 1781; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1782; GFX9-NEXT: s_mov_b32 s8, s2 1783; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1784; GFX9-NEXT: s_mul_i32 s2, s2, 5 1785; GFX9-NEXT: s_mov_b32 s11, 0xf000 1786; GFX9-NEXT: s_mov_b32 s10, -1 1787; GFX9-NEXT: s_mov_b32 s9, s3 1788; GFX9-NEXT: v_mov_b32_e32 v0, s2 1789; GFX9-NEXT: v_mov_b32_e32 v1, 0 1790; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1791; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1792; GFX9-NEXT: s_waitcnt vmcnt(0) 1793; GFX9-NEXT: buffer_wbinvl1_vol 1794; GFX9-NEXT: BB9_2: 1795; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1796; GFX9-NEXT: v_readfirstlane_b32 s4, v0 1797; GFX9-NEXT: v_readfirstlane_b32 s5, v1 1798; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1799; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1800; GFX9-NEXT: v_mov_b32_e32 v2, s5 1801; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 1802; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1803; GFX9-NEXT: s_mov_b32 s3, 0xf000 1804; GFX9-NEXT: s_mov_b32 s2, -1 1805; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 1806; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1807; GFX9-NEXT: s_endpgm 1808; 1809; GFX1064-LABEL: sub_i64_constant: 1810; GFX1064: ; %bb.0: ; %entry 1811; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1812; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1813; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1814; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1815; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1816; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1817; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1818; GFX1064-NEXT: s_cbranch_execz BB9_2 1819; GFX1064-NEXT: ; %bb.1: 1820; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1821; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1822; GFX1064-NEXT: s_mul_i32 s6, s6, 5 1823; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 1824; GFX1064-NEXT: v_mov_b32_e32 v0, s6 1825; GFX1064-NEXT: s_mov_b32 s10, -1 1826; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1827; GFX1064-NEXT: s_mov_b32 s8, s2 1828; GFX1064-NEXT: s_mov_b32 s9, s3 1829; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1830; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1831; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1832; GFX1064-NEXT: s_waitcnt vmcnt(0) 1833; GFX1064-NEXT: buffer_gl0_inv 1834; GFX1064-NEXT: buffer_gl1_inv 1835; GFX1064-NEXT: BB9_2: 1836; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1837; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1838; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1839; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 1840; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1841; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 1842; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1843; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 1844; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 1845; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1846; GFX1064-NEXT: s_mov_b32 s2, -1 1847; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1848; GFX1064-NEXT: s_endpgm 1849; 1850; GFX1032-LABEL: sub_i64_constant: 1851; GFX1032: ; %bb.0: ; %entry 1852; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1853; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1854; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1855; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1856; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1857; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1858; GFX1032-NEXT: s_cbranch_execz BB9_2 1859; GFX1032-NEXT: ; %bb.1: 1860; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1861; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1862; GFX1032-NEXT: s_mul_i32 s5, s5, 5 1863; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 1864; GFX1032-NEXT: v_mov_b32_e32 v0, s5 1865; GFX1032-NEXT: s_mov_b32 s10, -1 1866; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1867; GFX1032-NEXT: s_mov_b32 s8, s2 1868; GFX1032-NEXT: s_mov_b32 s9, s3 1869; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1870; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1871; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1872; GFX1032-NEXT: s_waitcnt vmcnt(0) 1873; GFX1032-NEXT: buffer_gl0_inv 1874; GFX1032-NEXT: buffer_gl1_inv 1875; GFX1032-NEXT: BB9_2: 1876; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1877; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1878; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1879; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 1880; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1881; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 1882; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1883; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 1884; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 1885; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1886; GFX1032-NEXT: s_mov_b32 s2, -1 1887; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1888; GFX1032-NEXT: s_endpgm 1889entry: 1890 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel 1891 store i64 %old, i64 addrspace(1)* %out 1892 ret void 1893} 1894 1895define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) { 1896; GFX7LESS-LABEL: sub_i64_uniform: 1897; GFX7LESS: ; %bb.0: ; %entry 1898; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec 1899; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1900; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1901; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 1902; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 1903; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1904; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1905; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1906; GFX7LESS-NEXT: s_cbranch_execz BB10_2 1907; GFX7LESS-NEXT: ; %bb.1: 1908; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 1909; GFX7LESS-NEXT: s_mov_b32 s14, -1 1910; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1911; GFX7LESS-NEXT: s_mov_b32 s12, s6 1912; GFX7LESS-NEXT: s_mov_b32 s13, s7 1913; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 1914; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 1915; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1916; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 1917; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 1918; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 1919; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1920; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1921; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc 1922; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1923; GFX7LESS-NEXT: buffer_wbinvl1 1924; GFX7LESS-NEXT: BB10_2: 1925; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1926; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1927; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1928; GFX7LESS-NEXT: s_mov_b32 s6, -1 1929; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 1930; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1 1931; GFX7LESS-NEXT: s_waitcnt expcnt(0) 1932; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 1933; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 1934; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 1935; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 1936; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3 1937; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v2 1938; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 1939; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1940; GFX7LESS-NEXT: s_endpgm 1941; 1942; GFX8-LABEL: sub_i64_uniform: 1943; GFX8: ; %bb.0: ; %entry 1944; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1945; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1946; GFX8-NEXT: s_mov_b64 s[8:9], exec 1947; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 1948; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 1949; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1950; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1951; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1952; GFX8-NEXT: s_cbranch_execz BB10_2 1953; GFX8-NEXT: ; %bb.1: 1954; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1955; GFX8-NEXT: s_mov_b32 s12, s6 1956; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 1957; GFX8-NEXT: v_mov_b32_e32 v0, s6 1958; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 1959; GFX8-NEXT: s_mov_b32 s13, s7 1960; GFX8-NEXT: s_mul_i32 s7, s1, s6 1961; GFX8-NEXT: s_mul_i32 s6, s0, s6 1962; GFX8-NEXT: s_mov_b32 s15, 0xf000 1963; GFX8-NEXT: s_mov_b32 s14, -1 1964; GFX8-NEXT: v_add_u32_e32 v1, vcc, s7, v0 1965; GFX8-NEXT: v_mov_b32_e32 v0, s6 1966; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1967; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc 1968; GFX8-NEXT: s_waitcnt vmcnt(0) 1969; GFX8-NEXT: buffer_wbinvl1_vol 1970; GFX8-NEXT: BB10_2: 1971; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1972; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1973; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1974; GFX8-NEXT: v_mul_lo_u32 v0, s1, v2 1975; GFX8-NEXT: v_mul_hi_u32 v3, s0, v2 1976; GFX8-NEXT: v_readfirstlane_b32 s1, v1 1977; GFX8-NEXT: v_mul_lo_u32 v1, s0, v2 1978; GFX8-NEXT: s_mov_b32 s7, 0xf000 1979; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 1980; GFX8-NEXT: v_mov_b32_e32 v3, s1 1981; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v1 1982; GFX8-NEXT: s_mov_b32 s6, -1 1983; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc 1984; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1985; GFX8-NEXT: s_endpgm 1986; 1987; GFX9-LABEL: sub_i64_uniform: 1988; GFX9: ; %bb.0: ; %entry 1989; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1990; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1991; GFX9-NEXT: s_mov_b64 s[8:9], exec 1992; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 1993; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 1994; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1995; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1996; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1997; GFX9-NEXT: s_cbranch_execz BB10_2 1998; GFX9-NEXT: ; %bb.1: 1999; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2000; GFX9-NEXT: s_mov_b32 s12, s6 2001; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 2002; GFX9-NEXT: s_mov_b32 s13, s7 2003; GFX9-NEXT: s_mul_i32 s7, s3, s6 2004; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2005; GFX9-NEXT: s_add_i32 s8, s8, s7 2006; GFX9-NEXT: s_mul_i32 s6, s2, s6 2007; GFX9-NEXT: s_mov_b32 s15, 0xf000 2008; GFX9-NEXT: s_mov_b32 s14, -1 2009; GFX9-NEXT: v_mov_b32_e32 v0, s6 2010; GFX9-NEXT: v_mov_b32_e32 v1, s8 2011; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2012; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc 2013; GFX9-NEXT: s_waitcnt vmcnt(0) 2014; GFX9-NEXT: buffer_wbinvl1_vol 2015; GFX9-NEXT: BB10_2: 2016; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 2017; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2018; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 2019; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2 2020; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2021; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2 2022; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2023; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 2024; GFX9-NEXT: v_mov_b32_e32 v2, s1 2025; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 2026; GFX9-NEXT: s_mov_b32 s7, 0xf000 2027; GFX9-NEXT: s_mov_b32 s6, -1 2028; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2029; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2030; GFX9-NEXT: s_endpgm 2031; 2032; GFX1064-LABEL: sub_i64_uniform: 2033; GFX1064: ; %bb.0: ; %entry 2034; GFX1064-NEXT: s_clause 0x1 2035; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2036; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2037; GFX1064-NEXT: s_mov_b64 s[8:9], exec 2038; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 2039; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 2040; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 2041; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2042; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 2043; GFX1064-NEXT: s_cbranch_execz BB10_2 2044; GFX1064-NEXT: ; %bb.1: 2045; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] 2046; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 2047; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2048; GFX1064-NEXT: s_mul_i32 s9, s3, s8 2049; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 2050; GFX1064-NEXT: s_mul_i32 s8, s2, s8 2051; GFX1064-NEXT: s_add_i32 s10, s10, s9 2052; GFX1064-NEXT: v_mov_b32_e32 v0, s8 2053; GFX1064-NEXT: v_mov_b32_e32 v1, s10 2054; GFX1064-NEXT: s_mov_b32 s10, -1 2055; GFX1064-NEXT: s_mov_b32 s8, s6 2056; GFX1064-NEXT: s_mov_b32 s9, s7 2057; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2058; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2059; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2060; GFX1064-NEXT: s_waitcnt vmcnt(0) 2061; GFX1064-NEXT: buffer_gl0_inv 2062; GFX1064-NEXT: buffer_gl1_inv 2063; GFX1064-NEXT: BB10_2: 2064; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2065; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 2066; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2067; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2 2068; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v2 2069; GFX1064-NEXT: v_mul_lo_u32 v2, s2, v2 2070; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 2071; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 2072; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 2073; GFX1064-NEXT: s_mov_b32 s6, -1 2074; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 2075; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v2 2076; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc 2077; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2078; GFX1064-NEXT: s_endpgm 2079; 2080; GFX1032-LABEL: sub_i64_uniform: 2081; GFX1032: ; %bb.0: ; %entry 2082; GFX1032-NEXT: s_clause 0x1 2083; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2084; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2085; GFX1032-NEXT: s_mov_b32 s8, exec_lo 2086; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2087; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 2088; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2089; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 2090; GFX1032-NEXT: s_cbranch_execz BB10_2 2091; GFX1032-NEXT: ; %bb.1: 2092; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 2093; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 2094; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2095; GFX1032-NEXT: s_mul_i32 s8, s3, s1 2096; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 2097; GFX1032-NEXT: s_mul_i32 s1, s2, s1 2098; GFX1032-NEXT: s_add_i32 s9, s9, s8 2099; GFX1032-NEXT: v_mov_b32_e32 v0, s1 2100; GFX1032-NEXT: v_mov_b32_e32 v1, s9 2101; GFX1032-NEXT: s_mov_b32 s10, -1 2102; GFX1032-NEXT: s_mov_b32 s8, s6 2103; GFX1032-NEXT: s_mov_b32 s9, s7 2104; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2105; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2106; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2107; GFX1032-NEXT: s_waitcnt vmcnt(0) 2108; GFX1032-NEXT: buffer_gl0_inv 2109; GFX1032-NEXT: buffer_gl1_inv 2110; GFX1032-NEXT: BB10_2: 2111; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2112; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 2113; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2114; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2 2115; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v2 2116; GFX1032-NEXT: v_mul_lo_u32 v2, s2, v2 2117; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 2118; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 2119; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 2120; GFX1032-NEXT: s_mov_b32 s6, -1 2121; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 2122; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v2 2123; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 2124; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2125; GFX1032-NEXT: s_endpgm 2126entry: 2127 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel 2128 store i64 %old, i64 addrspace(1)* %out 2129 ret void 2130} 2131 2132define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 2133; GFX7LESS-LABEL: sub_i64_varying: 2134; GFX7LESS: ; %bb.0: ; %entry 2135; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2136; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2137; GFX7LESS-NEXT: s_mov_b32 s6, -1 2138; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2139; GFX7LESS-NEXT: s_mov_b32 s10, s6 2140; GFX7LESS-NEXT: s_mov_b32 s11, s7 2141; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2142; GFX7LESS-NEXT: s_mov_b32 s8, s2 2143; GFX7LESS-NEXT: s_mov_b32 s9, s3 2144; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2145; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2146; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 2147; GFX7LESS-NEXT: buffer_wbinvl1 2148; GFX7LESS-NEXT: s_mov_b32 s4, s0 2149; GFX7LESS-NEXT: s_mov_b32 s5, s1 2150; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2151; GFX7LESS-NEXT: s_endpgm 2152; 2153; GFX89-LABEL: sub_i64_varying: 2154; GFX89: ; %bb.0: ; %entry 2155; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2156; GFX89-NEXT: s_mov_b32 s3, 0xf000 2157; GFX89-NEXT: s_mov_b32 s2, -1 2158; GFX89-NEXT: v_mov_b32_e32 v1, 0 2159; GFX89-NEXT: s_waitcnt lgkmcnt(0) 2160; GFX89-NEXT: s_mov_b32 s0, s4 2161; GFX89-NEXT: s_mov_b32 s1, s5 2162; GFX89-NEXT: s_mov_b32 s4, s6 2163; GFX89-NEXT: s_mov_b32 s5, s7 2164; GFX89-NEXT: s_mov_b32 s6, s2 2165; GFX89-NEXT: s_mov_b32 s7, s3 2166; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2167; GFX89-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 glc 2168; GFX89-NEXT: s_waitcnt vmcnt(0) 2169; GFX89-NEXT: buffer_wbinvl1_vol 2170; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2171; GFX89-NEXT: s_endpgm 2172; 2173; GFX10-LABEL: sub_i64_varying: 2174; GFX10: ; %bb.0: ; %entry 2175; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2176; GFX10-NEXT: v_mov_b32_e32 v1, 0 2177; GFX10-NEXT: s_mov_b32 s7, 0x31016000 2178; GFX10-NEXT: s_mov_b32 s6, -1 2179; GFX10-NEXT: s_mov_b32 s11, s7 2180; GFX10-NEXT: s_mov_b32 s10, s6 2181; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2182; GFX10-NEXT: s_mov_b32 s8, s2 2183; GFX10-NEXT: s_mov_b32 s9, s3 2184; GFX10-NEXT: s_mov_b32 s4, s0 2185; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2186; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2187; GFX10-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2188; GFX10-NEXT: s_waitcnt vmcnt(0) 2189; GFX10-NEXT: buffer_gl0_inv 2190; GFX10-NEXT: buffer_gl1_inv 2191; GFX10-NEXT: s_mov_b32 s5, s1 2192; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2193; GFX10-NEXT: s_endpgm 2194entry: 2195 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2196 %zext = zext i32 %lane to i64 2197 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel 2198 store i64 %old, i64 addrspace(1)* %out 2199 ret void 2200} 2201